Merge tag 'for-v4.3-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 17 Sep 2015 19:25:42 +0000 (12:25 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 17 Sep 2015 19:25:42 +0000 (12:25 -0700)
Pull power supply fixes from Sebastian Reichel:
 "twl4030-charger fixes"

* tag 'for-v4.3-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply:
  twl4030_charger: fix another compile error
  Revert "twl4030_charger: correctly handle -EPROBE_DEFER from devm_usb_get_phy_by_node"

901 files changed:
CREDITS
Documentation/cgroups/blkio-controller.txt
Documentation/cgroups/unified-hierarchy.txt
Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/imx6ul_tsc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/soc/qcom/qcom,smd.txt
Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt [new file with mode: 0644]
Documentation/devicetree/bindings/watchdog/lpc18xx-wdt.txt [new file with mode: 0644]
Documentation/gpio/board.txt
Documentation/gpio/consumer.txt
Documentation/hwmon/nct6775
Documentation/static-keys.txt
Documentation/thermal/sysfs-api.txt
Documentation/virtual/kvm/api.txt
Documentation/vm/00-INDEX
Documentation/vm/idle_page_tracking.txt [new file with mode: 0644]
Documentation/vm/pagemap.txt
Documentation/vm/zswap.txt
Documentation/watchdog/src/watchdog-test.c
MAINTAINERS
Makefile
arch/Kconfig
arch/alpha/include/asm/dma-mapping.h
arch/alpha/kernel/pci-noop.c
arch/alpha/kernel/pci_iommu.c
arch/arc/plat-axs10x/axs10x.c
arch/arm/Kconfig
arch/arm/Makefile
arch/arm/boot/compressed/decompress.c
arch/arm/boot/dts/exynos3250-monk.dts
arch/arm/boot/dts/exynos3250-rinato.dts
arch/arm/boot/dts/exynos3250.dtsi
arch/arm/boot/dts/exynos4.dtsi
arch/arm/boot/dts/exynos4212.dtsi
arch/arm/boot/dts/exynos4412-odroid-common.dtsi
arch/arm/boot/dts/exynos4412-odroidu3.dts
arch/arm/boot/dts/exynos4412-origen.dts
arch/arm/boot/dts/exynos4412-trats2.dts
arch/arm/boot/dts/exynos4412.dtsi
arch/arm/boot/dts/exynos5250-arndale.dts
arch/arm/boot/dts/exynos5250-smdk5250.dts
arch/arm/boot/dts/exynos5250-snow.dts
arch/arm/boot/dts/exynos5250-spring.dts
arch/arm/boot/dts/exynos5250.dtsi
arch/arm/boot/dts/exynos5422-cpus.dtsi [new file with mode: 0644]
arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
arch/arm/boot/dts/qcom-apq8064-cm-qs600.dts
arch/arm/boot/dts/qcom-apq8064-ifc6410.dts
arch/arm/boot/dts/qcom-apq8074-dragonboard.dts
arch/arm/boot/dts/qcom-apq8084-ifc6540.dts
arch/arm/boot/dts/qcom-apq8084-mtp.dts
arch/arm/boot/dts/qcom-apq8084.dtsi
arch/arm/boot/dts/qcom-ipq8064-ap148.dts
arch/arm/boot/dts/qcom-ipq8064.dtsi
arch/arm/boot/dts/qcom-msm8660-surf.dts
arch/arm/boot/dts/qcom-msm8660.dtsi
arch/arm/boot/dts/qcom-msm8960-cdp.dts
arch/arm/boot/dts/qcom-msm8960.dtsi
arch/arm/boot/dts/qcom-msm8974-sony-xperia-honami.dts
arch/arm/boot/dts/qcom-msm8974.dtsi
arch/arm/configs/exynos_defconfig
arch/arm/configs/multi_v7_defconfig
arch/arm/include/asm/assembler.h
arch/arm/include/asm/bug.h
arch/arm/include/asm/dma-mapping.h
arch/arm/include/asm/domain.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/thread_info.h
arch/arm/include/asm/xen/page.h
arch/arm/kernel/process.c
arch/arm/kvm/arm.c
arch/arm/kvm/guest.c
arch/arm/kvm/interrupts.S
arch/arm/kvm/reset.c
arch/arm/mach-exynos/Kconfig
arch/arm/mach-exynos/exynos.c
arch/arm/mm/dma-mapping.c
arch/arm/nwfpe/entry.S
arch/arm/xen/enlighten.c
arch/arm/xen/hypercall.S
arch/arm/xen/mm.c
arch/arm64/Kconfig
arch/arm64/Makefile
arch/arm64/include/asm/dma-mapping.h
arch/arm64/include/asm/hw_breakpoint.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/pgtable.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/asm-offsets.c
arch/arm64/kernel/debug-monitors.c
arch/arm64/kernel/head.S
arch/arm64/kernel/hw_breakpoint.c
arch/arm64/kernel/module.c
arch/arm64/kernel/signal32.c
arch/arm64/kvm/Makefile
arch/arm64/kvm/debug.c [new file with mode: 0644]
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp.S
arch/arm64/kvm/reset.c
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/sys_regs.h
arch/arm64/kvm/sys_regs_generic_v8.c
arch/arm64/kvm/trace.h
arch/arm64/mm/dma-mapping.c
arch/cris/Kconfig
arch/cris/arch-v10/kernel/entry.S
arch/cris/arch-v10/lib/dmacopy.c [deleted file]
arch/cris/arch-v10/lib/old_checksum.c [deleted file]
arch/cris/arch-v32/drivers/Kconfig
arch/cris/arch-v32/drivers/axisflashmap.c
arch/cris/arch-v32/drivers/mach-a3/gpio.c
arch/cris/arch-v32/drivers/mach-fs/gpio.c
arch/cris/arch-v32/kernel/entry.S
arch/cris/arch-v32/kernel/process.c
arch/cris/arch-v32/kernel/signal.c
arch/cris/arch-v32/mach-fs/pinmux.c
arch/cris/configs/artpec_3_defconfig
arch/cris/configs/etraxfs_defconfig
arch/cris/include/arch-v10/arch/elf.h [deleted file]
arch/cris/include/arch-v10/arch/ptrace.h [deleted file]
arch/cris/include/arch-v32/arch/bug.h
arch/cris/include/arch-v32/arch/elf.h [deleted file]
arch/cris/include/arch-v32/arch/irqflags.h
arch/cris/include/arch-v32/arch/ptrace.h [deleted file]
arch/cris/include/asm/Kbuild
arch/cris/include/asm/elf.h [deleted file]
arch/cris/include/asm/mmu_context.h
arch/cris/include/asm/stacktrace.h [new file with mode: 0644]
arch/cris/include/asm/types.h [deleted file]
arch/cris/include/asm/unistd.h
arch/cris/include/uapi/asm/Kbuild
arch/cris/include/uapi/asm/auxvec.h [deleted file]
arch/cris/include/uapi/asm/bitsperlong.h [deleted file]
arch/cris/include/uapi/asm/elf.h [new file with mode: 0644]
arch/cris/include/uapi/asm/elf_v10.h [new file with mode: 0644]
arch/cris/include/uapi/asm/elf_v32.h [new file with mode: 0644]
arch/cris/include/uapi/asm/errno.h [deleted file]
arch/cris/include/uapi/asm/fcntl.h [deleted file]
arch/cris/include/uapi/asm/ioctl.h [deleted file]
arch/cris/include/uapi/asm/ipcbuf.h [deleted file]
arch/cris/include/uapi/asm/kvm_para.h [deleted file]
arch/cris/include/uapi/asm/mman.h [deleted file]
arch/cris/include/uapi/asm/msgbuf.h [deleted file]
arch/cris/include/uapi/asm/poll.h [deleted file]
arch/cris/include/uapi/asm/ptrace.h
arch/cris/include/uapi/asm/ptrace_v10.h [new file with mode: 0644]
arch/cris/include/uapi/asm/ptrace_v32.h [new file with mode: 0644]
arch/cris/include/uapi/asm/resource.h [deleted file]
arch/cris/include/uapi/asm/sembuf.h [deleted file]
arch/cris/include/uapi/asm/shmbuf.h [deleted file]
arch/cris/include/uapi/asm/siginfo.h [deleted file]
arch/cris/include/uapi/asm/socket.h [deleted file]
arch/cris/include/uapi/asm/sockios.h [deleted file]
arch/cris/include/uapi/asm/statfs.h [deleted file]
arch/cris/include/uapi/asm/types.h [deleted file]
arch/cris/include/uapi/asm/unistd.h
arch/cris/kernel/Makefile
arch/cris/kernel/irq.c
arch/cris/kernel/stacktrace.c [new file with mode: 0644]
arch/h8300/boot/compressed/misc.c
arch/h8300/include/asm/dma-mapping.h
arch/hexagon/include/asm/dma-mapping.h
arch/hexagon/include/uapi/asm/signal.h
arch/hexagon/kernel/dma.c
arch/hexagon/kernel/time.c
arch/ia64/Kconfig
arch/ia64/include/asm/dma-mapping.h
arch/ia64/include/asm/unistd.h
arch/ia64/include/uapi/asm/unistd.h
arch/ia64/kernel/entry.S
arch/m32r/boot/compressed/misc.c
arch/m68k/Kconfig
arch/microblaze/include/asm/dma-mapping.h
arch/microblaze/include/uapi/asm/elf.h
arch/mips/Kconfig
arch/mips/boot/compressed/decompress.c
arch/mips/cavium-octeon/dma-octeon.c
arch/mips/include/asm/dma-mapping.h
arch/mips/loongson64/common/dma-swiotlb.c
arch/mips/mm/dma-default.c
arch/mips/netlogic/common/nlm-dma.c
arch/openrisc/include/asm/dma-mapping.h
arch/powerpc/Kconfig
arch/powerpc/include/asm/dma-mapping.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/ppc-opcode.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_32_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_paired_singles.c
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/e500_mmu.c
arch/powerpc/kvm/powerpc.c
arch/s390/Kconfig
arch/s390/boot/compressed/misc.c
arch/s390/include/asm/dma-mapping.h
arch/s390/pci/pci_dma.c
arch/sh/Kconfig
arch/sh/boot/compressed/misc.c
arch/sh/include/asm/dma-mapping.h
arch/sparc/include/asm/dma-mapping.h
arch/tile/Kconfig
arch/tile/include/asm/dma-mapping.h
arch/unicore32/boot/compressed/misc.c
arch/unicore32/include/asm/dma-mapping.h
arch/x86/Kconfig
arch/x86/boot/compressed/misc.c
arch/x86/boot/header.S
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/entry/vsyscall/vsyscall_64.c
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/dma-mapping.h
arch/x86/include/asm/kdebug.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/qspinlock.h
arch/x86/include/asm/xen/page.h
arch/x86/kernel/Makefile
arch/x86/kernel/alternative.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/perf_event_intel.c
arch/x86/kernel/cpu/perf_event_intel_bts.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/ldt.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/tsc.c
arch/x86/kernel/vm86_32.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/mm/mpx.c
arch/x86/mm/srat.c
arch/x86/platform/efi/efi.c
arch/x86/platform/uv/uv_nmi.c
arch/x86/xen/mmu.c
arch/x86/xen/smp.c
arch/xtensa/include/asm/dma-mapping.h
block/bio.c
block/blk-cgroup.c
block/blk-core.c
block/blk-throttle.c
block/blk.h
block/cfq-iosched.c
crypto/testmgr.c
drivers/acpi/thermal.c
drivers/android/binder.c
drivers/base/power/domain.c
drivers/base/power/opp.c
drivers/base/property.c
drivers/block/rbd.c
drivers/block/xen-blkfront.c
drivers/clk/h8300/clk-h8s2678.c
drivers/clk/hisilicon/Kconfig
drivers/clk/hisilicon/Makefile
drivers/clk/rockchip/clk-rk3188.c
drivers/clk/samsung/clk-exynos4.c
drivers/cpufreq/Kconfig.arm
drivers/cpufreq/Makefile
drivers/cpufreq/cpufreq-dt.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/exynos-cpufreq.c [deleted file]
drivers/cpufreq/exynos-cpufreq.h [deleted file]
drivers/cpufreq/exynos4x12-cpufreq.c [deleted file]
drivers/cpufreq/exynos5250-cpufreq.c [deleted file]
drivers/cpufreq/intel_pstate.c
drivers/cpuidle/coupled.c
drivers/cpuidle/cpuidle.h
drivers/cpuidle/driver.c
drivers/crypto/Kconfig
drivers/crypto/qat/qat_common/adf_transport_debug.c
drivers/crypto/sunxi-ss/sun4i-ss-cipher.c
drivers/edac/sb_edac.c
drivers/firmware/efi/Kconfig
drivers/gpio/Kconfig
drivers/gpio/gpio-mxc.c
drivers/gpio/gpio-mxs.c
drivers/gpio/gpio-omap.c
drivers/gpio/gpio-sx150x.c
drivers/gpio/gpiolib.c
drivers/gpu/drm/drm_atomic.c
drivers/gpu/drm/drm_dp_helper.c
drivers/gpu/drm/exynos/Kconfig
drivers/gpu/drm/exynos/exynos_drm_g2d.c
drivers/gpu/drm/exynos/exynos_drm_gem.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem_execbuffer.c
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/intel_csr.c
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_dp_mst.c
drivers/gpu/drm/i915/intel_dsi.c
drivers/gpu/drm/i915/intel_pm.c
drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/nv04.c
drivers/gpu/drm/nouveau/nvkm/subdev/clk/gt215.c
drivers/gpu/drm/qxl/qxl_display.c
drivers/gpu/drm/qxl/qxl_drv.h
drivers/gpu/drm/vgem/vgem_drv.c
drivers/hsi/clients/cmt_speech.c
drivers/hwmon/Kconfig
drivers/hwmon/lm75.c
drivers/hwmon/nct6775.c
drivers/hwmon/ntc_thermistor.c
drivers/hwmon/tmp102.c
drivers/infiniband/Kconfig
drivers/infiniband/hw/Makefile
drivers/infiniband/hw/ehca/Kconfig [deleted file]
drivers/infiniband/hw/ehca/Makefile [deleted file]
drivers/infiniband/hw/ehca/ehca_av.c [deleted file]
drivers/infiniband/hw/ehca/ehca_classes.h [deleted file]
drivers/infiniband/hw/ehca/ehca_classes_pSeries.h [deleted file]
drivers/infiniband/hw/ehca/ehca_cq.c [deleted file]
drivers/infiniband/hw/ehca/ehca_eq.c [deleted file]
drivers/infiniband/hw/ehca/ehca_hca.c [deleted file]
drivers/infiniband/hw/ehca/ehca_irq.c [deleted file]
drivers/infiniband/hw/ehca/ehca_irq.h [deleted file]
drivers/infiniband/hw/ehca/ehca_iverbs.h [deleted file]
drivers/infiniband/hw/ehca/ehca_main.c [deleted file]
drivers/infiniband/hw/ehca/ehca_mcast.c [deleted file]
drivers/infiniband/hw/ehca/ehca_mrmw.c [deleted file]
drivers/infiniband/hw/ehca/ehca_mrmw.h [deleted file]
drivers/infiniband/hw/ehca/ehca_pd.c [deleted file]
drivers/infiniband/hw/ehca/ehca_qes.h [deleted file]
drivers/infiniband/hw/ehca/ehca_qp.c [deleted file]
drivers/infiniband/hw/ehca/ehca_reqs.c [deleted file]
drivers/infiniband/hw/ehca/ehca_sqp.c [deleted file]
drivers/infiniband/hw/ehca/ehca_tools.h [deleted file]
drivers/infiniband/hw/ehca/ehca_uverbs.c [deleted file]
drivers/infiniband/hw/ehca/hcp_if.c [deleted file]
drivers/infiniband/hw/ehca/hcp_if.h [deleted file]
drivers/infiniband/hw/ehca/hcp_phyp.c [deleted file]
drivers/infiniband/hw/ehca/hcp_phyp.h [deleted file]
drivers/infiniband/hw/ehca/hipz_fns.h [deleted file]
drivers/infiniband/hw/ehca/hipz_fns_core.h [deleted file]
drivers/infiniband/hw/ehca/hipz_hw.h [deleted file]
drivers/infiniband/hw/ehca/ipz_pt_fn.c [deleted file]
drivers/infiniband/hw/ehca/ipz_pt_fn.h [deleted file]
drivers/infiniband/hw/qib/qib_file_ops.c
drivers/infiniband/hw/qib/qib_mmap.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/input/evdev.c
drivers/input/keyboard/imx_keypad.c
drivers/input/misc/ab8500-ponkey.c
drivers/input/misc/pwm-beeper.c
drivers/input/misc/regulator-haptic.c
drivers/input/misc/sparcspkr.c
drivers/input/misc/xen-kbdfront.c
drivers/input/mouse/elan_i2c_core.c
drivers/input/serio/i8042.c
drivers/input/touchscreen/Kconfig
drivers/input/touchscreen/Makefile
drivers/input/touchscreen/colibri-vf50-ts.c [new file with mode: 0644]
drivers/input/touchscreen/cyttsp4_i2c.c
drivers/input/touchscreen/cyttsp_i2c.c
drivers/input/touchscreen/elants_i2c.c
drivers/input/touchscreen/imx6ul_tsc.c [new file with mode: 0644]
drivers/input/touchscreen/sun4i-ts.c
drivers/iommu/omap-iommu-debug.c
drivers/md/Kconfig
drivers/md/dm-mpath.c
drivers/media/platform/omap/Kconfig
drivers/media/platform/omap/omap_vout.c
drivers/media/v4l2-core/Kconfig
drivers/media/v4l2-core/videobuf2-core.c
drivers/media/v4l2-core/videobuf2-dma-contig.c
drivers/media/v4l2-core/videobuf2-dma-sg.c
drivers/media/v4l2-core/videobuf2-memops.c
drivers/media/v4l2-core/videobuf2-vmalloc.c
drivers/misc/genwqe/card_dev.c
drivers/misc/mei/wd.c
drivers/net/dsa/bcm_sf2.c
drivers/net/dsa/bcm_sf2.h
drivers/net/dsa/mv88e6171.c
drivers/net/ethernet/altera/altera_tse_main.c
drivers/net/ethernet/cavium/liquidio/lio_main.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
drivers/net/ethernet/chelsio/cxgb4/sge.c
drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
drivers/net/ethernet/davicom/dm9000.c
drivers/net/ethernet/emulex/benet/be_cmds.c
drivers/net/ethernet/ethoc.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/jme.c
drivers/net/ethernet/marvell/mv643xx_eth.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
drivers/net/ethernet/realtek/r8169.c
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ethernet/synopsys/Kconfig
drivers/net/ntb_netdev.c
drivers/net/phy/Kconfig
drivers/net/phy/Makefile
drivers/net/phy/fixed_phy.c
drivers/net/phy/microchip.c [new file with mode: 0644]
drivers/net/usb/lan78xx.c
drivers/net/usb/r8152.c
drivers/net/usb/usbnet.c
drivers/net/vxlan.c
drivers/net/wan/sbni.c
drivers/net/wireless/ath/wil6210/debugfs.c
drivers/net/xen-netback/common.h
drivers/net/xen-netback/netback.c
drivers/net/xen-netfront.c
drivers/ntb/hw/intel/ntb_hw_intel.c
drivers/ntb/hw/intel/ntb_hw_intel.h
drivers/ntb/ntb_transport.c
drivers/parisc/ccio-dma.c
drivers/parisc/sba_iommu.c
drivers/pci/pci-driver.c
drivers/pinctrl/core.c
drivers/pinctrl/pinctrl-digicolor.c
drivers/pinctrl/pinmux.c
drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c
drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c
drivers/pinctrl/samsung/pinctrl-s3c24xx.c
drivers/platform/x86/acerhdf.c
drivers/platform/x86/intel_mid_thermal.c
drivers/power/charger-manager.c
drivers/power/power_supply_core.c
drivers/reset/reset-ath79.c
drivers/s390/crypto/zcrypt_api.c
drivers/scsi/Makefile
drivers/scsi/aic94xx/aic94xx_sds.c
drivers/scsi/bfa/bfa_ioc.c
drivers/scsi/device_handler/Kconfig
drivers/scsi/device_handler/Makefile
drivers/scsi/device_handler/scsi_dh.c [deleted file]
drivers/scsi/device_handler/scsi_dh_alua.c
drivers/scsi/device_handler/scsi_dh_emc.c
drivers/scsi/device_handler/scsi_dh_hp_sw.c
drivers/scsi/device_handler/scsi_dh_rdac.c
drivers/scsi/fcoe/fcoe.c
drivers/scsi/ipr.c
drivers/scsi/libiscsi.c
drivers/scsi/lpfc/lpfc_mbox.c
drivers/scsi/mpt2sas/mpt2sas_base.c
drivers/scsi/mpt2sas/mpt2sas_base.h
drivers/scsi/mpt2sas/mpt2sas_ctl.c
drivers/scsi/mpt2sas/mpt2sas_scsih.c
drivers/scsi/mpt2sas/mpt2sas_transport.c
drivers/scsi/mpt3sas/mpi/mpi2.h
drivers/scsi/mpt3sas/mpi/mpi2_cnfg.h
drivers/scsi/mpt3sas/mpi/mpi2_ioc.h
drivers/scsi/mpt3sas/mpi/mpi2_tool.h
drivers/scsi/mpt3sas/mpt3sas_base.c
drivers/scsi/mpt3sas/mpt3sas_base.h
drivers/scsi/mpt3sas/mpt3sas_scsih.c
drivers/scsi/mpt3sas/mpt3sas_transport.c
drivers/scsi/pm8001/pm8001_hwi.c
drivers/scsi/pm8001/pm80xx_hwi.c
drivers/scsi/qla2xxx/Kconfig
drivers/scsi/qla2xxx/tcm_qla2xxx.c
drivers/scsi/scsi_common.c
drivers/scsi/scsi_debug.c
drivers/scsi/scsi_dh.c [new file with mode: 0644]
drivers/scsi/scsi_error.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_priv.h
drivers/scsi/scsi_sysfs.c
drivers/scsi/scsi_transport_sas.c
drivers/scsi/xen-scsifront.c
drivers/soc/qcom/smd.c
drivers/soc/qcom/smem.c
drivers/staging/android/ion/ion.c
drivers/staging/board/armadillo800eva.c
drivers/staging/board/board.c
drivers/staging/comedi/comedi_fops.c
drivers/staging/rdma/Kconfig
drivers/staging/rdma/Makefile
drivers/staging/rdma/ehca/Kconfig [new file with mode: 0644]
drivers/staging/rdma/ehca/Makefile [new file with mode: 0644]
drivers/staging/rdma/ehca/TODO [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_av.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_classes.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_classes_pSeries.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_cq.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_eq.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_hca.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_irq.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_irq.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_iverbs.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_main.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_mcast.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_mrmw.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_mrmw.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_pd.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_qes.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_qp.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_reqs.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_sqp.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_tools.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_uverbs.c [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_if.c [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_if.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_phyp.c [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_phyp.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hipz_fns.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hipz_fns_core.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hipz_hw.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ipz_pt_fn.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ipz_pt_fn.h [new file with mode: 0644]
drivers/target/iscsi/iscsi_target.c
drivers/target/iscsi/iscsi_target.h
drivers/target/iscsi/iscsi_target_configfs.c
drivers/target/iscsi/iscsi_target_device.c
drivers/target/iscsi/iscsi_target_login.c
drivers/target/iscsi/iscsi_target_login.h
drivers/target/iscsi/iscsi_target_nego.c
drivers/target/iscsi/iscsi_target_stat.c
drivers/target/iscsi/iscsi_target_tmr.c
drivers/target/iscsi/iscsi_target_tpg.c
drivers/target/iscsi/iscsi_target_tpg.h
drivers/target/iscsi/iscsi_target_util.c
drivers/target/loopback/tcm_loop.c
drivers/target/target_core_device.c
drivers/target/target_core_fabric_configfs.c
drivers/target/target_core_hba.c
drivers/target/target_core_sbc.c
drivers/target/target_core_spc.c
drivers/target/target_core_tpg.c
drivers/target/target_core_transport.c
drivers/target/target_core_user.c
drivers/target/target_core_xcopy.c
drivers/target/tcm_fc/tfc_cmd.c
drivers/thermal/Kconfig
drivers/thermal/Makefile
drivers/thermal/armada_thermal.c
drivers/thermal/db8500_thermal.c
drivers/thermal/dove_thermal.c
drivers/thermal/fair_share.c
drivers/thermal/gov_bang_bang.c
drivers/thermal/hisi_thermal.c
drivers/thermal/imx_thermal.c
drivers/thermal/int340x_thermal/int3400_thermal.c
drivers/thermal/int340x_thermal/int340x_thermal_zone.c
drivers/thermal/int340x_thermal/int340x_thermal_zone.h
drivers/thermal/int340x_thermal/processor_thermal_device.c
drivers/thermal/intel_pch_thermal.c [new file with mode: 0644]
drivers/thermal/intel_powerclamp.c
drivers/thermal/intel_quark_dts_thermal.c
drivers/thermal/intel_soc_dts_iosf.c
drivers/thermal/kirkwood_thermal.c
drivers/thermal/of-thermal.c
drivers/thermal/power_allocator.c
drivers/thermal/qcom-spmi-temp-alarm.c
drivers/thermal/rcar_thermal.c
drivers/thermal/rockchip_thermal.c
drivers/thermal/samsung/exynos_tmu.c
drivers/thermal/spear_thermal.c
drivers/thermal/st/st_thermal.c
drivers/thermal/step_wise.c
drivers/thermal/tegra_soctherm.c
drivers/thermal/thermal_core.c
drivers/thermal/thermal_hwmon.c
drivers/thermal/ti-soc-thermal/ti-thermal-common.c
drivers/thermal/x86_pkg_temp_thermal.c
drivers/tty/hvc/hvc_xen.c
drivers/video/fbdev/omap2/omapfb/omapfb-main.c
drivers/video/fbdev/xen-fbfront.c
drivers/watchdog/Kconfig
drivers/watchdog/Makefile
drivers/watchdog/at91rm9200_wdt.c
drivers/watchdog/at91sam9_wdt.c
drivers/watchdog/at91sam9_wdt.h
drivers/watchdog/bcm2835_wdt.c
drivers/watchdog/bcm47xx_wdt.c
drivers/watchdog/bcm_kona_wdt.c
drivers/watchdog/booke_wdt.c
drivers/watchdog/coh901327_wdt.c
drivers/watchdog/da9052_wdt.c
drivers/watchdog/da9055_wdt.c
drivers/watchdog/da9062_wdt.c
drivers/watchdog/da9063_wdt.c
drivers/watchdog/davinci_wdt.c
drivers/watchdog/digicolor_wdt.c
drivers/watchdog/ep93xx_wdt.c
drivers/watchdog/gpio_wdt.c
drivers/watchdog/ie6xx_wdt.c
drivers/watchdog/imgpdc_wdt.c
drivers/watchdog/intel-mid_wdt.c
drivers/watchdog/jz4740_wdt.c
drivers/watchdog/lpc18xx_wdt.c [new file with mode: 0644]
drivers/watchdog/mena21_wdt.c
drivers/watchdog/menf21bmc_wdt.c
drivers/watchdog/mpc8xxx_wdt.c
drivers/watchdog/mtk_wdt.c
drivers/watchdog/nv_tco.c
drivers/watchdog/omap_wdt.c
drivers/watchdog/orion_wdt.c
drivers/watchdog/pnx4008_wdt.c
drivers/watchdog/qcom-wdt.c
drivers/watchdog/retu_wdt.c
drivers/watchdog/rt2880_wdt.c
drivers/watchdog/s3c2410_wdt.c
drivers/watchdog/sama5d4_wdt.c [new file with mode: 0644]
drivers/watchdog/shwdt.c
drivers/watchdog/sirfsoc_wdt.c
drivers/watchdog/sp805_wdt.c
drivers/watchdog/st_lpc_wdt.c
drivers/watchdog/stmp3xxx_rtc_wdt.c
drivers/watchdog/sunxi_wdt.c
drivers/watchdog/tegra_wdt.c
drivers/watchdog/twl4030_wdt.c
drivers/watchdog/txx9wdt.c
drivers/watchdog/ux500_wdt.c
drivers/watchdog/via_wdt.c
drivers/watchdog/wm831x_wdt.c
drivers/watchdog/wm8350_wdt.c
drivers/xen/balloon.c
drivers/xen/biomerge.c
drivers/xen/events/events_base.c
drivers/xen/events/events_fifo.c
drivers/xen/gntalloc.c
drivers/xen/gntdev.c
drivers/xen/manage.c
drivers/xen/privcmd.c
drivers/xen/swiotlb-xen.c
drivers/xen/tmem.c
drivers/xen/xenbus/xenbus_client.c
drivers/xen/xenbus/xenbus_dev_backend.c
drivers/xen/xenbus/xenbus_probe.c
drivers/xen/xlate_mmu.c
fs/affs/super.c
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/inode.c
fs/btrfs/scrub.c
fs/btrfs/tree-defrag.c
fs/btrfs/volumes.c
fs/ceph/addr.c
fs/ceph/caps.c
fs/ceph/file.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/snap.c
fs/ceph/super.c
fs/cifs/cifsfs.c
fs/cifs/file.c
fs/cifs/ioctl.c
fs/coda/upcall.c
fs/coredump.c
fs/fs-writeback.c
fs/gfs2/glock.c
fs/gfs2/glops.c
fs/gfs2/incore.h
fs/gfs2/lock_dlm.c
fs/gfs2/lops.c
fs/gfs2/meta_io.c
fs/gfs2/meta_io.h
fs/gfs2/quota.c
fs/gfs2/rgrp.c
fs/gfs2/trace_gfs2.h
fs/gfs2/trans.c
fs/hfs/bnode.c
fs/hfs/brec.c
fs/hfsplus/bnode.c
fs/kernfs/dir.c
fs/namei.c
fs/nsfs.c
fs/ocfs2/dlm/dlmrecovery.c
fs/proc/base.c
fs/proc/generic.c
fs/proc/page.c
fs/proc/task_mmu.c
fs/seq_file.c
include/asm-generic/dma-mapping-common.h
include/asm-generic/qspinlock.h
include/kvm/arm_arch_timer.h
include/kvm/arm_vgic.h
include/linux/backing-dev.h
include/linux/blk-cgroup.h
include/linux/blkdev.h
include/linux/ceph/libceph.h
include/linux/ceph/messenger.h
include/linux/ceph/msgr.h
include/linux/cgroup_subsys.h
include/linux/clockchips.h
include/linux/irqchip/arm-gic-v3.h
include/linux/irqchip/arm-gic.h
include/linux/jump_label.h
include/linux/kernfs.h
include/linux/kexec.h
include/linux/kmod.h
include/linux/kvm_host.h
include/linux/memcontrol.h
include/linux/microchipphy.h [new file with mode: 0644]
include/linux/mm.h
include/linux/mmu_notifier.h
include/linux/netlink.h
include/linux/ntb.h
include/linux/ntb_transport.h
include/linux/page-flags.h
include/linux/page_ext.h
include/linux/page_idle.h [new file with mode: 0644]
include/linux/pm_opp.h
include/linux/poison.h
include/linux/printk.h
include/linux/reset.h
include/linux/seq_file.h
include/linux/string_helpers.h
include/linux/syscalls.h
include/linux/thermal.h
include/linux/tick.h
include/linux/zpool.h
include/media/videobuf2-memops.h
include/net/fib_rules.h
include/net/mac80211.h
include/net/netfilter/br_netfilter.h
include/net/netfilter/nf_conntrack.h
include/net/netfilter/nf_tables.h
include/scsi/scsi_common.h
include/scsi/scsi_device.h
include/scsi/scsi_dh.h
include/scsi/scsi_eh.h
include/target/iscsi/iscsi_target_core.h
include/target/iscsi/iscsi_target_stat.h
include/target/iscsi/iscsi_transport.h
include/target/target_core_backend.h
include/target/target_core_base.h
include/target/target_core_fabric.h
include/trace/events/kvm.h
include/trace/events/thermal_power_allocator.h
include/trace/events/writeback.h
include/uapi/asm-generic/unistd.h
include/uapi/drm/i915_drm.h
include/uapi/linux/Kbuild
include/uapi/linux/elf-em.h
include/uapi/linux/if_ether.h
include/uapi/linux/kernel-page-flags.h
include/uapi/linux/kvm.h
include/uapi/linux/membarrier.h [new file with mode: 0644]
include/uapi/linux/target_core_user.h
include/uapi/xen/privcmd.h
include/xen/page.h
include/xen/xen-ops.h
init/Kconfig
init/initramfs.c
init/main.c
ipc/msgutil.c
ipc/shm.c
kernel/Makefile
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/cpu_pm.c
kernel/cred.c
kernel/events/core.c
kernel/extable.c
kernel/kexec.c
kernel/kexec_core.c [new file with mode: 0644]
kernel/kexec_file.c [new file with mode: 0644]
kernel/kexec_internal.h [new file with mode: 0644]
kernel/kmod.c
kernel/ksysfs.c
kernel/locking/qspinlock.c
kernel/membarrier.c [new file with mode: 0644]
kernel/printk/printk.c
kernel/reboot.c
kernel/sched/core.c
kernel/sys_ni.c
kernel/sysctl.c
kernel/time/clockevents.c
kernel/time/tick-common.c
kernel/time/tick-sched.c
kernel/time/timekeeping.c
kernel/time/timer_list.c
lib/bitmap.c
lib/decompress_bunzip2.c
lib/decompress_inflate.c
lib/decompress_unlz4.c
lib/decompress_unlzma.c
lib/decompress_unlzo.c
lib/decompress_unxz.c
lib/kstrtox.c
lib/string_helpers.c
lib/test-kstrtox.c
lib/test_kasan.c
lib/zlib_deflate/deftree.c
lib/zlib_deflate/defutil.h
mm/Kconfig
mm/Makefile
mm/backing-dev.c
mm/debug.c
mm/early_ioremap.c
mm/frame_vector.c [new file with mode: 0644]
mm/huge_memory.c
mm/hwpoison-inject.c
mm/kmemleak.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/migrate.c
mm/mmap.c
mm/mmu_notifier.c
mm/nommu.c
mm/page-writeback.c
mm/page_ext.c
mm/page_idle.c [new file with mode: 0644]
mm/rmap.c
mm/swap.c
mm/zpool.c
mm/zswap.c
net/bridge/br_netlink.c
net/bridge/br_vlan.c
net/ceph/ceph_common.c
net/ceph/crypto.c
net/ceph/messenger.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/core/fib_rules.c
net/decnet/dn_rules.c
net/ipv4/fib_rules.c
net/ipv4/ipmr.c
net/ipv4/tcp_cubic.c
net/ipv4/tcp_output.c
net/ipv6/fib6_rules.c
net/ipv6/ip6mr.c
net/ipv6/route.c
net/mac80211/cfg.c
net/mac80211/mlme.c
net/mac80211/rate.c
net/mac80211/tdls.c
net/mac80211/vht.c
net/netfilter/ipset/ip_set_hash_gen.h
net/netfilter/ipset/ip_set_hash_netnet.c
net/netfilter/ipset/ip_set_hash_netportnet.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_synproxy_core.c
net/netfilter/nfnetlink.c
net/netfilter/nfnetlink_queue_core.c
net/netfilter/xt_CT.c
net/netlink/af_netlink.c
net/openvswitch/Kconfig
net/openvswitch/Makefile
net/openvswitch/conntrack.h
net/rds/connection.c
net/rfkill/core.c
net/sctp/protocol.c
net/switchdev/switchdev.c
net/tipc/bcast.c
net/wireless/reg.c
scripts/checkpatch.pl
scripts/extract-cert.c
scripts/sign-file.c
security/device_cgroup.c
security/selinux/selinuxfs.c
sound/pci/hda/patch_realtek.c
sound/sparc/amd7930.c
sound/usb/stream.c
tools/perf/builtin-script.c
tools/perf/tests/sw-clock.c
tools/perf/tests/task-exit.c
tools/perf/ui/browsers/hists.c
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/header.c
tools/perf/util/intel-bts.c
tools/perf/util/intel-pt.c
tools/perf/util/parse-events.c
tools/perf/util/parse-events.y
tools/testing/selftests/Makefile
tools/testing/selftests/membarrier/.gitignore [new file with mode: 0644]
tools/testing/selftests/membarrier/Makefile [new file with mode: 0644]
tools/testing/selftests/membarrier/membarrier_test.c [new file with mode: 0644]
tools/testing/selftests/x86/entry_from_vm86.c
virt/kvm/arm/arch_timer.c
virt/kvm/arm/vgic-v2.c
virt/kvm/arm/vgic-v3.c
virt/kvm/arm/vgic.c
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

diff --git a/CREDITS b/CREDITS
index bcb8efa..8207cc6 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -2992,6 +2992,10 @@ S: 2200 Mission College Blvd
 S: Santa Clara, CA 95052
 S: USA
 
+N: Anil Ravindranath
+E: anil_ravindranath@pmc-sierra.com
+D: PMC-Sierra MaxRAID driver
+
 N: Eric S. Raymond
 E: esr@thyrsus.com
 W: http://www.tuxedo.org/~esr/
index 68b6a6a..12686be 100644 (file)
@@ -201,7 +201,7 @@ Proportional weight policy files
          specifies the number of bytes.
 
 - blkio.io_serviced
-       - Number of IOs completed to/from the disk by the group. These
+       - Number of IOs (bio) issued to the disk by the group. These
          are further divided by the type of operation - read or write, sync
          or async. First two fields specify the major and minor number of the
          device, third field specifies the operation type and the fourth field
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
       subjected to both the constraints.
 
 - blkio.throttle.io_serviced
-       - Number of IOs (bio) completed to/from the disk by the group (as
-         seen by throttling policy). These are further divided by the type
-         of operation - read or write, sync or async. First two fields specify
-         the major and minor number of the device, third field specifies the
-         operation type and the fourth field specifies the number of IOs.
-
-         blkio.io_serviced does accounting as seen by CFQ and counts are in
-         number of requests (struct request). On the other hand,
-         blkio.throttle.io_serviced counts number of IO in terms of number
-         of bios as seen by throttling policy.  These bios can later be
-         merged by elevator and total number of requests completed can be
-         lesser.
+       - Number of IOs (bio) issued to the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of IOs.
 
 - blkio.throttle.io_service_bytes
        - Number of bytes transferred to/from the disk by the group. These
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
          device, third field specifies the operation type and the fourth field
          specifies the number of bytes.
 
-         These numbers should roughly be same as blkio.io_service_bytes as
-         updated by CFQ. The difference between two is that
-         blkio.io_service_bytes will not be updated if CFQ is not operating
-         on request queue.
-
 Common files among various policies
 -----------------------------------
 - blkio.reset_stats
index 1ee9caf..e0975c2 100644 (file)
@@ -27,7 +27,7 @@ CONTENTS
     5-3-1. Format
     5-3-2. Control Knobs
   5-4. Per-Controller Changes
-    5-4-1. blkio
+    5-4-1. io
     5-4-2. cpuset
     5-4-3. memory
 6. Planned Changes
@@ -203,7 +203,7 @@ other issues.  The mapping from nice level to weight isn't obvious or
 universal, and there are various other knobs which simply aren't
 available for tasks.
 
-The blkio controller implicitly creates a hidden leaf node for each
+The io controller implicitly creates a hidden leaf node for each
 cgroup to host the tasks.  The hidden leaf has its own copies of all
 the knobs with "leaf_" prefixed.  While this allows equivalent control
 over internal tasks, it's with serious drawbacks.  It always adds an
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified.
 
 5-4. Per-Controller Changes
 
-5-4-1. blkio
+5-4-1. io
 
-- blk-throttle becomes properly hierarchical.
+- blkio is renamed to io.  The interface is overhauled anyway.  The
+  new name is more in line with the other two major controllers, cpu
+  and memory, and better suited given that it may be used for cgroup
+  writeback without involving block layer.
+
+- Everything including stat is always hierarchical making separate
+  recursive stat files pointless and, as no internal node can have
+  tasks, leaf weights are meaningless.  The operation model is
+  simplified and the interface is overhauled accordingly.
+
+  io.stat
+
+       The stat file.  The reported stats are from the point where
+       bio's are issued to request_queue.  The stats are counted
+       independent of which policies are enabled.  Each line in the
+       file follows the following format.  More fields may later be
+       added at the end.
+
+         $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
+
+  io.weight
+
+       The weight setting, currently only available and effective if
+       cfq-iosched is in use for the target device.  The weight is
+       between 1 and 10000 and defaults to 100.  The first line
+       always contains the default weight in the following format to
+       use when per-device setting is missing.
+
+         default $WEIGHT
+
+       Subsequent lines list per-device weights of the following
+       format.
+
+         $MAJ:$MIN $WEIGHT
+
+       Writing "$WEIGHT" or "default $WEIGHT" changes the default
+       setting.  Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
+       while "$MAJ:$MIN default" clears it.
+
+       This file is available only on non-root cgroups.
+
+  io.max
+
+       The maximum bandwidth and/or iops setting, only available if
+       blk-throttle is enabled.  The file is of the following format.
+
+         $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
+
+       ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
+       read/write IOs per second.  "max" indicates no limit.  Writing
+       to the file follows the same format but the individual
+       settings may be ommitted or specified in any order.
+
+       This file is available only on non-root cgroups.
 
 
 5-4-2. cpuset
diff --git a/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt b/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt
new file mode 100644 (file)
index 0000000..9d9e930
--- /dev/null
@@ -0,0 +1,36 @@
+* Toradex Colibri VF50 Touchscreen driver
+
+Required Properties:
+- compatible must be toradex,vf50-touchscreen
+- io-channels: adc channels being used by the Colibri VF50 module
+- xp-gpios: FET gate driver for input of X+
+- xm-gpios: FET gate driver for input of X-
+- yp-gpios: FET gate driver for input of Y+
+- ym-gpios: FET gate driver for input of Y-
+- interrupt-parent: phandle for the interrupt controller
+- interrupts: pen irq interrupt for touch detection
+- pinctrl-names: "idle", "default", "gpios"
+- pinctrl-0: pinctrl node for pen/touch detection state pinmux
+- pinctrl-1: pinctrl node for X/Y and pressure measurement (ADC) state pinmux
+- pinctrl-2: pinctrl node for gpios functioning as FET gate drivers
+- vf50-ts-min-pressure: pressure level at which to stop measuring X/Y values
+
+Example:
+
+       touchctrl: vf50_touchctrl {
+               compatible = "toradex,vf50-touchscreen";
+               io-channels = <&adc1 0>,<&adc0 0>,
+                               <&adc0 1>,<&adc1 2>;
+               xp-gpios = <&gpio0 13 GPIO_ACTIVE_LOW>;
+               xm-gpios = <&gpio2 29 GPIO_ACTIVE_HIGH>;
+               yp-gpios = <&gpio0 12 GPIO_ACTIVE_LOW>;
+               ym-gpios = <&gpio0 4 GPIO_ACTIVE_HIGH>;
+               interrupt-parent = <&gpio0>;
+               interrupts = <8 IRQ_TYPE_LEVEL_LOW>;
+               pinctrl-names = "idle","default","gpios";
+               pinctrl-0 = <&pinctrl_touchctrl_idle>;
+               pinctrl-1 = <&pinctrl_touchctrl_default>;
+               pinctrl-2 = <&pinctrl_touchctrl_gpios>;
+               vf50-ts-min-pressure = <200>;
+               status = "disabled";
+       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/imx6ul_tsc.txt b/Documentation/devicetree/bindings/input/touchscreen/imx6ul_tsc.txt
new file mode 100644 (file)
index 0000000..853dff9
--- /dev/null
@@ -0,0 +1,36 @@
+* Freescale i.MX6UL Touch Controller
+
+Required properties:
+- compatible: must be "fsl,imx6ul-tsc".
+- reg: this touch controller address and the ADC2 address.
+- interrupts: the interrupt of this touch controller and ADC2.
+- clocks: the root clock of touch controller and ADC2.
+- clock-names; must be "tsc" and "adc".
+- xnur-gpio: the X- gpio this controller connect to.
+  This xnur-gpio returns to low once the finger leave the touch screen (The
+  last touch event the touch controller capture).
+
+Optional properties:
+- measure-delay-time: the value of measure delay time.
+  Before X-axis or Y-axis measurement, the screen need some time before
+  even potential distribution ready.
+  This value depends on the touch screen.
+- pre-charge-time: the touch screen need some time to precharge.
+  This value depends on the touch screen.
+
+Example:
+       tsc: tsc@02040000 {
+               compatible = "fsl,imx6ul-tsc";
+               reg = <0x02040000 0x4000>, <0x0219c000 0x4000>;
+               interrupts = <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&clks IMX6UL_CLK_IPG>,
+                        <&clks IMX6UL_CLK_ADC2>;
+               clock-names = "tsc", "adc";
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_tsc>;
+               xnur-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>;
+               measure-delay-time = <0xfff>;
+               pre-charge-time = <0xffff>;
+               status = "okay";
+       };
index f65c76d..97d9b3e 100644 (file)
@@ -37,6 +37,12 @@ The edge is described by the following properties:
        Definition: the identifier of the remote processor in the smd channel
                    allocation table
 
+- qcom,remote-pid:
+       Usage: optional
+       Value type: <u32>
+       Definition: the identifier for the remote processor as known by the rest
+                   of the system.
+
 = SMD DEVICES
 
 In turn, subnodes of the "edges" represent devices tied to SMD channels on that
diff --git a/Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt b/Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt
new file mode 100644 (file)
index 0000000..f7cc7c0
--- /dev/null
@@ -0,0 +1,35 @@
+* Atmel SAMA5D4 Watchdog Timer (WDT) Controller
+
+Required properties:
+- compatible: "atmel,sama5d4-wdt"
+- reg: base physical address and length of memory mapped region.
+
+Optional properties:
+- timeout-sec: watchdog timeout value (in seconds).
+- interrupts: interrupt number to the CPU.
+- atmel,watchdog-type: should be "hardware" or "software".
+       "hardware": enable watchdog fault reset. A watchdog fault triggers
+                   watchdog reset.
+       "software": enable watchdog fault interrupt. A watchdog fault asserts
+                   watchdog interrupt.
+- atmel,idle-halt: present if you want to stop the watchdog when the CPU is
+                  in idle state.
+       CAUTION: This property should be used with care, it actually makes the
+       watchdog not counting when the CPU is in idle state, therefore the
+       watchdog reset time depends on mean CPU usage and will not reset at all
+       if the CPU stop working while it is in idle state, which is probably
+       not what you want.
+- atmel,dbg-halt: present if you want to stop the watchdog when the CPU is
+                 in debug state.
+
+Example:
+       watchdog@fc068640 {
+               compatible = "atmel,sama5d4-wdt";
+               reg = <0xfc068640 0x10>;
+               interrupts = <4 IRQ_TYPE_LEVEL_HIGH 5>;
+               timeout-sec = <10>;
+               atmel,watchdog-type = "hardware";
+               atmel,dbg-halt;
+               atmel,idle-halt;
+               status = "okay";
+       };
diff --git a/Documentation/devicetree/bindings/watchdog/lpc18xx-wdt.txt b/Documentation/devicetree/bindings/watchdog/lpc18xx-wdt.txt
new file mode 100644 (file)
index 0000000..09f6b24
--- /dev/null
@@ -0,0 +1,19 @@
+* NXP LPC18xx Watchdog Timer (WDT)
+
+Required properties:
+- compatible: Should be "nxp,lpc1850-wwdt"
+- reg: Should contain WDT registers location and length
+- clocks: Must contain an entry for each entry in clock-names.
+- clock-names: Should contain "wdtclk" and "reg"; the watchdog counter
+               clock and register interface clock respectively.
+- interrupts: Should contain WDT interrupt
+
+Examples:
+
+watchdog@40080000 {
+       compatible = "nxp,lpc1850-wwdt";
+       reg = <0x40080000 0x24>;
+       clocks = <&cgu BASE_SAFE_CLK>, <&ccu1 CLK_CPU_WWDT>;
+       clock-names = "wdtclk", "reg";
+       interrupts = <49>;
+};
index b80606d..f59c43b 100644 (file)
@@ -21,8 +21,8 @@ exact way to do it depends on the GPIO controller providing the GPIOs, see the
 device tree bindings for your controller.
 
 GPIOs mappings are defined in the consumer device's node, in a property named
-<function>-gpios, where <function> is the function the driver will request
-through gpiod_get(). For example:
+either <function>-gpios or <function>-gpio, where <function> is the function
+the driver will request through gpiod_get(). For example:
 
        foo_device {
                compatible = "acme,foo";
@@ -31,7 +31,7 @@ through gpiod_get(). For example:
                            <&gpio 16 GPIO_ACTIVE_HIGH>, /* green */
                            <&gpio 17 GPIO_ACTIVE_HIGH>; /* blue */
 
-               power-gpios = <&gpio 1 GPIO_ACTIVE_LOW>;
+               power-gpio = <&gpio 1 GPIO_ACTIVE_LOW>;
        };
 
 This property will make GPIOs 15, 16 and 17 available to the driver under the
@@ -39,15 +39,24 @@ This property will make GPIOs 15, 16 and 17 available to the driver under the
 
        struct gpio_desc *red, *green, *blue, *power;
 
-       red = gpiod_get_index(dev, "led", 0);
-       green = gpiod_get_index(dev, "led", 1);
-       blue = gpiod_get_index(dev, "led", 2);
+       red = gpiod_get_index(dev, "led", 0, GPIOD_OUT_HIGH);
+       green = gpiod_get_index(dev, "led", 1, GPIOD_OUT_HIGH);
+       blue = gpiod_get_index(dev, "led", 2, GPIOD_OUT_HIGH);
 
-       power = gpiod_get(dev, "power");
+       power = gpiod_get(dev, "power", GPIOD_OUT_HIGH);
 
 The led GPIOs will be active-high, while the power GPIO will be active-low (i.e.
 gpiod_is_active_low(power) will be true).
 
+The second parameter of the gpiod_get() functions, the con_id string, has to be
+the <function>-prefix of the GPIO suffixes ("gpios" or "gpio", automatically
+looked up by the gpiod functions internally) used in the device tree. With above
+"led-gpios" example, use the prefix without the "-" as con_id parameter: "led".
+
+Internally, the GPIO subsystem prefixes the GPIO suffix ("gpios" or "gpio")
+with the string passed in con_id to get the resulting string
+(snprintf(... "%s-%s", con_id, gpio_suffixes[]).
+
 ACPI
 ----
 ACPI also supports function names for GPIOs in a similar fashion to DT.
@@ -142,13 +151,14 @@ The driver controlling "foo.0" will then be able to obtain its GPIOs as follows:
 
        struct gpio_desc *red, *green, *blue, *power;
 
-       red = gpiod_get_index(dev, "led", 0);
-       green = gpiod_get_index(dev, "led", 1);
-       blue = gpiod_get_index(dev, "led", 2);
+       red = gpiod_get_index(dev, "led", 0, GPIOD_OUT_HIGH);
+       green = gpiod_get_index(dev, "led", 1, GPIOD_OUT_HIGH);
+       blue = gpiod_get_index(dev, "led", 2, GPIOD_OUT_HIGH);
 
-       power = gpiod_get(dev, "power");
-       gpiod_direction_output(power, 1);
+       power = gpiod_get(dev, "power", GPIOD_OUT_HIGH);
 
-Since the "power" GPIO is mapped as active-low, its actual signal will be 0
-after this code. Contrary to the legacy integer GPIO interface, the active-low
-property is handled during mapping and is thus transparent to GPIO consumers.
+Since the "led" GPIOs are mapped as active-high, this example will switch their
+signals to 1, i.e. enabling the LEDs. And for the "power" GPIO, which is mapped
+as active-low, its actual signal will be 0 after this code. Contrary to the legacy
+integer GPIO interface, the active-low property is handled during mapping and is
+thus transparent to GPIO consumers.
index a206639..e000502 100644 (file)
@@ -39,6 +39,9 @@ device that displays digits), an additional index argument can be specified:
                                          const char *con_id, unsigned int idx,
                                          enum gpiod_flags flags)
 
+For a more detailed description of the con_id parameter in the DeviceTree case
+see Documentation/gpio/board.txt
+
 The flags parameter is used to optionally specify a direction and initial value
 for the GPIO. Values can be:
 
index f0dd3d2..76add4c 100644 (file)
@@ -32,6 +32,10 @@ Supported chips:
     Prefix: 'nct6792'
     Addresses scanned: ISA address retrieved from Super I/O registers
     Datasheet: Available from Nuvoton upon request
+  * Nuvoton NCT6793D
+    Prefix: 'nct6793'
+    Addresses scanned: ISA address retrieved from Super I/O registers
+    Datasheet: Available from Nuvoton upon request
 
 Authors:
         Guenter Roeck <linux@roeck-us.net>
index f4cb0b2..477927b 100644 (file)
@@ -15,8 +15,8 @@ The updated API replacements are:
 
 DEFINE_STATIC_KEY_TRUE(key);
 DEFINE_STATIC_KEY_FALSE(key);
-static_key_likely()
-statick_key_unlikely()
+static_branch_likely()
+static_branch_unlikely()
 
 0) Abstract
 
index c1f6864..10f062e 100644 (file)
@@ -180,6 +180,7 @@ Thermal zone device sys I/F, created once it's registered:
     |---temp:                  Current temperature
     |---mode:                  Working mode of the thermal zone
     |---policy:                        Thermal governor used for this zone
+    |---available_policies:    Available thermal governors for this zone
     |---trip_point_[0-*]_temp: Trip point temperature
     |---trip_point_[0-*]_type: Trip point type
     |---trip_point_[0-*]_hyst: Hysteresis value for this trip point
@@ -256,6 +257,10 @@ policy
        One of the various thermal governors used for a particular zone.
        RW, Required
 
+available_policies
+       Available thermal governors which can be used for a particular zone.
+       RO, Required
+
 trip_point_[0-*]_temp
        The temperature above which trip point will be fired.
        Unit: millidegree Celsius
@@ -417,6 +422,7 @@ method, the sys I/F structure will be built like this:
     |---temp:                  37000
     |---mode:                  enabled
     |---policy:                        step_wise
+    |---available_policies:    step_wise fair_share
     |---trip_point_0_temp:     100000
     |---trip_point_0_type:     critical
     |---trip_point_1_temp:     80000
index a4ebcb7..d9eccee 100644 (file)
@@ -2671,7 +2671,7 @@ handled.
 4.87 KVM_SET_GUEST_DEBUG
 
 Capability: KVM_CAP_SET_GUEST_DEBUG
-Architectures: x86, s390, ppc
+Architectures: x86, s390, ppc, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_guest_debug (in)
 Returns: 0 on success; -1 on error
@@ -2693,8 +2693,8 @@ when running. Common control bits are:
 The top 16 bits of the control field are architecture specific control
 flags which can include the following:
 
-  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86]
-  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86, arm64]
+  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390, arm64]
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -2709,6 +2709,11 @@ updated to the correct (supplied) values.
 The second part of the structure is architecture specific and
 typically contains a set of debug registers.
 
+For arm64 the number of debug registers is implementation defined and
+can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and
+KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number
+indicating the number of supported registers.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
@@ -3111,11 +3116,13 @@ data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
 where kvm expects application code to place the data for the next
 KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
 
+               /* KVM_EXIT_DEBUG */
                struct {
                        struct kvm_debug_exit_arch arch;
                } debug;
 
-Unused.
+If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event
+for which architecture specific information is returned.
 
                /* KVM_EXIT_MMIO */
                struct {
index 081c497..6a5e2a1 100644 (file)
@@ -14,6 +14,8 @@ hugetlbpage.txt
        - a brief summary of hugetlbpage support in the Linux kernel.
 hwpoison.txt
        - explains what hwpoison is
+idle_page_tracking.txt
+       - description of the idle page tracking feature.
 ksm.txt
        - how to use the Kernel Samepage Merging feature.
 numa
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt
new file mode 100644 (file)
index 0000000..85dcc3b
--- /dev/null
@@ -0,0 +1,98 @@
+MOTIVATION
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+USER API
+
+The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
+it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the IMPLEMENTATION
+DETAILS section). To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+/proc/kpageflags in order to correctly count idle huge pages.
+
+Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+    /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
+    /proc/pid/pagemap if the workload is represented by a process, or by
+    filtering out alien pages using /proc/kpagecgroup in case the workload is
+    placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
+    one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using /proc/kpageflags.
+
+See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
+/proc/kpageflags, and /proc/kpagecgroup.
+
+IMPLEMENTATION DETAILS
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+   or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+   because a process needs filesystem metadata stored in it (e.g. lists a
+   directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
index 3cd3843..0e1e555 100644 (file)
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are three components to pagemap:
+There are four components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
    physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -70,6 +70,11 @@ There are three components to pagemap:
     22. THP
     23. BALLOON
     24. ZERO_PAGE
+    25. IDLE
+
+ * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
 
 Short descriptions to the page flags:
 
@@ -116,6 +121,12 @@ Short descriptions to the page flags:
 24. ZERO_PAGE
     zero page for pfn_zero or huge_zero page
 
+25. IDLE
+    page has not been accessed since it was marked idle (see
+    Documentation/vm/idle_page_tracking.txt). Note that this flag may be
+    stale in case the page was accessed via a PTE. To make sure the flag
+    is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
+
     [IO related page flags]
  1. ERROR     IO error occurred
  3. UPTODATE  page has up-to-date data
index 8458c08..89fff7d 100644 (file)
@@ -32,7 +32,7 @@ can also be enabled and disabled at runtime using the sysfs interface.
 An example command to enable zswap at runtime, assuming sysfs is mounted
 at /sys, is:
 
-echo 1 > /sys/modules/zswap/parameters/enabled
+echo 1 > /sys/module/zswap/parameters/enabled
 
 When zswap is disabled at runtime it will stop storing pages that are
 being swapped out.  However, it will _not_ immediately write out or fault
@@ -49,14 +49,26 @@ Zswap receives pages for compression through the Frontswap API and is able to
 evict pages from its own compressed pool on an LRU basis and write them back to
 the backing swap device in the case that the compressed pool is full.
 
-Zswap makes use of zbud for the managing the compressed memory pool.  Each
-allocation in zbud is not directly accessible by address.  Rather, a handle is
+Zswap makes use of zpool for the managing the compressed memory pool.  Each
+allocation in zpool is not directly accessible by address.  Rather, a handle is
 returned by the allocation routine and that handle must be mapped before being
 accessed.  The compressed memory pool grows on demand and shrinks as compressed
-pages are freed.  The pool is not preallocated.
+pages are freed.  The pool is not preallocated.  By default, a zpool of type
+zbud is created, but it can be selected at boot time by setting the "zpool"
+attribute, e.g. zswap.zpool=zbud.  It can also be changed at runtime using the
+sysfs "zpool" attribute, e.g.
+
+echo zbud > /sys/module/zswap/parameters/zpool
+
+The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
+means the compression ratio will always be 2:1 or worse (because of half-full
+zbud pages).  The zsmalloc type zpool has a more complex compressed page
+storage method, and it can achieve greater storage densities.  However,
+zsmalloc does not implement compressed page eviction, so once zswap fills it
+cannot evict the oldest page, it can only reject new pages.
 
 When a swap page is passed from frontswap to zswap, zswap maintains a mapping
-of the swap entry, a combination of the swap type and swap offset, to the zbud
+of the swap entry, a combination of the swap type and swap offset, to the zpool
 handle that references that compressed swap page.  This mapping is achieved
 with a red-black tree per swap type.  The swap offset is the search key for the
 tree nodes.
@@ -74,9 +86,17 @@ controlled policy:
 * max_pool_percent - The maximum percentage of memory that the compressed
     pool can occupy.
 
-Zswap allows the compressor to be selected at kernel boot time by setting the
-“compressor” attribute.  The default compressor is lzo.  e.g.
-zswap.compressor=deflate
+The default compressor is lzo, but it can be selected at boot time by setting
+the “compressor” attribute, e.g. zswap.compressor=lzo.  It can also be changed
+at runtime using the sysfs "compressor" attribute, e.g.
+
+echo lzo > /sys/module/zswap/parameters/compressor
+
+When the zpool and/or compressor parameter is changed at runtime, any existing
+compressed pages are not modified; they are left in their own zpool.  When a
+request is made for a page in an old zpool, it is uncompressed using its
+original compressor.  Once all pages are removed from an old zpool, the zpool
+and its compressor are freed.
 
 A debugfs interface is provided for various statistic about pool size, number
 of pages stored, and various counters for the reasons pages are rejected.
index 3da8229..fcdde8f 100644 (file)
@@ -41,6 +41,7 @@ static void term(int sig)
 int main(int argc, char *argv[])
 {
     int flags;
+    unsigned int ping_rate = 1;
 
     fd = open("/dev/watchdog", O_WRONLY);
 
@@ -63,22 +64,33 @@ int main(int argc, char *argv[])
            fprintf(stderr, "Watchdog card enabled.\n");
            fflush(stderr);
            goto end;
+       } else if (!strncasecmp(argv[1], "-t", 2) && argv[2]) {
+           flags = atoi(argv[2]);
+           ioctl(fd, WDIOC_SETTIMEOUT, &flags);
+           fprintf(stderr, "Watchdog timeout set to %u seconds.\n", flags);
+           fflush(stderr);
+           goto end;
+       } else if (!strncasecmp(argv[1], "-p", 2) && argv[2]) {
+           ping_rate = strtoul(argv[2], NULL, 0);
+           fprintf(stderr, "Watchdog ping rate set to %u seconds.\n", ping_rate);
+           fflush(stderr);
        } else {
-           fprintf(stderr, "-d to disable, -e to enable.\n");
+           fprintf(stderr, "-d to disable, -e to enable, -t <n> to set " \
+               "the timeout,\n-p <n> to set the ping rate, and \n");
            fprintf(stderr, "run by itself to tick the card.\n");
            fflush(stderr);
            goto end;
        }
-    } else {
-       fprintf(stderr, "Watchdog Ticking Away!\n");
-       fflush(stderr);
     }
 
+    fprintf(stderr, "Watchdog Ticking Away!\n");
+    fflush(stderr);
+
     signal(SIGINT, term);
 
     while(1) {
        keep_alive();
-       sleep(1);
+       sleep(ping_rate);
     }
 end:
     close(fd);
index 67a4443..7ba7ab7 100644 (file)
@@ -6789,6 +6789,14 @@ W:       http://www.mellanox.com
 Q:     http://patchwork.ozlabs.org/project/netdev/list/
 F:     drivers/net/ethernet/mellanox/mlxsw/
 
+MEMBARRIER SUPPORT
+M:     Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+M:     "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+L:     linux-kernel@vger.kernel.org
+S:     Supported
+F:     kernel/membarrier.c
+F:     include/uapi/linux/membarrier.h
+
 MEMORY MANAGEMENT
 L:     linux-mm@kvack.org
 W:     http://www.linux-mm.org
@@ -7396,6 +7404,7 @@ NTB DRIVER CORE
 M:     Jon Mason <jdmason@kudzu.us>
 M:     Dave Jiang <dave.jiang@intel.com>
 M:     Allen Hubbe <Allen.Hubbe@emc.com>
+L:     linux-ntb@googlegroups.com
 S:     Supported
 W:     https://github.com/jonmason/ntb/wiki
 T:     git git://github.com/jonmason/ntb.git
@@ -7407,6 +7416,7 @@ F:        include/linux/ntb_transport.h
 NTB INTEL DRIVER
 M:     Jon Mason <jdmason@kudzu.us>
 M:     Dave Jiang <dave.jiang@intel.com>
+L:     linux-ntb@googlegroups.com
 S:     Supported
 W:     https://github.com/jonmason/ntb/wiki
 T:     git git://github.com/jonmason/ntb.git
@@ -8199,10 +8209,9 @@ F:       drivers/hwmon/pmbus/
 F:     include/linux/i2c/pmbus.h
 
 PMC SIERRA MaxRAID DRIVER
-M:     Anil Ravindranath <anil_ravindranath@pmc-sierra.com>
 L:     linux-scsi@vger.kernel.org
 W:     http://www.pmc-sierra.com/
-S:     Supported
+S:     Orphan
 F:     drivers/scsi/pmcraid.*
 
 PMC SIERRA PM8001 DRIVER
index f2d2706..1a132ea 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
-PATCHLEVEL = 2
+PATCHLEVEL = 3
 SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION = -rc1
 NAME = Hurr durr I'ma sheep
 
 # *DOCUMENTATION*
index 8f35649..4e949e5 100644 (file)
@@ -2,6 +2,9 @@
 # General architecture dependent options
 #
 
+config KEXEC_CORE
+       bool
+
 config OPROFILE
        tristate "OProfile system profiling"
        depends on PROFILING
index dfa32f0..72a8ca7 100644 (file)
@@ -12,42 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       return get_dma_ops(dev)->alloc(dev, size, dma_handle, gfp, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       get_dma_ops(dev)->free(dev, size, vaddr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f)      dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h)       dma_free_coherent(d, s, v, h)
-
 #define dma_cache_sync(dev, va, size, dir)               ((void)0)
 
 #endif /* _ALPHA_DMA_MAPPING_H */
index df24b76..2b1f4a1 100644 (file)
@@ -166,15 +166,6 @@ static int alpha_noop_supported(struct device *dev, u64 mask)
        return mask < 0x00ffffffUL ? 0 : 1;
 }
 
-static int alpha_noop_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 struct dma_map_ops alpha_noop_ops = {
        .alloc                  = alpha_noop_alloc_coherent,
        .free                   = alpha_noop_free_coherent,
@@ -182,7 +173,6 @@ struct dma_map_ops alpha_noop_ops = {
        .map_sg                 = alpha_noop_map_sg,
        .mapping_error          = alpha_noop_mapping_error,
        .dma_supported          = alpha_noop_supported,
-       .set_dma_mask           = alpha_noop_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_noop_ops;
index eddee77..8969bf2 100644 (file)
@@ -939,16 +939,6 @@ static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr)
        return dma_addr == 0;
 }
 
-static int alpha_pci_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask ||
-           !pci_dma_supported(alpha_gendev_to_pci(dev), mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 struct dma_map_ops alpha_pci_ops = {
        .alloc                  = alpha_pci_alloc_coherent,
        .free                   = alpha_pci_free_coherent,
@@ -958,7 +948,6 @@ struct dma_map_ops alpha_pci_ops = {
        .unmap_sg               = alpha_pci_unmap_sg,
        .mapping_error          = alpha_pci_mapping_error,
        .dma_supported          = alpha_pci_supported,
-       .set_dma_mask           = alpha_pci_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_pci_ops;
index ad9825d..0a77b19 100644 (file)
@@ -402,6 +402,8 @@ static void __init axs103_early_init(void)
        unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;
        if (num_cores > 2)
                arc_set_core_freq(50 * 1000000);
+       else if (num_cores == 2)
+               arc_set_core_freq(75 * 1000000);
 #endif
 
        switch (arc_get_core_freq()/1000000) {
index 0d1b717..72ad724 100644 (file)
@@ -2020,6 +2020,7 @@ config KEXEC
        bool "Kexec system call (EXPERIMENTAL)"
        depends on (!SMP || PM_SLEEP_SMP)
        depends on !CPU_V7M
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 7451b44..2c2b28e 100644 (file)
@@ -54,6 +54,14 @@ AS           += -EL
 LD             += -EL
 endif
 
+#
+# The Scalar Replacement of Aggregates (SRA) optimization pass in GCC 4.9 and
+# later may result in code being generated that handles signed short and signed
+# char struct members incorrectly. So disable it.
+# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65932)
+#
+KBUILD_CFLAGS  += $(call cc-option,-fno-ipa-sra)
+
 # This selects which instruction set is used.
 # Note that GCC does not numerically define an architecture version
 # macro, but instead defines a whole series of macros which makes
index bd245d3..a0765e7 100644 (file)
@@ -57,5 +57,5 @@ extern char * strstr(const char * s1, const char *s2);
 
 int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
 {
-       return decompress(input, len, NULL, NULL, output, NULL, error);
+       return __decompress(input, len, NULL, NULL, output, 0, NULL, error);
 }
index a5863ac..540a0ad 100644 (file)
                min-microvolt = <1100000>;
                max-microvolt = <2700000>;
        };
+
+       thermal-zones {
+               cpu_thermal: cpu-thermal {
+                       cooling-maps {
+                               map0 {
+                                       /* Correspond to 500MHz at freq_table */
+                                       cooling-device = <&cpu0 5 5>;
+                               };
+                               map1 {
+                                       /* Correspond to 200MHz at freq_table */
+                                       cooling-device = <&cpu0 8 8>;
+                               };
+                       };
+               };
+       };
 };
 
 &adc {
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &exynos_usbphy {
        status = "okay";
 };
index baa9b2f..41a5faf 100644 (file)
                min-microvolt = <1100000>;
                max-microvolt = <2700000>;
        };
+
+       thermal-zones {
+               cpu_thermal: cpu-thermal {
+                       cooling-maps {
+                               map0 {
+                                       /* Corresponds to 500MHz */
+                                       cooling-device = <&cpu0 5 5>;
+                               };
+                               map1 {
+                                       /* Corresponds to 200MHz */
+                                       cooling-device = <&cpu0 8 8>;
+                               };
+                       };
+               };
+       };
 };
 
 &adc {
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &exynos_usbphy {
        status = "okay";
 };
index 2db9943..033def4 100644 (file)
                        compatible = "arm,cortex-a7";
                        reg = <0>;
                        clock-frequency = <1000000000>;
+                       clocks = <&cmu CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       #cooling-cells = <2>;
+
+                       operating-points = <
+                               1000000 1150000
+                               900000  1112500
+                               800000  1075000
+                               700000  1037500
+                               600000  1000000
+                               500000  962500
+                               400000  925000
+                               300000  887500
+                               200000  850000
+                               100000  850000
+                       >;
                };
 
                cpu1: cpu@1 {
index b0d52b1..98c0a36 100644 (file)
                clocks = <&clock CLK_JPEG>;
                clock-names = "jpeg";
                power-domains = <&pd_cam>;
+               iommus = <&sysmmu_jpeg>;
        };
 
        hdmi: hdmi@12D00000 {
index d9c8efe..5389011 100644 (file)
@@ -30,6 +30,9 @@
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA00>;
+                       clocks = <&clock CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       operating-points-v2 = <&cpu0_opp_table>;
                        cooling-min-level = <13>;
                        cooling-max-level = <7>;
                        #cooling-cells = <2>; /* min followed by max */
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA01>;
+                       operating-points-v2 = <&cpu0_opp_table>;
+               };
+       };
+
+       cpu0_opp_table: opp_table0 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp00 {
+                       opp-hz = /bits/ 64 <200000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <300000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <925000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <500000000>;
+                       opp-microvolt = <950000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <600000000>;
+                       opp-microvolt = <975000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp05 {
+                       opp-hz = /bits/ 64 <700000000>;
+                       opp-microvolt = <987500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp06 {
+                       opp-hz = /bits/ 64 <800000000>;
+                       opp-microvolt = <1000000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp07 {
+                       opp-hz = /bits/ 64 <900000000>;
+                       opp-microvolt = <1037500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp08 {
+                       opp-hz = /bits/ 64 <1000000000>;
+                       opp-microvolt = <1087500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp09 {
+                       opp-hz = /bits/ 64 <1100000000>;
+                       opp-microvolt = <1137500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp10 {
+                       opp-hz = /bits/ 64 <1200000000>;
+                       opp-microvolt = <1187500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp11 {
+                       opp-hz = /bits/ 64 <1300000000>;
+                       opp-microvolt = <1250000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp12 {
+                       opp-hz = /bits/ 64 <1400000000>;
+                       opp-microvolt = <1287500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp13 {
+                       opp-hz = /bits/ 64 <1500000000>;
+                       opp-microvolt = <1350000>;
+                       clock-latency-ns = <200000>;
+                       turbo-mode;
                };
        };
 };
index ca7d168..db52841 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 /* RSTN signal for eMMC */
 &sd1_cd {
        samsung,pin-pud = <0>;
index 44684e5..8632f35 100644 (file)
@@ -13,6 +13,7 @@
 
 /dts-v1/;
 #include "exynos4412-odroid-common.dtsi"
+#include <dt-bindings/gpio/gpio.h>
 
 / {
        model = "Hardkernel ODROID-U3 board based on Exynos4412";
                "Speakers", "SPKL",
                "Speakers", "SPKR";
 };
+
+&spi_1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&spi1_bus>;
+       cs-gpios = <&gpb 5 GPIO_ACTIVE_HIGH>;
+       status = "okay";
+};
index 84c7631..9d528af 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &fimd {
        pinctrl-0 = <&lcd_clk &lcd_data24 &pwm1_out>;
        pinctrl-names = "default";
index 8848400..2a1ebb7 100644 (file)
        status = "okay";
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &csis_0 {
        status = "okay";
        vddcore-supply = <&ldo8_reg>;
index b78ada7..ca0e3c1 100644 (file)
@@ -30,6 +30,9 @@
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA00>;
+                       clocks = <&clock CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       operating-points-v2 = <&cpu0_opp_table>;
                        cooling-min-level = <13>;
                        cooling-max-level = <7>;
                        #cooling-cells = <2>; /* min followed by max */
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA01>;
+                       operating-points-v2 = <&cpu0_opp_table>;
                };
 
                cpu@A02 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA02>;
+                       operating-points-v2 = <&cpu0_opp_table>;
                };
 
                cpu@A03 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA03>;
+                       operating-points-v2 = <&cpu0_opp_table>;
+               };
+       };
+
+       cpu0_opp_table: opp_table0 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp00 {
+                       opp-hz = /bits/ 64 <200000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <300000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <925000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <500000000>;
+                       opp-microvolt = <950000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <600000000>;
+                       opp-microvolt = <975000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp05 {
+                       opp-hz = /bits/ 64 <700000000>;
+                       opp-microvolt = <987500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp06 {
+                       opp-hz = /bits/ 64 <800000000>;
+                       opp-microvolt = <1000000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp07 {
+                       opp-hz = /bits/ 64 <900000000>;
+                       opp-microvolt = <1037500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp08 {
+                       opp-hz = /bits/ 64 <1000000000>;
+                       opp-microvolt = <1087500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp09 {
+                       opp-hz = /bits/ 64 <1100000000>;
+                       opp-microvolt = <1137500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp10 {
+                       opp-hz = /bits/ 64 <1200000000>;
+                       opp-microvolt = <1187500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp11 {
+                       opp-hz = /bits/ 64 <1300000000>;
+                       opp-microvolt = <1250000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp12 {
+                       opp-hz = /bits/ 64 <1400000000>;
+                       opp-microvolt = <1287500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp13 {
+                       opp-hz = /bits/ 64 <1500000000>;
+                       opp-microvolt = <1350000>;
+                       clock-latency-ns = <200000>;
+                       turbo-mode;
                };
        };
 
index 7e728a1..db3f65f 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        status = "okay";
        samsung,color-space = <0>;
index 4fe186d..15aea76 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        samsung,color-space = <0>;
        samsung,dynamic-range = <0>;
index b7f4122..0720caa 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        status = "okay";
        pinctrl-names = "default";
        status = "okay";
        samsung,spi-src-clk = <0>;
        num-cs = <1>;
+       cs-gpios = <&gpa2 5 GPIO_ACTIVE_HIGH>;
 };
 
 &usbdrd_dwc3 {
index d03f9b8..c1edd6d 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        status = "okay";
        pinctrl-names = "default";
index 4a1f883..b24610e 100644 (file)
                        compatible = "arm,cortex-a15";
                        reg = <0>;
                        clock-frequency = <1700000000>;
+                       clocks = <&clock CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       clock-latency = <140000>;
+
+                       operating-points = <
+                               1700000 1300000
+                               1600000 1250000
+                               1500000 1225000
+                               1400000 1200000
+                               1300000 1150000
+                               1200000 1125000
+                               1100000 1100000
+                               1000000 1075000
+                                900000 1050000
+                                800000 1025000
+                                700000 1012500
+                                600000 1000000
+                                500000  975000
+                                400000  950000
+                                300000  937500
+                                200000  925000
+                       >;
                        cooling-min-level = <15>;
                        cooling-max-level = <9>;
                        #cooling-cells = <2>; /* min followed by max */
diff --git a/arch/arm/boot/dts/exynos5422-cpus.dtsi b/arch/arm/boot/dts/exynos5422-cpus.dtsi
new file mode 100644 (file)
index 0000000..b7f60c8
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * SAMSUNG EXYNOS5422 SoC cpu device tree source
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com
+ *
+ * The only difference between EXYNOS5422 and EXYNOS5800 is cpu ordering. The
+ * EXYNOS5422 is booting from Cortex-A7 core while the EXYNOS5800 is booting
+ * from Cortex-A15 core.
+ *
+ * EXYNOS5422 based board files can include this file to provide cpu ordering
+ * which could boot a cortex-a7 from cpu0.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+&cpu0 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x100>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu1 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x101>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu2 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x102>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu3 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x103>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu4 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x0>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
+
+&cpu5 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x1>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
+
+&cpu6 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x2>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
+
+&cpu7 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x3>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
index 1565667..79ffdfe 100644 (file)
@@ -15,6 +15,7 @@
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/sound/samsung-i2s.h>
 #include "exynos5800.dtsi"
+#include "exynos5422-cpus.dtsi"
 #include "exynos5422-cpu-thermal.dtsi"
 
 / {
index 34ccb26..47c0282 100644 (file)
@@ -4,6 +4,14 @@
        model = "CompuLab CM-QS600";
        compatible = "qcom,apq8064-cm-qs600", "qcom,apq8064";
 
+       aliases {
+               serial0 = &gsbi7_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                pinctrl@800000 {
                        i2c1_pins: i2c1 {
index 88d6655..f3100da 100644 (file)
                serial1 = &gsbi6_serial;
        };
 
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                pinctrl@800000 {
                        card_detect: card_detect {
index d484d08..835bdc7 100644 (file)
@@ -6,6 +6,14 @@
        model = "Qualcomm APQ8074 Dragonboard";
        compatible = "qcom,apq8074-dragonboard", "qcom,apq8074";
 
+       aliases {
+               serial0 = &blsp1_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                serial@f991e000 {
                        status = "ok";
index f7725b9..c9c2b76 100644 (file)
@@ -5,6 +5,14 @@
        model = "Qualcomm APQ8084/IFC6540";
        compatible = "qcom,apq8084-ifc6540", "qcom,apq8084";
 
+       aliases {
+               serial0 = &blsp2_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                serial@f995e000 {
                        status = "okay";
index cb43acf..3016c70 100644 (file)
@@ -5,6 +5,14 @@
        model = "Qualcomm APQ 8084-MTP";
        compatible = "qcom,apq8084-mtp", "qcom,apq8084";
 
+       aliases {
+               serial0 = &blsp2_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                serial@f995e000 {
                        status = "okay";
index 7084010..0554fbd 100644 (file)
                        interrupts = <0 208 0>;
                };
 
-               serial@f995e000 {
+               blsp2_uart2: serial@f995e000 {
                        compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
                        reg = <0xf995e000 0x1000>;
                        interrupts = <0 114 0x0>;
index 55b2910..d501382 100644 (file)
@@ -4,6 +4,14 @@
        model = "Qualcomm IPQ8064/AP148";
        compatible = "qcom,ipq8064-ap148", "qcom,ipq8064";
 
+       aliases {
+               serial0 = &gsbi4_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        reserved-memory {
                #address-cells = <1>;
                #size-cells = <1>;
index 9f727d8..fa69863 100644 (file)
 
                        syscon-tcsr = <&tcsr>;
 
-                       serial@16340000 {
+                       gsbi4_serial: serial@16340000 {
                                compatible = "qcom,msm-uartdm-v1.3", "qcom,msm-uartdm";
                                reg = <0x16340000 0x1000>,
                                      <0x16300000 0x1000>;
index e0883c3..b17f379 100644 (file)
@@ -6,6 +6,14 @@
        model = "Qualcomm MSM8660 SURF";
        compatible = "qcom,msm8660-surf", "qcom,msm8660";
 
+       aliases {
+               serial0 = &gsbi12_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                gsbi@19c00000 {
                        status = "ok";
index ef2fe72..e5f7f33 100644 (file)
@@ -98,7 +98,7 @@
 
                        syscon-tcsr = <&tcsr>;
 
-                       serial@19c40000 {
+                       gsbi12_serial: serial@19c40000 {
                                compatible = "qcom,msm-uartdm-v1.3", "qcom,msm-uartdm";
                                reg = <0x19c40000 0x1000>,
                                      <0x19c00000 0x1000>;
index fad71d5..b72a554 100644 (file)
@@ -6,6 +6,14 @@
        model = "Qualcomm MSM8960 CDP";
        compatible = "qcom,msm8960-cdp", "qcom,msm8960";
 
+       aliases {
+               serial0 = &gsbi5_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                gsbi@16400000 {
                        status = "ok";
index 2096a94..134cd91 100644 (file)
 
                        syscon-tcsr = <&tcsr>;
 
-                       serial@16440000 {
+                       gsbi5_serial: serial@16440000 {
                                compatible = "qcom,msm-uartdm-v1.3", "qcom,msm-uartdm";
                                reg = <0x16440000 0x1000>,
                                      <0x16400000 0x1000>;
index 9bc72a3..016f9ad 100644 (file)
@@ -6,6 +6,14 @@
        model = "Sony Xperia Z1";
        compatible = "sony,xperia-honami", "qcom,msm8974";
 
+       aliases {
+               serial0 = &blsp1_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        memory@0 {
                reg = <0 0x40000000>, <0x40000000 0x40000000>;
                device_type = "memory";
index d7c99b8..ab8e572 100644 (file)
                        hwlocks = <&tcsr_mutex 3>;
                };
 
-               serial@f991e000 {
+               blsp1_uart2: serial@f991e000 {
                        compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
                        reg = <0xf991e000 0x1000>;
                        interrupts = <0 108 0x0>;
index 3eaf8fb..1ff2bfa 100644 (file)
@@ -27,6 +27,8 @@ CONFIG_ARM_APPENDED_DTB=y
 CONFIG_ARM_ATAG_DTB_COMPAT=y
 CONFIG_CMDLINE="root=/dev/ram0 rw ramdisk=8192 initrd=0x41000000,8M console=ttySAC1,115200 init=/linuxrc mem=256M"
 CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPUFREQ_DT=y
 CONFIG_CPU_IDLE=y
 CONFIG_ARM_EXYNOS_CPUIDLE=y
 CONFIG_VFP=y
@@ -94,6 +96,7 @@ CONFIG_CHARGER_MAX14577=y
 CONFIG_CHARGER_MAX77693=y
 CONFIG_CHARGER_TPS65090=y
 CONFIG_SENSORS_LM90=y
+CONFIG_SENSORS_NTC_THERMISTOR=y
 CONFIG_SENSORS_PWM_FAN=y
 CONFIG_SENSORS_INA2XX=y
 CONFIG_THERMAL=y
@@ -144,6 +147,8 @@ CONFIG_SND=y
 CONFIG_SND_SOC=y
 CONFIG_SND_SOC_SAMSUNG=y
 CONFIG_SND_SOC_SNOW=y
+CONFIG_SND_SOC_ODROIDX2=y
+CONFIG_SND_SIMPLE_CARD=y
 CONFIG_USB=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_XHCI_HCD=y
index f84471d..03deb7f 100644 (file)
@@ -362,6 +362,7 @@ CONFIG_POWER_RESET_KEYSTONE=y
 CONFIG_POWER_RESET_RMOBILE=y
 CONFIG_SENSORS_LM90=y
 CONFIG_SENSORS_LM95245=y
+CONFIG_SENSORS_NTC_THERMISTOR=m
 CONFIG_THERMAL=y
 CONFIG_CPU_THERMAL=y
 CONFIG_RCAR_THERMAL=y
@@ -410,7 +411,9 @@ CONFIG_REGULATOR_MAX8907=y
 CONFIG_REGULATOR_MAX8973=y
 CONFIG_REGULATOR_MAX77686=y
 CONFIG_REGULATOR_MAX77693=m
+CONFIG_REGULATOR_MAX77802=m
 CONFIG_REGULATOR_PALMAS=y
+CONFIG_REGULATOR_PBIAS=y
 CONFIG_REGULATOR_PWM=m
 CONFIG_REGULATOR_S2MPS11=y
 CONFIG_REGULATOR_S5M8767=y
@@ -509,8 +512,6 @@ CONFIG_USB_CHIPIDEA_HOST=y
 CONFIG_AB8500_USB=y
 CONFIG_KEYSTONE_USB_PHY=y
 CONFIG_OMAP_USB3=y
-CONFIG_SAMSUNG_USB2PHY=y
-CONFIG_SAMSUNG_USB3PHY=y
 CONFIG_USB_GPIO_VBUS=y
 CONFIG_USB_ISP1301=y
 CONFIG_USB_MXS_PHY=y
@@ -635,6 +636,7 @@ CONFIG_EXTCON=y
 CONFIG_TI_AEMIF=y
 CONFIG_IIO=y
 CONFIG_AT91_ADC=m
+CONFIG_EXYNOS_ADC=m
 CONFIG_XILINX_XADC=y
 CONFIG_AK8975=y
 CONFIG_PWM=y
index 7bbf325..b2bc8e1 100644 (file)
@@ -491,11 +491,6 @@ THUMB(     orr     \reg , \reg , #PSR_T_BIT        )
 #endif
        .endm
 
-       .macro  uaccess_save_and_disable, tmp
-       uaccess_save \tmp
-       uaccess_disable \tmp
-       .endm
-
        .irp    c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
        .macro  ret\c, reg
 #if __LINUX_ARM_ARCH__ < 6
index b274bde..e7335a9 100644 (file)
@@ -40,6 +40,7 @@ do {                                                          \
                "2:\t.asciz " #__file "\n"                      \
                ".popsection\n"                                 \
                ".pushsection __bug_table,\"a\"\n"              \
+               ".align 2\n"                                    \
                "3:\t.word 1b, 2b\n"                            \
                "\t.hword " #__line ", 0\n"                     \
                ".popsection");                                 \
index a68b9d8..ccb3aa6 100644 (file)
@@ -8,7 +8,6 @@
 #include <linux/dma-attrs.h>
 #include <linux/dma-debug.h>
 
-#include <asm-generic/dma-coherent.h>
 #include <asm/memory.h>
 
 #include <xen/xen.h>
@@ -39,12 +38,15 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
        dev->archdata.dma_ops = ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
+#define HAVE_ARCH_DMA_SUPPORTED 1
+extern int dma_supported(struct device *dev, u64 mask);
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
+/*
+ * Note that while the generic code provides dummy dma_{alloc,free}_noncoherent
+ * implementations, we don't provide a dma_cache_sync function so drivers using
+ * this API are highlighted with build warnings.
+ */
+#include <asm-generic/dma-mapping-common.h>
 
 #ifdef __arch_page_to_dma
 #error Please update to __arch_pfn_to_dma
@@ -167,32 +169,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline void dma_mark_clean(void *addr, size_t size) { }
 
-/*
- * DMA errors are defined by all-bits-set in the DMA address.
- */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return dma_addr == DMA_ERROR_CODE;
-}
-
-/*
- * Dummy noncoherent implementation.  We don't provide a dma_cache_sync
- * function so drivers using this API are highlighted with build warnings.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-               dma_addr_t *handle, gfp_t gfp)
-{
-       return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-               void *cpu_addr, dma_addr_t handle)
-{
-}
-
-extern int dma_supported(struct device *dev, u64 mask);
-
 extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 
 /**
@@ -209,21 +185,6 @@ extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                           gfp_t gfp, struct dma_attrs *attrs);
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                      dma_addr_t *dma_handle, gfp_t flag,
-                                      struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-       BUG_ON(!ops);
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-       return cpu_addr;
-}
-
 /**
  * arm_dma_free - free memory allocated by arm_dma_alloc
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -241,19 +202,6 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 extern void arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
                         dma_addr_t handle, struct dma_attrs *attrs);
 
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                    void *cpu_addr, dma_addr_t dma_handle,
-                                    struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       BUG_ON(!ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 /**
  * arm_dma_mmap - map a coherent DMA allocation into user space
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
index e878129..fc8ba16 100644 (file)
@@ -12,6 +12,7 @@
 
 #ifndef __ASSEMBLY__
 #include <asm/barrier.h>
+#include <asm/thread_info.h>
 #endif
 
 /*
@@ -89,7 +90,8 @@ static inline unsigned int get_domain(void)
 
        asm(
        "mrc    p15, 0, %0, c3, c0      @ get domain"
-        : "=r" (domain));
+        : "=r" (domain)
+        : "m" (current_thread_info()->cpu_domain));
 
        return domain;
 }
@@ -98,7 +100,7 @@ static inline void set_domain(unsigned val)
 {
        asm volatile(
        "mcr    p15, 0, %0, c3, c0      @ set domain"
-         : : "r" (val));
+         : : "r" (val) : "memory");
        isb();
 }
 
index e896d2c..dcba0fa 100644 (file)
@@ -231,4 +231,9 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+static inline void kvm_arm_init_debug(void) {}
+static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
+
 #endif /* __ARM_KVM_HOST_H__ */
index d0a1119..776757d 100644 (file)
@@ -25,7 +25,6 @@
 struct task_struct;
 
 #include <asm/types.h>
-#include <asm/domain.h>
 
 typedef unsigned long mm_segment_t;
 
index 98b1084..1279563 100644 (file)
@@ -34,7 +34,19 @@ typedef struct xpaddr {
 unsigned long __pfn_to_mfn(unsigned long pfn);
 extern struct rb_root phys_to_mach;
 
-static inline unsigned long pfn_to_mfn(unsigned long pfn)
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+       return pfn;
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+       return gfn;
+}
+
+/* Pseudo-physical <-> BUS conversion */
+static inline unsigned long pfn_to_bfn(unsigned long pfn)
 {
        unsigned long mfn;
 
@@ -47,16 +59,16 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
        return pfn;
 }
 
-static inline unsigned long mfn_to_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_pfn(unsigned long bfn)
 {
-       return mfn;
+       return bfn;
 }
 
-#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn)
+#define bfn_to_local_pfn(bfn)  bfn_to_pfn(bfn)
 
-/* VIRT <-> MACHINE conversion */
-#define virt_to_mfn(v)         (pfn_to_mfn(virt_to_pfn(v)))
-#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v)         (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(m)         (__va(gfn_to_pfn(m) << PAGE_SHIFT))
 
 /* Only used in PV code. But ARM guests are always HVM. */
 static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
@@ -96,7 +108,7 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 
 bool xen_arch_need_swiotlb(struct device *dev,
                           unsigned long pfn,
-                          unsigned long mfn);
+                          unsigned long bfn);
 unsigned long xen_get_swiotlb_free_pages(unsigned int order);
 
 #endif /* _ASM_ARM_XEN_PAGE_H */
index a3089ba..7a7c4ce 100644 (file)
@@ -226,6 +226,7 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start,
 
        memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
 
+#ifdef CONFIG_CPU_USE_DOMAINS
        /*
         * Copy the initial value of the domain access control register
         * from the current thread: thread->addr_limit will have been
@@ -233,6 +234,7 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start,
         * kernel/fork.c
         */
        thread->cpu_domain = get_domain();
+#endif
 
        if (likely(!(p->flags & PF_KTHREAD))) {
                *childregs = *current_pt_regs();
index bc738d2..ce404a5 100644 (file)
@@ -125,6 +125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (ret)
                goto out_free_stage2_pgd;
 
+       kvm_vgic_early_init(kvm);
        kvm_timer_init(kvm);
 
        /* Mark the initial VMID generation invalid */
@@ -249,6 +250,7 @@ out:
 
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
+       kvm_vgic_vcpu_early_init(vcpu);
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
@@ -278,6 +280,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        /* Set up the timer */
        kvm_timer_vcpu_init(vcpu);
 
+       kvm_arm_reset_debug_ptr(vcpu);
+
        return 0;
 }
 
@@ -301,13 +305,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        kvm_arm_set_running_vcpu(NULL);
 }
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-                                       struct kvm_guest_debug *dbg)
-{
-       return -EINVAL;
-}
-
-
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
@@ -528,10 +525,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                if (vcpu->arch.pause)
                        vcpu_pause(vcpu);
 
-               kvm_vgic_flush_hwstate(vcpu);
+               /*
+                * Disarming the background timer must be done in a
+                * preemptible context, as this call may sleep.
+                */
                kvm_timer_flush_hwstate(vcpu);
 
+               /*
+                * Preparing the interrupts to be injected also
+                * involves poking the GIC, which must be done in a
+                * non-preemptible context.
+                */
                preempt_disable();
+               kvm_vgic_flush_hwstate(vcpu);
+
                local_irq_disable();
 
                /*
@@ -544,12 +551,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
                if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
                        local_irq_enable();
+                       kvm_vgic_sync_hwstate(vcpu);
                        preempt_enable();
                        kvm_timer_sync_hwstate(vcpu);
-                       kvm_vgic_sync_hwstate(vcpu);
                        continue;
                }
 
+               kvm_arm_setup_debug(vcpu);
+
                /**************************************************************
                 * Enter the guest
                 */
@@ -564,6 +573,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 * Back from guest
                 *************************************************************/
 
+               kvm_arm_clear_debug(vcpu);
+
                /*
                 * We may have taken a host interrupt in HYP mode (ie
                 * while executing the guest). This interrupt is still
@@ -586,11 +597,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 */
                kvm_guest_exit();
                trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-               preempt_enable();
 
+               kvm_vgic_sync_hwstate(vcpu);
+
+               preempt_enable();
 
                kvm_timer_sync_hwstate(vcpu);
-               kvm_vgic_sync_hwstate(vcpu);
 
                ret = handle_exit(vcpu, run, ret);
        }
@@ -921,6 +933,8 @@ static void cpu_init_hyp_mode(void *dummy)
        vector_ptr = (unsigned long)__kvm_hyp_vector;
 
        __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+
+       kvm_arm_init_debug();
 }
 
 static int hyp_init_cpu_notify(struct notifier_block *self,
index d503fbb..96e935b 100644 (file)
@@ -290,3 +290,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        return -EINVAL;
 }
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                       struct kvm_guest_debug *dbg)
+{
+       return -EINVAL;
+}
index 568494d..900ef6d 100644 (file)
@@ -361,10 +361,6 @@ hyp_hvc:
        @ Check syndrome register
        mrc     p15, 4, r1, c5, c2, 0   @ HSR
        lsr     r0, r1, #HSR_EC_SHIFT
-#ifdef CONFIG_VFPv3
-       cmp     r0, #HSR_EC_CP_0_13
-       beq     switch_to_guest_vfp
-#endif
        cmp     r0, #HSR_EC_HVC
        bne     guest_trap              @ Not HVC instr.
 
@@ -378,7 +374,10 @@ hyp_hvc:
        cmp     r2, #0
        bne     guest_trap              @ Guest called HVC
 
-host_switch_to_hyp:
+       /*
+        * Getting here means host called HVC, we shift parameters and branch
+        * to Hyp function.
+        */
        pop     {r0, r1, r2}
 
        /* Check for __hyp_get_vectors */
@@ -409,6 +408,10 @@ guest_trap:
 
        @ Check if we need the fault information
        lsr     r1, r1, #HSR_EC_SHIFT
+#ifdef CONFIG_VFPv3
+       cmp     r1, #HSR_EC_CP_0_13
+       beq     switch_to_guest_vfp
+#endif
        cmp     r1, #HSR_EC_IABT
        mrceq   p15, 4, r2, c6, c0, 2   @ HIFAR
        beq     2f
@@ -477,7 +480,6 @@ guest_trap:
  */
 #ifdef CONFIG_VFPv3
 switch_to_guest_vfp:
-       load_vcpu                       @ Load VCPU pointer to r0
        push    {r3-r7}
 
        @ NEON/VFP used.  Turn on VFP access.
index f558c07..eeb8585 100644 (file)
@@ -77,7 +77,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        kvm_reset_coprocs(vcpu);
 
        /* Reset arch_timer context */
-       kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-       return 0;
+       return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
index 4c4858c..3a10f1a 100644 (file)
@@ -15,6 +15,7 @@ menuconfig ARCH_EXYNOS
        select ARM_AMBA
        select ARM_GIC
        select COMMON_CLK_SAMSUNG
+       select EXYNOS_THERMAL
        select HAVE_ARM_SCU if SMP
        select HAVE_S3C2410_I2C if I2C
        select HAVE_S3C2410_WATCHDOG if WATCHDOG
@@ -24,6 +25,7 @@ menuconfig ARCH_EXYNOS
        select PM_GENERIC_DOMAINS if PM
        select S5P_DEV_MFC
        select SRAM
+       select THERMAL
        select MFD_SYSCON
        help
          Support for SAMSUNG EXYNOS SoCs (EXYNOS4/5)
index 5f8ddcd..1c47aee 100644 (file)
@@ -225,7 +225,11 @@ static void __init exynos_init_irq(void)
 }
 
 static const struct of_device_id exynos_cpufreq_matches[] = {
+       { .compatible = "samsung,exynos3250", .data = "cpufreq-dt" },
        { .compatible = "samsung,exynos4210", .data = "cpufreq-dt" },
+       { .compatible = "samsung,exynos4212", .data = "cpufreq-dt" },
+       { .compatible = "samsung,exynos4412", .data = "cpufreq-dt" },
+       { .compatible = "samsung,exynos5250", .data = "cpufreq-dt" },
        { /* sentinel */ }
 };
 
index bf35abc..e626043 100644 (file)
@@ -676,10 +676,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                    gfp_t gfp, struct dma_attrs *attrs)
 {
        pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, handle, &memory))
-               return memory;
 
        return __dma_alloc(dev, size, handle, gfp, prot, false,
                           attrs, __builtin_return_address(0));
@@ -688,11 +684,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
        dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, handle, &memory))
-               return memory;
-
        return __dma_alloc(dev, size, handle, gfp, PAGE_KERNEL, true,
                           attrs, __builtin_return_address(0));
 }
@@ -752,9 +743,6 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
        struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
        bool want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs);
 
-       if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
-               return;
-
        size = PAGE_ALIGN(size);
 
        if (nommu()) {
index 71df435..39c20af 100644 (file)
@@ -95,9 +95,10 @@ emulate:
        reteq   r4                      @ no, return failure
 
 next:
+       uaccess_enable r3
 .Lx1:  ldrt    r6, [r5], #4            @ get the next instruction and
                                        @ increment PC
-
+       uaccess_disable r3
        and     r2, r6, #0x0F000000     @ test for FP insns
        teq     r2, #0x0C000000
        teqne   r2, #0x0D000000
index c50c8d3..eeeab07 100644 (file)
@@ -49,35 +49,35 @@ static __read_mostly unsigned int xen_events_irq;
 
 static __initdata struct device_node *xen_node;
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t *mfn, int nr,
+                              xen_pfn_t *gfn, int nr,
                               int *err_ptr, pgprot_t prot,
                               unsigned domid,
                               struct page **pages)
 {
-       return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+       return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
                                         prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 /* Not used by XENFEAT_auto_translated guests. */
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t mfn, int nr,
+                              xen_pfn_t gfn, int nr,
                               pgprot_t prot, unsigned domid,
                               struct page **pages)
 {
        return -ENOSYS;
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
                               int nr, struct page **pages)
 {
        return xen_xlate_unmap_gfn_range(vma, nr, pages);
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
 
 static void xen_percpu_init(void)
 {
index f00e080..10fd99c 100644 (file)
@@ -98,8 +98,23 @@ ENTRY(privcmd_call)
        mov r1, r2
        mov r2, r3
        ldr r3, [sp, #8]
+       /*
+        * Privcmd calls are issued by the userspace. We need to allow the
+        * kernel to access the userspace memory before issuing the hypercall.
+        */
+       uaccess_enable r4
+
+       /* r4 is loaded now as we use it as scratch register before */
        ldr r4, [sp, #4]
        __HVC(XEN_IMM)
+
+       /*
+        * Disable userspace access from kernel. This is fine to do it
+        * unconditionally as no set_fs(KERNEL_DS)/set_fs(get_ds()) is
+        * called before.
+        */
+       uaccess_disable r4
+
        ldm sp!, {r4}
        ret lr
 ENDPROC(privcmd_call);
index 03e75fe..6dd911d 100644 (file)
@@ -139,9 +139,9 @@ void __xen_dma_sync_single_for_device(struct device *hwdev,
 
 bool xen_arch_need_swiotlb(struct device *dev,
                           unsigned long pfn,
-                          unsigned long mfn)
+                          unsigned long bfn)
 {
-       return (!hypercall_cflush && (pfn != mfn) && !is_device_dma_coherent(dev));
+       return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev));
 }
 
 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
index 7d95663..07d1811 100644 (file)
@@ -32,6 +32,7 @@ config ARM64
        select GENERIC_CLOCKEVENTS_BROADCAST
        select GENERIC_CPU_AUTOPROBE
        select GENERIC_EARLY_IOREMAP
+       select GENERIC_IDLE_POLL_SETUP
        select GENERIC_IRQ_PROBE
        select GENERIC_IRQ_SHOW
        select GENERIC_IRQ_SHOW_LEVEL
@@ -331,6 +332,22 @@ config ARM64_ERRATUM_845719
 
          If unsure, say Y.
 
+config ARM64_ERRATUM_843419
+       bool "Cortex-A53: 843419: A load or store might access an incorrect address"
+       depends on MODULES
+       default y
+       help
+         This option builds kernel modules using the large memory model in
+         order to avoid the use of the ADRP instruction, which can cause
+         a subsequent memory access to use an incorrect address on Cortex-A53
+         parts up to r0p4.
+
+         Note that the kernel itself must be linked with a version of ld
+         which fixes potentially affected ADRP instructions through the
+         use of veneers.
+
+         If unsure, say Y.
+
 endmenu
 
 
index 15ff5b4..f9914d7 100644 (file)
@@ -41,6 +41,10 @@ endif
 
 CHECKFLAGS     += -D__aarch64__
 
+ifeq ($(CONFIG_ARM64_ERRATUM_843419), y)
+CFLAGS_MODULE  += -mcmodel=large
+endif
+
 # Default value
 head-y         := arch/arm64/kernel/head.o
 
index f0d6d0b..cfdb34b 100644 (file)
@@ -22,8 +22,6 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <xen/xen.h>
 #include <asm/xen/hypervisor.h>
 
@@ -86,28 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
        return (phys_addr_t)dev_addr;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dev_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       debug_dma_mapping_error(dev, dev_addr);
-       return ops->mapping_error(dev, dev_addr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
@@ -120,50 +96,5 @@ static inline void dma_mark_clean(void *addr, size_t size)
 {
 }
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, h, f)  dma_free_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flags,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *vaddr;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &vaddr))
-               return vaddr;
-
-       vaddr = ops->alloc(dev, size, dma_handle, flags, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, vaddr);
-       return vaddr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dev_addr,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       debug_dma_free_coherent(dev, size, vaddr, dev_addr);
-       ops->free(dev, size, vaddr, dev_addr, attrs);
-}
-
-/*
- * There is no dma_cache_sync() implementation, so just return NULL here.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                                         dma_addr_t *handle, gfp_t flags)
-{
-       return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-                                       void *cpu_addr, dma_addr_t handle)
-{
-}
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_DMA_MAPPING_H */
index 52b484b..4c47cb2 100644 (file)
@@ -16,6 +16,8 @@
 #ifndef __ASM_HW_BREAKPOINT_H
 #define __ASM_HW_BREAKPOINT_H
 
+#include <asm/cputype.h>
+
 #ifdef __KERNEL__
 
 struct arch_hw_breakpoint_ctrl {
@@ -132,5 +134,17 @@ static inline void ptrace_hw_copy_thread(struct task_struct *task)
 
 extern struct pmu perf_ops_bp;
 
+/* Determine number of BRP registers available. */
+static inline int get_num_brps(void)
+{
+       return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
+}
+
+/* Determine number of WRP registers available. */
+static inline int get_num_wrps(void)
+{
+       return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_BREAKPOINT_H */
index ac6fafb..7605e09 100644 (file)
 #define HSTR_EL2_TTEE  (1 << 16)
 #define HSTR_EL2_T(x)  (1 << x)
 
+/* Hyp Coproccessor Trap Register Shifts */
+#define CPTR_EL2_TFP_SHIFT 10
+
 /* Hyp Coprocessor Trap Register */
 #define CPTR_EL2_TCPAC (1 << 31)
 #define CPTR_EL2_TTA   (1 << 20)
-#define CPTR_EL2_TFP   (1 << 10)
+#define CPTR_EL2_TFP   (1 << CPTR_EL2_TFP_SHIFT)
 
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_TDRA          (1 << 11)
index 3c5fe68..67fa0de 100644 (file)
 #define        CNTKCTL_EL1     20      /* Timer Control Register (EL1) */
 #define        PAR_EL1         21      /* Physical Address Register */
 #define MDSCR_EL1      22      /* Monitor Debug System Control Register */
-#define DBGBCR0_EL1    23      /* Debug Breakpoint Control Registers (0-15) */
-#define DBGBCR15_EL1   38
-#define DBGBVR0_EL1    39      /* Debug Breakpoint Value Registers (0-15) */
-#define DBGBVR15_EL1   54
-#define DBGWCR0_EL1    55      /* Debug Watchpoint Control Registers (0-15) */
-#define DBGWCR15_EL1   70
-#define DBGWVR0_EL1    71      /* Debug Watchpoint Value Registers (0-15) */
-#define DBGWVR15_EL1   86
-#define MDCCINT_EL1    87      /* Monitor Debug Comms Channel Interrupt Enable Reg */
+#define MDCCINT_EL1    23      /* Monitor Debug Comms Channel Interrupt Enable Reg */
 
 /* 32bit specific registers. Keep them at the end of the range */
-#define        DACR32_EL2      88      /* Domain Access Control Register */
-#define        IFSR32_EL2      89      /* Instruction Fault Status Register */
-#define        FPEXC32_EL2     90      /* Floating-Point Exception Control Register */
-#define        DBGVCR32_EL2    91      /* Debug Vector Catch Register */
-#define        TEECR32_EL1     92      /* ThumbEE Configuration Register */
-#define        TEEHBR32_EL1    93      /* ThumbEE Handler Base Register */
-#define        NR_SYS_REGS     94
+#define        DACR32_EL2      24      /* Domain Access Control Register */
+#define        IFSR32_EL2      25      /* Instruction Fault Status Register */
+#define        FPEXC32_EL2     26      /* Floating-Point Exception Control Register */
+#define        DBGVCR32_EL2    27      /* Debug Vector Catch Register */
+#define        TEECR32_EL1     28      /* ThumbEE Configuration Register */
+#define        TEEHBR32_EL1    29      /* ThumbEE Handler Base Register */
+#define        NR_SYS_REGS     30
 
 /* 32bit mapping */
 #define c0_MPIDR       (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
@@ -132,6 +124,8 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 
+extern u32 __kvm_get_mdcr_el2(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
index 2709db2..415938d 100644 (file)
@@ -103,15 +103,34 @@ struct kvm_vcpu_arch {
 
        /* HYP configuration */
        u64 hcr_el2;
+       u32 mdcr_el2;
 
        /* Exception Information */
        struct kvm_vcpu_fault_info fault;
 
-       /* Debug state */
+       /* Guest debug state */
        u64 debug_flags;
 
+       /*
+        * We maintain more than a single set of debug registers to support
+        * debugging the guest from the host and to maintain separate host and
+        * guest state during world switches. vcpu_debug_state are the debug
+        * registers of the vcpu as the guest sees them.  host_debug_state are
+        * the host registers which are saved and restored during
+        * world switches. external_debug_state contains the debug
+        * values we want to debug the guest. This is set via the
+        * KVM_SET_GUEST_DEBUG ioctl.
+        *
+        * debug_ptr points to the set of debug registers that should be loaded
+        * onto the hardware when running the guest.
+        */
+       struct kvm_guest_debug_arch *debug_ptr;
+       struct kvm_guest_debug_arch vcpu_debug_state;
+       struct kvm_guest_debug_arch external_debug_state;
+
        /* Pointer to host CPU context */
        kvm_cpu_context_t *host_cpu_context;
+       struct kvm_guest_debug_arch host_debug_state;
 
        /* VGIC state */
        struct vgic_cpu vgic_cpu;
@@ -122,6 +141,17 @@ struct kvm_vcpu_arch {
         * here.
         */
 
+       /*
+        * Guest registers we preserve during guest debugging.
+        *
+        * These shadow registers are updated by the kvm_handle_sys_reg
+        * trap handler if the guest accesses or updates them while we
+        * are using guest debug.
+        */
+       struct {
+               u32     mdscr_el1;
+       } guest_debug_preserved;
+
        /* Don't run the guest */
        bool pause;
 
@@ -216,15 +246,15 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
                     hyp_stack_ptr, vector_ptr);
 }
 
-struct vgic_sr_vectors {
-       void    *save_vgic;
-       void    *restore_vgic;
-};
-
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+void kvm_arm_init_debug(void);
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HOST_H__ */
index 6900b2d..b0329be 100644 (file)
  * Software defined PTE bits definition.
  */
 #define PTE_VALID              (_AT(pteval_t, 1) << 0)
+#define PTE_WRITE              (PTE_DBM)                /* same as DBM (51) */
 #define PTE_DIRTY              (_AT(pteval_t, 1) << 55)
 #define PTE_SPECIAL            (_AT(pteval_t, 1) << 56)
-#ifdef CONFIG_ARM64_HW_AFDBM
-#define PTE_WRITE              (PTE_DBM)                /* same as DBM */
-#else
-#define PTE_WRITE              (_AT(pteval_t, 1) << 57)
-#endif
 #define PTE_PROT_NONE          (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
 
 /*
@@ -146,7 +142,7 @@ extern struct page *empty_zero_page;
 #define pte_exec(pte)          (!(pte_val(pte) & PTE_UXN))
 
 #ifdef CONFIG_ARM64_HW_AFDBM
-#define pte_hw_dirty(pte)      (!(pte_val(pte) & PTE_RDONLY))
+#define pte_hw_dirty(pte)      (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
 #else
 #define pte_hw_dirty(pte)      (0)
 #endif
@@ -238,7 +234,7 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);
  * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
  * the page fault mechanism. Checking the dirty status of a pte becomes:
  *
- *   PTE_DIRTY || !PTE_RDONLY
+ *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
  */
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep, pte_t pte)
@@ -503,7 +499,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
                              PTE_PROT_NONE | PTE_WRITE | PTE_TYPE_MASK;
        /* preserve the hardware dirty information */
        if (pte_hw_dirty(pte))
-               newprot |= PTE_DIRTY;
+               pte = pte_mkdirty(pte);
        pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
        return pte;
 }
index d268320..0cd7b59 100644 (file)
@@ -53,14 +53,20 @@ struct kvm_regs {
        struct user_fpsimd_state fp_regs;
 };
 
-/* Supported Processor Types */
+/*
+ * Supported CPU Targets - Adding a new target type is not recommended,
+ * unless there are some special registers not supported by the
+ * genericv8 syreg table.
+ */
 #define KVM_ARM_TARGET_AEM_V8          0
 #define KVM_ARM_TARGET_FOUNDATION_V8   1
 #define KVM_ARM_TARGET_CORTEX_A57      2
 #define KVM_ARM_TARGET_XGENE_POTENZA   3
 #define KVM_ARM_TARGET_CORTEX_A53      4
+/* Generic ARM v8 target */
+#define KVM_ARM_TARGET_GENERIC_V8      5
 
-#define KVM_ARM_NUM_TARGETS            5
+#define KVM_ARM_NUM_TARGETS            6
 
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 #define KVM_ARM_DEVICE_TYPE_SHIFT      0
@@ -100,12 +106,39 @@ struct kvm_sregs {
 struct kvm_fpu {
 };
 
+/*
+ * See v8 ARM ARM D7.3: Debug Registers
+ *
+ * The architectural limit is 16 debug registers of each type although
+ * in practice there are usually less (see ID_AA64DFR0_EL1).
+ *
+ * Although the control registers are architecturally defined as 32
+ * bits wide we use a 64 bit structure here to keep parity with
+ * KVM_GET/SET_ONE_REG behaviour which treats all system registers as
+ * 64 bit values. It also allows for the possibility of the
+ * architecture expanding the control registers without having to
+ * change the userspace ABI.
+ */
+#define KVM_ARM_MAX_DBG_REGS 16
 struct kvm_guest_debug_arch {
+       __u64 dbg_bcr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_bvr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_wcr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS];
 };
 
 struct kvm_debug_exit_arch {
+       __u32 hsr;
+       __u64 far;      /* used for watchpoints */
 };
 
+/*
+ * Architecture specific defines for kvm_guest_debug->control
+ */
+
+#define KVM_GUESTDBG_USE_SW_BP         (1 << 16)
+#define KVM_GUESTDBG_USE_HW            (1 << 17)
+
 struct kvm_sync_regs {
 };
 
index c99701a..8d89cf8 100644 (file)
@@ -116,17 +116,22 @@ int main(void)
   DEFINE(VCPU_FAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.far_el2));
   DEFINE(VCPU_HPFAR_EL2,       offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
   DEFINE(VCPU_DEBUG_FLAGS,     offsetof(struct kvm_vcpu, arch.debug_flags));
+  DEFINE(VCPU_DEBUG_PTR,       offsetof(struct kvm_vcpu, arch.debug_ptr));
+  DEFINE(DEBUG_BCR,            offsetof(struct kvm_guest_debug_arch, dbg_bcr));
+  DEFINE(DEBUG_BVR,            offsetof(struct kvm_guest_debug_arch, dbg_bvr));
+  DEFINE(DEBUG_WCR,            offsetof(struct kvm_guest_debug_arch, dbg_wcr));
+  DEFINE(DEBUG_WVR,            offsetof(struct kvm_guest_debug_arch, dbg_wvr));
   DEFINE(VCPU_HCR_EL2,         offsetof(struct kvm_vcpu, arch.hcr_el2));
+  DEFINE(VCPU_MDCR_EL2,        offsetof(struct kvm_vcpu, arch.mdcr_el2));
   DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
   DEFINE(VCPU_HOST_CONTEXT,    offsetof(struct kvm_vcpu, arch.host_cpu_context));
+  DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
   DEFINE(VCPU_TIMER_CNTV_CTL,  offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
   DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
   DEFINE(KVM_TIMER_CNTVOFF,    offsetof(struct kvm, arch.timer.cntvoff));
   DEFINE(KVM_TIMER_ENABLED,    offsetof(struct kvm, arch.timer.enabled));
   DEFINE(VCPU_KVM,             offsetof(struct kvm_vcpu, kvm));
   DEFINE(VCPU_VGIC_CPU,                offsetof(struct kvm_vcpu, arch.vgic_cpu));
-  DEFINE(VGIC_SAVE_FN,         offsetof(struct vgic_sr_vectors, save_vgic));
-  DEFINE(VGIC_RESTORE_FN,      offsetof(struct vgic_sr_vectors, restore_vgic));
   DEFINE(VGIC_V2_CPU_HCR,      offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
   DEFINE(VGIC_V2_CPU_VMCR,     offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
   DEFINE(VGIC_V2_CPU_MISR,     offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
index 9b3b62a..cebf786 100644 (file)
@@ -134,7 +134,7 @@ static int os_lock_notify(struct notifier_block *self,
                                    unsigned long action, void *data)
 {
        int cpu = (unsigned long)data;
-       if (action == CPU_ONLINE)
+       if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
                smp_call_function_single(cpu, clear_os_lock, NULL, 1);
        return NOTIFY_OK;
 }
index a055be6..90d09ed 100644 (file)
@@ -523,6 +523,11 @@ CPU_LE(    movk    x0, #0x30d0, lsl #16    )       // Clear EE and E0E on LE systems
        msr     hstr_el2, xzr                   // Disable CP15 traps to EL2
 #endif
 
+       /* EL2 debug */
+       mrs     x0, pmcr_el0                    // Disable debug access traps
+       ubfx    x0, x0, #11, #5                 // to EL2 and allow access to
+       msr     mdcr_el2, x0                    // all PMU counters from EL1
+
        /* Stage-2 translation */
        msr     vttbr_el2, xzr
 
index 003bc3d..bba85c8 100644 (file)
@@ -48,18 +48,6 @@ static DEFINE_PER_CPU(int, stepping_kernel_bp);
 static int core_num_brps;
 static int core_num_wrps;
 
-/* Determine number of BRP registers available. */
-static int get_num_brps(void)
-{
-       return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
-}
-
-/* Determine number of WRP registers available. */
-static int get_num_wrps(void)
-{
-       return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
-}
-
 int hw_breakpoint_slots(int type)
 {
        /*
@@ -884,7 +872,7 @@ static int hw_breakpoint_reset_notify(struct notifier_block *self,
                                                void *hcpu)
 {
        int cpu = (long)hcpu;
-       if (action == CPU_ONLINE)
+       if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
                smp_call_function_single(cpu, hw_breakpoint_reset, NULL, 1);
        return NOTIFY_OK;
 }
index 67bf410..876eb8d 100644 (file)
@@ -332,12 +332,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                        ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21,
                                             AARCH64_INSN_IMM_ADR);
                        break;
+#ifndef CONFIG_ARM64_ERRATUM_843419
                case R_AARCH64_ADR_PREL_PG_HI21_NC:
                        overflow_check = false;
                case R_AARCH64_ADR_PREL_PG_HI21:
                        ovf = reloc_insn_imm(RELOC_OP_PAGE, loc, val, 12, 21,
                                             AARCH64_INSN_IMM_ADR);
                        break;
+#endif
                case R_AARCH64_ADD_ABS_LO12_NC:
                case R_AARCH64_LDST8_ABS_LO12_NC:
                        overflow_check = false;
index 948f0ad..71ef6dc 100644 (file)
@@ -212,14 +212,32 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 
 /*
  * VFP save/restore code.
+ *
+ * We have to be careful with endianness, since the fpsimd context-switch
+ * code operates on 128-bit (Q) register values whereas the compat ABI
+ * uses an array of 64-bit (D) registers. Consequently, we need to swap
+ * the two halves of each Q register when running on a big-endian CPU.
  */
+union __fpsimd_vreg {
+       __uint128_t     raw;
+       struct {
+#ifdef __AARCH64EB__
+               u64     hi;
+               u64     lo;
+#else
+               u64     lo;
+               u64     hi;
+#endif
+       };
+};
+
 static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 {
        struct fpsimd_state *fpsimd = &current->thread.fpsimd_state;
        compat_ulong_t magic = VFP_MAGIC;
        compat_ulong_t size = VFP_STORAGE_SIZE;
        compat_ulong_t fpscr, fpexc;
-       int err = 0;
+       int i, err = 0;
 
        /*
         * Save the hardware registers to the fpsimd_state structure.
@@ -235,10 +253,15 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
        /*
         * Now copy the FP registers. Since the registers are packed,
         * we can copy the prefix we want (V0-V15) as it is.
-        * FIXME: Won't work if big endian.
         */
-       err |= __copy_to_user(&frame->ufp.fpregs, fpsimd->vregs,
-                             sizeof(frame->ufp.fpregs));
+       for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+               union __fpsimd_vreg vreg = {
+                       .raw = fpsimd->vregs[i >> 1],
+               };
+
+               __put_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+               __put_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+       }
 
        /* Create an AArch32 fpscr from the fpsr and the fpcr. */
        fpscr = (fpsimd->fpsr & VFP_FPSCR_STAT_MASK) |
@@ -263,7 +286,7 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
        compat_ulong_t magic = VFP_MAGIC;
        compat_ulong_t size = VFP_STORAGE_SIZE;
        compat_ulong_t fpscr;
-       int err = 0;
+       int i, err = 0;
 
        __get_user_error(magic, &frame->magic, err);
        __get_user_error(size, &frame->size, err);
@@ -273,12 +296,14 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
        if (magic != VFP_MAGIC || size != VFP_STORAGE_SIZE)
                return -EINVAL;
 
-       /*
-        * Copy the FP registers into the start of the fpsimd_state.
-        * FIXME: Won't work if big endian.
-        */
-       err |= __copy_from_user(fpsimd.vregs, frame->ufp.fpregs,
-                               sizeof(frame->ufp.fpregs));
+       /* Copy the FP registers into the start of the fpsimd_state. */
+       for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+               union __fpsimd_vreg vreg;
+
+               __get_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+               __get_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+               fpsimd.vregs[i >> 1] = vreg.raw;
+       }
 
        /* Extract the fpsr and the fpcr from the fpscr */
        __get_user_error(fpscr, &frame->ufp.fpscr, err);
index f90f4aa..1949fe5 100644 (file)
@@ -17,7 +17,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
-kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
new file mode 100644 (file)
index 0000000..47e5f0f
--- /dev/null
@@ -0,0 +1,217 @@
+/*
+ * Debug and Guest Debug support
+ *
+ * Copyright (C) 2015 - Linaro Ltd
+ * Author: Alex Bennée <alex.bennee@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hw_breakpoint.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+
+#include "trace.h"
+
+/* These are the bits of MDSCR_EL1 we may manipulate */
+#define MDSCR_EL1_DEBUG_MASK   (DBG_MDSCR_SS | \
+                               DBG_MDSCR_KDE | \
+                               DBG_MDSCR_MDE)
+
+static DEFINE_PER_CPU(u32, mdcr_el2);
+
+/**
+ * save/restore_guest_debug_regs
+ *
+ * For some debug operations we need to tweak some guest registers. As
+ * a result we need to save the state of those registers before we
+ * make those modifications.
+ *
+ * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled
+ * after we have restored the preserved value to the main context.
+ */
+static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1);
+
+       trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
+                               vcpu->arch.guest_debug_preserved.mdscr_el1);
+}
+
+static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1;
+
+       trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
+                               vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+/**
+ * kvm_arm_init_debug - grab what we need for debug
+ *
+ * Currently the sole task of this function is to retrieve the initial
+ * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has
+ * presumably been set-up by some knowledgeable bootcode.
+ *
+ * It is called once per-cpu during CPU hyp initialisation.
+ */
+
+void kvm_arm_init_debug(void)
+{
+       __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2));
+}
+
+/**
+ * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
+ */
+
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state;
+}
+
+/**
+ * kvm_arm_setup_debug - set up debug related stuff
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * This is called before each entry into the hypervisor to setup any
+ * debug related registers. Currently this just ensures we will trap
+ * access to:
+ *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ *  - Debug ROM Address (MDCR_EL2_TDRA)
+ *  - OS related registers (MDCR_EL2_TDOSA)
+ *
+ * Additionally, KVM only traps guest accesses to the debug registers if
+ * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
+ * flag on vcpu->arch.debug_flags).  Since the guest must not interfere
+ * with the hardware state when debugging the guest, we must ensure that
+ * trapping is enabled whenever we are debugging the guest using the
+ * debug registers.
+ */
+
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
+{
+       bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY);
+
+       trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
+
+       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+                               MDCR_EL2_TPMCR |
+                               MDCR_EL2_TDRA |
+                               MDCR_EL2_TDOSA);
+
+       /* Is Guest debugging in effect? */
+       if (vcpu->guest_debug) {
+               /* Route all software debug exceptions to EL2 */
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+               /* Save guest debug state */
+               save_guest_debug_regs(vcpu);
+
+               /*
+                * Single Step (ARM ARM D2.12.3 The software step state
+                * machine)
+                *
+                * If we are doing Single Step we need to manipulate
+                * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the
+                * step has occurred the hypervisor will trap the
+                * debug exception and we return to userspace.
+                *
+                * If the guest attempts to single step its userspace
+                * we would have to deal with a trapped exception
+                * while in the guest kernel. Because this would be
+                * hard to unwind we suppress the guest's ability to
+                * do so by masking MDSCR_EL.SS.
+                *
+                * This confuses guest debuggers which use
+                * single-step behind the scenes but everything
+                * returns to normal once the host is no longer
+                * debugging the system.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+                       *vcpu_cpsr(vcpu) |=  DBG_SPSR_SS;
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_SS;
+               } else {
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS;
+               }
+
+               trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
+
+               /*
+                * HW Breakpoints and watchpoints
+                *
+                * We simply switch the debug_ptr to point to our new
+                * external_debug_state which has been populated by the
+                * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY
+                * mechanism ensures the registers are updated on the
+                * world switch.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       /* Enable breakpoints/watchpoints */
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_MDE;
+
+                       vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
+                       vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+                       trap_debug = true;
+
+                       trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+                                               &vcpu->arch.debug_ptr->dbg_bcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+                       trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+                                               &vcpu->arch.debug_ptr->dbg_wcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_wvr[0]);
+               }
+       }
+
+       BUG_ON(!vcpu->guest_debug &&
+               vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
+
+       /* Trap debug register access */
+       if (trap_debug)
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+       trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
+{
+       trace_kvm_arm_clear_debug(vcpu->guest_debug);
+
+       if (vcpu->guest_debug) {
+               restore_guest_debug_regs(vcpu);
+
+               /*
+                * If we were using HW debug we need to restore the
+                * debug_ptr to the guest debug state.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       kvm_arm_reset_debug_ptr(vcpu);
+
+                       trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+                                               &vcpu->arch.debug_ptr->dbg_bcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+                       trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+                                               &vcpu->arch.debug_ptr->dbg_wcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_wvr[0]);
+               }
+       }
+}
index 9535bd5..d250160 100644 (file)
@@ -32,6 +32,8 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
 
+#include "trace.h"
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
@@ -293,7 +295,8 @@ int __attribute_const__ kvm_target_cpu(void)
                break;
        };
 
-       return -EINVAL;
+       /* Return a default generic target */
+       return KVM_ARM_TARGET_GENERIC_V8;
 }
 
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
@@ -331,3 +334,41 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        return -EINVAL;
 }
+
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |    \
+                           KVM_GUESTDBG_USE_SW_BP | \
+                           KVM_GUESTDBG_USE_HW | \
+                           KVM_GUESTDBG_SINGLESTEP)
+
+/**
+ * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
+ * @kvm:       pointer to the KVM struct
+ * @kvm_guest_debug: the ioctl data buffer
+ *
+ * This sets up and enables the VM for guest debugging. Userspace
+ * passes in a control flag to enable different debug types and
+ * potentially other architecture specific information in the rest of
+ * the structure.
+ */
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                       struct kvm_guest_debug *dbg)
+{
+       trace_kvm_set_guest_debug(vcpu, dbg->control);
+
+       if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
+               return -EINVAL;
+
+       if (dbg->control & KVM_GUESTDBG_ENABLE) {
+               vcpu->guest_debug = dbg->control;
+
+               /* Hardware assisted Break and Watch points */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       vcpu->arch.external_debug_state = dbg->arch;
+               }
+
+       } else {
+               /* If not enabled clear all flags */
+               vcpu->guest_debug = 0;
+       }
+       return 0;
+}
index 524fa25..68a0759 100644 (file)
@@ -82,6 +82,45 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
        return 1;
 }
 
+/**
+ * kvm_handle_guest_debug - handle a debug exception instruction
+ *
+ * @vcpu:      the vcpu pointer
+ * @run:       access to the kvm_run structure for results
+ *
+ * We route all debug exceptions through the same handler. If both the
+ * guest and host are using the same debug facilities it will be up to
+ * userspace to re-inject the correct exception for guest delivery.
+ *
+ * @return: 0 (while setting run->exit_reason), -1 for error
+ */
+static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+       u32 hsr = kvm_vcpu_get_hsr(vcpu);
+       int ret = 0;
+
+       run->exit_reason = KVM_EXIT_DEBUG;
+       run->debug.arch.hsr = hsr;
+
+       switch (hsr >> ESR_ELx_EC_SHIFT) {
+       case ESR_ELx_EC_WATCHPT_LOW:
+               run->debug.arch.far = vcpu->arch.fault.far_el2;
+               /* fall through */
+       case ESR_ELx_EC_SOFTSTP_LOW:
+       case ESR_ELx_EC_BREAKPT_LOW:
+       case ESR_ELx_EC_BKPT32:
+       case ESR_ELx_EC_BRK64:
+               break;
+       default:
+               kvm_err("%s: un-handled case hsr: %#08x\n",
+                       __func__, (unsigned int) hsr);
+               ret = -1;
+               break;
+       }
+
+       return ret;
+}
+
 static exit_handle_fn arm_exit_handlers[] = {
        [ESR_ELx_EC_WFx]        = kvm_handle_wfx,
        [ESR_ELx_EC_CP15_32]    = kvm_handle_cp15_32,
@@ -96,6 +135,11 @@ static exit_handle_fn arm_exit_handlers[] = {
        [ESR_ELx_EC_SYS64]      = kvm_handle_sys_reg,
        [ESR_ELx_EC_IABT_LOW]   = kvm_handle_guest_abort,
        [ESR_ELx_EC_DABT_LOW]   = kvm_handle_guest_abort,
+       [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_BKPT32]     = kvm_handle_guest_debug,
+       [ESR_ELx_EC_BRK64]      = kvm_handle_guest_debug,
 };
 
 static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
index 10915aa..37c89ea 100644 (file)
        stp     x24, x25, [x3, #160]
 .endm
 
-.macro save_debug
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgbcr15_el1
-       mrs     x19, dbgbcr14_el1
-       mrs     x18, dbgbcr13_el1
-       mrs     x17, dbgbcr12_el1
-       mrs     x16, dbgbcr11_el1
-       mrs     x15, dbgbcr10_el1
-       mrs     x14, dbgbcr9_el1
-       mrs     x13, dbgbcr8_el1
-       mrs     x12, dbgbcr7_el1
-       mrs     x11, dbgbcr6_el1
-       mrs     x10, dbgbcr5_el1
-       mrs     x9, dbgbcr4_el1
-       mrs     x8, dbgbcr3_el1
-       mrs     x7, dbgbcr2_el1
-       mrs     x6, dbgbcr1_el1
-       mrs     x5, dbgbcr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+.macro save_debug type
+       // x4: pointer to register set
+       // x5: number of registers to skip
+       // x6..x22 trashed
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       mrs     x20, dbgbvr15_el1
-       mrs     x19, dbgbvr14_el1
-       mrs     x18, dbgbvr13_el1
-       mrs     x17, dbgbvr12_el1
-       mrs     x16, dbgbvr11_el1
-       mrs     x15, dbgbvr10_el1
-       mrs     x14, dbgbvr9_el1
-       mrs     x13, dbgbvr8_el1
-       mrs     x12, dbgbvr7_el1
-       mrs     x11, dbgbvr6_el1
-       mrs     x10, dbgbvr5_el1
-       mrs     x9, dbgbvr4_el1
-       mrs     x8, dbgbvr3_el1
-       mrs     x7, dbgbvr2_el1
-       mrs     x6, dbgbvr1_el1
-       mrs     x5, dbgbvr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgwcr15_el1
-       mrs     x19, dbgwcr14_el1
-       mrs     x18, dbgwcr13_el1
-       mrs     x17, dbgwcr12_el1
-       mrs     x16, dbgwcr11_el1
-       mrs     x15, dbgwcr10_el1
-       mrs     x14, dbgwcr9_el1
-       mrs     x13, dbgwcr8_el1
-       mrs     x12, dbgwcr7_el1
-       mrs     x11, dbgwcr6_el1
-       mrs     x10, dbgwcr5_el1
-       mrs     x9, dbgwcr4_el1
-       mrs     x8, dbgwcr3_el1
-       mrs     x7, dbgwcr2_el1
-       mrs     x6, dbgwcr1_el1
-       mrs     x5, dbgwcr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgwvr15_el1
-       mrs     x19, dbgwvr14_el1
-       mrs     x18, dbgwvr13_el1
-       mrs     x17, dbgwvr12_el1
-       mrs     x16, dbgwvr11_el1
-       mrs     x15, dbgwvr10_el1
-       mrs     x14, dbgwvr9_el1
-       mrs     x13, dbgwvr8_el1
-       mrs     x12, dbgwvr7_el1
-       mrs     x11, dbgwvr6_el1
-       mrs     x10, dbgwvr5_el1
-       mrs     x9, dbgwvr4_el1
-       mrs     x8, dbgwvr3_el1
-       mrs     x7, dbgwvr2_el1
-       mrs     x6, dbgwvr1_el1
-       mrs     x5, dbgwvr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-
+       mrs     x21, \type\()15_el1
+       mrs     x20, \type\()14_el1
+       mrs     x19, \type\()13_el1
+       mrs     x18, \type\()12_el1
+       mrs     x17, \type\()11_el1
+       mrs     x16, \type\()10_el1
+       mrs     x15, \type\()9_el1
+       mrs     x14, \type\()8_el1
+       mrs     x13, \type\()7_el1
+       mrs     x12, \type\()6_el1
+       mrs     x11, \type\()5_el1
+       mrs     x10, \type\()4_el1
+       mrs     x9, \type\()3_el1
+       mrs     x8, \type\()2_el1
+       mrs     x7, \type\()1_el1
+       mrs     x6, \type\()0_el1
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       mrs     x21, mdccint_el1
-       str     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+       str     x21, [x4, #(15 * 8)]
+       str     x20, [x4, #(14 * 8)]
+       str     x19, [x4, #(13 * 8)]
+       str     x18, [x4, #(12 * 8)]
+       str     x17, [x4, #(11 * 8)]
+       str     x16, [x4, #(10 * 8)]
+       str     x15, [x4, #(9 * 8)]
+       str     x14, [x4, #(8 * 8)]
+       str     x13, [x4, #(7 * 8)]
+       str     x12, [x4, #(6 * 8)]
+       str     x11, [x4, #(5 * 8)]
+       str     x10, [x4, #(4 * 8)]
+       str     x9, [x4, #(3 * 8)]
+       str     x8, [x4, #(2 * 8)]
+       str     x7, [x4, #(1 * 8)]
+       str     x6, [x4, #(0 * 8)]
 .endm
 
 .macro restore_sysregs
        msr     mdscr_el1,      x25
 .endm
 
-.macro restore_debug
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
+.macro restore_debug type
+       // x4: pointer to register set
+       // x5: number of registers to skip
+       // x6..x22 trashed
 
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       msr     dbgbcr15_el1, x20
-       msr     dbgbcr14_el1, x19
-       msr     dbgbcr13_el1, x18
-       msr     dbgbcr12_el1, x17
-       msr     dbgbcr11_el1, x16
-       msr     dbgbcr10_el1, x15
-       msr     dbgbcr9_el1, x14
-       msr     dbgbcr8_el1, x13
-       msr     dbgbcr7_el1, x12
-       msr     dbgbcr6_el1, x11
-       msr     dbgbcr5_el1, x10
-       msr     dbgbcr4_el1, x9
-       msr     dbgbcr3_el1, x8
-       msr     dbgbcr2_el1, x7
-       msr     dbgbcr1_el1, x6
-       msr     dbgbcr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+       ldr     x21, [x4, #(15 * 8)]
+       ldr     x20, [x4, #(14 * 8)]
+       ldr     x19, [x4, #(13 * 8)]
+       ldr     x18, [x4, #(12 * 8)]
+       ldr     x17, [x4, #(11 * 8)]
+       ldr     x16, [x4, #(10 * 8)]
+       ldr     x15, [x4, #(9 * 8)]
+       ldr     x14, [x4, #(8 * 8)]
+       ldr     x13, [x4, #(7 * 8)]
+       ldr     x12, [x4, #(6 * 8)]
+       ldr     x11, [x4, #(5 * 8)]
+       ldr     x10, [x4, #(4 * 8)]
+       ldr     x9, [x4, #(3 * 8)]
+       ldr     x8, [x4, #(2 * 8)]
+       ldr     x7, [x4, #(1 * 8)]
+       ldr     x6, [x4, #(0 * 8)]
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       msr     dbgbvr15_el1, x20
-       msr     dbgbvr14_el1, x19
-       msr     dbgbvr13_el1, x18
-       msr     dbgbvr12_el1, x17
-       msr     dbgbvr11_el1, x16
-       msr     dbgbvr10_el1, x15
-       msr     dbgbvr9_el1, x14
-       msr     dbgbvr8_el1, x13
-       msr     dbgbvr7_el1, x12
-       msr     dbgbvr6_el1, x11
-       msr     dbgbvr5_el1, x10
-       msr     dbgbvr4_el1, x9
-       msr     dbgbvr3_el1, x8
-       msr     dbgbvr2_el1, x7
-       msr     dbgbvr1_el1, x6
-       msr     dbgbvr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       msr     dbgwcr15_el1, x20
-       msr     dbgwcr14_el1, x19
-       msr     dbgwcr13_el1, x18
-       msr     dbgwcr12_el1, x17
-       msr     dbgwcr11_el1, x16
-       msr     dbgwcr10_el1, x15
-       msr     dbgwcr9_el1, x14
-       msr     dbgwcr8_el1, x13
-       msr     dbgwcr7_el1, x12
-       msr     dbgwcr6_el1, x11
-       msr     dbgwcr5_el1, x10
-       msr     dbgwcr4_el1, x9
-       msr     dbgwcr3_el1, x8
-       msr     dbgwcr2_el1, x7
-       msr     dbgwcr1_el1, x6
-       msr     dbgwcr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       msr     dbgwvr15_el1, x20
-       msr     dbgwvr14_el1, x19
-       msr     dbgwvr13_el1, x18
-       msr     dbgwvr12_el1, x17
-       msr     dbgwvr11_el1, x16
-       msr     dbgwvr10_el1, x15
-       msr     dbgwvr9_el1, x14
-       msr     dbgwvr8_el1, x13
-       msr     dbgwvr7_el1, x12
-       msr     dbgwvr6_el1, x11
-       msr     dbgwvr5_el1, x10
-       msr     dbgwvr4_el1, x9
-       msr     dbgwvr3_el1, x8
-       msr     dbgwvr2_el1, x7
-       msr     dbgwvr1_el1, x6
-       msr     dbgwvr0_el1, x5
-
-       ldr     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
-       msr     mdccint_el1, x21
+       msr     \type\()15_el1, x21
+       msr     \type\()14_el1, x20
+       msr     \type\()13_el1, x19
+       msr     \type\()12_el1, x18
+       msr     \type\()11_el1, x17
+       msr     \type\()10_el1, x16
+       msr     \type\()9_el1, x15
+       msr     \type\()8_el1, x14
+       msr     \type\()7_el1, x13
+       msr     \type\()6_el1, x12
+       msr     \type\()5_el1, x11
+       msr     \type\()4_el1, x10
+       msr     \type\()3_el1, x9
+       msr     \type\()2_el1, x8
+       msr     \type\()1_el1, x7
+       msr     \type\()0_el1, x6
 .endm
 
 .macro skip_32bit_state tmp, target
        tbz     \tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
 .endm
 
+/*
+ * Branch to target if CPTR_EL2.TFP bit is set (VFP/SIMD trapping enabled)
+ */
+.macro skip_fpsimd_state tmp, target
+       mrs     \tmp, cptr_el2
+       tbnz    \tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
 .macro compute_debug_state target
        // Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
        // is set, we do a full save/restore cycle and disable trapping.
        add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
        mrs     x4, dacr32_el2
        mrs     x5, ifsr32_el2
-       mrs     x6, fpexc32_el2
        stp     x4, x5, [x3]
-       str     x6, [x3, #16]
 
+       skip_fpsimd_state x8, 3f
+       mrs     x6, fpexc32_el2
+       str     x6, [x3, #16]
+3:
        skip_debug_state x8, 2f
        mrs     x7, dbgvcr32_el2
        str     x7, [x3, #24]
 
        add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
        ldp     x4, x5, [x3]
-       ldr     x6, [x3, #16]
        msr     dacr32_el2, x4
        msr     ifsr32_el2, x5
-       msr     fpexc32_el2, x6
 
        skip_debug_state x8, 2f
        ldr     x7, [x3, #24]
 
 .macro activate_traps
        ldr     x2, [x0, #VCPU_HCR_EL2]
+
+       /*
+        * We are about to set CPTR_EL2.TFP to trap all floating point
+        * register accesses to EL2, however, the ARM ARM clearly states that
+        * traps are only taken to EL2 if the operation would not otherwise
+        * trap to EL1.  Therefore, always make sure that for 32-bit guests,
+        * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+        */
+       tbnz    x2, #HCR_RW_SHIFT, 99f // open code skip_32bit_state
+       mov     x3, #(1 << 30)
+       msr     fpexc32_el2, x3
+       isb
+99:
        msr     hcr_el2, x2
        mov     x2, #CPTR_EL2_TTA
+       orr     x2, x2, #CPTR_EL2_TFP
        msr     cptr_el2, x2
 
        mov     x2, #(1 << 15)  // Trap CP15 Cr=15
        msr     hstr_el2, x2
 
-       mrs     x2, mdcr_el2
-       and     x2, x2, #MDCR_EL2_HPMN_MASK
-       orr     x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
-       orr     x2, x2, #(MDCR_EL2_TDRA | MDCR_EL2_TDOSA)
-
-       // Check for KVM_ARM64_DEBUG_DIRTY, and set debug to trap
-       // if not dirty.
-       ldr     x3, [x0, #VCPU_DEBUG_FLAGS]
-       tbnz    x3, #KVM_ARM64_DEBUG_DIRTY_SHIFT, 1f
-       orr     x2, x2,  #MDCR_EL2_TDA
-1:
+       // Monitor Debug Config - see kvm_arm_setup_debug()
+       ldr     x2, [x0, #VCPU_MDCR_EL2]
        msr     mdcr_el2, x2
 .endm
 
 .macro deactivate_traps
        mov     x2, #HCR_RW
        msr     hcr_el2, x2
-       msr     cptr_el2, xzr
        msr     hstr_el2, xzr
 
        mrs     x2, mdcr_el2
@@ -900,21 +622,101 @@ __restore_sysregs:
        restore_sysregs
        ret
 
+/* Save debug state */
 __save_debug:
-       save_debug
+       // x2: ptr to CPU context
+       // x3: ptr to debug reg struct
+       // x4/x5/x6-22/x24-26: trashed
+
+       mrs     x26, id_aa64dfr0_el1
+       ubfx    x24, x26, #12, #4       // Extract BRPs
+       ubfx    x25, x26, #20, #4       // Extract WRPs
+       mov     w26, #15
+       sub     w24, w26, w24           // How many BPs to skip
+       sub     w25, w26, w25           // How many WPs to skip
+
+       mov     x5, x24
+       add     x4, x3, #DEBUG_BCR
+       save_debug dbgbcr
+       add     x4, x3, #DEBUG_BVR
+       save_debug dbgbvr
+
+       mov     x5, x25
+       add     x4, x3, #DEBUG_WCR
+       save_debug dbgwcr
+       add     x4, x3, #DEBUG_WVR
+       save_debug dbgwvr
+
+       mrs     x21, mdccint_el1
+       str     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
        ret
 
+/* Restore debug state */
 __restore_debug:
-       restore_debug
+       // x2: ptr to CPU context
+       // x3: ptr to debug reg struct
+       // x4/x5/x6-22/x24-26: trashed
+
+       mrs     x26, id_aa64dfr0_el1
+       ubfx    x24, x26, #12, #4       // Extract BRPs
+       ubfx    x25, x26, #20, #4       // Extract WRPs
+       mov     w26, #15
+       sub     w24, w26, w24           // How many BPs to skip
+       sub     w25, w26, w25           // How many WPs to skip
+
+       mov     x5, x24
+       add     x4, x3, #DEBUG_BCR
+       restore_debug dbgbcr
+       add     x4, x3, #DEBUG_BVR
+       restore_debug dbgbvr
+
+       mov     x5, x25
+       add     x4, x3, #DEBUG_WCR
+       restore_debug dbgwcr
+       add     x4, x3, #DEBUG_WVR
+       restore_debug dbgwvr
+
+       ldr     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+       msr     mdccint_el1, x21
+
        ret
 
 __save_fpsimd:
+       skip_fpsimd_state x3, 1f
        save_fpsimd
-       ret
+1:     ret
 
 __restore_fpsimd:
+       skip_fpsimd_state x3, 1f
        restore_fpsimd
-       ret
+1:     ret
+
+switch_to_guest_fpsimd:
+       push    x4, lr
+
+       mrs     x2, cptr_el2
+       bic     x2, x2, #CPTR_EL2_TFP
+       msr     cptr_el2, x2
+       isb
+
+       mrs     x0, tpidr_el2
+
+       ldr     x2, [x0, #VCPU_HOST_CONTEXT]
+       kern_hyp_va x2
+       bl __save_fpsimd
+
+       add     x2, x0, #VCPU_CONTEXT
+       bl __restore_fpsimd
+
+       skip_32bit_state x3, 1f
+       ldr     x4, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
+       msr     fpexc32_el2, x4
+1:
+       pop     x4, lr
+       pop     x2, x3
+       pop     x0, x1
+
+       eret
 
 /*
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -936,10 +738,10 @@ ENTRY(__kvm_vcpu_run)
        kern_hyp_va x2
 
        save_host_regs
-       bl __save_fpsimd
        bl __save_sysregs
 
        compute_debug_state 1f
+       add     x3, x0, #VCPU_HOST_DEBUG_STATE
        bl      __save_debug
 1:
        activate_traps
@@ -952,9 +754,10 @@ ENTRY(__kvm_vcpu_run)
        add     x2, x0, #VCPU_CONTEXT
 
        bl __restore_sysregs
-       bl __restore_fpsimd
 
        skip_debug_state x3, 1f
+       ldr     x3, [x0, #VCPU_DEBUG_PTR]
+       kern_hyp_va x3
        bl      __restore_debug
 1:
        restore_guest_32bit_state
@@ -975,6 +778,8 @@ __kvm_vcpu_return:
        bl __save_sysregs
 
        skip_debug_state x3, 1f
+       ldr     x3, [x0, #VCPU_DEBUG_PTR]
+       kern_hyp_va x3
        bl      __save_debug
 1:
        save_guest_32bit_state
@@ -991,12 +796,15 @@ __kvm_vcpu_return:
 
        bl __restore_sysregs
        bl __restore_fpsimd
+       /* Clear FPSIMD and Trace trapping */
+       msr     cptr_el2, xzr
 
        skip_debug_state x3, 1f
        // Clear the dirty flag for the next run, as all the state has
        // already been saved. Note that we nuke the whole 64bit word.
        // If we ever add more flags, we'll have to be more careful...
        str     xzr, [x0, #VCPU_DEBUG_FLAGS]
+       add     x3, x0, #VCPU_HOST_DEBUG_STATE
        bl      __restore_debug
 1:
        restore_host_regs
@@ -1199,6 +1007,11 @@ el1_trap:
         * x1: ESR
         * x2: ESR_EC
         */
+
+       /* Guest accessed VFP/SIMD registers, save host, restore Guest */
+       cmp     x2, #ESR_ELx_EC_FP_ASIMD
+       b.eq    switch_to_guest_fpsimd
+
        cmp     x2, #ESR_ELx_EC_DABT_LOW
        mov     x0, #ESR_ELx_EC_IABT_LOW
        ccmp    x2, x0, #4, ne
@@ -1293,4 +1106,10 @@ ENTRY(__kvm_hyp_vector)
        ventry  el1_error_invalid               // Error 32-bit EL1
 ENDPROC(__kvm_hyp_vector)
 
+
+ENTRY(__kvm_get_mdcr_el2)
+       mrs     x0, mdcr_el2
+       ret
+ENDPROC(__kvm_get_mdcr_el2)
+
        .popsection
index 0b43265..91cf535 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
+#include <linux/hw_breakpoint.h>
 
 #include <kvm/arm_arch_timer.h>
 
@@ -56,6 +57,12 @@ static bool cpu_has_32bit_el1(void)
        return !!(pfr0 & 0x20);
 }
 
+/**
+ * kvm_arch_dev_ioctl_check_extension
+ *
+ * We currently assume that the number of HW registers is uniform
+ * across all CPUs (see cpuinfo_sanity_check).
+ */
 int kvm_arch_dev_ioctl_check_extension(long ext)
 {
        int r;
@@ -64,6 +71,15 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
        case KVM_CAP_ARM_EL1_32BIT:
                r = cpu_has_32bit_el1();
                break;
+       case KVM_CAP_GUEST_DEBUG_HW_BPS:
+               r = get_num_brps();
+               break;
+       case KVM_CAP_GUEST_DEBUG_HW_WPS:
+               r = get_num_wrps();
+               break;
+       case KVM_CAP_SET_GUEST_DEBUG:
+               r = 1;
+               break;
        default:
                r = 0;
        }
@@ -105,7 +121,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        kvm_reset_sys_regs(vcpu);
 
        /* Reset timer */
-       kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-       return 0;
+       return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
index c370b40..b41607d 100644 (file)
@@ -38,6 +38,8 @@
 
 #include "sys_regs.h"
 
+#include "trace.h"
+
 /*
  * All of this file is extremly similar to the ARM coproc.c, but the
  * types are different. My gut feeling is that it should be pretty
@@ -208,9 +210,217 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
                *vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, r->reg);
        }
 
+       trace_trap_reg(__func__, r->reg, p->is_write, *vcpu_reg(vcpu, p->Rt));
+
+       return true;
+}
+
+/*
+ * reg_to_dbg/dbg_to_reg
+ *
+ * A 32 bit write to a debug register leave top bits alone
+ * A 32 bit read from a debug register only returns the bottom bits
+ *
+ * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the
+ * hyp.S code switches between host and guest values in future.
+ */
+static inline void reg_to_dbg(struct kvm_vcpu *vcpu,
+                             const struct sys_reg_params *p,
+                             u64 *dbg_reg)
+{
+       u64 val = *vcpu_reg(vcpu, p->Rt);
+
+       if (p->is_32bit) {
+               val &= 0xffffffffUL;
+               val |= ((*dbg_reg >> 32) << 32);
+       }
+
+       *dbg_reg = val;
+       vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+}
+
+static inline void dbg_to_reg(struct kvm_vcpu *vcpu,
+                             const struct sys_reg_params *p,
+                             u64 *dbg_reg)
+{
+       u64 val = *dbg_reg;
+
+       if (p->is_32bit)
+               val &= 0xffffffffUL;
+
+       *vcpu_reg(vcpu, p->Rt) = val;
+}
+
+static inline bool trap_bvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_bvr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_bcr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+
+       return 0;
+}
+
+static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_bcr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write,
+               vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]);
+
        return true;
 }
 
+static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_wvr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wcr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_wcr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val;
+}
+
 static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
        u64 amair;
@@ -240,16 +450,16 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 #define DBG_BCR_BVR_WCR_WVR_EL1(n)                                     \
        /* DBGBVRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100),     \
-         trap_debug_regs, reset_val, (DBGBVR0_EL1 + (n)), 0 },         \
+         trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr },                \
        /* DBGBCRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101),     \
-         trap_debug_regs, reset_val, (DBGBCR0_EL1 + (n)), 0 },         \
+         trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr },                \
        /* DBGWVRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110),     \
-         trap_debug_regs, reset_val, (DBGWVR0_EL1 + (n)), 0 },         \
+         trap_wvr, reset_wvr, n, 0,  get_wvr, set_wvr },               \
        /* DBGWCRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111),     \
-         trap_debug_regs, reset_val, (DBGWCR0_EL1 + (n)), 0 }
+         trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
 
 /*
  * Architected system registers.
@@ -516,28 +726,57 @@ static bool trap_debug32(struct kvm_vcpu *vcpu,
        return true;
 }
 
-#define DBG_BCR_BVR_WCR_WVR(n)                                 \
-       /* DBGBVRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_debug32,    \
-         NULL, (cp14_DBGBVR0 + (n) * 2) },                     \
-       /* DBGBCRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_debug32,    \
-         NULL, (cp14_DBGBCR0 + (n) * 2) },                     \
-       /* DBGWVRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_debug32,    \
-         NULL, (cp14_DBGWVR0 + (n) * 2) },                     \
-       /* DBGWCRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_debug32,    \
-         NULL, (cp14_DBGWCR0 + (n) * 2) }
-
-#define DBGBXVR(n)                                             \
-       { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_debug32,    \
-         NULL, cp14_DBGBXVR0 + n * 2 }
+/* AArch32 debug register mappings
+ *
+ * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0]
+ * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32]
+ *
+ * All control registers and watchpoint value registers are mapped to
+ * the lower 32 bits of their AArch64 equivalents. We share the trap
+ * handlers with the above AArch64 code which checks what mode the
+ * system is in.
+ */
+
+static inline bool trap_xvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (p->is_write) {
+               u64 val = *dbg_reg;
+
+               val &= 0xffffffffUL;
+               val |= *vcpu_reg(vcpu, p->Rt) << 32;
+               *dbg_reg = val;
+
+               vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+       } else {
+               *vcpu_reg(vcpu, p->Rt) = *dbg_reg >> 32;
+       }
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+#define DBG_BCR_BVR_WCR_WVR(n)                                         \
+       /* DBGBVRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n },     \
+       /* DBGBCRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n },     \
+       /* DBGWVRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n },     \
+       /* DBGWCRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n }
+
+#define DBGBXVR(n)                                                     \
+       { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_xvr, NULL, n }
 
 /*
  * Trapped cp14 registers. We generally ignore most of the external
  * debug, on the principle that they don't really make sense to a
- * guest. Revisit this one day, whould this principle change.
+ * guest. Revisit this one day, would this principle change.
  */
 static const struct sys_reg_desc cp14_regs[] = {
        /* DBGIDR */
@@ -999,6 +1238,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
        struct sys_reg_params params;
        unsigned long esr = kvm_vcpu_get_hsr(vcpu);
 
+       trace_kvm_handle_sys_reg(esr);
+
        params.is_aarch32 = false;
        params.is_32bit = false;
        params.Op0 = (esr >> 20) & 3;
@@ -1303,6 +1544,9 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
        if (!r)
                return get_invariant_sys_reg(reg->id, uaddr);
 
+       if (r->get_user)
+               return (r->get_user)(vcpu, r, reg, uaddr);
+
        return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id);
 }
 
@@ -1321,6 +1565,9 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
        if (!r)
                return set_invariant_sys_reg(reg->id, uaddr);
 
+       if (r->set_user)
+               return (r->set_user)(vcpu, r, reg, uaddr);
+
        return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
 }
 
index d411e25..eaa324e 100644 (file)
@@ -55,6 +55,12 @@ struct sys_reg_desc {
 
        /* Value (usually reset value) */
        u64 val;
+
+       /* Custom get/set_user functions, fallback to generic if NULL */
+       int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+                       const struct kvm_one_reg *reg, void __user *uaddr);
+       int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+                       const struct kvm_one_reg *reg, void __user *uaddr);
 };
 
 static inline void print_sys_reg_instr(const struct sys_reg_params *p)
index 475fd29..1e45768 100644 (file)
@@ -94,6 +94,8 @@ static int __init sys_reg_genericv8_init(void)
                                          &genericv8_target_table);
        kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA,
                                          &genericv8_target_table);
+       kvm_register_target_sys_reg_table(KVM_ARM_TARGET_GENERIC_V8,
+                                         &genericv8_target_table);
 
        return 0;
 }
index 157416e..7fb0008 100644 (file)
@@ -44,6 +44,129 @@ TRACE_EVENT(kvm_hvc_arm64,
                  __entry->vcpu_pc, __entry->r0, __entry->imm)
 );
 
+TRACE_EVENT(kvm_arm_setup_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_clear_debug,
+       TP_PROTO(__u32 guest_debug),
+       TP_ARGS(guest_debug),
+
+       TP_STRUCT__entry(
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("flags: 0x%08x", __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_set_dreg32,
+       TP_PROTO(const char *name, __u32 value),
+       TP_ARGS(name, value),
+
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(__u32, value)
+       ),
+
+       TP_fast_assign(
+               __entry->name = name;
+               __entry->value = value;
+       ),
+
+       TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+);
+
+TRACE_EVENT(kvm_arm_set_regset,
+       TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
+       TP_ARGS(type, len, control, value),
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(int, len)
+               __array(u64, ctrls, 16)
+               __array(u64, values, 16)
+       ),
+       TP_fast_assign(
+               __entry->name = type;
+               __entry->len = len;
+               memcpy(__entry->ctrls, control, len << 3);
+               memcpy(__entry->values, value, len << 3);
+       ),
+       TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
+               __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
+               __print_array(__entry->values, __entry->len, sizeof(__u64)))
+);
+
+TRACE_EVENT(trap_reg,
+       TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
+       TP_ARGS(fn, reg, is_write, write_value),
+
+       TP_STRUCT__entry(
+               __field(const char *, fn)
+               __field(int, reg)
+               __field(bool, is_write)
+               __field(u64, write_value)
+       ),
+
+       TP_fast_assign(
+               __entry->fn = fn;
+               __entry->reg = reg;
+               __entry->is_write = is_write;
+               __entry->write_value = write_value;
+       ),
+
+       TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+);
+
+TRACE_EVENT(kvm_handle_sys_reg,
+       TP_PROTO(unsigned long hsr),
+       TP_ARGS(hsr),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,  hsr)
+       ),
+
+       TP_fast_assign(
+               __entry->hsr = hsr;
+       ),
+
+       TP_printk("HSR 0x%08lx", __entry->hsr)
+);
+
+TRACE_EVENT(kvm_set_guest_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+
 #endif /* _TRACE_ARM64_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index 0bcc4bc..99224dc 100644 (file)
@@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
        if (IS_ENABLED(CONFIG_ZONE_DMA) &&
            dev->coherent_dma_mask <= DMA_BIT_MASK(32))
                flags |= GFP_DMA;
-       if (IS_ENABLED(CONFIG_DMA_CMA) && (flags & __GFP_WAIT)) {
+       if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) {
                struct page *page;
                void *addr;
 
index 0314e32..8da5653 100644 (file)
@@ -36,6 +36,17 @@ config FORCE_MAX_ZONEORDER
        int
        default 6
 
+config TRACE_IRQFLAGS_SUPPORT
+       depends on ETRAX_ARCH_V32
+       def_bool y
+
+config STACKTRACE_SUPPORT
+       def_bool y
+
+config LOCKDEP_SUPPORT
+       depends on ETRAX_ARCH_V32
+       def_bool y
+
 config CRIS
        bool
        default y
@@ -58,6 +69,7 @@ config CRIS
        select CLKSRC_MMIO if ETRAX_ARCH_V32
        select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
        select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+       select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
 
 config HZ
        int
index 81570fc..b562252 100644 (file)
@@ -955,6 +955,14 @@ sys_call_table:
        .long sys_process_vm_writev
        .long sys_kcmp                  /* 350 */
        .long sys_finit_module
+       .long sys_sched_setattr
+       .long sys_sched_getattr
+       .long sys_renameat2
+       .long sys_seccomp               /* 355 */
+       .long sys_getrandom
+       .long sys_memfd_create
+       .long sys_bpf
+       .long sys_execveat
 
         /*
          * NOTE!! This doesn't have to be exact - we just have
diff --git a/arch/cris/arch-v10/lib/dmacopy.c b/arch/cris/arch-v10/lib/dmacopy.c
deleted file mode 100644 (file)
index 49f5b8c..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * memcpy for large blocks, using memory-memory DMA channels 6 and 7 in Etrax
- */
-
-#include <asm/svinto.h>
-#include <asm/io.h>
-
-#define D(x)
-
-void *dma_memcpy(void *pdst,
-                const void *psrc,
-                unsigned int pn)
-{
-       static etrax_dma_descr indma, outdma;
-
-       D(printk(KERN_DEBUG "dma_memcpy %d bytes... ", pn));
-
-#if 0
-       *R_GEN_CONFIG = genconfig_shadow =
-               (genconfig_shadow & ~0x3c0000) |
-               IO_STATE(R_GEN_CONFIG, dma6, intdma7) |
-               IO_STATE(R_GEN_CONFIG, dma7, intdma6);
-#endif
-       indma.sw_len = outdma.sw_len = pn;
-       indma.ctrl = d_eol | d_eop;
-       outdma.ctrl = d_eol;
-       indma.buf = psrc;
-       outdma.buf = pdst;
-
-       *R_DMA_CH6_FIRST = &indma;
-       *R_DMA_CH7_FIRST = &outdma;
-       *R_DMA_CH6_CMD = IO_STATE(R_DMA_CH6_CMD, cmd, start);
-       *R_DMA_CH7_CMD = IO_STATE(R_DMA_CH7_CMD, cmd, start);
-
-       while (*R_DMA_CH7_CMD == 1)
-               /* wait for completion */;
-
-       D(printk(KERN_DEBUG "done\n"));
-}
-
-
-
diff --git a/arch/cris/arch-v10/lib/old_checksum.c b/arch/cris/arch-v10/lib/old_checksum.c
deleted file mode 100644 (file)
index 8f79163..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * INET                An implementation of the TCP/IP protocol suite for the LINUX
- *             operating system.  INET is implemented using the  BSD Socket
- *             interface as the means of communication with the user level.
- *
- *             IP/TCP/UDP checksumming routines
- *
- * Authors:    Jorge Cwik, <jorge@laser.satlink.net>
- *             Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- *             Tom May, <ftom@netcom.com>
- *             Lots of code moved from tcp.c and ip.c; see those files
- *             for more names.
- *
- *             This program is free software; you can redistribute it and/or
- *             modify it under the terms of the GNU General Public License
- *             as published by the Free Software Foundation; either version
- *             2 of the License, or (at your option) any later version.
- */
-
-#include <net/checksum.h>
-#include <net/module.h>
-
-#undef PROFILE_CHECKSUM
-
-#ifdef PROFILE_CHECKSUM
-/* these are just for profiling the checksum code with an oscillioscope.. uh */
-#if 0
-#define BITOFF *((unsigned char *)0xb0000030) = 0xff
-#define BITON *((unsigned char *)0xb0000030) = 0x0
-#endif
-#include <asm/io.h>
-#define CBITON LED_ACTIVE_SET(1)
-#define CBITOFF LED_ACTIVE_SET(0)
-#define BITOFF
-#define BITON
-#else
-#define BITOFF
-#define BITON
-#define CBITOFF
-#define CBITON
-#endif
-
-/*
- * computes a partial checksum, e.g. for TCP/UDP fragments
- */
-
-#include <asm/delay.h>
-
-__wsum csum_partial(const void *p, int len, __wsum __sum)
-{
-       u32 sum = (__force u32)__sum;
-       const u16 *buff = p;
-       /*
-       * Experiments with ethernet and slip connections show that buff
-       * is aligned on either a 2-byte or 4-byte boundary.
-       */
-       const void *endMarker = p + len;
-       const void *marker = endMarker - (len % 16);
-#if 0
-       if((int)buff & 0x3)
-               printk("unaligned buff %p\n", buff);
-       __delay(900); /* extra delay of 90 us to test performance hit */
-#endif
-       BITON;
-       while (buff < marker) {
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-       }
-       marker = endMarker - (len % 2);
-       while (buff < marker)
-               sum += *buff++;
-
-       if (endMarker > buff)
-               sum += *(const u8 *)buff;       /* add extra byte separately */
-
-       BITOFF;
-       return (__force __wsum)sum;
-}
-
-EXPORT_SYMBOL(csum_partial);
index 4fc16b4..e6c523c 100644 (file)
@@ -202,7 +202,7 @@ config ETRAX_PA_CHANGEABLE_DIR
        default "0x00" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (8 bits) with information of what bits in PA that a
+         This is a bitmask with information of what bits in PA that a
          user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0 here, but it depends on your hardware.
@@ -213,7 +213,7 @@ config ETRAX_PA_CHANGEABLE_BITS
        default "0x00" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (8 bits) with information of what bits in PA
+         This is a bitmask with information of what bits in PA
          that a user can change the value on using ioctl's.
          Bit set = changeable.
 
@@ -223,7 +223,7 @@ config ETRAX_PB_CHANGEABLE_DIR
        default "0x00000" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PB
+         This is a bitmask with information of what bits in PB
          that a user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0 here, but it depends on your hardware.
@@ -234,7 +234,7 @@ config ETRAX_PB_CHANGEABLE_BITS
        default "0x00000" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PB
+         This is a bitmask with information of what bits in PB
          that a user can change the value on using ioctl's.
          Bit set = changeable.
 
@@ -244,7 +244,7 @@ config ETRAX_PC_CHANGEABLE_DIR
        default "0x00000" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PC
+         This is a bitmask with information of what bits in PC
          that a user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0 here, but it depends on your hardware.
@@ -253,9 +253,9 @@ config ETRAX_PC_CHANGEABLE_BITS
        hex "PC user changeable bits mask"
        depends on ETRAX_GPIO
        default "0x00000" if ETRAXFS
-       default "0x00000000" if ETRAXFS
+       default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PC
+         This is a bitmask with information of what bits in PC
          that a user can change the value on using ioctl's.
          Bit set = changeable.
 
@@ -264,7 +264,7 @@ config ETRAX_PD_CHANGEABLE_DIR
        depends on ETRAX_GPIO && ETRAXFS
        default "0x00000"
        help
-         This is a bitmask (18 bits) with information of what bits in PD
+         This is a bitmask with information of what bits in PD
          that a user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0x00000 here, but it depends on your hardware.
index 28dd771..5387424 100644 (file)
@@ -313,6 +313,7 @@ static int __init init_axis_flash(void)
        size_t len;
        int ram_rootfs_partition = -1; /* -1 => no RAM rootfs partition */
        int part;
+       struct mtd_partition *partition;
 
        /* We need a root fs. If it resides in RAM, we need to use an
         * MTDRAM device, so it must be enabled in the kernel config,
@@ -329,7 +330,7 @@ static int __init init_axis_flash(void)
 
        main_mtd = flash_probe();
        if (main_mtd)
-               printk(KERN_INFO "%s: 0x%08x bytes of NOR flash memory.\n",
+               printk(KERN_INFO "%s: 0x%08llx bytes of NOR flash memory.\n",
                       main_mtd->name, main_mtd->size);
 
 #ifdef CONFIG_ETRAX_NANDFLASH
@@ -388,10 +389,10 @@ static int __init init_axis_flash(void)
 #endif
 
        if (main_mtd) {
+               loff_t ptable_sector = CONFIG_ETRAX_PTABLE_SECTOR;
                main_mtd->owner = THIS_MODULE;
                axisflash_mtd = main_mtd;
 
-               loff_t ptable_sector = CONFIG_ETRAX_PTABLE_SECTOR;
 
                /* First partition (rescue) is always set to the default. */
                pidx++;
@@ -517,7 +518,7 @@ static int __init init_axis_flash(void)
        /* Decide whether to use default partition table. */
        /* Only use default table if we actually have a device (main_mtd) */
 
-       struct mtd_partition *partition = &axis_partitions[0];
+       partition = &axis_partitions[0];
        if (main_mtd && !ptable_ok) {
                memcpy(axis_partitions, axis_default_partitions,
                       sizeof(axis_default_partitions));
@@ -580,7 +581,7 @@ static int __init init_axis_flash(void)
                        printk(KERN_INFO "axisflashmap: Adding RAM partition "
                               "for rootfs image.\n");
                        err = mtdram_init_device(mtd_ram,
-                                                (void *)partition[part].offset,
+                                                (void *)(u_int32_t)partition[part].offset,
                                                 partition[part].size,
                                                 partition[part].name);
                        if (err)
index 74f9fe8..c92e1da 100644 (file)
@@ -957,7 +957,7 @@ static void __init virtual_gpio_init(void)
 
 static int __init gpio_init(void)
 {
-       int res;
+       int res, res2;
 
        printk(KERN_INFO "ETRAX FS GPIO driver v2.7, (c) 2003-2008 "
                "Axis Communications AB\n");
@@ -977,7 +977,7 @@ static int __init gpio_init(void)
        CRIS_LED_DISK_READ(0);
        CRIS_LED_DISK_WRITE(0);
 
-       int res2 = request_irq(GIO_INTR_VECT, gpio_interrupt,
+       res2 = request_irq(GIO_INTR_VECT, gpio_interrupt,
                IRQF_SHARED, "gpio", &alarmlist);
        if (res2) {
                printk(KERN_ERR "err: irq for gpio\n");
index 009f4ee..72968fb 100644 (file)
@@ -425,12 +425,11 @@ gpio_open(struct inode *inode, struct file *filp)
        if (p > GPIO_MINOR_LAST)
                return -EINVAL;
 
-       priv = kmalloc(sizeof(struct gpio_private), GFP_KERNEL);
+       priv = kzalloc(sizeof(struct gpio_private), GFP_KERNEL);
        if (!priv)
                return -ENOMEM;
 
        mutex_lock(&gpio_mutex);
-       memset(priv, 0, sizeof(*priv));
 
        priv->minor = p;
 
index 026a0b2..b17a209 100644 (file)
@@ -240,6 +240,17 @@ ret_from_sys_call:
 
        .type   _Rexit,@function
 _Rexit:
+#if defined(CONFIG_TRACE_IRQFLAGS)
+       addoq   +PT_ccs, $sp, $acr
+       move.d  [$acr], $r0
+       btstq   15, $r0         ; I1
+       bpl     1f
+       nop
+       jsr     trace_hardirqs_on
+       nop
+1:
+#endif
+
        ;; This epilogue MUST match the prologues in multiple_interrupt, irq.h
        ;; and ptregs.h.
        addq    4, $sp          ; Skip orig_r10.
@@ -875,6 +886,14 @@ sys_call_table:
        .long sys_process_vm_writev
        .long sys_kcmp                  /* 350 */
        .long sys_finit_module
+       .long sys_sched_setattr
+       .long sys_sched_getattr
+       .long sys_renameat2
+       .long sys_seccomp               /* 355 */
+       .long sys_getrandom
+       .long sys_memfd_create
+       .long sys_bpf
+       .long sys_execveat
 
        /*
         * NOTE!! This doesn't have to be exact - we just have
index cebd32e..c7ce784 100644 (file)
@@ -23,9 +23,9 @@ extern void stop_watchdog(void);
 /* We use this if we don't have any better idle routine. */
 void default_idle(void)
 {
+       local_irq_enable();
        /* Halt until exception. */
-       __asm__ volatile("ei    \n\t"
-                        "halt      ");
+       __asm__ volatile("halt");
 }
 
 /*
index 3a36ae6..150d1d7 100644 (file)
@@ -19,7 +19,6 @@
 #include <asm/processor.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
-#include <arch/ptrace.h>
 #include <arch/hwregs/cpu_vect.h>
 
 extern unsigned long cris_signal_return_page;
index 05a0470..d8a3a3c 100644 (file)
@@ -46,6 +46,8 @@ static int __crisv32_pinmux_alloc(int port, int first_pin, int last_pin,
                pins[port][i] = mode;
 
        crisv32_pinmux_set(port);
+
+       return 0;
 }
 
 static int crisv32_pinmux_init(void)
@@ -93,6 +95,7 @@ int crisv32_pinmux_alloc_fixed(enum fixed_function function)
        int ret = -EINVAL;
        char saved[sizeof pins];
        unsigned long flags;
+       reg_pinmux_rw_hwprot hwprot;
 
        spin_lock_irqsave(&pinmux_lock, flags);
 
@@ -101,7 +104,7 @@ int crisv32_pinmux_alloc_fixed(enum fixed_function function)
 
        crisv32_pinmux_init();  /* Must be done before we read rw_hwprot */
 
-       reg_pinmux_rw_hwprot hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
+       hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
 
        switch (function) {
        case pinmux_ser1:
@@ -227,6 +230,7 @@ int crisv32_pinmux_dealloc_fixed(enum fixed_function function)
        int ret = -EINVAL;
        char saved[sizeof pins];
        unsigned long flags;
+       reg_pinmux_rw_hwprot hwprot;
 
        spin_lock_irqsave(&pinmux_lock, flags);
 
@@ -235,7 +239,7 @@ int crisv32_pinmux_dealloc_fixed(enum fixed_function function)
 
        crisv32_pinmux_init();  /* Must be done before we read rw_hwprot */
 
-       reg_pinmux_rw_hwprot hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
+       hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
 
        switch (function) {
        case pinmux_ser1:
index 71854d4..70e497e 100644 (file)
@@ -12,10 +12,6 @@ CONFIG_ETRAX_FAST_TIMER=y
 CONFIG_CRIS_MACH_ARTPEC3=y
 CONFIG_ETRAX_DRAM_SIZE=32
 CONFIG_ETRAX_FLASH1_SIZE=4
-CONFIG_ETRAX_DEF_GIO_PA_OE=1c
-CONFIG_ETRAX_DEF_GIO_PA_OUT=00
-CONFIG_ETRAX_DEF_GIO_PB_OE=00000
-CONFIG_ETRAX_DEF_GIO_PB_OUT=00000
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -42,3 +38,4 @@ CONFIG_JFFS2_FS=y
 CONFIG_CRAMFS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
+CONFIG_ETRAX_GPIO=y
index 87c7227..9123268 100644 (file)
@@ -38,3 +38,4 @@ CONFIG_JFFS2_FS=y
 CONFIG_CRAMFS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
+CONFIG_ETRAX_GPIO=y
diff --git a/arch/cris/include/arch-v10/arch/elf.h b/arch/cris/include/arch-v10/arch/elf.h
deleted file mode 100644 (file)
index 1eb638a..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef __ASMCRIS_ARCH_ELF_H
-#define __ASMCRIS_ARCH_ELF_H
-
-#include <arch/system.h>
-
-#define ELF_MACH EF_CRIS_VARIANT_ANY_V0_V10
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x)                      \
- ((x)->e_machine == EM_CRIS                    \
-  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_ANY_V0_V10     \
-      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
-
-/*
- * ELF register definitions..
- */
-
-#include <asm/ptrace.h>
-
-/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program
-   starts (a register; assume first param register for CRIS)
-   contains a pointer to a function which might be
-   registered using `atexit'.  This provides a mean for the
-   dynamic linker to call DT_FINI functions for shared libraries
-   that have been loaded before the code runs.
-
-   A value of 0 tells we have no such handler.  */
-
-/* Explicitly set registers to 0 to increase determinism.  */
-#define ELF_PLAT_INIT(_r, load_addr)   do { \
-       (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
-       (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
-       (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
-       (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
-} while (0)
-
-/* The additional layer below is because the stack pointer is missing in 
-   the pt_regs struct, but needed in a core dump. pr_reg is a elf_gregset_t,
-   and should be filled in according to the layout of the user_regs_struct
-   struct; regs is a pt_regs struct. We dump all registers, though several are
-   obviously unnecessary. That way there's less need for intelligence at 
-   the receiving end (i.e. gdb). */
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
-       pr_reg[0] = regs->r0;                              \
-       pr_reg[1] = regs->r1;                              \
-       pr_reg[2] = regs->r2;                              \
-       pr_reg[3] = regs->r3;                              \
-       pr_reg[4] = regs->r4;                              \
-       pr_reg[5] = regs->r5;                              \
-       pr_reg[6] = regs->r6;                              \
-       pr_reg[7] = regs->r7;                              \
-       pr_reg[8] = regs->r8;                              \
-       pr_reg[9] = regs->r9;                              \
-       pr_reg[10] = regs->r10;                            \
-       pr_reg[11] = regs->r11;                            \
-       pr_reg[12] = regs->r12;                            \
-       pr_reg[13] = regs->r13;                            \
-       pr_reg[14] = rdusp();               /* sp */       \
-       pr_reg[15] = regs->irp;             /* pc */       \
-       pr_reg[16] = 0;                     /* p0 */       \
-       pr_reg[17] = rdvr();                /* vr */       \
-       pr_reg[18] = 0;                     /* p2 */       \
-       pr_reg[19] = 0;                     /* p3 */       \
-       pr_reg[20] = 0;                     /* p4 */       \
-       pr_reg[21] = (regs->dccr & 0xffff); /* ccr */      \
-       pr_reg[22] = 0;                     /* p6 */       \
-       pr_reg[23] = regs->mof;             /* mof */      \
-       pr_reg[24] = 0;                     /* p8 */       \
-       pr_reg[25] = 0;                     /* ibr */      \
-       pr_reg[26] = 0;                     /* irp */      \
-       pr_reg[27] = regs->srp;             /* srp */      \
-       pr_reg[28] = 0;                     /* bar */      \
-       pr_reg[29] = regs->dccr;            /* dccr */     \
-       pr_reg[30] = 0;                     /* brp */      \
-       pr_reg[31] = rdusp();               /* usp */      \
-       pr_reg[32] = 0;                     /* csrinstr */ \
-       pr_reg[33] = 0;                     /* csraddr */  \
-       pr_reg[34] = 0;                     /* csrdata */
-
-
-#endif
diff --git a/arch/cris/include/arch-v10/arch/ptrace.h b/arch/cris/include/arch-v10/arch/ptrace.h
deleted file mode 100644 (file)
index 1a23273..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef _CRIS_ARCH_PTRACE_H
-#define _CRIS_ARCH_PTRACE_H
-
-/* Frame types */
-
-#define CRIS_FRAME_NORMAL   0 /* normal frame without SBFS stacking */
-#define CRIS_FRAME_BUSFAULT 1 /* frame stacked using SBFS, need RBF return
-                                path */
-
-/* Register numbers in the ptrace system call interface */
-
-#define PT_FRAMETYPE 0
-#define PT_ORIG_R10  1
-#define PT_R13       2
-#define PT_R12       3
-#define PT_R11       4
-#define PT_R10       5
-#define PT_R9        6
-#define PT_R8        7
-#define PT_R7        8
-#define PT_R6        9
-#define PT_R5        10
-#define PT_R4        11
-#define PT_R3        12
-#define PT_R2        13
-#define PT_R1        14
-#define PT_R0        15
-#define PT_MOF       16
-#define PT_DCCR      17
-#define PT_SRP       18
-#define PT_IRP       19    /* This is actually the debugged process' PC */
-#define PT_CSRINSTR  20    /* CPU Status record remnants -
-                             valid if frametype == busfault */
-#define PT_CSRADDR   21
-#define PT_CSRDATA   22
-#define PT_USP       23    /* special case - USP is not in the pt_regs */
-#define PT_MAX       23
-
-/* Condition code bit numbers.  The same numbers apply to CCR of course,
-   but we use DCCR everywhere else, so let's try and be consistent.  */
-#define C_DCCR_BITNR 0
-#define V_DCCR_BITNR 1
-#define Z_DCCR_BITNR 2
-#define N_DCCR_BITNR 3
-#define X_DCCR_BITNR 4
-#define I_DCCR_BITNR 5
-#define B_DCCR_BITNR 6
-#define M_DCCR_BITNR 7
-#define U_DCCR_BITNR 8
-#define P_DCCR_BITNR 9
-#define F_DCCR_BITNR 10
-
-/* pt_regs not only specifices the format in the user-struct during
- * ptrace but is also the frame format used in the kernel prologue/epilogues 
- * themselves
- */
-
-struct pt_regs {
-       unsigned long frametype;  /* type of stackframe */
-       unsigned long orig_r10;
-       /* pushed by movem r13, [sp] in SAVE_ALL, movem pushes backwards */
-       unsigned long r13;
-       unsigned long r12;
-       unsigned long r11;
-       unsigned long r10;
-       unsigned long r9;
-       unsigned long r8;
-       unsigned long r7;
-       unsigned long r6;
-       unsigned long r5;
-       unsigned long r4;
-       unsigned long r3;
-       unsigned long r2;
-       unsigned long r1;
-       unsigned long r0;
-       unsigned long mof;
-       unsigned long dccr;
-       unsigned long srp;
-       unsigned long irp; /* This is actually the debugged process' PC */
-       unsigned long csrinstr;
-       unsigned long csraddr;
-       unsigned long csrdata;
-};
-
-/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
- * when doing a context-switch. it is used (apart from in resume) when a new
- * thread is made and we need to make _resume (which is starting it for the
- * first time) realise what is going on.
- *
- * Actually, the use is very close to the thread struct (TSS) in that both the
- * switch_stack and the TSS are used to keep thread stuff when switching in
- * _resume.
- */
-
-struct switch_stack {
-       unsigned long r9;
-       unsigned long r8;
-       unsigned long r7;
-       unsigned long r6;
-       unsigned long r5;
-       unsigned long r4;
-       unsigned long r3;
-       unsigned long r2;
-       unsigned long r1;
-       unsigned long r0;
-       unsigned long return_ip; /* ip that _resume will return to */
-};
-
-#ifdef __KERNEL__
-
-/* bit 8 is user-mode flag */
-#define user_mode(regs) (((regs)->dccr & 0x100) != 0)
-#define instruction_pointer(regs) ((regs)->irp)
-#define profile_pc(regs) instruction_pointer(regs)
-
-#endif  /*  __KERNEL__  */
-
-#endif
index 0f211e1..fb59faa 100644 (file)
@@ -10,6 +10,7 @@
  * All other stuff is done out-of-band with exception handlers.
  */
 #define BUG()                                                          \
+do {                                                                   \
        __asm__ __volatile__ ("0: break 14\n\t"                         \
                              ".section .fixup,\"ax\"\n"                \
                              "1:\n\t"                                  \
                              ".section __ex_table,\"a\"\n\t"           \
                              ".dword 0b, 1b\n\t"                       \
                              ".previous\n\t"                           \
-                             : : "ri" (__FILE__), "i" (__LINE__))
+                             : : "ri" (__FILE__), "i" (__LINE__));     \
+       unreachable();                          \
+} while (0)
 #else
-#define BUG() __asm__ __volatile__ ("break 14\n\t")
+#define BUG()                                  \
+do {                                           \
+       __asm__ __volatile__ ("break 14\n\t");  \
+       unreachable();                          \
+} while (0)
 #endif
 
 #define HAVE_ARCH_BUG
diff --git a/arch/cris/include/arch-v32/arch/elf.h b/arch/cris/include/arch-v32/arch/elf.h
deleted file mode 100644 (file)
index c46d582..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef _ASM_CRIS_ELF_H
-#define _ASM_CRIS_ELF_H
-
-#include <arch/system.h>
-
-#define ELF_CORE_EFLAGS EF_CRIS_VARIANT_V32
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x)                      \
- ((x)->e_machine == EM_CRIS                    \
-  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_V32    \
-      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
-
-/* CRISv32 ELF register definitions. */
-
-#include <asm/ptrace.h>
-
-/* Explicitly zero out registers to increase determinism. */
-#define ELF_PLAT_INIT(_r, load_addr)    do { \
-        (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
-        (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
-        (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
-        (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
-        (_r)->acr = 0; \
-} while (0)
-
-/*
- * An executable for which elf_read_implies_exec() returns TRUE will
- * have the READ_IMPLIES_EXEC personality flag set automatically.
- */
-#define elf_read_implies_exec_binary(ex, have_pt_gnu_stack)    (!(have_pt_gnu_stack))
-
-/*
- * This is basically a pt_regs with the additional definition
- * of the stack pointer since it's needed in a core dump.
- * pr_regs is a elf_gregset_t and should be filled according
- * to the layout of user_regs_struct.
- */
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
-        pr_reg[0] = regs->r0;                              \
-        pr_reg[1] = regs->r1;                              \
-        pr_reg[2] = regs->r2;                              \
-        pr_reg[3] = regs->r3;                              \
-        pr_reg[4] = regs->r4;                              \
-        pr_reg[5] = regs->r5;                              \
-        pr_reg[6] = regs->r6;                              \
-        pr_reg[7] = regs->r7;                              \
-        pr_reg[8] = regs->r8;                              \
-        pr_reg[9] = regs->r9;                              \
-        pr_reg[10] = regs->r10;                            \
-        pr_reg[11] = regs->r11;                            \
-        pr_reg[12] = regs->r12;                            \
-        pr_reg[13] = regs->r13;                            \
-        pr_reg[14] = rdusp();               /* SP */       \
-        pr_reg[15] = regs->acr;             /* ACR */      \
-        pr_reg[16] = 0;                     /* BZ */       \
-        pr_reg[17] = rdvr();                /* VR */       \
-        pr_reg[18] = 0;                     /* PID */      \
-        pr_reg[19] = regs->srs;             /* SRS */      \
-        pr_reg[20] = 0;                     /* WZ */       \
-        pr_reg[21] = regs->exs;             /* EXS */      \
-        pr_reg[22] = regs->eda;             /* EDA */      \
-        pr_reg[23] = regs->mof;             /* MOF */      \
-        pr_reg[24] = 0;                     /* DZ */       \
-        pr_reg[25] = 0;                     /* EBP */      \
-        pr_reg[26] = regs->erp;             /* ERP */      \
-        pr_reg[27] = regs->srp;             /* SRP */      \
-        pr_reg[28] = 0;                     /* NRP */      \
-        pr_reg[29] = regs->ccs;             /* CCS */      \
-        pr_reg[30] = rdusp();               /* USP */      \
-        pr_reg[31] = regs->spc;             /* SPC */      \
-
-#endif /* _ASM_CRIS_ELF_H */
index 041851f..5f6fddf 100644 (file)
@@ -2,7 +2,7 @@
 #define __ASM_CRIS_ARCH_IRQFLAGS_H
 
 #include <linux/types.h>
-#include <arch/ptrace.h>
+#include <asm/ptrace.h>
 
 static inline unsigned long arch_local_save_flags(void)
 {
diff --git a/arch/cris/include/arch-v32/arch/ptrace.h b/arch/cris/include/arch-v32/arch/ptrace.h
deleted file mode 100644 (file)
index 19773d3..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef _CRIS_ARCH_PTRACE_H
-#define _CRIS_ARCH_PTRACE_H
-
-/* Register numbers in the ptrace system call interface */
-
-#define PT_ORIG_R10  0
-#define PT_R0        1
-#define PT_R1        2
-#define PT_R2        3
-#define PT_R3        4
-#define PT_R4        5
-#define PT_R5        6
-#define PT_R6        7
-#define PT_R7        8
-#define PT_R8        9
-#define PT_R9        10
-#define PT_R10       11
-#define PT_R11       12
-#define PT_R12       13
-#define PT_R13       14
-#define PT_ACR       15
-#define PT_SRS       16
-#define PT_MOF       17
-#define PT_SPC       18
-#define PT_CCS       19
-#define PT_SRP       20
-#define PT_ERP       21    /* This is actually the debugged process' PC */
-#define PT_EXS       22
-#define PT_EDA       23
-#define PT_USP       24    /* special case - USP is not in the pt_regs */
-#define PT_PPC       25    /* special case - pseudo PC */
-#define PT_BP        26    /* Base number for BP registers. */
-#define PT_BP_CTRL   26    /* BP control register. */
-#define PT_MAX       40
-
-/* Condition code bit numbers. */
-#define C_CCS_BITNR 0
-#define V_CCS_BITNR 1
-#define Z_CCS_BITNR 2
-#define N_CCS_BITNR 3
-#define X_CCS_BITNR 4
-#define I_CCS_BITNR 5
-#define U_CCS_BITNR 6
-#define P_CCS_BITNR 7
-#define R_CCS_BITNR 8
-#define S_CCS_BITNR 9
-#define M_CCS_BITNR 30
-#define Q_CCS_BITNR 31
-#define CCS_SHIFT   10 /* Shift count for each level in CCS */
-
-/* pt_regs not only specifices the format in the user-struct during
- * ptrace but is also the frame format used in the kernel prologue/epilogues
- * themselves
- */
-
-struct pt_regs {
-       unsigned long orig_r10;
-       /* pushed by movem r13, [sp] in SAVE_ALL. */
-       unsigned long r0;
-       unsigned long r1;
-       unsigned long r2;
-       unsigned long r3;
-       unsigned long r4;
-       unsigned long r5;
-       unsigned long r6;
-       unsigned long r7;
-       unsigned long r8;
-       unsigned long r9;
-       unsigned long r10;
-       unsigned long r11;
-       unsigned long r12;
-       unsigned long r13;
-       unsigned long acr;
-       unsigned long srs;
-       unsigned long mof;
-       unsigned long spc;
-       unsigned long ccs;
-       unsigned long srp;
-       unsigned long erp; /* This is actually the debugged process' PC */
-       /* For debugging purposes; saved only when needed. */
-       unsigned long exs;
-       unsigned long eda;
-};
-
-/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
- * when doing a context-switch. it is used (apart from in resume) when a new
- * thread is made and we need to make _resume (which is starting it for the
- * first time) realise what is going on.
- *
- * Actually, the use is very close to the thread struct (TSS) in that both the
- * switch_stack and the TSS are used to keep thread stuff when switching in
- * _resume.
- */
-
-struct switch_stack {
-       unsigned long r0;
-       unsigned long r1;
-       unsigned long r2;
-       unsigned long r3;
-       unsigned long r4;
-       unsigned long r5;
-       unsigned long r6;
-       unsigned long r7;
-       unsigned long r8;
-       unsigned long r9;
-       unsigned long return_ip; /* ip that _resume will return to */
-};
-
-#ifdef __KERNEL__
-
-#define arch_has_single_step() (1)
-#define user_mode(regs) (((regs)->ccs & (1 << (U_CCS_BITNR + CCS_SHIFT))) != 0)
-#define instruction_pointer(regs) ((regs)->erp)
-#define profile_pc(regs) instruction_pointer(regs)
-
-#endif  /*  __KERNEL__  */
-
-#endif
index ad2244f..b7f6819 100644 (file)
@@ -1,14 +1,20 @@
 generic-y += atomic.h
+generic-y += auxvec.h
 generic-y += barrier.h
+generic-y += bitsperlong.h
 generic-y += clkdev.h
 generic-y += cmpxchg.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
+generic-y += errno.h
 generic-y += exec.h
 generic-y += emergency-restart.h
+generic-y += fcntl.h
 generic-y += futex.h
 generic-y += hardirq.h
+generic-y += ioctl.h
+generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
@@ -19,11 +25,22 @@ generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
+generic-y += mman.h
 generic-y += module.h
+generic-y += msgbuf.h
 generic-y += percpu.h
+generic-y += poll.h
 generic-y += preempt.h
+generic-y += resource.h
 generic-y += sections.h
+generic-y += sembuf.h
+generic-y += shmbuf.h
+generic-y += siginfo.h
+generic-y += socket.h
+generic-y += sockios.h
+generic-y += statfs.h
 generic-y += topology.h
 generic-y += trace_clock.h
+generic-y += types.h
 generic-y += vga.h
 generic-y += xor.h
diff --git a/arch/cris/include/asm/elf.h b/arch/cris/include/asm/elf.h
deleted file mode 100644 (file)
index c2a394f..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef __ASMCRIS_ELF_H
-#define __ASMCRIS_ELF_H
-
-/*
- * ELF register definitions..
- */
-
-#include <asm/user.h>
-
-#define R_CRIS_NONE             0
-#define R_CRIS_8                1
-#define R_CRIS_16               2
-#define R_CRIS_32               3
-#define R_CRIS_8_PCREL          4
-#define R_CRIS_16_PCREL         5
-#define R_CRIS_32_PCREL         6
-#define R_CRIS_GNU_VTINHERIT    7
-#define R_CRIS_GNU_VTENTRY      8
-#define R_CRIS_COPY             9
-#define R_CRIS_GLOB_DAT         10
-#define R_CRIS_JUMP_SLOT        11
-#define R_CRIS_RELATIVE         12
-#define R_CRIS_16_GOT           13
-#define R_CRIS_32_GOT           14
-#define R_CRIS_16_GOTPLT        15
-#define R_CRIS_32_GOTPLT        16
-#define R_CRIS_32_GOTREL        17
-#define R_CRIS_32_PLT_GOTREL    18
-#define R_CRIS_32_PLT_PCREL     19
-
-typedef unsigned long elf_greg_t;
-
-/* Note that NGREG is defined to ELF_NGREG in include/linux/elfcore.h, and is
-   thus exposed to user-space. */
-#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-/* A placeholder; CRIS does not have any fp regs.  */
-typedef unsigned long elf_fpregset_t;
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS      ELFCLASS32
-#define ELF_DATA       ELFDATA2LSB
-#define ELF_ARCH       EM_CRIS
-
-#include <arch/elf.h>
-
-/* The master for these definitions is {binutils}/include/elf/cris.h:  */
-/* User symbols in this file have a leading underscore.  */
-#define EF_CRIS_UNDERSCORE             0x00000001
-
-/* This is a mask for different incompatible machine variants.  */
-#define EF_CRIS_VARIANT_MASK           0x0000000e
-
-/* Variant 0; may contain v0..10 object.  */
-#define EF_CRIS_VARIANT_ANY_V0_V10     0x00000000
-
-/* Variant 1; contains v32 object.  */
-#define EF_CRIS_VARIANT_V32            0x00000002
-
-/* Variant 2; contains object compatible with v32 and v10.  */
-#define EF_CRIS_VARIANT_COMMON_V10_V32 0x00000004
-/* End of excerpt from {binutils}/include/elf/cris.h.  */
-
-#define ELF_EXEC_PAGESIZE      8192
-
-/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
-   use of this is to invoke "./ld.so someprog" to test out a new version of
-   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
-
-#define ELF_ET_DYN_BASE         (TASK_SIZE / 3 * 2)
-
-/* This yields a mask that user programs can use to figure out what
-   instruction set this CPU supports.  This could be done in user space,
-   but it's not easy, and we've already done it here.  */
-
-#define ELF_HWCAP       (0)
-
-/* This yields a string that ld.so will use to load implementation
-   specific libraries for optimization.  This is more specific in
-   intent than poking at uname or /proc/cpuinfo.
-*/
-
-#define ELF_PLATFORM  (NULL)
-
-#endif
index 1d45fd6..349acfd 100644 (file)
@@ -11,7 +11,14 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 #define deactivate_mm(tsk,mm)  do { } while (0)
 
-#define activate_mm(prev,next) switch_mm((prev),(next),NULL)
+static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       switch_mm(prev, next, NULL);
+       local_irq_restore(flags);
+}
 
 /* current active pgd - this is similar to other processors pgd 
  * registers like cr3 on the i386
diff --git a/arch/cris/include/asm/stacktrace.h b/arch/cris/include/asm/stacktrace.h
new file mode 100644 (file)
index 0000000..2d90856
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef __CRIS_STACKTRACE_H
+#define __CRIS_STACKTRACE_H
+
+void walk_stackframe(unsigned long sp,
+                    int (*fn)(unsigned long addr, void *data),
+                    void *data);
+
+#endif
diff --git a/arch/cris/include/asm/types.h b/arch/cris/include/asm/types.h
deleted file mode 100644 (file)
index a3cac77..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ETRAX_TYPES_H
-#define _ETRAX_TYPES_H
-
-#include <uapi/asm/types.h>
-
-/*
- * These aren't exported outside the kernel to avoid name space clashes
- */
-
-#define BITS_PER_LONG 32
-
-#endif
index 0f40fed..9c23535 100644 (file)
@@ -4,7 +4,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define NR_syscalls 360
+#define NR_syscalls 365
 
 #include <arch/unistd.h>
 
index 01f66b8..d5564a0 100644 (file)
@@ -6,6 +6,9 @@ header-y += ../arch-v32/arch/
 header-y += auxvec.h
 header-y += bitsperlong.h
 header-y += byteorder.h
+header-y += elf.h
+header-y += elf_v10.h
+header-y += elf_v32.h
 header-y += errno.h
 header-y += ethernet.h
 header-y += etraxgpio.h
@@ -19,6 +22,8 @@ header-y += param.h
 header-y += poll.h
 header-y += posix_types.h
 header-y += ptrace.h
+header-y += ptrace_v10.h
+header-y += ptrace_v32.h
 header-y += resource.h
 header-y += rs485.h
 header-y += sembuf.h
diff --git a/arch/cris/include/uapi/asm/auxvec.h b/arch/cris/include/uapi/asm/auxvec.h
deleted file mode 100644 (file)
index cb30b01..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __ASMCRIS_AUXVEC_H
-#define __ASMCRIS_AUXVEC_H
-
-#endif
diff --git a/arch/cris/include/uapi/asm/bitsperlong.h b/arch/cris/include/uapi/asm/bitsperlong.h
deleted file mode 100644 (file)
index 6dc0bb0..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/bitsperlong.h>
diff --git a/arch/cris/include/uapi/asm/elf.h b/arch/cris/include/uapi/asm/elf.h
new file mode 100644 (file)
index 0000000..a5df05b
--- /dev/null
@@ -0,0 +1,90 @@
+#ifndef __ASMCRIS_ELF_H
+#define __ASMCRIS_ELF_H
+
+/*
+ * ELF register definitions..
+ */
+
+#ifdef __arch_v32
+#include <asm/elf_v32.h>
+#else
+#include <asm/elf_v10.h>
+#endif
+
+#define R_CRIS_NONE             0
+#define R_CRIS_8                1
+#define R_CRIS_16               2
+#define R_CRIS_32               3
+#define R_CRIS_8_PCREL          4
+#define R_CRIS_16_PCREL         5
+#define R_CRIS_32_PCREL         6
+#define R_CRIS_GNU_VTINHERIT    7
+#define R_CRIS_GNU_VTENTRY      8
+#define R_CRIS_COPY             9
+#define R_CRIS_GLOB_DAT         10
+#define R_CRIS_JUMP_SLOT        11
+#define R_CRIS_RELATIVE         12
+#define R_CRIS_16_GOT           13
+#define R_CRIS_32_GOT           14
+#define R_CRIS_16_GOTPLT        15
+#define R_CRIS_32_GOTPLT        16
+#define R_CRIS_32_GOTREL        17
+#define R_CRIS_32_PLT_GOTREL    18
+#define R_CRIS_32_PLT_PCREL     19
+
+typedef unsigned long elf_greg_t;
+
+/* Note that NGREG is defined to ELF_NGREG in include/linux/elfcore.h, and is
+   thus exposed to user-space. */
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+/* A placeholder; CRIS does not have any fp regs.  */
+typedef unsigned long elf_fpregset_t;
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS      ELFCLASS32
+#define ELF_DATA       ELFDATA2LSB
+#define ELF_ARCH       EM_CRIS
+
+/* The master for these definitions is {binutils}/include/elf/cris.h:  */
+/* User symbols in this file have a leading underscore.  */
+#define EF_CRIS_UNDERSCORE             0x00000001
+
+/* This is a mask for different incompatible machine variants.  */
+#define EF_CRIS_VARIANT_MASK           0x0000000e
+
+/* Variant 0; may contain v0..10 object.  */
+#define EF_CRIS_VARIANT_ANY_V0_V10     0x00000000
+
+/* Variant 1; contains v32 object.  */
+#define EF_CRIS_VARIANT_V32            0x00000002
+
+/* Variant 2; contains object compatible with v32 and v10.  */
+#define EF_CRIS_VARIANT_COMMON_V10_V32 0x00000004
+/* End of excerpt from {binutils}/include/elf/cris.h.  */
+
+#define ELF_EXEC_PAGESIZE      8192
+
+/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
+   use of this is to invoke "./ld.so someprog" to test out a new version of
+   the loader.  We need to make sure that it is out of the way of the program
+   that it will "exec", and that there is sufficient room for the brk.  */
+
+#define ELF_ET_DYN_BASE         (TASK_SIZE / 3 * 2)
+
+/* This yields a mask that user programs can use to figure out what
+   instruction set this CPU supports.  This could be done in user space,
+   but it's not easy, and we've already done it here.  */
+
+#define ELF_HWCAP       (0)
+
+/* This yields a string that ld.so will use to load implementation
+   specific libraries for optimization.  This is more specific in
+   intent than poking at uname or /proc/cpuinfo.
+*/
+
+#define ELF_PLATFORM  (NULL)
+
+#endif
diff --git a/arch/cris/include/uapi/asm/elf_v10.h b/arch/cris/include/uapi/asm/elf_v10.h
new file mode 100644 (file)
index 0000000..3ea65ce
--- /dev/null
@@ -0,0 +1,84 @@
+#ifndef __ASMCRIS_ARCH_ELF_H
+#define __ASMCRIS_ARCH_ELF_H
+
+#define ELF_MACH EF_CRIS_VARIANT_ANY_V0_V10
+
+/* Matches struct user_regs_struct */
+#define ELF_NGREG 35
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x)                      \
+ ((x)->e_machine == EM_CRIS                    \
+  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_ANY_V0_V10     \
+      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
+
+/*
+ * ELF register definitions..
+ */
+
+#include <asm/ptrace.h>
+
+/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program
+   starts (a register; assume first param register for CRIS)
+   contains a pointer to a function which might be
+   registered using `atexit'.  This provides a mean for the
+   dynamic linker to call DT_FINI functions for shared libraries
+   that have been loaded before the code runs.
+
+   A value of 0 tells we have no such handler.  */
+
+/* Explicitly set registers to 0 to increase determinism.  */
+#define ELF_PLAT_INIT(_r, load_addr)   do { \
+       (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
+       (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
+       (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
+       (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
+} while (0)
+
+/* The additional layer below is because the stack pointer is missing in 
+   the pt_regs struct, but needed in a core dump. pr_reg is a elf_gregset_t,
+   and should be filled in according to the layout of the user_regs_struct
+   struct; regs is a pt_regs struct. We dump all registers, though several are
+   obviously unnecessary. That way there's less need for intelligence at 
+   the receiving end (i.e. gdb). */
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
+       pr_reg[0] = regs->r0;                              \
+       pr_reg[1] = regs->r1;                              \
+       pr_reg[2] = regs->r2;                              \
+       pr_reg[3] = regs->r3;                              \
+       pr_reg[4] = regs->r4;                              \
+       pr_reg[5] = regs->r5;                              \
+       pr_reg[6] = regs->r6;                              \
+       pr_reg[7] = regs->r7;                              \
+       pr_reg[8] = regs->r8;                              \
+       pr_reg[9] = regs->r9;                              \
+       pr_reg[10] = regs->r10;                            \
+       pr_reg[11] = regs->r11;                            \
+       pr_reg[12] = regs->r12;                            \
+       pr_reg[13] = regs->r13;                            \
+       pr_reg[14] = rdusp();               /* sp */       \
+       pr_reg[15] = regs->irp;             /* pc */       \
+       pr_reg[16] = 0;                     /* p0 */       \
+       pr_reg[17] = rdvr();                /* vr */       \
+       pr_reg[18] = 0;                     /* p2 */       \
+       pr_reg[19] = 0;                     /* p3 */       \
+       pr_reg[20] = 0;                     /* p4 */       \
+       pr_reg[21] = (regs->dccr & 0xffff); /* ccr */      \
+       pr_reg[22] = 0;                     /* p6 */       \
+       pr_reg[23] = regs->mof;             /* mof */      \
+       pr_reg[24] = 0;                     /* p8 */       \
+       pr_reg[25] = 0;                     /* ibr */      \
+       pr_reg[26] = 0;                     /* irp */      \
+       pr_reg[27] = regs->srp;             /* srp */      \
+       pr_reg[28] = 0;                     /* bar */      \
+       pr_reg[29] = regs->dccr;            /* dccr */     \
+       pr_reg[30] = 0;                     /* brp */      \
+       pr_reg[31] = rdusp();               /* usp */      \
+       pr_reg[32] = 0;                     /* csrinstr */ \
+       pr_reg[33] = 0;                     /* csraddr */  \
+       pr_reg[34] = 0;                     /* csrdata */
+
+
+#endif
diff --git a/arch/cris/include/uapi/asm/elf_v32.h b/arch/cris/include/uapi/asm/elf_v32.h
new file mode 100644 (file)
index 0000000..f09fe49
--- /dev/null
@@ -0,0 +1,76 @@
+#ifndef _ASM_CRIS_ELF_H
+#define _ASM_CRIS_ELF_H
+
+#define ELF_CORE_EFLAGS EF_CRIS_VARIANT_V32
+
+/* Matches struct user_regs_struct */
+#define ELF_NGREG 32
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x)                      \
+ ((x)->e_machine == EM_CRIS                    \
+  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_V32    \
+      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
+
+/* CRISv32 ELF register definitions. */
+
+#include <asm/ptrace.h>
+
+/* Explicitly zero out registers to increase determinism. */
+#define ELF_PLAT_INIT(_r, load_addr)    do { \
+        (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
+        (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
+        (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
+        (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
+        (_r)->acr = 0; \
+} while (0)
+
+/*
+ * An executable for which elf_read_implies_exec() returns TRUE will
+ * have the READ_IMPLIES_EXEC personality flag set automatically.
+ */
+#define elf_read_implies_exec_binary(ex, have_pt_gnu_stack)    (!(have_pt_gnu_stack))
+
+/*
+ * This is basically a pt_regs with the additional definition
+ * of the stack pointer since it's needed in a core dump.
+ * pr_regs is a elf_gregset_t and should be filled according
+ * to the layout of user_regs_struct.
+ */
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
+        pr_reg[0] = regs->r0;                              \
+        pr_reg[1] = regs->r1;                              \
+        pr_reg[2] = regs->r2;                              \
+        pr_reg[3] = regs->r3;                              \
+        pr_reg[4] = regs->r4;                              \
+        pr_reg[5] = regs->r5;                              \
+        pr_reg[6] = regs->r6;                              \
+        pr_reg[7] = regs->r7;                              \
+        pr_reg[8] = regs->r8;                              \
+        pr_reg[9] = regs->r9;                              \
+        pr_reg[10] = regs->r10;                            \
+        pr_reg[11] = regs->r11;                            \
+        pr_reg[12] = regs->r12;                            \
+        pr_reg[13] = regs->r13;                            \
+        pr_reg[14] = rdusp();               /* SP */       \
+        pr_reg[15] = regs->acr;             /* ACR */      \
+        pr_reg[16] = 0;                     /* BZ */       \
+        pr_reg[17] = rdvr();                /* VR */       \
+        pr_reg[18] = 0;                     /* PID */      \
+        pr_reg[19] = regs->srs;             /* SRS */      \
+        pr_reg[20] = 0;                     /* WZ */       \
+        pr_reg[21] = regs->exs;             /* EXS */      \
+        pr_reg[22] = regs->eda;             /* EDA */      \
+        pr_reg[23] = regs->mof;             /* MOF */      \
+        pr_reg[24] = 0;                     /* DZ */       \
+        pr_reg[25] = 0;                     /* EBP */      \
+        pr_reg[26] = regs->erp;             /* ERP */      \
+        pr_reg[27] = regs->srp;             /* SRP */      \
+        pr_reg[28] = 0;                     /* NRP */      \
+        pr_reg[29] = regs->ccs;             /* CCS */      \
+        pr_reg[30] = rdusp();               /* USP */      \
+        pr_reg[31] = regs->spc;             /* SPC */      \
+
+#endif /* _ASM_CRIS_ELF_H */
diff --git a/arch/cris/include/uapi/asm/errno.h b/arch/cris/include/uapi/asm/errno.h
deleted file mode 100644 (file)
index 2bf5eb5..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_ERRNO_H
-#define _CRIS_ERRNO_H
-
-#include <asm-generic/errno.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/fcntl.h b/arch/cris/include/uapi/asm/fcntl.h
deleted file mode 100644 (file)
index 46ab12d..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fcntl.h>
diff --git a/arch/cris/include/uapi/asm/ioctl.h b/arch/cris/include/uapi/asm/ioctl.h
deleted file mode 100644 (file)
index b279fe0..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctl.h>
diff --git a/arch/cris/include/uapi/asm/ipcbuf.h b/arch/cris/include/uapi/asm/ipcbuf.h
deleted file mode 100644 (file)
index 84c7e51..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipcbuf.h>
diff --git a/arch/cris/include/uapi/asm/kvm_para.h b/arch/cris/include/uapi/asm/kvm_para.h
deleted file mode 100644 (file)
index 14fab8f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kvm_para.h>
diff --git a/arch/cris/include/uapi/asm/mman.h b/arch/cris/include/uapi/asm/mman.h
deleted file mode 100644 (file)
index 8eebf89..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/mman.h>
diff --git a/arch/cris/include/uapi/asm/msgbuf.h b/arch/cris/include/uapi/asm/msgbuf.h
deleted file mode 100644 (file)
index ada63df..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _CRIS_MSGBUF_H
-#define _CRIS_MSGBUF_H
-
-/* verbatim copy of asm-i386 version */
-
-/* 
- * The msqid64_ds structure for CRIS architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct msqid64_ds {
-       struct ipc64_perm msg_perm;
-       __kernel_time_t msg_stime;      /* last msgsnd time */
-       unsigned long   __unused1;
-       __kernel_time_t msg_rtime;      /* last msgrcv time */
-       unsigned long   __unused2;
-       __kernel_time_t msg_ctime;      /* last change time */
-       unsigned long   __unused3;
-       unsigned long  msg_cbytes;      /* current number of bytes on queue */
-       unsigned long  msg_qnum;        /* number of messages in queue */
-       unsigned long  msg_qbytes;      /* max number of bytes on queue */
-       __kernel_pid_t msg_lspid;       /* pid of last msgsnd */
-       __kernel_pid_t msg_lrpid;       /* last receive pid */
-       unsigned long  __unused4;
-       unsigned long  __unused5;
-};
-
-#endif /* _CRIS_MSGBUF_H */
diff --git a/arch/cris/include/uapi/asm/poll.h b/arch/cris/include/uapi/asm/poll.h
deleted file mode 100644 (file)
index c98509d..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
index c689c9b..bd8946f 100644 (file)
@@ -1 +1,5 @@
-#include <arch/ptrace.h>
+#ifdef __arch_v32
+#include <asm/ptrace_v32.h>
+#else
+#include <asm/ptrace_v10.h>
+#endif
diff --git a/arch/cris/include/uapi/asm/ptrace_v10.h b/arch/cris/include/uapi/asm/ptrace_v10.h
new file mode 100644 (file)
index 0000000..1a23273
--- /dev/null
@@ -0,0 +1,118 @@
+#ifndef _CRIS_ARCH_PTRACE_H
+#define _CRIS_ARCH_PTRACE_H
+
+/* Frame types */
+
+#define CRIS_FRAME_NORMAL   0 /* normal frame without SBFS stacking */
+#define CRIS_FRAME_BUSFAULT 1 /* frame stacked using SBFS, need RBF return
+                                path */
+
+/* Register numbers in the ptrace system call interface */
+
+#define PT_FRAMETYPE 0
+#define PT_ORIG_R10  1
+#define PT_R13       2
+#define PT_R12       3
+#define PT_R11       4
+#define PT_R10       5
+#define PT_R9        6
+#define PT_R8        7
+#define PT_R7        8
+#define PT_R6        9
+#define PT_R5        10
+#define PT_R4        11
+#define PT_R3        12
+#define PT_R2        13
+#define PT_R1        14
+#define PT_R0        15
+#define PT_MOF       16
+#define PT_DCCR      17
+#define PT_SRP       18
+#define PT_IRP       19    /* This is actually the debugged process' PC */
+#define PT_CSRINSTR  20    /* CPU Status record remnants -
+                             valid if frametype == busfault */
+#define PT_CSRADDR   21
+#define PT_CSRDATA   22
+#define PT_USP       23    /* special case - USP is not in the pt_regs */
+#define PT_MAX       23
+
+/* Condition code bit numbers.  The same numbers apply to CCR of course,
+   but we use DCCR everywhere else, so let's try and be consistent.  */
+#define C_DCCR_BITNR 0
+#define V_DCCR_BITNR 1
+#define Z_DCCR_BITNR 2
+#define N_DCCR_BITNR 3
+#define X_DCCR_BITNR 4
+#define I_DCCR_BITNR 5
+#define B_DCCR_BITNR 6
+#define M_DCCR_BITNR 7
+#define U_DCCR_BITNR 8
+#define P_DCCR_BITNR 9
+#define F_DCCR_BITNR 10
+
+/* pt_regs not only specifices the format in the user-struct during
+ * ptrace but is also the frame format used in the kernel prologue/epilogues 
+ * themselves
+ */
+
+struct pt_regs {
+       unsigned long frametype;  /* type of stackframe */
+       unsigned long orig_r10;
+       /* pushed by movem r13, [sp] in SAVE_ALL, movem pushes backwards */
+       unsigned long r13;
+       unsigned long r12;
+       unsigned long r11;
+       unsigned long r10;
+       unsigned long r9;
+       unsigned long r8;
+       unsigned long r7;
+       unsigned long r6;
+       unsigned long r5;
+       unsigned long r4;
+       unsigned long r3;
+       unsigned long r2;
+       unsigned long r1;
+       unsigned long r0;
+       unsigned long mof;
+       unsigned long dccr;
+       unsigned long srp;
+       unsigned long irp; /* This is actually the debugged process' PC */
+       unsigned long csrinstr;
+       unsigned long csraddr;
+       unsigned long csrdata;
+};
+
+/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
+ * when doing a context-switch. it is used (apart from in resume) when a new
+ * thread is made and we need to make _resume (which is starting it for the
+ * first time) realise what is going on.
+ *
+ * Actually, the use is very close to the thread struct (TSS) in that both the
+ * switch_stack and the TSS are used to keep thread stuff when switching in
+ * _resume.
+ */
+
+struct switch_stack {
+       unsigned long r9;
+       unsigned long r8;
+       unsigned long r7;
+       unsigned long r6;
+       unsigned long r5;
+       unsigned long r4;
+       unsigned long r3;
+       unsigned long r2;
+       unsigned long r1;
+       unsigned long r0;
+       unsigned long return_ip; /* ip that _resume will return to */
+};
+
+#ifdef __KERNEL__
+
+/* bit 8 is user-mode flag */
+#define user_mode(regs) (((regs)->dccr & 0x100) != 0)
+#define instruction_pointer(regs) ((regs)->irp)
+#define profile_pc(regs) instruction_pointer(regs)
+
+#endif  /*  __KERNEL__  */
+
+#endif
diff --git a/arch/cris/include/uapi/asm/ptrace_v32.h b/arch/cris/include/uapi/asm/ptrace_v32.h
new file mode 100644 (file)
index 0000000..19773d3
--- /dev/null
@@ -0,0 +1,118 @@
+#ifndef _CRIS_ARCH_PTRACE_H
+#define _CRIS_ARCH_PTRACE_H
+
+/* Register numbers in the ptrace system call interface */
+
+#define PT_ORIG_R10  0
+#define PT_R0        1
+#define PT_R1        2
+#define PT_R2        3
+#define PT_R3        4
+#define PT_R4        5
+#define PT_R5        6
+#define PT_R6        7
+#define PT_R7        8
+#define PT_R8        9
+#define PT_R9        10
+#define PT_R10       11
+#define PT_R11       12
+#define PT_R12       13
+#define PT_R13       14
+#define PT_ACR       15
+#define PT_SRS       16
+#define PT_MOF       17
+#define PT_SPC       18
+#define PT_CCS       19
+#define PT_SRP       20
+#define PT_ERP       21    /* This is actually the debugged process' PC */
+#define PT_EXS       22
+#define PT_EDA       23
+#define PT_USP       24    /* special case - USP is not in the pt_regs */
+#define PT_PPC       25    /* special case - pseudo PC */
+#define PT_BP        26    /* Base number for BP registers. */
+#define PT_BP_CTRL   26    /* BP control register. */
+#define PT_MAX       40
+
+/* Condition code bit numbers. */
+#define C_CCS_BITNR 0
+#define V_CCS_BITNR 1
+#define Z_CCS_BITNR 2
+#define N_CCS_BITNR 3
+#define X_CCS_BITNR 4
+#define I_CCS_BITNR 5
+#define U_CCS_BITNR 6
+#define P_CCS_BITNR 7
+#define R_CCS_BITNR 8
+#define S_CCS_BITNR 9
+#define M_CCS_BITNR 30
+#define Q_CCS_BITNR 31
+#define CCS_SHIFT   10 /* Shift count for each level in CCS */
+
+/* pt_regs not only specifices the format in the user-struct during
+ * ptrace but is also the frame format used in the kernel prologue/epilogues
+ * themselves
+ */
+
+struct pt_regs {
+       unsigned long orig_r10;
+       /* pushed by movem r13, [sp] in SAVE_ALL. */
+       unsigned long r0;
+       unsigned long r1;
+       unsigned long r2;
+       unsigned long r3;
+       unsigned long r4;
+       unsigned long r5;
+       unsigned long r6;
+       unsigned long r7;
+       unsigned long r8;
+       unsigned long r9;
+       unsigned long r10;
+       unsigned long r11;
+       unsigned long r12;
+       unsigned long r13;
+       unsigned long acr;
+       unsigned long srs;
+       unsigned long mof;
+       unsigned long spc;
+       unsigned long ccs;
+       unsigned long srp;
+       unsigned long erp; /* This is actually the debugged process' PC */
+       /* For debugging purposes; saved only when needed. */
+       unsigned long exs;
+       unsigned long eda;
+};
+
+/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
+ * when doing a context-switch. it is used (apart from in resume) when a new
+ * thread is made and we need to make _resume (which is starting it for the
+ * first time) realise what is going on.
+ *
+ * Actually, the use is very close to the thread struct (TSS) in that both the
+ * switch_stack and the TSS are used to keep thread stuff when switching in
+ * _resume.
+ */
+
+struct switch_stack {
+       unsigned long r0;
+       unsigned long r1;
+       unsigned long r2;
+       unsigned long r3;
+       unsigned long r4;
+       unsigned long r5;
+       unsigned long r6;
+       unsigned long r7;
+       unsigned long r8;
+       unsigned long r9;
+       unsigned long return_ip; /* ip that _resume will return to */
+};
+
+#ifdef __KERNEL__
+
+#define arch_has_single_step() (1)
+#define user_mode(regs) (((regs)->ccs & (1 << (U_CCS_BITNR + CCS_SHIFT))) != 0)
+#define instruction_pointer(regs) ((regs)->erp)
+#define profile_pc(regs) instruction_pointer(regs)
+
+#endif  /*  __KERNEL__  */
+
+#endif
diff --git a/arch/cris/include/uapi/asm/resource.h b/arch/cris/include/uapi/asm/resource.h
deleted file mode 100644 (file)
index b5d2944..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_RESOURCE_H
-#define _CRIS_RESOURCE_H
-
-#include <asm-generic/resource.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/sembuf.h b/arch/cris/include/uapi/asm/sembuf.h
deleted file mode 100644 (file)
index 7fed984..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _CRIS_SEMBUF_H
-#define _CRIS_SEMBUF_H
-
-/* 
- * The semid64_ds structure for CRIS architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct semid64_ds {
-       struct ipc64_perm sem_perm;             /* permissions .. see ipc.h */
-       __kernel_time_t sem_otime;              /* last semop time */
-       unsigned long   __unused1;
-       __kernel_time_t sem_ctime;              /* last change time */
-       unsigned long   __unused2;
-       unsigned long   sem_nsems;              /* no. of semaphores in array */
-       unsigned long   __unused3;
-       unsigned long   __unused4;
-};
-
-#endif /* _CRIS_SEMBUF_H */
diff --git a/arch/cris/include/uapi/asm/shmbuf.h b/arch/cris/include/uapi/asm/shmbuf.h
deleted file mode 100644 (file)
index 3239e3f..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _CRIS_SHMBUF_H
-#define _CRIS_SHMBUF_H
-
-/* 
- * The shmid64_ds structure for CRIS architecture (same as for i386)
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct shmid64_ds {
-       struct ipc64_perm       shm_perm;       /* operation perms */
-       size_t                  shm_segsz;      /* size of segment (bytes) */
-       __kernel_time_t         shm_atime;      /* last attach time */
-       unsigned long           __unused1;
-       __kernel_time_t         shm_dtime;      /* last detach time */
-       unsigned long           __unused2;
-       __kernel_time_t         shm_ctime;      /* last change time */
-       unsigned long           __unused3;
-       __kernel_pid_t          shm_cpid;       /* pid of creator */
-       __kernel_pid_t          shm_lpid;       /* pid of last operator */
-       unsigned long           shm_nattch;     /* no. of current attaches */
-       unsigned long           __unused4;
-       unsigned long           __unused5;
-};
-
-struct shminfo64 {
-       unsigned long   shmmax;
-       unsigned long   shmmin;
-       unsigned long   shmmni;
-       unsigned long   shmseg;
-       unsigned long   shmall;
-       unsigned long   __unused1;
-       unsigned long   __unused2;
-       unsigned long   __unused3;
-       unsigned long   __unused4;
-};
-
-#endif /* _CRIS_SHMBUF_H */
diff --git a/arch/cris/include/uapi/asm/siginfo.h b/arch/cris/include/uapi/asm/siginfo.h
deleted file mode 100644 (file)
index c1cd6d1..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_SIGINFO_H
-#define _CRIS_SIGINFO_H
-
-#include <asm-generic/siginfo.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
deleted file mode 100644 (file)
index e2503d9..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _ASM_SOCKET_H
-#define _ASM_SOCKET_H
-
-/* almost the same as asm-i386/socket.h */
-
-#include <asm/sockios.h>
-
-/* For setsockoptions(2) */
-#define SOL_SOCKET     1
-
-#define SO_DEBUG       1
-#define SO_REUSEADDR   2
-#define SO_TYPE                3
-#define SO_ERROR       4
-#define SO_DONTROUTE   5
-#define SO_BROADCAST   6
-#define SO_SNDBUF      7
-#define SO_RCVBUF      8
-#define SO_SNDBUFFORCE 32
-#define SO_RCVBUFFORCE 33
-#define SO_KEEPALIVE   9
-#define SO_OOBINLINE   10
-#define SO_NO_CHECK    11
-#define SO_PRIORITY    12
-#define SO_LINGER      13
-#define SO_BSDCOMPAT   14
-#define SO_REUSEPORT   15
-#define SO_PASSCRED    16
-#define SO_PEERCRED    17
-#define SO_RCVLOWAT    18
-#define SO_SNDLOWAT    19
-#define SO_RCVTIMEO    20
-#define SO_SNDTIMEO    21
-
-/* Security levels - as per NRL IPv6 - don't actually do anything */
-#define SO_SECURITY_AUTHENTICATION             22
-#define SO_SECURITY_ENCRYPTION_TRANSPORT       23
-#define SO_SECURITY_ENCRYPTION_NETWORK         24
-
-#define SO_BINDTODEVICE        25
-
-/* Socket filtering */
-#define SO_ATTACH_FILTER        26
-#define SO_DETACH_FILTER        27
-#define SO_GET_FILTER          SO_ATTACH_FILTER
-
-#define SO_PEERNAME            28
-#define SO_TIMESTAMP           29
-#define SCM_TIMESTAMP          SO_TIMESTAMP
-
-#define SO_ACCEPTCONN          30
-
-#define SO_PEERSEC             31
-#define SO_PASSSEC             34
-#define SO_TIMESTAMPNS         35
-#define SCM_TIMESTAMPNS                SO_TIMESTAMPNS
-
-#define SO_MARK                        36
-
-#define SO_TIMESTAMPING                37
-#define SCM_TIMESTAMPING       SO_TIMESTAMPING
-
-#define SO_PROTOCOL            38
-#define SO_DOMAIN              39
-
-#define SO_RXQ_OVFL             40
-
-#define SO_WIFI_STATUS         41
-#define SCM_WIFI_STATUS                SO_WIFI_STATUS
-#define SO_PEEK_OFF            42
-
-/* Instruct lower device to use last 4-bytes of skb data as FCS */
-#define SO_NOFCS               43
-
-#define SO_LOCK_FILTER         44
-
-#define SO_SELECT_ERR_QUEUE    45
-
-#define SO_BUSY_POLL           46
-
-#define SO_MAX_PACING_RATE     47
-
-#define SO_BPF_EXTENSIONS      48
-
-#define SO_INCOMING_CPU                49
-
-#define SO_ATTACH_BPF          50
-#define SO_DETACH_BPF          SO_DETACH_FILTER
-
-#endif /* _ASM_SOCKET_H */
-
-
diff --git a/arch/cris/include/uapi/asm/sockios.h b/arch/cris/include/uapi/asm/sockios.h
deleted file mode 100644 (file)
index cfe7bfe..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __ARCH_CRIS_SOCKIOS__
-#define __ARCH_CRIS_SOCKIOS__
-
-/* Socket-level I/O control calls. */
-#define FIOSETOWN      0x8901
-#define SIOCSPGRP      0x8902
-#define FIOGETOWN      0x8903
-#define SIOCGPGRP      0x8904
-#define SIOCATMARK     0x8905
-#define SIOCGSTAMP     0x8906          /* Get stamp (timeval) */
-#define SIOCGSTAMPNS   0x8907          /* Get stamp (timespec) */
-
-#endif
diff --git a/arch/cris/include/uapi/asm/statfs.h b/arch/cris/include/uapi/asm/statfs.h
deleted file mode 100644 (file)
index fdaf921..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_STATFS_H
-#define _CRIS_STATFS_H
-
-#include <asm-generic/statfs.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/types.h b/arch/cris/include/uapi/asm/types.h
deleted file mode 100644 (file)
index 9ec9d4c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/int-ll64.h>
index f3287fa..062b648 100644 (file)
 #define __NR_process_vm_writev 349
 #define __NR_kcmp              350
 #define __NR_finit_module      351
+#define __NR_sched_setattr     352
+#define __NR_sched_getattr     353
+#define __NR_renameat2         354
+#define __NR_seccomp           355
+#define __NR_getrandom         356
+#define __NR_memfd_create      357
+#define __NR_bpf               358
+#define __NR_execveat          359
 
 #endif /* _UAPI_ASM_CRIS_UNISTD_H_ */
index edef71f..5fae398 100644 (file)
@@ -8,6 +8,7 @@ extra-y := vmlinux.lds
 
 obj-y   := process.o traps.o irq.o ptrace.o setup.o time.o sys_cris.o
 obj-y += devicetree.o
+obj-y += stacktrace.o
 
 obj-$(CONFIG_MODULES)    += crisksyms.o
 obj-$(CONFIG_MODULES)   += module.o
index dd0be5d..694850e 100644 (file)
 asmlinkage void do_IRQ(int irq, struct pt_regs * regs)
 {
        unsigned long sp;
-       struct pt_regs *old_regs = set_irq_regs(regs);
+       struct pt_regs *old_regs;
+
+       trace_hardirqs_off();
+
+       old_regs = set_irq_regs(regs);
        irq_enter();
        sp = rdsp();
        if (unlikely((sp & (PAGE_SIZE - 1)) < (PAGE_SIZE/8))) {
diff --git a/arch/cris/kernel/stacktrace.c b/arch/cris/kernel/stacktrace.c
new file mode 100644 (file)
index 0000000..99838c7
--- /dev/null
@@ -0,0 +1,76 @@
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/stacktrace.h>
+#include <asm/stacktrace.h>
+
+void walk_stackframe(unsigned long sp,
+                    int (*fn)(unsigned long addr, void *data),
+                    void *data)
+{
+       unsigned long high = ALIGN(sp, THREAD_SIZE);
+
+       for (; sp <= high - 4; sp += 4) {
+               unsigned long addr = *(unsigned long *) sp;
+
+               if (!kernel_text_address(addr))
+                       continue;
+
+               if (fn(addr, data))
+                       break;
+       }
+}
+
+struct stack_trace_data {
+       struct stack_trace *trace;
+       unsigned int no_sched_functions;
+       unsigned int skip;
+};
+
+#ifdef CONFIG_STACKTRACE
+
+static int save_trace(unsigned long addr, void *d)
+{
+       struct stack_trace_data *data = d;
+       struct stack_trace *trace = data->trace;
+
+       if (data->no_sched_functions && in_sched_functions(addr))
+               return 0;
+
+       if (data->skip) {
+               data->skip--;
+               return 0;
+       }
+
+       trace->entries[trace->nr_entries++] = addr;
+
+       return trace->nr_entries >= trace->max_entries;
+}
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+       struct stack_trace_data data;
+       unsigned long sp;
+
+       data.trace = trace;
+       data.skip = trace->skip;
+
+       if (tsk != current) {
+               data.no_sched_functions = 1;
+               sp = tsk->thread.ksp;
+       } else {
+               data.no_sched_functions = 0;
+               sp = rdsp();
+       }
+
+       walk_stackframe(sp, save_trace, &data);
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+void save_stack_trace(struct stack_trace *trace)
+{
+       save_stack_trace_tsk(current, trace);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+
+#endif /* CONFIG_STACKTRACE */
index 7042741..c4f2cfc 100644 (file)
@@ -70,5 +70,5 @@ void decompress_kernel(void)
        free_mem_ptr = (unsigned long)&_end;
        free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
 
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
 }
index 6e67a90..d9b5b80 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _H8300_DMA_MAPPING_H
 #define _H8300_DMA_MAPPING_H
 
-#include <asm-generic/dma-coherent.h>
-
 extern struct dma_map_ops h8300_dma_map_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
@@ -12,46 +10,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       return 0;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return 0;
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-       return memory;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return 0;
-}
-
 #endif
index 1696542..268fde8 100644 (file)
 
 struct device;
 extern int bad_dma_address;
+#define DMA_ERROR_CODE bad_dma_address
 
 extern struct dma_map_ops *dma_ops;
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
        if (unlikely(dev == NULL))
@@ -45,8 +43,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *dev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
 extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                           enum dma_data_direction direction);
@@ -60,47 +58,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == bad_dma_address);
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       ret = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
 #endif
index 98106e5..24b9988 100644 (file)
@@ -19,8 +19,6 @@
 #ifndef _ASM_SIGNAL_H
 #define _ASM_SIGNAL_H
 
-#include <uapi/asm/registers.h>
-
 extern unsigned long __rt_sigtramp_template[2];
 
 void do_signal(struct pt_regs *regs);
index b74f9ba..9e3ddf7 100644 (file)
@@ -44,17 +44,6 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 static struct gen_pool *coherent_pool;
 
 
index 17fbf45..a6a1d1f 100644 (file)
@@ -97,20 +97,6 @@ static int set_next_event(unsigned long delta, struct clock_event_device *evt)
        return 0;
 }
 
-/*
- * Sets the mode (periodic, shutdown, oneshot, etc) of a timer.
- */
-static void set_mode(enum clock_event_mode mode,
-       struct clock_event_device *evt)
-{
-       switch (mode) {
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               /* XXX implement me */
-       default:
-               break;
-       }
-}
-
 #ifdef CONFIG_SMP
 /*  Broadcast mechanism  */
 static void broadcast(const struct cpumask *mask)
@@ -119,13 +105,13 @@ static void broadcast(const struct cpumask *mask)
 }
 #endif
 
+/* XXX Implement set_state_shutdown() */
 static struct clock_event_device hexagon_clockevent_dev = {
        .name           = "clockevent",
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .rating         = 400,
        .irq            = RTOS_TIMER_INT,
        .set_next_event = set_next_event,
-       .set_mode       = set_mode,
 #ifdef CONFIG_SMP
        .broadcast      = broadcast,
 #endif
@@ -146,7 +132,6 @@ void setup_percpu_clockdev(void)
 
        dummy_clock_dev->features = CLOCK_EVT_FEAT_DUMMY;
        dummy_clock_dev->cpumask = cpumask_of(cpu);
-       dummy_clock_dev->mode = CLOCK_EVT_MODE_UNUSED;
 
        clockevents_register_device(dummy_clock_dev);
 }
index 42a91a7..eb0249e 100644 (file)
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
 config KEXEC
        bool "kexec system call"
        depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index cf3ab7e..9beccf8 100644 (file)
@@ -23,60 +23,10 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
 extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
                                enum dma_data_direction);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *daddr, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       void *caddr;
-
-       caddr = ops->alloc(dev, size, daddr, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *daddr, caddr);
-       return caddr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *caddr, dma_addr_t daddr,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       debug_dma_free_coherent(dev, size, caddr, daddr);
-       ops->free(dev, size, caddr, daddr, attrs);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #define get_dma_ops(dev) platform_dma_get_ops(dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       debug_dma_mapping_error(dev, daddr);
-       return ops->mapping_error(dev, daddr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int
-dma_set_mask (struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
index 95c39b9..99c96a5 100644 (file)
@@ -11,7 +11,7 @@
 
 
 
-#define NR_syscalls                    319 /* length of syscall table */
+#define NR_syscalls                    321 /* length of syscall table */
 
 /*
  * The following defines stop scripts/checksyscalls.sh from complaining about
index 4610795..98e94e1 100644 (file)
 #define __NR_memfd_create              1340
 #define __NR_bpf                       1341
 #define __NR_execveat                  1342
+#define __NR_userfaultfd               1343
+#define __NR_membarrier                        1344
 
 #endif /* _UAPI_ASM_IA64_UNISTD_H */
index ae0de7b..37cc7a6 100644 (file)
@@ -1768,5 +1768,7 @@ sys_call_table:
        data8 sys_memfd_create                  // 1340
        data8 sys_bpf
        data8 sys_execveat
+       data8 sys_userfaultfd
+       data8 sys_membarrier
 
        .org sys_call_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
index 28a0952..3a76927 100644 (file)
@@ -86,6 +86,7 @@ decompress_kernel(int mmu_on, unsigned char *zimage_data,
        free_mem_end_ptr = free_mem_ptr + BOOT_HEAP_SIZE;
 
        puts("\nDecompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output_data, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output_data, 0,
+                       NULL, error);
        puts("done.\nBooting the kernel.\n");
 }
index 2dd8f63..498b567 100644 (file)
@@ -95,6 +95,7 @@ config MMU_SUN3
 config KEXEC
        bool "kexec system call"
        depends on M68KCLASSIC
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index ab35372..24b1297 100644 (file)
@@ -27,7 +27,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
-#include <asm-generic/dma-coherent.h>
 #include <asm/cacheflush.h>
 
 #define DMA_ERROR_CODE         (~(dma_addr_t)0x0)
@@ -45,31 +44,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &dma_direct_ops;
 }
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (unlikely(!ops))
-               return 0;
-       if (!ops->dma_supported)
-               return 1;
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (unlikely(ops == NULL))
-               return -EIO;
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, dma_mask);
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-       *dev->dma_mask = dma_mask;
-       return 0;
-}
-
 #include <asm-generic/dma-mapping-common.h>
 
 static inline void __dma_sync(unsigned long paddr,
@@ -88,50 +62,6 @@ static inline void __dma_sync(unsigned long paddr,
        }
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       BUG_ON(!ops);
-
-       memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!ops);
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                enum dma_data_direction direction)
 {
index be1731d..e9bcdb6 100644 (file)
 #ifndef _UAPI_ASM_MICROBLAZE_ELF_H
 #define _UAPI_ASM_MICROBLAZE_ELF_H
 
+#include <linux/elf-em.h>
+
 /*
  * Note there is no "official" ELF designation for Microblaze.
  * I've snaffled the value from the microblaze binutils source code
  * /binutils/microblaze/include/elf/microblaze.h
  */
-#define EM_MICROBLAZE          189
 #define EM_MICROBLAZE_OLD      0xbaab
 #define ELF_ARCH               EM_MICROBLAZE
 
index 752acca..e3aa5b0 100644 (file)
@@ -2597,6 +2597,7 @@ source "kernel/Kconfig.preempt"
 
 config KEXEC
        bool "Kexec system call"
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 5483106..080cd53 100644 (file)
@@ -111,8 +111,8 @@ void decompress_kernel(unsigned long boot_heap_start)
        puts("\n");
 
        /* Decompress the kernel with according algorithm */
-       decompress((char *)zimage_start, zimage_size, 0, 0,
-                  (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, error);
+       __decompress((char *)zimage_start, zimage_size, 0, 0,
+                  (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, 0, error);
 
        /* FIXME: should we flush cache here? */
        puts("Now, booting the kernel...\n");
index d8960d4..2cd45f5 100644 (file)
@@ -161,9 +161,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 {
        void *ret;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -194,11 +191,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 static void octeon_dma_free_coherent(struct device *dev, size_t size,
        void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
index 360b338..e604f76 100644 (file)
@@ -4,7 +4,6 @@
 #include <linux/scatterlist.h>
 #include <asm/dma-coherence.h>
 #include <asm/cache.h>
-#include <asm-generic/dma-coherent.h>
 
 #ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */
 #include <dma-coherence.h>
@@ -32,73 +31,7 @@ static inline void dma_mark_clean(void *addr, size_t size) {}
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_mapping_error(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, mask);
-       return ops->mapping_error(dev, mask);
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if(!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, mask);
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
               enum dma_data_direction direction);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ops->free(dev, size, vaddr, dma_handle, attrs);
-
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
-
-
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                          dma_addr_t *dma_handle, gfp_t flag);
-
-void dma_free_noncoherent(struct device *dev, size_t size,
-                        void *vaddr, dma_addr_t dma_handle);
-
 #endif /* _ASM_DMA_MAPPING_H */
index 2c6b989..4ffa6fc 100644 (file)
@@ -14,9 +14,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 {
        void *ret;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -46,11 +43,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 static void loongson_dma_free_coherent(struct device *dev, size_t size,
                void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
@@ -93,6 +85,9 @@ static void loongson_dma_sync_sg_for_device(struct device *dev,
 
 static int loongson_dma_set_mask(struct device *dev, u64 mask)
 {
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+
        if (mask > DMA_BIT_MASK(loongson_sysconf.dma_mask_bits)) {
                *dev->dma_mask = DMA_BIT_MASK(loongson_sysconf.dma_mask_bits);
                return -EIO;
index 8f23cf0..a914dc1 100644 (file)
@@ -112,7 +112,7 @@ static gfp_t massage_gfp_flags(const struct device *dev, gfp_t gfp)
        return gfp | dma_flag;
 }
 
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
+static void *mips_dma_alloc_noncoherent(struct device *dev, size_t size,
        dma_addr_t * dma_handle, gfp_t gfp)
 {
        void *ret;
@@ -128,7 +128,6 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size,
 
        return ret;
 }
-EXPORT_SYMBOL(dma_alloc_noncoherent);
 
 static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
        dma_addr_t * dma_handle, gfp_t gfp, struct dma_attrs *attrs)
@@ -137,8 +136,12 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
        struct page *page = NULL;
        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
+       /*
+        * XXX: seems like the coherent and non-coherent implementations could
+        * be consolidated.
+        */
+       if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs))
+               return mips_dma_alloc_noncoherent(dev, size, dma_handle, gfp);
 
        gfp = massage_gfp_flags(dev, gfp);
 
@@ -164,24 +167,24 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
 }
 
 
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
-       dma_addr_t dma_handle)
+static void mips_dma_free_noncoherent(struct device *dev, size_t size,
+               void *vaddr, dma_addr_t dma_handle)
 {
        plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
        free_pages((unsigned long) vaddr, get_order(size));
 }
-EXPORT_SYMBOL(dma_free_noncoherent);
 
 static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
        dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
        unsigned long addr = (unsigned long) vaddr;
-       int order = get_order(size);
        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
        struct page *page = NULL;
 
-       if (dma_release_from_coherent(dev, order, vaddr))
+       if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs)) {
+               mips_dma_free_noncoherent(dev, size, vaddr, dma_handle);
                return;
+       }
 
        plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
 
index f3d4ae8..3758715 100644 (file)
@@ -47,11 +47,6 @@ static char *nlm_swiotlb;
 static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
        dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-       void *ret;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -69,11 +64,6 @@ static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
 static void nlm_dma_free_coherent(struct device *dev, size_t size,
        void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
index fab8628..413bfcf 100644 (file)
@@ -23,7 +23,6 @@
  */
 
 #include <linux/dma-debug.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/kmemcheck.h>
 #include <linux/dma-mapping.h>
 
@@ -36,75 +35,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &or1k_dma_map_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) 
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                                         dma_addr_t *dma_handle, gfp_t gfp)
-{
-       struct dma_attrs attrs;
-
-       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-
-       return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-                                        void *cpu_addr, dma_addr_t dma_handle)
-{
-       struct dma_attrs attrs;
-
-       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-
-       dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
-}
-
+#define HAVE_ARCH_DMA_SUPPORTED 1
 static inline int dma_supported(struct device *dev, u64 dma_mask)
 {
        /* Support 32 bit DMA mask exclusively */
        return dma_mask == DMA_BIT_MASK(32);
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return 0;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
+#include <asm-generic/dma-mapping-common.h>
 
-       return 0;
-}
 #endif /* __ASM_OPENRISC_DMA_MAPPING_H */
index b447918..9a7057e 100644 (file)
@@ -420,6 +420,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config KEXEC
        bool "kexec system call"
        depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 710f60e..7f522c0 100644 (file)
@@ -18,7 +18,9 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 
+#ifdef CONFIG_PPC64
 #define DMA_ERROR_CODE         (~(dma_addr_t)0x0)
+#endif
 
 /* Some dma direct funcs must be visible for use in other dma_ops */
 extern void *__dma_direct_alloc_coherent(struct device *dev, size_t size,
@@ -120,71 +122,14 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 /* this will be removed soon */
 #define flush_write_buffers()
 
-#include <asm-generic/dma-mapping-common.h>
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
+#define HAVE_ARCH_DMA_SET_MASK 1
+extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
-       if (unlikely(dma_ops == NULL))
-               return 0;
-       if (dma_ops->dma_supported == NULL)
-               return 1;
-       return dma_ops->dma_supported(dev, mask);
-}
+#include <asm-generic/dma-mapping-common.h>
 
-extern int dma_set_mask(struct device *dev, u64 dma_mask);
 extern int __dma_set_mask(struct device *dev, u64 dma_mask);
 extern u64 __dma_get_required_mask(struct device *dev);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       BUG_ON(!dma_ops);
-
-       cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-#ifdef CONFIG_PPC64
-       return (dma_addr == DMA_ERROR_CODE);
-#else
-       return 0;
-#endif
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 #ifdef CONFIG_SWIOTLB
@@ -210,9 +155,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
        return daddr - get_dma_offset(dev);
 }
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #define ARCH_HAS_DMA_MMAP_COHERENT
 
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
index b91e74a..9fac01c 100644 (file)
@@ -158,6 +158,7 @@ extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
                        bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
                        unsigned long *rmap, long pte_index, int realmode);
+extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
                        unsigned long pte_index);
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
@@ -225,12 +226,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
        return vcpu->arch.cr;
 }
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
        vcpu->arch.xer = val;
 }
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.xer;
 }
index 5bdfb5d..72b6225 100644 (file)
 #define XICS_MFRR              0xc
 #define XICS_IPI               2       /* interrupt source # for IPIs */
 
+/* Maximum number of threads per physical core */
+#define MAX_SMT_THREADS                8
+
+/* Maximum number of subcores per physical core */
+#define MAX_SUBCORES           4
+
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -65,6 +71,19 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+struct kvmppc_vcore;
+
+/* Struct used for coordinating micro-threading (split-core) mode changes */
+struct kvm_split_mode {
+       unsigned long   rpr;
+       unsigned long   pmmar;
+       unsigned long   ldbar;
+       u8              subcore_size;
+       u8              do_nap;
+       u8              napped[MAX_SMT_THREADS];
+       struct kvmppc_vcore *master_vcs[MAX_SUBCORES];
+};
+
 /*
  * This struct goes in the PACA on 64-bit processors.  It is used
  * to store host state that needs to be saved when we enter a guest
@@ -100,6 +119,7 @@ struct kvmppc_host_state {
        u64 host_spurr;
        u64 host_dscr;
        u64 dec_expires;
+       struct kvm_split_mode *kvm_split_mode;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
        u64 cfar;
@@ -112,7 +132,7 @@ struct kvmppc_book3s_shadow_vcpu {
        bool in_use;
        ulong gpr[14];
        u32 cr;
-       u32 xer;
+       ulong xer;
        ulong ctr;
        ulong lr;
        ulong pc;
index 3286f0d..bc6e29e 100644 (file)
@@ -54,12 +54,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
        return vcpu->arch.cr;
 }
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
        vcpu->arch.xer = val;
 }
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.xer;
 }
index d91f65b..98eebbf 100644 (file)
@@ -205,8 +205,10 @@ struct revmap_entry {
  */
 #define KVMPPC_RMAP_LOCK_BIT   63
 #define KVMPPC_RMAP_RC_SHIFT   32
+#define KVMPPC_RMAP_CHG_SHIFT  48
 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_CHANGED    (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHG_ORDER  (0x3ful << KVMPPC_RMAP_CHG_SHIFT)
 #define KVMPPC_RMAP_PRESENT    0x100000000ul
 #define KVMPPC_RMAP_INDEX      0xfffffffful
 
@@ -278,7 +280,9 @@ struct kvmppc_vcore {
        u16 last_cpu;
        u8 vcore_state;
        u8 in_guest;
+       struct kvmppc_vcore *master_vcore;
        struct list_head runnable_threads;
+       struct list_head preempt_list;
        spinlock_t lock;
        wait_queue_head_t wq;
        spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
@@ -300,12 +304,21 @@ struct kvmppc_vcore {
 #define VCORE_EXIT_MAP(vc)     ((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
 
-/* Values for vcore_state */
+/* This bit is used when a vcore exit is triggered from outside the vcore */
+#define VCORE_EXIT_REQ         0x10000
+
+/*
+ * Values for vcore_state.
+ * Note that these are arranged such that lower values
+ * (< VCORE_SLEEPING) don't require stolen time accounting
+ * on load/unload, and higher values do.
+ */
 #define VCORE_INACTIVE 0
-#define VCORE_SLEEPING 1
-#define VCORE_PREEMPT  2
-#define VCORE_RUNNING  3
-#define VCORE_EXITING  4
+#define VCORE_PREEMPT  1
+#define VCORE_PIGGYBACK        2
+#define VCORE_SLEEPING 3
+#define VCORE_RUNNING  4
+#define VCORE_EXITING  5
 
 /*
  * Struct used to manage memory for a virtual processor area
@@ -473,7 +486,7 @@ struct kvm_vcpu_arch {
        ulong ciabr;
        ulong cfar;
        ulong ppr;
-       ulong pspb;
+       u32 pspb;
        ulong fscr;
        ulong shadow_fscr;
        ulong ebbhr;
@@ -619,6 +632,7 @@ struct kvm_vcpu_arch {
        int trap;
        int state;
        int ptid;
+       int thread_cpu;
        bool timer_running;
        wait_queue_head_t cpu_run;
 
index 8452335..790f5d1 100644 (file)
 
 /* POWER8 Micro Partition Prefetch (MPP) parameters */
 /* Address mask is common for LOGMPP instruction and MPPR SPR */
-#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000
+#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000ULL
 
 /* Bits 60 and 61 of MPP SPR should be set to one of the following */
 /* Aborting the fetch is indeed setting 00 in the table size bits */
index 810f433..221d584 100644 (file)
@@ -511,6 +511,8 @@ int main(void)
        DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
        DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
        DEFINE(VCPU_HEIR, offsetof(struct kvm_vcpu, arch.emul_inst));
+       DEFINE(VCPU_CPU, offsetof(struct kvm_vcpu, cpu));
+       DEFINE(VCPU_THREAD_CPU, offsetof(struct kvm_vcpu, arch.thread_cpu));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
        DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -673,7 +675,14 @@ int main(void)
        HSTATE_FIELD(HSTATE_DSCR, host_dscr);
        HSTATE_FIELD(HSTATE_DABR, dabr);
        HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+       HSTATE_FIELD(HSTATE_SPLIT_MODE, kvm_split_mode);
        DEFINE(IPI_PRIORITY, IPI_PRIORITY);
+       DEFINE(KVM_SPLIT_RPR, offsetof(struct kvm_split_mode, rpr));
+       DEFINE(KVM_SPLIT_PMMAR, offsetof(struct kvm_split_mode, pmmar));
+       DEFINE(KVM_SPLIT_LDBAR, offsetof(struct kvm_split_mode, ldbar));
+       DEFINE(KVM_SPLIT_SIZE, offsetof(struct kvm_split_mode, subcore_size));
+       DEFINE(KVM_SPLIT_DO_NAP, offsetof(struct kvm_split_mode, do_nap));
+       DEFINE(KVM_SPLIT_NAPPED, offsetof(struct kvm_split_mode, napped));
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
index 3caec2c..c2024ac 100644 (file)
@@ -74,14 +74,14 @@ config KVM_BOOK3S_64
          If unsure, say N.
 
 config KVM_BOOK3S_64_HV
-       tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+       tristate "KVM for POWER7 and later using hypervisor mode in host"
        depends on KVM_BOOK3S_64 && PPC_POWERNV
        select KVM_BOOK3S_HV_POSSIBLE
        select MMU_NOTIFIER
        select CMA
        ---help---
          Support running unmodified book3s_64 guest kernels in
-         virtual machines on POWER7 and PPC970 processors that have
+         virtual machines on POWER7 and newer processors that have
          hypervisor mode available to the host.
 
          If you say Y here, KVM will use the hardware virtualization
@@ -89,8 +89,8 @@ config KVM_BOOK3S_64_HV
          guest operating systems will run at full hardware speed
          using supervisor and user modes.  However, this also means
          that KVM is not usable under PowerVM (pHyp), is only usable
-         on POWER7 (or later) processors and PPC970-family processors,
-         and cannot emulate a different processor from the host processor.
+         on POWER7 or later processors, and cannot emulate a
+         different processor from the host processor.
 
          If unsure, say N.
 
index 6d6398f..d75bf32 100644 (file)
@@ -240,7 +240,8 @@ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
        kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
 }
 
-int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
+static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
 {
        int deliver = 1;
        int vec = 0;
index 2035d16..d5c9bfe 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
+#include "book3s.h"
 
 /* #define DEBUG_MMU */
 /* #define DEBUG_SR */
index b982d92..79ad35a 100644 (file)
@@ -28,6 +28,7 @@
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 #include "trace_pr.h"
+#include "book3s.h"
 
 #define PTE_SIZE 12
 
index dab68b7..1f9c0a1 100644 (file)
@@ -761,6 +761,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        /* Harvest R and C */
                        rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
                        *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+                       if (rcbits & HPTE_R_C)
+                               kvmppc_update_rmap_change(rmapp, psize);
                        if (rcbits & ~rev[i].guest_rpte) {
                                rev[i].guest_rpte = ptel | rcbits;
                                note_hpte_modification(kvm, &rev[i]);
@@ -927,8 +929,12 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
  retry:
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_CHANGED) {
-               *rmapp &= ~KVMPPC_RMAP_CHANGED;
+               long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
+                       >> KVMPPC_RMAP_CHG_SHIFT;
+               *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
                npages_dirty = 1;
+               if (change_order > PAGE_SHIFT)
+                       npages_dirty = 1ul << (change_order - PAGE_SHIFT);
        }
        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
                unlock_rmap(rmapp);
index 5a2bc4b..2afdb9c 100644 (file)
@@ -23,6 +23,7 @@
 #include <asm/reg.h>
 #include <asm/switch_to.h>
 #include <asm/time.h>
+#include "book3s.h"
 
 #define OP_19_XOP_RFID         18
 #define OP_19_XOP_RFI          50
index a9f753f..9754e68 100644 (file)
@@ -81,6 +81,12 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 #define MPP_BUFFER_ORDER       3
 #endif
 
+static int dynamic_mt_modes = 6;
+module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
+static int target_smt_mode;
+module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -114,7 +120,7 @@ static bool kvmppc_ipi_thread(int cpu)
 
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
-       int cpu = vcpu->cpu;
+       int cpu;
        wait_queue_head_t *wqp;
 
        wqp = kvm_arch_vcpu_wq(vcpu);
@@ -123,10 +129,11 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
                ++vcpu->stat.halt_wakeup;
        }
 
-       if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+       if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
                return;
 
        /* CPU points to the first thread of the core */
+       cpu = vcpu->cpu;
        if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
                smp_send_reschedule(cpu);
 }
@@ -164,6 +171,27 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  * they should never fail.)
  */
 
+static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vc->stoltb_lock, flags);
+       vc->preempt_tb = mftb();
+       spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
+static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vc->stoltb_lock, flags);
+       if (vc->preempt_tb != TB_NIL) {
+               vc->stolen_tb += mftb() - vc->preempt_tb;
+               vc->preempt_tb = TB_NIL;
+       }
+       spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -175,14 +203,9 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
         * vcpu, and once it is set to this vcpu, only this task
         * ever sets it to NULL.
         */
-       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-               spin_lock_irqsave(&vc->stoltb_lock, flags);
-               if (vc->preempt_tb != TB_NIL) {
-                       vc->stolen_tb += mftb() - vc->preempt_tb;
-                       vc->preempt_tb = TB_NIL;
-               }
-               spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-       }
+       if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+               kvmppc_core_end_stolen(vc);
+
        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
            vcpu->arch.busy_preempt != TB_NIL) {
@@ -197,11 +220,9 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
        unsigned long flags;
 
-       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-               spin_lock_irqsave(&vc->stoltb_lock, flags);
-               vc->preempt_tb = mftb();
-               spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-       }
+       if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+               kvmppc_core_start_stolen(vc);
+
        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
                vcpu->arch.busy_preempt = mftb();
@@ -214,12 +235,12 @@ static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
        kvmppc_end_cede(vcpu);
 }
 
-void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
+static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 {
        vcpu->arch.pvr = pvr;
 }
 
-int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 {
        unsigned long pcr = 0;
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -259,7 +280,7 @@ int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
        return 0;
 }
 
-void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 {
        int r;
 
@@ -292,7 +313,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
               vcpu->arch.last_inst);
 }
 
-struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
 {
        int r;
        struct kvm_vcpu *v, *ret = NULL;
@@ -641,7 +662,8 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
 
        spin_lock(&vcore->lock);
        if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
-           vcore->vcore_state != VCORE_INACTIVE)
+           vcore->vcore_state != VCORE_INACTIVE &&
+           vcore->runner)
                target = vcore->runner;
        spin_unlock(&vcore->lock);
 
@@ -1431,6 +1453,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
        vcore->lpcr = kvm->arch.lpcr;
        vcore->first_vcpuid = core * threads_per_subcore;
        vcore->kvm = kvm;
+       INIT_LIST_HEAD(&vcore->preempt_list);
 
        vcore->mpp_buffer_is_valid = false;
 
@@ -1655,6 +1678,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
        spin_unlock(&vcore->lock);
        vcpu->arch.vcore = vcore;
        vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
+       vcpu->arch.thread_cpu = -1;
 
        vcpu->arch.cpu_type = KVM_CPU_3S_64;
        kvmppc_sanity_check(vcpu);
@@ -1749,6 +1773,7 @@ static int kvmppc_grab_hwthread(int cpu)
 
        /* Ensure the thread won't go into the kernel if it wakes */
        tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.kvm_vcore = NULL;
        tpaca->kvm_hstate.napping = 0;
        smp_wmb();
        tpaca->kvm_hstate.hwthread_req = 1;
@@ -1780,26 +1805,32 @@ static void kvmppc_release_hwthread(int cpu)
        tpaca = &paca[cpu];
        tpaca->kvm_hstate.hwthread_req = 0;
        tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.kvm_vcore = NULL;
+       tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
-static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
        int cpu;
        struct paca_struct *tpaca;
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       struct kvmppc_vcore *mvc = vc->master_vcore;
 
-       if (vcpu->arch.timer_running) {
-               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-               vcpu->arch.timer_running = 0;
+       cpu = vc->pcpu;
+       if (vcpu) {
+               if (vcpu->arch.timer_running) {
+                       hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+                       vcpu->arch.timer_running = 0;
+               }
+               cpu += vcpu->arch.ptid;
+               vcpu->cpu = mvc->pcpu;
+               vcpu->arch.thread_cpu = cpu;
        }
-       cpu = vc->pcpu + vcpu->arch.ptid;
        tpaca = &paca[cpu];
-       tpaca->kvm_hstate.kvm_vcore = vc;
-       tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
-       vcpu->cpu = vc->pcpu;
-       /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
-       smp_wmb();
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
+       tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
+       /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
+       smp_wmb();
+       tpaca->kvm_hstate.kvm_vcore = mvc;
        if (cpu != smp_processor_id())
                kvmppc_ipi_thread(cpu);
 }
@@ -1812,12 +1843,12 @@ static void kvmppc_wait_for_nap(void)
        for (loops = 0; loops < 1000000; ++loops) {
                /*
                 * Check if all threads are finished.
-                * We set the vcpu pointer when starting a thread
+                * We set the vcore pointer when starting a thread
                 * and the thread clears it when finished, so we look
-                * for any threads that still have a non-NULL vcpu ptr.
+                * for any threads that still have a non-NULL vcore ptr.
                 */
                for (i = 1; i < threads_per_subcore; ++i)
-                       if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+                       if (paca[cpu + i].kvm_hstate.kvm_vcore)
                                break;
                if (i == threads_per_subcore) {
                        HMT_medium();
@@ -1827,7 +1858,7 @@ static void kvmppc_wait_for_nap(void)
        }
        HMT_medium();
        for (i = 1; i < threads_per_subcore; ++i)
-               if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+               if (paca[cpu + i].kvm_hstate.kvm_vcore)
                        pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 
@@ -1890,6 +1921,278 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
        mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 }
 
+/*
+ * A list of virtual cores for each physical CPU.
+ * These are vcores that could run but their runner VCPU tasks are
+ * (or may be) preempted.
+ */
+struct preempted_vcore_list {
+       struct list_head        list;
+       spinlock_t              lock;
+};
+
+static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
+
+static void init_vcore_lists(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
+               spin_lock_init(&lp->lock);
+               INIT_LIST_HEAD(&lp->list);
+       }
+}
+
+static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
+{
+       struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+
+       vc->vcore_state = VCORE_PREEMPT;
+       vc->pcpu = smp_processor_id();
+       if (vc->num_threads < threads_per_subcore) {
+               spin_lock(&lp->lock);
+               list_add_tail(&vc->preempt_list, &lp->list);
+               spin_unlock(&lp->lock);
+       }
+
+       /* Start accumulating stolen time */
+       kvmppc_core_start_stolen(vc);
+}
+
+static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
+{
+       struct preempted_vcore_list *lp;
+
+       kvmppc_core_end_stolen(vc);
+       if (!list_empty(&vc->preempt_list)) {
+               lp = &per_cpu(preempted_vcores, vc->pcpu);
+               spin_lock(&lp->lock);
+               list_del_init(&vc->preempt_list);
+               spin_unlock(&lp->lock);
+       }
+       vc->vcore_state = VCORE_INACTIVE;
+}
+
+/*
+ * This stores information about the virtual cores currently
+ * assigned to a physical core.
+ */
+struct core_info {
+       int             n_subcores;
+       int             max_subcore_threads;
+       int             total_threads;
+       int             subcore_threads[MAX_SUBCORES];
+       struct kvm      *subcore_vm[MAX_SUBCORES];
+       struct list_head vcs[MAX_SUBCORES];
+};
+
+/*
+ * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
+ * respectively in 2-way micro-threading (split-core) mode.
+ */
+static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
+
+static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
+{
+       int sub;
+
+       memset(cip, 0, sizeof(*cip));
+       cip->n_subcores = 1;
+       cip->max_subcore_threads = vc->num_threads;
+       cip->total_threads = vc->num_threads;
+       cip->subcore_threads[0] = vc->num_threads;
+       cip->subcore_vm[0] = vc->kvm;
+       for (sub = 0; sub < MAX_SUBCORES; ++sub)
+               INIT_LIST_HEAD(&cip->vcs[sub]);
+       list_add_tail(&vc->preempt_list, &cip->vcs[0]);
+}
+
+static bool subcore_config_ok(int n_subcores, int n_threads)
+{
+       /* Can only dynamically split if unsplit to begin with */
+       if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
+               return false;
+       if (n_subcores > MAX_SUBCORES)
+               return false;
+       if (n_subcores > 1) {
+               if (!(dynamic_mt_modes & 2))
+                       n_subcores = 4;
+               if (n_subcores > 2 && !(dynamic_mt_modes & 4))
+                       return false;
+       }
+
+       return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
+}
+
+static void init_master_vcore(struct kvmppc_vcore *vc)
+{
+       vc->master_vcore = vc;
+       vc->entry_exit_map = 0;
+       vc->in_guest = 0;
+       vc->napping_threads = 0;
+       vc->conferring_threads = 0;
+}
+
+/*
+ * See if the existing subcores can be split into 3 (or fewer) subcores
+ * of at most two threads each, so we can fit in another vcore.  This
+ * assumes there are at most two subcores and at most 6 threads in total.
+ */
+static bool can_split_piggybacked_subcores(struct core_info *cip)
+{
+       int sub, new_sub;
+       int large_sub = -1;
+       int thr;
+       int n_subcores = cip->n_subcores;
+       struct kvmppc_vcore *vc, *vcnext;
+       struct kvmppc_vcore *master_vc = NULL;
+
+       for (sub = 0; sub < cip->n_subcores; ++sub) {
+               if (cip->subcore_threads[sub] <= 2)
+                       continue;
+               if (large_sub >= 0)
+                       return false;
+               large_sub = sub;
+               vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+                                     preempt_list);
+               if (vc->num_threads > 2)
+                       return false;
+               n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
+       }
+       if (n_subcores > 3 || large_sub < 0)
+               return false;
+
+       /*
+        * Seems feasible, so go through and move vcores to new subcores.
+        * Note that when we have two or more vcores in one subcore,
+        * all those vcores must have only one thread each.
+        */
+       new_sub = cip->n_subcores;
+       thr = 0;
+       sub = large_sub;
+       list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
+               if (thr >= 2) {
+                       list_del(&vc->preempt_list);
+                       list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
+                       /* vc->num_threads must be 1 */
+                       if (++cip->subcore_threads[new_sub] == 1) {
+                               cip->subcore_vm[new_sub] = vc->kvm;
+                               init_master_vcore(vc);
+                               master_vc = vc;
+                               ++cip->n_subcores;
+                       } else {
+                               vc->master_vcore = master_vc;
+                               ++new_sub;
+                       }
+               }
+               thr += vc->num_threads;
+       }
+       cip->subcore_threads[large_sub] = 2;
+       cip->max_subcore_threads = 2;
+
+       return true;
+}
+
+static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
+{
+       int n_threads = vc->num_threads;
+       int sub;
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return false;
+
+       if (n_threads < cip->max_subcore_threads)
+               n_threads = cip->max_subcore_threads;
+       if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
+               cip->max_subcore_threads = n_threads;
+       } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
+                  vc->num_threads <= 2) {
+               /*
+                * We may be able to fit another subcore in by
+                * splitting an existing subcore with 3 or 4
+                * threads into two 2-thread subcores, or one
+                * with 5 or 6 threads into three subcores.
+                * We can only do this if those subcores have
+                * piggybacked virtual cores.
+                */
+               if (!can_split_piggybacked_subcores(cip))
+                       return false;
+       } else {
+               return false;
+       }
+
+       sub = cip->n_subcores;
+       ++cip->n_subcores;
+       cip->total_threads += vc->num_threads;
+       cip->subcore_threads[sub] = vc->num_threads;
+       cip->subcore_vm[sub] = vc->kvm;
+       init_master_vcore(vc);
+       list_del(&vc->preempt_list);
+       list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
+
+       return true;
+}
+
+static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
+                                 struct core_info *cip, int sub)
+{
+       struct kvmppc_vcore *vc;
+       int n_thr;
+
+       vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+                             preempt_list);
+
+       /* require same VM and same per-core reg values */
+       if (pvc->kvm != vc->kvm ||
+           pvc->tb_offset != vc->tb_offset ||
+           pvc->pcr != vc->pcr ||
+           pvc->lpcr != vc->lpcr)
+               return false;
+
+       /* P8 guest with > 1 thread per core would see wrong TIR value */
+       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+           (vc->num_threads > 1 || pvc->num_threads > 1))
+               return false;
+
+       n_thr = cip->subcore_threads[sub] + pvc->num_threads;
+       if (n_thr > cip->max_subcore_threads) {
+               if (!subcore_config_ok(cip->n_subcores, n_thr))
+                       return false;
+               cip->max_subcore_threads = n_thr;
+       }
+
+       cip->total_threads += pvc->num_threads;
+       cip->subcore_threads[sub] = n_thr;
+       pvc->master_vcore = vc;
+       list_del(&pvc->preempt_list);
+       list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
+
+       return true;
+}
+
+/*
+ * Work out whether it is possible to piggyback the execution of
+ * vcore *pvc onto the execution of the other vcores described in *cip.
+ */
+static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
+                         int target_threads)
+{
+       int sub;
+
+       if (cip->total_threads + pvc->num_threads > target_threads)
+               return false;
+       for (sub = 0; sub < cip->n_subcores; ++sub)
+               if (cip->subcore_threads[sub] &&
+                   can_piggyback_subcore(pvc, cip, sub))
+                       return true;
+
+       if (can_dynamic_split(pvc, cip))
+               return true;
+
+       return false;
+}
+
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
        struct kvm_vcpu *vcpu, *vnext;
@@ -1909,12 +2212,45 @@ static void prepare_threads(struct kvmppc_vcore *vc)
        }
 }
 
-static void post_guest_process(struct kvmppc_vcore *vc)
+static void collect_piggybacks(struct core_info *cip, int target_threads)
+{
+       struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+       struct kvmppc_vcore *pvc, *vcnext;
+
+       spin_lock(&lp->lock);
+       list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
+               if (!spin_trylock(&pvc->lock))
+                       continue;
+               prepare_threads(pvc);
+               if (!pvc->n_runnable) {
+                       list_del_init(&pvc->preempt_list);
+                       if (pvc->runner == NULL) {
+                               pvc->vcore_state = VCORE_INACTIVE;
+                               kvmppc_core_end_stolen(pvc);
+                       }
+                       spin_unlock(&pvc->lock);
+                       continue;
+               }
+               if (!can_piggyback(pvc, cip, target_threads)) {
+                       spin_unlock(&pvc->lock);
+                       continue;
+               }
+               kvmppc_core_end_stolen(pvc);
+               pvc->vcore_state = VCORE_PIGGYBACK;
+               if (cip->total_threads >= target_threads)
+                       break;
+       }
+       spin_unlock(&lp->lock);
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
+       int still_running = 0;
        u64 now;
        long ret;
        struct kvm_vcpu *vcpu, *vnext;
 
+       spin_lock(&vc->lock);
        now = get_tb();
        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                 arch.run_list) {
@@ -1933,17 +2269,36 @@ static void post_guest_process(struct kvmppc_vcore *vc)
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
 
-               if (vcpu->arch.ceded) {
-                       if (!is_kvmppc_resume_guest(ret))
-                               kvmppc_end_cede(vcpu);
-                       else
+               if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       if (vcpu->arch.pending_exceptions)
+                               kvmppc_core_prepare_to_enter(vcpu);
+                       if (vcpu->arch.ceded)
                                kvmppc_set_timer(vcpu);
-               }
-               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       else
+                               ++still_running;
+               } else {
                        kvmppc_remove_runnable(vc, vcpu);
                        wake_up(&vcpu->arch.cpu_run);
                }
        }
+       list_del_init(&vc->preempt_list);
+       if (!is_master) {
+               if (still_running > 0) {
+                       kvmppc_vcore_preempt(vc);
+               } else if (vc->runner) {
+                       vc->vcore_state = VCORE_PREEMPT;
+                       kvmppc_core_start_stolen(vc);
+               } else {
+                       vc->vcore_state = VCORE_INACTIVE;
+               }
+               if (vc->n_runnable > 0 && vc->runner == NULL) {
+                       /* make sure there's a candidate runner awake */
+                       vcpu = list_first_entry(&vc->runnable_threads,
+                                               struct kvm_vcpu, arch.run_list);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
+       }
+       spin_unlock(&vc->lock);
 }
 
 /*
@@ -1955,6 +2310,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        struct kvm_vcpu *vcpu, *vnext;
        int i;
        int srcu_idx;
+       struct core_info core_info;
+       struct kvmppc_vcore *pvc, *vcnext;
+       struct kvm_split_mode split_info, *sip;
+       int split, subcore_size, active;
+       int sub;
+       bool thr0_done;
+       unsigned long cmd_bit, stat_bit;
+       int pcpu, thr;
+       int target_threads;
 
        /*
         * Remove from the list any threads that have a signal pending
@@ -1969,11 +2333,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        /*
         * Initialize *vc.
         */
-       vc->entry_exit_map = 0;
+       init_master_vcore(vc);
        vc->preempt_tb = TB_NIL;
-       vc->in_guest = 0;
-       vc->napping_threads = 0;
-       vc->conferring_threads = 0;
 
        /*
         * Make sure we are running on primary threads, and that secondary
@@ -1991,24 +2352,120 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                goto out;
        }
 
+       /*
+        * See if we could run any other vcores on the physical core
+        * along with this one.
+        */
+       init_core_info(&core_info, vc);
+       pcpu = smp_processor_id();
+       target_threads = threads_per_subcore;
+       if (target_smt_mode && target_smt_mode < target_threads)
+               target_threads = target_smt_mode;
+       if (vc->num_threads < target_threads)
+               collect_piggybacks(&core_info, target_threads);
+
+       /* Decide on micro-threading (split-core) mode */
+       subcore_size = threads_per_subcore;
+       cmd_bit = stat_bit = 0;
+       split = core_info.n_subcores;
+       sip = NULL;
+       if (split > 1) {
+               /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
+               if (split == 2 && (dynamic_mt_modes & 2)) {
+                       cmd_bit = HID0_POWER8_1TO2LPAR;
+                       stat_bit = HID0_POWER8_2LPARMODE;
+               } else {
+                       split = 4;
+                       cmd_bit = HID0_POWER8_1TO4LPAR;
+                       stat_bit = HID0_POWER8_4LPARMODE;
+               }
+               subcore_size = MAX_SMT_THREADS / split;
+               sip = &split_info;
+               memset(&split_info, 0, sizeof(split_info));
+               split_info.rpr = mfspr(SPRN_RPR);
+               split_info.pmmar = mfspr(SPRN_PMMAR);
+               split_info.ldbar = mfspr(SPRN_LDBAR);
+               split_info.subcore_size = subcore_size;
+               for (sub = 0; sub < core_info.n_subcores; ++sub)
+                       split_info.master_vcs[sub] =
+                               list_first_entry(&core_info.vcs[sub],
+                                       struct kvmppc_vcore, preempt_list);
+               /* order writes to split_info before kvm_split_mode pointer */
+               smp_wmb();
+       }
+       pcpu = smp_processor_id();
+       for (thr = 0; thr < threads_per_subcore; ++thr)
+               paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+
+       /* Initiate micro-threading (split-core) if required */
+       if (cmd_bit) {
+               unsigned long hid0 = mfspr(SPRN_HID0);
+
+               hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
+               mb();
+               mtspr(SPRN_HID0, hid0);
+               isync();
+               for (;;) {
+                       hid0 = mfspr(SPRN_HID0);
+                       if (hid0 & stat_bit)
+                               break;
+                       cpu_relax();
+               }
+       }
 
-       vc->pcpu = smp_processor_id();
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               kvmppc_start_thread(vcpu);
-               kvmppc_create_dtl_entry(vcpu, vc);
-               trace_kvm_guest_enter(vcpu);
+       /* Start all the threads */
+       active = 0;
+       for (sub = 0; sub < core_info.n_subcores; ++sub) {
+               thr = subcore_thread_map[sub];
+               thr0_done = false;
+               active |= 1 << thr;
+               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
+                       pvc->pcpu = pcpu + thr;
+                       list_for_each_entry(vcpu, &pvc->runnable_threads,
+                                           arch.run_list) {
+                               kvmppc_start_thread(vcpu, pvc);
+                               kvmppc_create_dtl_entry(vcpu, pvc);
+                               trace_kvm_guest_enter(vcpu);
+                               if (!vcpu->arch.ptid)
+                                       thr0_done = true;
+                               active |= 1 << (thr + vcpu->arch.ptid);
+                       }
+                       /*
+                        * We need to start the first thread of each subcore
+                        * even if it doesn't have a vcpu.
+                        */
+                       if (pvc->master_vcore == pvc && !thr0_done)
+                               kvmppc_start_thread(NULL, pvc);
+                       thr += pvc->num_threads;
+               }
        }
 
-       /* Set this explicitly in case thread 0 doesn't have a vcpu */
-       get_paca()->kvm_hstate.kvm_vcore = vc;
-       get_paca()->kvm_hstate.ptid = 0;
+       /*
+        * Ensure that split_info.do_nap is set after setting
+        * the vcore pointer in the PACA of the secondaries.
+        */
+       smp_mb();
+       if (cmd_bit)
+               split_info.do_nap = 1;  /* ask secondaries to nap when done */
+
+       /*
+        * When doing micro-threading, poke the inactive threads as well.
+        * This gets them to the nap instruction after kvm_do_nap,
+        * which reduces the time taken to unsplit later.
+        */
+       if (split > 1)
+               for (thr = 1; thr < threads_per_subcore; ++thr)
+                       if (!(active & (1 << thr)))
+                               kvmppc_ipi_thread(pcpu + thr);
 
        vc->vcore_state = VCORE_RUNNING;
        preempt_disable();
 
        trace_kvmppc_run_core(vc, 0);
 
-       spin_unlock(&vc->lock);
+       for (sub = 0; sub < core_info.n_subcores; ++sub)
+               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
+                       spin_unlock(&pvc->lock);
 
        kvm_guest_enter();
 
@@ -2019,32 +2476,58 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
        __kvmppc_vcore_entry();
 
-       spin_lock(&vc->lock);
-
        if (vc->mpp_buffer)
                kvmppc_start_saving_l2_cache(vc);
 
-       /* disable sending of IPIs on virtual external irqs */
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-               vcpu->cpu = -1;
-       /* wait for secondary threads to finish writing their state to memory */
-       kvmppc_wait_for_nap();
-       for (i = 0; i < threads_per_subcore; ++i)
-               kvmppc_release_hwthread(vc->pcpu + i);
+       srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+
+       spin_lock(&vc->lock);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
        vc->vcore_state = VCORE_EXITING;
-       spin_unlock(&vc->lock);
 
-       srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+       /* wait for secondary threads to finish writing their state to memory */
+       kvmppc_wait_for_nap();
+
+       /* Return to whole-core mode if we split the core earlier */
+       if (split > 1) {
+               unsigned long hid0 = mfspr(SPRN_HID0);
+               unsigned long loops = 0;
+
+               hid0 &= ~HID0_POWER8_DYNLPARDIS;
+               stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+               mb();
+               mtspr(SPRN_HID0, hid0);
+               isync();
+               for (;;) {
+                       hid0 = mfspr(SPRN_HID0);
+                       if (!(hid0 & stat_bit))
+                               break;
+                       cpu_relax();
+                       ++loops;
+               }
+               split_info.do_nap = 0;
+       }
+
+       /* Let secondaries go back to the offline loop */
+       for (i = 0; i < threads_per_subcore; ++i) {
+               kvmppc_release_hwthread(pcpu + i);
+               if (sip && sip->napped[i])
+                       kvmppc_ipi_thread(pcpu + i);
+       }
+
+       spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
        smp_mb();
        kvm_guest_exit();
 
-       preempt_enable();
+       for (sub = 0; sub < core_info.n_subcores; ++sub)
+               list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
+                                        preempt_list)
+                       post_guest_process(pvc, pvc == vc);
 
        spin_lock(&vc->lock);
-       post_guest_process(vc);
+       preempt_enable();
 
  out:
        vc->vcore_state = VCORE_INACTIVE;
@@ -2055,13 +2538,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
  * Wait for some other vcpu thread to execute us, and
  * wake us up when we need to handle something in the host.
  */
-static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
+static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
+                                struct kvm_vcpu *vcpu, int wait_state)
 {
        DEFINE_WAIT(wait);
 
        prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+               spin_unlock(&vc->lock);
                schedule();
+               spin_lock(&vc->lock);
+       }
        finish_wait(&vcpu->arch.cpu_run, &wait);
 }
 
@@ -2137,9 +2624,21 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         * this thread straight away and have it join in.
         */
        if (!signal_pending(current)) {
-               if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
+               if (vc->vcore_state == VCORE_PIGGYBACK) {
+                       struct kvmppc_vcore *mvc = vc->master_vcore;
+                       if (spin_trylock(&mvc->lock)) {
+                               if (mvc->vcore_state == VCORE_RUNNING &&
+                                   !VCORE_IS_EXITING(mvc)) {
+                                       kvmppc_create_dtl_entry(vcpu, vc);
+                                       kvmppc_start_thread(vcpu, vc);
+                                       trace_kvm_guest_enter(vcpu);
+                               }
+                               spin_unlock(&mvc->lock);
+                       }
+               } else if (vc->vcore_state == VCORE_RUNNING &&
+                          !VCORE_IS_EXITING(vc)) {
                        kvmppc_create_dtl_entry(vcpu, vc);
-                       kvmppc_start_thread(vcpu);
+                       kvmppc_start_thread(vcpu, vc);
                        trace_kvm_guest_enter(vcpu);
                } else if (vc->vcore_state == VCORE_SLEEPING) {
                        wake_up(&vc->wq);
@@ -2149,10 +2648,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
               !signal_pending(current)) {
+               if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
+                       kvmppc_vcore_end_preempt(vc);
+
                if (vc->vcore_state != VCORE_INACTIVE) {
-                       spin_unlock(&vc->lock);
-                       kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
-                       spin_lock(&vc->lock);
+                       kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
                        continue;
                }
                list_for_each_entry_safe(v, vn, &vc->runnable_threads,
@@ -2179,10 +2679,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                if (n_ceded == vc->n_runnable) {
                        kvmppc_vcore_blocked(vc);
                } else if (need_resched()) {
-                       vc->vcore_state = VCORE_PREEMPT;
+                       kvmppc_vcore_preempt(vc);
                        /* Let something else run */
                        cond_resched_lock(&vc->lock);
-                       vc->vcore_state = VCORE_INACTIVE;
+                       if (vc->vcore_state == VCORE_PREEMPT)
+                               kvmppc_vcore_end_preempt(vc);
                } else {
                        kvmppc_run_core(vc);
                }
@@ -2191,11 +2692,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
               (vc->vcore_state == VCORE_RUNNING ||
-               vc->vcore_state == VCORE_EXITING)) {
-               spin_unlock(&vc->lock);
-               kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
-               spin_lock(&vc->lock);
-       }
+               vc->vcore_state == VCORE_EXITING))
+               kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
 
        if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
                kvmppc_remove_runnable(vc, vcpu);
@@ -2755,6 +3253,8 @@ static int kvmppc_book3s_init_hv(void)
 
        init_default_hcalls();
 
+       init_vcore_lists();
+
        r = kvmppc_mmu_hv_init();
        return r;
 }
index ed2589d..fd7006b 100644 (file)
@@ -110,14 +110,15 @@ void __init kvm_cma_reserve(void)
 long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
                            unsigned int yield_count)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+       int ptid = local_paca->kvm_hstate.ptid;
        int threads_running;
        int threads_ceded;
        int threads_conferring;
        u64 stop = get_tb() + 10 * tb_ticks_per_usec;
        int rv = H_SUCCESS; /* => don't yield */
 
-       set_bit(vcpu->arch.ptid, &vc->conferring_threads);
+       set_bit(ptid, &vc->conferring_threads);
        while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
                threads_running = VCORE_ENTRY_MAP(vc);
                threads_ceded = vc->napping_threads;
@@ -127,7 +128,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
                        break;
                }
        }
-       clear_bit(vcpu->arch.ptid, &vc->conferring_threads);
+       clear_bit(ptid, &vc->conferring_threads);
        return rv;
 }
 
@@ -238,7 +239,8 @@ void kvmhv_commence_exit(int trap)
 {
        struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
        int ptid = local_paca->kvm_hstate.ptid;
-       int me, ee;
+       struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
+       int me, ee, i;
 
        /* Set our bit in the threads-exiting-guest map in the 0xff00
           bits of vcore->entry_exit_map */
@@ -258,4 +260,26 @@ void kvmhv_commence_exit(int trap)
         */
        if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
                kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+
+       /*
+        * If we are doing dynamic micro-threading, interrupt the other
+        * subcores to pull them out of their guests too.
+        */
+       if (!sip)
+               return;
+
+       for (i = 0; i < MAX_SUBCORES; ++i) {
+               vc = sip->master_vcs[i];
+               if (!vc)
+                       break;
+               do {
+                       ee = vc->entry_exit_map;
+                       /* Already asked to exit? */
+                       if ((ee >> 8) != 0)
+                               break;
+               } while (cmpxchg(&vc->entry_exit_map, ee,
+                                ee | VCORE_EXIT_REQ) != ee);
+               if ((ee >> 8) == 0)
+                       kvmhv_interrupt_vcore(vc, ee);
+       }
 }
index b027a89..c1df9bb 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hugetlb.h>
 #include <linux/module.h>
+#include <linux/log2.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -97,25 +98,52 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
+/* Update the changed page order field of an rmap entry */
+void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
+{
+       unsigned long order;
+
+       if (!psize)
+               return;
+       order = ilog2(psize);
+       order <<= KVMPPC_RMAP_CHG_SHIFT;
+       if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
+               *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
+
+/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
+static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
+                                     unsigned long hpte_gr)
+{
+       struct kvm_memory_slot *memslot;
+       unsigned long *rmap;
+       unsigned long gfn;
+
+       gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+       if (!memslot)
+               return NULL;
+
+       rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
+       return rmap;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                                struct revmap_entry *rev,
                                unsigned long hpte_v, unsigned long hpte_r)
 {
        struct revmap_entry *next, *prev;
-       unsigned long gfn, ptel, head;
-       struct kvm_memory_slot *memslot;
+       unsigned long ptel, head;
        unsigned long *rmap;
        unsigned long rcbits;
 
        rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
        ptel = rev->guest_rpte |= rcbits;
-       gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
-       if (!memslot)
+       rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+       if (!rmap)
                return;
-
-       rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
        lock_rmap(rmap);
 
        head = *rmap & KVMPPC_RMAP_INDEX;
@@ -131,6 +159,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                        *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
        }
        *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+       if (rcbits & HPTE_R_C)
+               kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
        unlock_rmap(rmap);
 }
 
@@ -421,14 +451,20 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
        v = pte & ~HPTE_V_HVLOCK;
        if (v & HPTE_V_VALID) {
-               u64 pte1;
-
-               pte1 = be64_to_cpu(hpte[1]);
                hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
-               rb = compute_tlbie_rb(v, pte1, pte_index);
+               rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-               /* Read PTE low word after tlbie to get final R/C values */
-               remove_revmap_chain(kvm, pte_index, rev, v, pte1);
+               /*
+                * The reference (R) and change (C) bits in a HPT
+                * entry can be set by hardware at any time up until
+                * the HPTE is invalidated and the TLB invalidation
+                * sequence has completed.  This means that when
+                * removing a HPTE, we need to re-read the HPTE after
+                * the invalidation sequence has completed in order to
+                * obtain reliable values of R and C.
+                */
+               remove_revmap_chain(kvm, pte_index, rev, v,
+                                   be64_to_cpu(hpte[1]));
        }
        r = rev->guest_rpte & ~HPTE_GR_RESERVED;
        note_hpte_modification(kvm, rev);
@@ -655,6 +691,105 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
        return H_SUCCESS;
 }
 
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+                       unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       __be64 *hpte;
+       unsigned long v, r, gr;
+       struct revmap_entry *rev;
+       unsigned long *rmap;
+       long ret = H_NOT_FOUND;
+
+       if (pte_index >= kvm->arch.hpt_npte)
+               return H_PARAMETER;
+
+       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+       hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       v = be64_to_cpu(hpte[0]);
+       r = be64_to_cpu(hpte[1]);
+       if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+               goto out;
+
+       gr = rev->guest_rpte;
+       if (rev->guest_rpte & HPTE_R_R) {
+               rev->guest_rpte &= ~HPTE_R_R;
+               note_hpte_modification(kvm, rev);
+       }
+       if (v & HPTE_V_VALID) {
+               gr |= r & (HPTE_R_R | HPTE_R_C);
+               if (r & HPTE_R_R) {
+                       kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
+                       rmap = revmap_for_hpte(kvm, v, gr);
+                       if (rmap) {
+                               lock_rmap(rmap);
+                               *rmap |= KVMPPC_RMAP_REFERENCED;
+                               unlock_rmap(rmap);
+                       }
+               }
+       }
+       vcpu->arch.gpr[4] = gr;
+       ret = H_SUCCESS;
+ out:
+       unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+       return ret;
+}
+
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+                       unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       __be64 *hpte;
+       unsigned long v, r, gr;
+       struct revmap_entry *rev;
+       unsigned long *rmap;
+       long ret = H_NOT_FOUND;
+
+       if (pte_index >= kvm->arch.hpt_npte)
+               return H_PARAMETER;
+
+       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+       hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       v = be64_to_cpu(hpte[0]);
+       r = be64_to_cpu(hpte[1]);
+       if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+               goto out;
+
+       gr = rev->guest_rpte;
+       if (gr & HPTE_R_C) {
+               rev->guest_rpte &= ~HPTE_R_C;
+               note_hpte_modification(kvm, rev);
+       }
+       if (v & HPTE_V_VALID) {
+               /* need to make it temporarily absent so C is stable */
+               hpte[0] |= cpu_to_be64(HPTE_V_ABSENT);
+               kvmppc_invalidate_hpte(kvm, hpte, pte_index);
+               r = be64_to_cpu(hpte[1]);
+               gr |= r & (HPTE_R_R | HPTE_R_C);
+               if (r & HPTE_R_C) {
+                       unsigned long psize = hpte_page_size(v, r);
+                       hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
+                       eieio();
+                       rmap = revmap_for_hpte(kvm, v, gr);
+                       if (rmap) {
+                               lock_rmap(rmap);
+                               *rmap |= KVMPPC_RMAP_CHANGED;
+                               kvmppc_update_rmap_change(rmap, psize);
+                               unlock_rmap(rmap);
+                       }
+               }
+       }
+       vcpu->arch.gpr[4] = gr;
+       ret = H_SUCCESS;
+ out:
+       unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+       return ret;
+}
+
 void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
                        unsigned long pte_index)
 {
index 00e45b6..24f5807 100644 (file)
@@ -67,14 +67,12 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
        }
 
        /* Check if the core is loaded, if not, too hard */
-       cpu = vcpu->cpu;
+       cpu = vcpu->arch.thread_cpu;
        if (cpu < 0 || cpu >= nr_cpu_ids) {
                this_icp->rm_action |= XICS_RM_KICK_VCPU;
                this_icp->rm_kick_target = vcpu;
                return;
        }
-       /* In SMT cpu will always point to thread 0, we adjust it */
-       cpu += vcpu->arch.ptid;
 
        smp_mb();
        kvmhv_rm_send_ipi(cpu);
index faa86e9..2273dca 100644 (file)
@@ -128,6 +128,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        subf    r4, r4, r3
        mtspr   SPRN_DEC, r4
 
+       /* hwthread_req may have got set by cede or no vcpu, so clear it */
+       li      r0, 0
+       stb     r0, HSTATE_HWTHREAD_REQ(r13)
+
        /*
         * For external and machine check interrupts, we need
         * to call the Linux handler to process the interrupt.
@@ -215,7 +219,6 @@ kvm_novcpu_wakeup:
        ld      r5, HSTATE_KVM_VCORE(r13)
        li      r0, 0
        stb     r0, HSTATE_NAPPING(r13)
-       stb     r0, HSTATE_HWTHREAD_REQ(r13)
 
        /* check the wake reason */
        bl      kvmppc_check_wake_reason
@@ -315,10 +318,10 @@ kvm_start_guest:
        cmpdi   r3, 0
        bge     kvm_no_guest
 
-       /* get vcpu pointer, NULL if we have no vcpu to run */
-       ld      r4,HSTATE_KVM_VCPU(r13)
-       cmpdi   r4,0
-       /* if we have no vcpu to run, go back to sleep */
+       /* get vcore pointer, NULL if we have nothing to run */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       cmpdi   r5,0
+       /* if we have no vcore to run, go back to sleep */
        beq     kvm_no_guest
 
 kvm_secondary_got_guest:
@@ -327,21 +330,42 @@ kvm_secondary_got_guest:
        ld      r6, PACA_DSCR_DEFAULT(r13)
        std     r6, HSTATE_DSCR(r13)
 
-       /* Order load of vcore, ptid etc. after load of vcpu */
+       /* On thread 0 of a subcore, set HDEC to max */
+       lbz     r4, HSTATE_PTID(r13)
+       cmpwi   r4, 0
+       bne     63f
+       lis     r6, 0x7fff
+       ori     r6, r6, 0xffff
+       mtspr   SPRN_HDEC, r6
+       /* and set per-LPAR registers, if doing dynamic micro-threading */
+       ld      r6, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r6, 0
+       beq     63f
+       ld      r0, KVM_SPLIT_RPR(r6)
+       mtspr   SPRN_RPR, r0
+       ld      r0, KVM_SPLIT_PMMAR(r6)
+       mtspr   SPRN_PMMAR, r0
+       ld      r0, KVM_SPLIT_LDBAR(r6)
+       mtspr   SPRN_LDBAR, r0
+       isync
+63:
+       /* Order load of vcpu after load of vcore */
        lwsync
+       ld      r4, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_hv_entry
 
        /* Back from the guest, go back to nap */
-       /* Clear our vcpu pointer so we don't come back in early */
+       /* Clear our vcpu and vcore pointers so we don't come back in early */
        li      r0, 0
+       std     r0, HSTATE_KVM_VCPU(r13)
        /*
-        * Once we clear HSTATE_KVM_VCPU(r13), the code in
+        * Once we clear HSTATE_KVM_VCORE(r13), the code in
         * kvmppc_run_core() is going to assume that all our vcpu
         * state is visible in memory.  This lwsync makes sure
         * that that is true.
         */
        lwsync
-       std     r0, HSTATE_KVM_VCPU(r13)
+       std     r0, HSTATE_KVM_VCORE(r13)
 
 /*
  * At this point we have finished executing in the guest.
@@ -374,16 +398,71 @@ kvm_no_guest:
        b       power7_wakeup_loss
 
 53:    HMT_LOW
-       ld      r4, HSTATE_KVM_VCPU(r13)
-       cmpdi   r4, 0
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       cmpdi   r5, 0
+       bne     60f
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r3, 0
+       beq     kvm_no_guest
+       lbz     r0, KVM_SPLIT_DO_NAP(r3)
+       cmpwi   r0, 0
        beq     kvm_no_guest
        HMT_MEDIUM
+       b       kvm_unsplit_nap
+60:    HMT_MEDIUM
        b       kvm_secondary_got_guest
 
 54:    li      r0, KVM_HWTHREAD_IN_KVM
        stb     r0, HSTATE_HWTHREAD_STATE(r13)
        b       kvm_no_guest
 
+/*
+ * Here the primary thread is trying to return the core to
+ * whole-core mode, so we need to nap.
+ */
+kvm_unsplit_nap:
+       /*
+        * Ensure that secondary doesn't nap when it has
+        * its vcore pointer set.
+        */
+       sync            /* matches smp_mb() before setting split_info.do_nap */
+       ld      r0, HSTATE_KVM_VCORE(r13)
+       cmpdi   r0, 0
+       bne     kvm_no_guest
+       /* clear any pending message */
+BEGIN_FTR_SECTION
+       lis     r6, (PPC_DBELL_SERVER << (63-36))@h
+       PPC_MSGCLR(6)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       /* Set kvm_split_mode.napped[tid] = 1 */
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       li      r0, 1
+       lhz     r4, PACAPACAINDEX(r13)
+       clrldi  r4, r4, 61      /* micro-threading => P8 => 8 threads/core */
+       addi    r4, r4, KVM_SPLIT_NAPPED
+       stbx    r0, r3, r4
+       /* Check the do_nap flag again after setting napped[] */
+       sync
+       lbz     r0, KVM_SPLIT_DO_NAP(r3)
+       cmpwi   r0, 0
+       beq     57f
+       li      r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
+       mfspr   r4, SPRN_LPCR
+       rlwimi  r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
+       mtspr   SPRN_LPCR, r4
+       isync
+       std     r0, HSTATE_SCRATCH0(r13)
+       ptesync
+       ld      r0, HSTATE_SCRATCH0(r13)
+1:     cmpd    r0, r0
+       bne     1b
+       nap
+       b       .
+
+57:    li      r0, 0
+       stbx    r0, r3, r4
+       b       kvm_no_guest
+
 /******************************************************************************
  *                                                                            *
  *                               Entry code                                   *
@@ -854,7 +933,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        cmpwi   r0, 0
        bne     21f
        HMT_LOW
-20:    lbz     r0, VCORE_IN_GUEST(r5)
+20:    lwz     r3, VCORE_ENTRY_EXIT(r5)
+       cmpwi   r3, 0x100
+       bge     no_switch_exit
+       lbz     r0, VCORE_IN_GUEST(r5)
        cmpwi   r0, 0
        beq     20b
        HMT_MEDIUM
@@ -870,7 +952,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        blt     hdec_soon
 
        ld      r6, VCPU_CTR(r4)
-       lwz     r7, VCPU_XER(r4)
+       l     r7, VCPU_XER(r4)
 
        mtctr   r6
        mtxer   r7
@@ -985,9 +1067,13 @@ secondary_too_late:
 #endif
 11:    b       kvmhv_switch_to_host
 
+no_switch_exit:
+       HMT_MEDIUM
+       li      r12, 0
+       b       12f
 hdec_soon:
        li      r12, BOOK3S_INTERRUPT_HV_DECREMENTER
-       stw     r12, VCPU_TRAP(r4)
+12:    stw     r12, VCPU_TRAP(r4)
        mr      r9, r4
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
        addi    r3, r4, VCPU_TB_RMEXIT
@@ -1103,7 +1189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        mfctr   r3
        mfxer   r4
        std     r3, VCPU_CTR(r9)
-       stw     r4, VCPU_XER(r9)
+       std     r4, VCPU_XER(r9)
 
        /* If this is a page table miss then see if it's theirs or ours */
        cmpwi   r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
@@ -1127,6 +1213,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        cmpwi   r12, BOOK3S_INTERRUPT_H_DOORBELL
        bne     3f
        lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
        beq     4f
        b       guest_exit_cont
 3:
@@ -1176,6 +1263,11 @@ mc_cont:
        ld      r9, HSTATE_KVM_VCPU(r13)
        lwz     r12, VCPU_TRAP(r9)
 
+       /* Stop others sending VCPU interrupts to this physical CPU */
+       li      r0, -1
+       stw     r0, VCPU_CPU(r9)
+       stw     r0, VCPU_THREAD_CPU(r9)
+
        /* Save guest CTRL register, set runlatch to 1 */
        mfspr   r6,SPRN_CTRLF
        stw     r6,VCPU_CTRL(r9)
@@ -1540,12 +1632,17 @@ kvmhv_switch_to_host:
 
        /* Primary thread waits for all the secondaries to exit guest */
 15:    lwz     r3,VCORE_ENTRY_EXIT(r5)
-       srwi    r0,r3,8
+       rlwinm  r0,r3,32-8,0xff
        clrldi  r3,r3,56
        cmpw    r3,r0
        bne     15b
        isync
 
+       /* Did we actually switch to the guest at all? */
+       lbz     r6, VCORE_IN_GUEST(r5)
+       cmpwi   r6, 0
+       beq     19f
+
        /* Primary thread switches back to host partition */
        ld      r6,KVM_HOST_SDR1(r4)
        lwz     r7,KVM_HOST_LPID(r4)
@@ -1589,7 +1686,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 18:
        /* Signal secondary CPUs to continue */
        stb     r0,VCORE_IN_GUEST(r5)
-       lis     r8,0x7fff               /* MAX_INT@h */
+19:    lis     r8,0x7fff               /* MAX_INT@h */
        mtspr   SPRN_HDEC,r8
 
 16:    ld      r8,KVM_HOST_LPCR(r4)
@@ -1675,7 +1772,7 @@ kvmppc_hdsi:
        bl      kvmppc_msr_interrupt
 fast_interrupt_c_return:
 6:     ld      r7, VCPU_CTR(r9)
-       lwz     r8, VCPU_XER(r9)
+       l     r8, VCPU_XER(r9)
        mtctr   r7
        mtxer   r8
        mr      r4, r9
@@ -1816,8 +1913,8 @@ hcall_real_table:
        .long   DOTSYM(kvmppc_h_remove) - hcall_real_table
        .long   DOTSYM(kvmppc_h_enter) - hcall_real_table
        .long   DOTSYM(kvmppc_h_read) - hcall_real_table
-       .long   0               /* 0x10 - H_CLEAR_MOD */
-       .long   0               /* 0x14 - H_CLEAR_REF */
+       .long   DOTSYM(kvmppc_h_clear_mod) - hcall_real_table
+       .long   DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
        .long   DOTSYM(kvmppc_h_protect) - hcall_real_table
        .long   DOTSYM(kvmppc_h_get_tce) - hcall_real_table
        .long   DOTSYM(kvmppc_h_put_tce) - hcall_real_table
index bd6ab16..a759d9a 100644 (file)
@@ -352,7 +352,7 @@ static inline u32 inst_get_field(u32 inst, int msb, int lsb)
        return kvmppc_get_field(inst, msb + 32, lsb + 32);
 }
 
-bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
+static bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
 {
        if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
                return false;
index acee37c..ca8f174 100644 (file)
@@ -123,7 +123,7 @@ no_dcbz32_on:
        PPC_LL  r8, SVCPU_CTR(r3)
        PPC_LL  r9, SVCPU_LR(r3)
        lwz     r10, SVCPU_CR(r3)
-       lwz     r11, SVCPU_XER(r3)
+       PPC_LL  r11, SVCPU_XER(r3)
 
        mtctr   r8
        mtlr    r9
@@ -237,7 +237,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
        mfctr   r8
        mflr    r9
 
-       stw     r5, SVCPU_XER(r13)
+       PPC_STL r5, SVCPU_XER(r13)
        PPC_STL r6, SVCPU_FAULT_DAR(r13)
        stw     r7, SVCPU_FAULT_DSISR(r13)
        PPC_STL r8, SVCPU_CTR(r13)
index c6ca7db..905e94a 100644 (file)
@@ -41,7 +41,7 @@
  * =======
  *
  * Each ICS has a spin lock protecting the information about the IRQ
- * sources and avoiding simultaneous deliveries if the same interrupt.
+ * sources and avoiding simultaneous deliveries of the same interrupt.
  *
  * ICP operations are done via a single compare & swap transaction
  * (most ICP state fits in the union kvmppc_icp_state)
index cc58426..ae458f0 100644 (file)
@@ -933,6 +933,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
 #endif
                break;
        case BOOKE_INTERRUPT_CRITICAL:
+               kvmppc_fill_pt_regs(&regs);
                unknown_exception(&regs);
                break;
        case BOOKE_INTERRUPT_DEBUG:
index 50860e9..29911a0 100644 (file)
@@ -377,7 +377,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
                        | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
                vcpu->arch.shared->mas1 =
                          (vcpu->arch.shared->mas6 & MAS6_SPID0)
-                       | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0))
+                       | ((vcpu->arch.shared->mas6 & MAS6_SAS) ? MAS1_TS : 0)
                        | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
                vcpu->arch.shared->mas2 &= MAS2_EPN;
                vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
index e5dde32..2e51289 100644 (file)
@@ -660,7 +660,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
        return kvmppc_core_pending_dec(vcpu);
 }
 
-enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
 {
        struct kvm_vcpu *vcpu;
 
index 4827870..1d57000 100644 (file)
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 
 config KEXEC
        def_bool y
+       select KEXEC_CORE
 
 config AUDIT_ARCH
        def_bool y
index 42506b3..4da604e 100644 (file)
@@ -167,7 +167,7 @@ unsigned long decompress_kernel(void)
 #endif
 
        puts("Uncompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
        puts("Ok, booting the kernel.\n");
        return (unsigned long) output;
 }
index 9d39596..b3fd54d 100644 (file)
@@ -18,27 +18,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &s390_dma_ops;
 }
 
-extern int dma_set_mask(struct device *dev, u64 mask);
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                                  enum dma_data_direction direction)
 {
 }
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->dma_supported == NULL)
-               return 1;
-       return dma_ops->dma_supported(dev, mask);
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
@@ -46,45 +32,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-       return dma_addr == DMA_ERROR_CODE;
-}
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flags,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       BUG_ON(!ops);
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flags, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 #endif /* _ASM_S390_DMA_MAPPING_H */
index 42b7658..37505b8 100644 (file)
@@ -262,16 +262,6 @@ out:
        spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
 }
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(dma_set_mask);
-
 static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
                                     unsigned long offset, size_t size,
                                     enum dma_data_direction direction,
index 50057fe..d514df7 100644 (file)
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
        depends on SUPERH32 && MMU
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 95470a4..208a975 100644 (file)
@@ -132,7 +132,7 @@ void decompress_kernel(void)
 
        puts("Uncompressing Linux... ");
        cache_control(CACHE_ENABLE);
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
        cache_control(CACHE_DISABLE);
        puts("Ok, booting the kernel.\n");
 }
index b437f2c..a3745a3 100644 (file)
@@ -9,86 +9,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
-#include <asm-generic/dma-coherent.h>
-#include <asm-generic/dma-mapping-common.h>
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (ops->dma_supported)
-               return ops->dma_supported(dev, mask);
-
-       return 1;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
+#define DMA_ERROR_CODE 0
 
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, mask);
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
+#include <asm-generic/dma-mapping-common.h>
 
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                    enum dma_data_direction dir);
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return dma_addr == 0;
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-               return memory;
-       if (!ops->alloc)
-               return NULL;
-
-       memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-       if (ops->free)
-               ops->free(dev, size, vaddr, dma_handle, attrs);
-}
-
 /* arch/sh/mm/consistent.c */
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                        dma_addr_t *dma_addr, gfp_t flag,
index 7e064c6..a21da59 100644 (file)
@@ -7,11 +7,9 @@
 
 #define DMA_ERROR_CODE (~(dma_addr_t)0x0)
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 int dma_supported(struct device *dev, u64 mask);
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                                  enum dma_data_direction dir)
 {
@@ -39,39 +37,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return (dma_addr == DMA_ERROR_CODE);
-}
+#define HAVE_ARCH_DMA_SET_MASK 1
 
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
@@ -86,4 +52,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
        return -EINVAL;
 }
 
+#include <asm-generic/dma-mapping-common.h>
+
 #endif
index 2ba12d7..106c21b 100644 (file)
@@ -205,6 +205,7 @@ source "kernel/Kconfig.hz"
 
 config KEXEC
        bool "kexec system call"
+       select KEXEC_CORE
        ---help---
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 1eae359..96ac6cc 100644 (file)
@@ -59,8 +59,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
 {
        dev->archdata.dma_ops = ops;
@@ -74,18 +72,9 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-}
+#define HAVE_ARCH_DMA_SET_MASK 1
 
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->dma_supported(dev, mask);
-}
+#include <asm-generic/dma-mapping-common.h>
 
 static inline int
 dma_set_mask(struct device *dev, u64 mask)
@@ -116,36 +105,6 @@ dma_set_mask(struct device *dev, u64 mask)
        return 0;
 }
 
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-
 /*
  * dma_alloc_noncoherent() is #defined to return coherent memory,
  * so there's no need to do any flushing here.
index 176d5bd..5c65dfe 100644 (file)
@@ -119,8 +119,8 @@ unsigned long decompress_kernel(unsigned long output_start,
        output_ptr = get_unaligned_le32(tmp);
 
        arch_decomp_puts("Uncompressing Linux...");
-       decompress(input_data, input_data_end - input_data, NULL, NULL,
-                       output_data, NULL, error);
+       __decompress(input_data, input_data_end - input_data, NULL, NULL,
+                       output_data, 0, NULL, error);
        arch_decomp_puts(" done, booting the kernel.\n");
        return output_ptr;
 }
index 366460a..8140e05 100644 (file)
@@ -18,8 +18,6 @@
 #include <linux/scatterlist.h>
 #include <linux/swiotlb.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <asm/memory.h>
 #include <asm/cacheflush.h>
 
@@ -30,26 +28,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &swiotlb_dma_map_ops;
 }
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (unlikely(dma_ops == NULL))
-               return 0;
-
-       return dma_ops->dma_supported(dev, mask);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-       return 0;
-}
-
 #include <asm-generic/dma-mapping-common.h>
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
@@ -72,41 +50,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
-
-       return 0;
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       return dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr,
                size_t size, enum dma_data_direction direction)
 {
index cc0d73e..328c835 100644 (file)
@@ -1006,7 +1006,7 @@ config X86_THERMAL_VECTOR
        depends on X86_MCE_INTEL
 
 config X86_LEGACY_VM86
-       bool "Legacy VM86 support (obsolete)"
+       bool "Legacy VM86 support"
        default n
        depends on X86_32
        ---help---
@@ -1018,19 +1018,20 @@ config X86_LEGACY_VM86
          available to accelerate real mode DOS programs.  However, any
          recent version of DOSEMU, X, or vbetool should be fully
          functional even without kernel VM86 support, as they will all
-         fall back to (pretty well performing) software emulation.
+         fall back to software emulation. Nevertheless, if you are using
+         a 16-bit DOS program where 16-bit performance matters, vm86
+         mode might be faster than emulation and you might want to
+         enable this option.
 
-         Anything that works on a 64-bit kernel is unlikely to need
-         this option, as 64-bit kernels don't, and can't, support V8086
-         mode.  This option is also unrelated to 16-bit protected mode
-         and is not needed to run most 16-bit programs under Wine.
+         Note that any app that works on a 64-bit kernel is unlikely to
+         need this option, as 64-bit kernels don't, and can't, support
+         V8086 mode. This option is also unrelated to 16-bit protected
+         mode and is not needed to run most 16-bit programs under Wine.
 
-         Enabling this option adds considerable attack surface to the
-         kernel and slows down system calls and exception handling.
+         Enabling this option increases the complexity of the kernel
+         and slows down exception handling a tiny bit.
 
-         Unless you use very old userspace or need the last drop of
-         performance in your real mode DOS games and can't use KVM,
-         say N here.
+         If unsure, say N here.
 
 config VM86
        bool
@@ -1754,6 +1755,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
        bool "kexec system call"
+       select KEXEC_CORE
        ---help---
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
@@ -1770,8 +1772,8 @@ config KEXEC
 
 config KEXEC_FILE
        bool "kexec file based system call"
+       select KEXEC_CORE
        select BUILD_BIN2C
-       depends on KEXEC
        depends on X86_64
        depends on CRYPTO=y
        depends on CRYPTO_SHA256=y
index f637979..79dac17 100644 (file)
@@ -448,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 #endif
 
        debug_putstr("\nDecompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, output_len,
+                       NULL, error);
        parse_elf(output);
        /*
         * 32-bit always performs relocations. 64-bit relocations are only
index 16ef025..2d6b309 100644 (file)
@@ -414,7 +414,7 @@ xloadflags:
 # define XLF23 0
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
 # define XLF4 XLF_EFI_KEXEC
 #else
 # define XLF4 0
index 477bfa6..7663c45 100644 (file)
 372    i386    recvmsg                 sys_recvmsg                     compat_sys_recvmsg
 373    i386    shutdown                sys_shutdown
 374    i386    userfaultfd             sys_userfaultfd
+375    i386    membarrier              sys_membarrier
index 81c4906..278842f 100644 (file)
 321    common  bpf                     sys_bpf
 322    64      execveat                stub_execveat
 323    common  userfaultfd             sys_userfaultfd
+324    common  membarrier              sys_membarrier
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index 26a46f4..b160c0c 100644 (file)
@@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
 {
        return "[vsyscall]";
 }
-static struct vm_operations_struct gate_vma_ops = {
+static const struct vm_operations_struct gate_vma_ops = {
        .name = gate_vma_name,
 };
 static struct vm_area_struct gate_vma = {
index 477fc28..e6cf2ad 100644 (file)
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
 #define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI     ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
 
 /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
 #define X86_FEATURE_XSAVEOPT   (10*32+ 0) /* XSAVEOPT */
index 1f5b728..953b726 100644 (file)
@@ -12,7 +12,6 @@
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/dma-contiguous.h>
 
 #ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 #endif
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
+
+#include <asm-generic/dma-mapping-common.h>
 
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                        dma_addr_t *dma_addr, gfp_t flag,
@@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
        return gfp;
 }
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-               gfp_t gfp, struct dma_attrs *attrs);
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-void dma_free_attrs(struct device *dev, size_t size,
-                   void *vaddr, dma_addr_t bus,
-                   struct dma_attrs *attrs);
-
 #endif
index 32ce713..b130d59 100644 (file)
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 extern int in_crash_kexec;
 #else
 /* no crash dump is ever in progress if no crash kernel can be kexec'd */
index ce029e4..31247b5 100644 (file)
@@ -97,7 +97,6 @@ struct pv_lazy_ops {
 struct pv_time_ops {
        unsigned long long (*sched_clock)(void);
        unsigned long long (*steal_clock)(int cpu);
-       unsigned long (*get_tsc_khz)(void);
 };
 
 struct pv_cpu_ops {
index 9d51fae..eaba080 100644 (file)
@@ -39,18 +39,27 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
 }
 #endif
 
-#define virt_queued_spin_lock virt_queued_spin_lock
-
-static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+#ifdef CONFIG_PARAVIRT
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
 {
        if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
                return false;
 
-       while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
-               cpu_relax();
+       /*
+        * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+        * back to a Test-and-Set spinlock, because fair locks have
+        * horrible lock 'holder' preemption issues.
+        */
+
+       do {
+               while (atomic_read(&lock->val) != 0)
+                       cpu_relax();
+       } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
 
        return true;
 }
+#endif /* CONFIG_PARAVIRT */
 
 #include <asm-generic/qspinlock.h>
 
index a3804fb..0679e11 100644 (file)
@@ -101,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
        unsigned long mfn;
 
+       /*
+        * Some x86 code are still using pfn_to_mfn instead of
+        * pfn_to_mfn. This will have to be removed when we figured
+        * out which call.
+        */
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return pfn;
 
@@ -147,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
        unsigned long pfn;
 
+       /*
+        * Some x86 code are still using mfn_to_pfn instead of
+        * gfn_to_pfn. This will have to be removed when we figure
+        * out which call.
+        */
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return mfn;
 
@@ -176,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
        return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
 }
 
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return pfn;
+       else
+               return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return gfn;
+       else
+               return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn)                pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn)                gfn_to_pfn(bfn)
+
 /*
  * We detect special mappings in one of two ways:
  *  1. If the MFN is an I/O page then Xen will set the m2p entry
@@ -196,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
  *      require. In all the cases we care about, the FOREIGN_FRAME bit is
  *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
  */
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
 {
        unsigned long pfn;
 
@@ -215,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 #define virt_to_mfn(v)         (pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v)         (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g)         (__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
 static inline unsigned long pte_mfn(pte_t pte)
 {
        return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
@@ -262,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr);
 
 static inline bool xen_arch_need_swiotlb(struct device *dev,
                                         unsigned long pfn,
-                                        unsigned long mfn)
+                                        unsigned long bfn)
 {
        return false;
 }
index 9ffdf25..b1b78ff 100644 (file)
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH)               += livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)  += ftrace.o
 obj-$(CONFIG_X86_TSC)          += trace_clock.o
-obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)       += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE)       += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)       += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
 obj-y                          += kprobes/
index c42827e..25f9093 100644 (file)
@@ -338,10 +338,15 @@ done:
 
 static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
 {
+       unsigned long flags;
+
        if (instr[0] != 0x90)
                return;
 
+       local_irq_save(flags);
        add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+       sync_core();
+       local_irq_restore(flags);
 
        DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
                   instr, a->instrlen - a->padlen, a->padlen);
index 3ca3e46..24e94ce 100644 (file)
@@ -336,6 +336,13 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
        apic_write(APIC_LVTT, lvtt_value);
 
        if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+               /*
+                * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+                * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+                * According to Intel, MFENCE can do the serialization here.
+                */
+               asm volatile("mfence" : : : "memory");
+
                printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
                return;
        }
index 38a76f8..5c60bb1 100644 (file)
@@ -2522,6 +2522,7 @@ void __init setup_ioapic_dest(void)
        int pin, ioapic, irq, irq_entry;
        const struct cpumask *mask;
        struct irq_data *idata;
+       struct irq_chip *chip;
 
        if (skip_ioapic_setup == 1)
                return;
@@ -2545,9 +2546,9 @@ void __init setup_ioapic_dest(void)
                else
                        mask = apic->target_cpus();
 
-               irq_set_affinity(irq, mask);
+               chip = irq_data_get_irq_chip(idata);
+               chip->irq_set_affinity(idata, mask, false);
        }
-
 }
 #endif
 
index 07ce52c..de22ea7 100644 (file)
@@ -1110,10 +1110,10 @@ void print_cpu_info(struct cpuinfo_x86 *c)
        else
                printk(KERN_CONT "%d86", c->x86);
 
-       printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+       printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
 
        if (c->x86_mask || c->cpuid_level >= 0)
-               printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
+               printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
        else
                printk(KERN_CONT ")\n");
 
index cd9b6d0..3fefebf 100644 (file)
@@ -2316,9 +2316,12 @@ static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
                            struct perf_event *event)
 {
-       struct event_constraint *c1 = cpuc->event_constraint[idx];
+       struct event_constraint *c1 = NULL;
        struct event_constraint *c2;
 
+       if (idx >= 0) /* fake does < 0 */
+               c1 = cpuc->event_constraint[idx];
+
        /*
         * first time only
         * - static constraint: no change across incremental scheduling calls
index 54690e8..d1c0f25 100644 (file)
@@ -222,6 +222,7 @@ static void __bts_event_start(struct perf_event *event)
        if (!buf || bts_buffer_is_full(buf, bts))
                return;
 
+       event->hw.itrace_started = 1;
        event->hw.state = 0;
 
        if (!buf->snapshot)
index 49487b4..2c7aafa 100644 (file)
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
  * kind of shutdown from our side, we unregister the clock by writting anything
  * that does not have the 'enable' bit set in the msr
  */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
        native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
        x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
        x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
        machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
        kvm_get_preset_lpj();
index 2bcc052..6acc9dd 100644 (file)
@@ -58,7 +58,7 @@ static struct ldt_struct *alloc_ldt_struct(int size)
        if (alloc_size > PAGE_SIZE)
                new_ldt->entries = vzalloc(alloc_size);
        else
-               new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
 
        if (!new_ldt->entries) {
                kfree(new_ldt);
@@ -95,7 +95,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
        if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
                vfree(ldt->entries);
        else
-               kfree(ldt->entries);
+               free_page((unsigned long)ldt->entries);
        kfree(ldt);
 }
 
index 353972c..84b8ef8 100644 (file)
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
 /* Number of entries preallocated for DMA-API debugging */
 #define PREALLOC_DMA_DEBUG_ENTRIES       65536
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 void __init pci_iommu_alloc(void)
 {
        struct iommu_table_entry *p;
@@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
                free_pages((unsigned long)vaddr, get_order(size));
 }
 
-void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-                     gfp_t gfp, struct dma_attrs *attrs)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
 {
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-               return memory;
-
-       if (!dev)
-               dev = &x86_dma_fallback_dev;
-
-       if (!is_device_dma_capable(dev))
-               return NULL;
-
-       if (!ops->alloc)
-               return NULL;
-
-       memory = ops->alloc(dev, size, dma_handle,
-                           dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-EXPORT_SYMBOL(dma_alloc_attrs);
-
-void dma_free_attrs(struct device *dev, size_t size,
-                   void *vaddr, dma_addr_t bus,
-                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       WARN_ON(irqs_disabled());       /* for portability */
+       *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
+       *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
 
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
+       if (!*dev)
+               *dev = &x86_dma_fallback_dev;
+       if (!is_device_dma_capable(*dev))
+               return false;
+       return true;
 
-       debug_dma_free_coherent(dev, size, vaddr, bus);
-       if (ops->free)
-               ops->free(dev, size, vaddr, bus, attrs);
 }
-EXPORT_SYMBOL(dma_free_attrs);
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
 
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
index 86db4bc..02693dd 100644 (file)
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
        .emergency_restart = native_machine_emergency_restart,
        .restart = native_machine_restart,
        .halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        .crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
@@ -703,7 +703,7 @@ void machine_halt(void)
        machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void machine_crash_shutdown(struct pt_regs *regs)
 {
        machine_ops.crash_shutdown(regs);
index baadbf9..fdb7f2a 100644 (file)
@@ -478,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * --------- Crashkernel reservation ------------------------------
  */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
index c8d52cb..c3f7602 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/hypervisor.h>
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
+#include <asm/geode.h>
 
 unsigned int __read_mostly cpu_khz;    /* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -1013,15 +1014,17 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
 static void __init check_system_tsc_reliable(void)
 {
-#ifdef CONFIG_MGEODE_LX
-       /* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+       if (is_geode_lx()) {
+               /* RTSC counts during suspend */
 #define RTSC_SUSP 0x100
-       unsigned long res_low, res_high;
+               unsigned long res_low, res_high;
 
-       rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
-       /* Geode_LX - the OLPC CPU has a very reliable TSC */
-       if (res_low & RTSC_SUSP)
-               tsc_clocksource_reliable = 1;
+               rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+               /* Geode_LX - the OLPC CPU has a very reliable TSC */
+               if (res_low & RTSC_SUSP)
+                       tsc_clocksource_reliable = 1;
+       }
 #endif
        if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
                tsc_clocksource_reliable = 1;
index abd8b85..5246193 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/audit.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -232,6 +233,32 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
        struct pt_regs *regs = current_pt_regs();
        unsigned long err = 0;
 
+       err = security_mmap_addr(0);
+       if (err) {
+               /*
+                * vm86 cannot virtualize the address space, so vm86 users
+                * need to manage the low 1MB themselves using mmap.  Given
+                * that BIOS places important data in the first page, vm86
+                * is essentially useless if mmap_min_addr != 0.  DOSEMU,
+                * for example, won't even bother trying to use vm86 if it
+                * can't map a page at virtual address 0.
+                *
+                * To reduce the available kernel attack surface, simply
+                * disallow vm86(old) for users who cannot mmap at va 0.
+                *
+                * The implementation of security_mmap_addr will allow
+                * suitably privileged users to map va 0 even if
+                * vm.mmap_min_addr is set above 0, and we want this
+                * behavior for vm86 as well, as it ensures that legacy
+                * tools like vbetool will not fail just because of
+                * vm.mmap_min_addr.
+                */
+               pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d).  Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+                            current->comm, task_pid_nr(current),
+                            from_kuid_munged(&init_user_ns, current_uid()));
+               return -EPERM;
+       }
+
        if (!vm86) {
                if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
                        return -ENOMEM;
index 00bf300..74e4bf1 100644 (file)
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
 
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <asm/kexec.h>
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
index e7a4fde..b372a75 100644 (file)
@@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
        u16 sel;
 
        la = seg_base(ctxt, addr.seg) + addr.ea;
+       *linear = la;
        *max_size = 0;
        switch (mode) {
        case X86EMUL_MODE_PROT64:
@@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
        }
        if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
                return emulate_gp(ctxt, 0);
-       *linear = la;
        return X86EMUL_CONTINUE;
 bad:
        if (addr.seg == VCPU_SREG_SS)
index fb16a8e..69088a1 100644 (file)
@@ -3309,13 +3309,14 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 
        walk_shadow_page_lockless_begin(vcpu);
 
-       for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+       for (shadow_walk_init(&iterator, vcpu, addr),
+                leaf = root = iterator.level;
             shadow_walk_okay(&iterator);
             __shadow_walk_next(&iterator, spte)) {
-               leaf = iterator.level;
                spte = mmu_spte_get_lockless(iterator.sptep);
 
                sptes[leaf - 1] = spte;
+               leaf--;
 
                if (!is_shadow_present_pte(spte))
                        break;
@@ -3329,7 +3330,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
        if (reserved) {
                pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
                       __func__, addr);
-               while (root >= leaf) {
+               while (root > leaf) {
                        pr_err("------ spte 0x%llx level %d.\n",
                               sptes[root - 1], root);
                        root--;
index 148ea20..d019868 100644 (file)
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This bitmap is used to indicate whether the vmclear
  * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
 #else
 static inline void crash_enable_local_vmclear(int cpu) { }
 static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
        if (r)
                return r;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                           crash_vmclear_local_loaded_vmcss);
 #endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
        synchronize_rcu();
 #endif
index 1e7e76e..a60bdbc 100644 (file)
@@ -5943,6 +5943,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
        put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
 }
 
+#ifdef CONFIG_X86_64
 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
        struct kvm_segment seg;
@@ -5958,6 +5959,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
        put_smstate(u32, buf, offset + 4, seg.limit);
        put_smstate(u64, buf, offset + 8, seg.base);
 }
+#endif
 
 static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
index db1b0bc..134948b 100644 (file)
@@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
  */
 static unsigned long mpx_mmap(unsigned long len)
 {
-       unsigned long ret;
-       unsigned long addr, pgoff;
        struct mm_struct *mm = current->mm;
-       vm_flags_t vm_flags;
-       struct vm_area_struct *vma;
+       unsigned long addr, populate;
 
        /* Only bounds table can be allocated here */
        if (len != mpx_bt_size_bytes(mm))
                return -EINVAL;
 
        down_write(&mm->mmap_sem);
-
-       /* Too many mappings? */
-       if (mm->map_count > sysctl_max_map_count) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /* Obtain the address to map to. we verify (or select) it and ensure
-        * that it represents a valid section of the address space.
-        */
-       addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
-       if (addr & ~PAGE_MASK) {
-               ret = addr;
-               goto out;
-       }
-
-       vm_flags = VM_READ | VM_WRITE | VM_MPX |
-                       mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
-       /* Set pgoff according to addr for anon_vma */
-       pgoff = addr >> PAGE_SHIFT;
-
-       ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
-       if (IS_ERR_VALUE(ret))
-               goto out;
-
-       vma = find_vma(mm, ret);
-       if (!vma) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       if (vm_flags & VM_LOCKED) {
-               up_write(&mm->mmap_sem);
-               mm_populate(ret, len);
-               return ret;
-       }
-
-out:
+       addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
        up_write(&mm->mmap_sem);
-       return ret;
+       if (populate)
+               mm_populate(addr, populate);
+
+       return addr;
 }
 
 enum reg_type {
index 66338a6..c2aea63 100644 (file)
@@ -192,10 +192,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
        node_set(node, numa_nodes_parsed);
 
-       pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+       pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
                node, pxm,
                (unsigned long long) start, (unsigned long long) end - 1,
-               hotpluggable ? " hotplug" : "");
+               hotpluggable ? " hotplug" : "",
+               ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");
 
        /* Mark hotplug range in memblock. */
        if (hotpluggable && memblock_mark_hotplug(start, ma->length))
index e4308fe..1db84c0 100644 (file)
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 
 static void __init save_runtime_map(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        efi_memory_desc_t *md;
        void *tmp, *p, *q = NULL;
        int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 
 static void __init kexec_enter_virtual_mode(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        efi_memory_desc_t *md;
        void *p;
 
index 020c101..5c9f63f 100644 (file)
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
        touch_nmi_watchdog();
 }
 
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
 static atomic_t uv_nmi_kexec_failed;
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
        uv_nmi_sync_exit(0);
 }
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
        if (master)
                pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
 }
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
index 2c50b44..9c479fe 100644 (file)
@@ -2812,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
        return 0;
 }
 
-static int do_remap_mfn(struct vm_area_struct *vma,
+static int do_remap_gfn(struct vm_area_struct *vma,
                        unsigned long addr,
-                       xen_pfn_t *mfn, int nr,
+                       xen_pfn_t *gfn, int nr,
                        int *err_ptr, pgprot_t prot,
                        unsigned domid,
                        struct page **pages)
@@ -2830,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma,
        if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
                /* We need to update the local page tables and the xen HAP */
-               return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+               return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
                                                 prot, domid, pages);
 #else
                return -EINVAL;
 #endif
         }
 
-       rmd.mfn = mfn;
+       rmd.mfn = gfn;
        rmd.prot = prot;
        /* We use the err_ptr to indicate if there we are doing a contigious
         * mapping or a discontigious mapping. */
@@ -2865,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma,
                                                    batch_left, &done, domid);
 
                        /*
-                        * @err_ptr may be the same buffer as @mfn, so
-                        * only clear it after each chunk of @mfn is
+                        * @err_ptr may be the same buffer as @gfn, so
+                        * only clear it after each chunk of @gfn is
                         * used.
                         */
                        if (err_ptr) {
@@ -2896,19 +2896,19 @@ out:
        return err < 0 ? err : mapped;
 }
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t mfn, int nr,
+                              xen_pfn_t gfn, int nr,
                               pgprot_t prot, unsigned domid,
                               struct page **pages)
 {
-       return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
+       return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t *mfn, int nr,
+                              xen_pfn_t *gfn, int nr,
                               int *err_ptr, pgprot_t prot,
                               unsigned domid, struct page **pages)
 {
@@ -2917,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
         * cause of "wrong memory was mapped in".
         */
        BUG_ON(err_ptr == NULL);
-       return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+       return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 
 /* Returns: 0 success */
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
                               int numpgs, struct page **pages)
 {
        if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
@@ -2935,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
        return -EINVAL;
 #endif
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
index 2a9ff73..3f4ebf0 100644 (file)
@@ -453,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        }
 #endif
        ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
        if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
                BUG();
 
index f01cb30..4427f38 100644 (file)
@@ -32,66 +32,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
-       ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       ops->free(dev, size, vaddr, dma_handle, attrs);
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
-
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       return ops->mapping_error(dev, dma_addr);
-}
-
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-       return 1;
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-       if(!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                    enum dma_data_direction direction);
 
index 515b543..ad3f276 100644 (file)
@@ -1990,7 +1990,7 @@ int bio_associate_current(struct bio *bio)
 
        get_io_context_active(ioc);
        bio->bi_ioc = ioc;
-       bio->bi_css = task_get_css(current, blkio_cgrp_id);
+       bio->bi_css = task_get_css(current, io_cgrp_id);
        return 0;
 }
 EXPORT_SYMBOL_GPL(bio_associate_current);
index d6283b3..ac8370c 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
                return;
 
        for (i = 0; i < BLKCG_MAX_POLS; i++)
-               kfree(blkg->pd[i]);
+               if (blkg->pd[i])
+                       blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
-       blk_exit_rl(&blkg->rl);
+       if (blkg->blkcg != &blkcg_root)
+               blk_exit_rl(&blkg->rl);
+
+       blkg_rwstat_exit(&blkg->stat_ios);
+       blkg_rwstat_exit(&blkg->stat_bytes);
        kfree(blkg);
 }
 
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        if (!blkg)
                return NULL;
 
+       if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+           blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+               goto err_free;
+
        blkg->q = q;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                        continue;
 
                /* alloc per-policy data and attach it to blkg */
-               pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+               pd = pol->pd_alloc_fn(gfp_mask, q->node);
                if (!pd)
                        goto err_free;
 
@@ -129,26 +139,11 @@ err_free:
        return NULL;
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint)
 {
        struct blkcg_gq *blkg;
 
-       blkg = rcu_dereference(blkcg->blkg_hint);
-       if (blkg && blkg->q == q)
-               return blkg;
-
        /*
         * Hint didn't match.  Look up from the radix tree.  Note that the
         * hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 
        return NULL;
 }
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (unlikely(blk_queue_bypass(q)))
-               return NULL;
-       return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                    struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
-               ret = -EINVAL;
+               ret = -ENODEV;
                goto err_free_blkg;
        }
 
        wb_congested = wb_congested_get_create(&q->backing_dev_info,
-                                              blkcg->css.id, GFP_ATOMIC);
+                                              blkcg->css.id, GFP_NOWAIT);
        if (!wb_congested) {
                ret = -ENOMEM;
                goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
        /* allocate */
        if (!new_blkg) {
-               new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+               new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
        if (blkcg_parent(blkcg)) {
                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                if (WARN_ON_ONCE(!blkg->parent)) {
-                       ret = -EINVAL;
+                       ret = -ENODEV;
                        goto err_put_congested;
                }
                blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_init_fn)
-                       pol->pd_init_fn(blkg);
+                       pol->pd_init_fn(blkg->pd[i]);
        }
 
        /* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                        struct blkcg_policy *pol = blkcg_policy[i];
 
                        if (blkg->pd[i] && pol->pd_online_fn)
-                               pol->pd_online_fn(blkg);
+                               pol->pd_online_fn(blkg->pd[i]);
                }
        }
        blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
         * we shouldn't allow anything to go through for a bypassing queue.
         */
        if (unlikely(blk_queue_bypass(q)))
-               return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+               return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
 
        blkg = __blkg_lookup(blkcg, q, true);
        if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                        return blkg;
        }
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
        struct blkcg *blkcg = blkg->blkcg;
+       struct blkcg_gq *parent = blkg->parent;
        int i;
 
        lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
+                       pol->pd_offline_fn(blkg->pd[i]);
+       }
+
+       if (parent) {
+               blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+               blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
        }
+
        blkg->online = false;
 
        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -400,15 +383,6 @@ static void blkg_destroy_all(struct request_queue *q)
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
        struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-       int i;
-
-       /* tell policies that this one is being freed */
-       for (i = 0; i < BLKCG_MAX_POLS; i++) {
-               struct blkcg_policy *pol = blkcg_policy[i];
-
-               if (blkg->pd[i] && pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-       }
 
        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
@@ -472,12 +446,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               blkg_rwstat_reset(&blkg->stat_bytes);
+               blkg_rwstat_reset(&blkg->stat_ios);
+
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];
 
-                       if (blkcg_policy_enabled(blkg->q, pol) &&
-                           pol->pd_reset_stats_fn)
-                               pol->pd_reset_stats_fn(blkg);
+                       if (blkg->pd[i] && pol->pd_reset_stats_fn)
+                               pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }
 
@@ -486,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
        return 0;
 }
 
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
        /* some drivers (floppy) instantiate a queue w/o disk registered */
        if (blkg->q->backing_dev_info.dev)
                return dev_name(blkg->q->backing_dev_info.dev);
        return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
@@ -581,9 +558,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-                          (unsigned long long)rwstat->cnt[i]);
+                          (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-       v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+       v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
        return v;
 }
@@ -620,31 +598,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+                                   struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+                                             struct blkg_policy_data *pd,
+                                             int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+                                                             NULL, off);
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        u64 sum = 0;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_stat *stat = (void *)pos_pd + off;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_stat *stat;
+
+               if (!pos_blkg->online)
+                       continue;
+
+               if (pol)
+                       stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       stat = (void *)blkg + off;
 
-               if (pos_blkg->online)
-                       sum += blkg_stat_read(stat);
+               sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
        }
        rcu_read_unlock();
 
@@ -654,37 +723,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        struct blkg_rwstat sum = { };
        int i;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-               struct blkg_rwstat tmp;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_rwstat *rwstat;
 
                if (!pos_blkg->online)
                        continue;
 
-               tmp = blkg_rwstat_read(rwstat);
+               if (pol)
+                       rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       rwstat = (void *)pos_blkg + off;
 
                for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       sum.cnt[i] += tmp.cnt[i];
+                       atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+                               percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+                               &sum.aux_cnt[i]);
        }
        rcu_read_unlock();
 
@@ -700,29 +775,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx)
+                  char *input, struct blkg_conf_ctx *ctx)
        __acquires(rcu) __acquires(disk->queue->queue_lock)
 {
        struct gendisk *disk;
        struct blkcg_gq *blkg;
        unsigned int major, minor;
-       unsigned long long v;
-       int part, ret;
+       int key_len, part, ret;
+       char *body;
 
-       if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+       if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;
 
+       body = input + key_len;
+       if (!isspace(*body))
+               return -EINVAL;
+       body = skip_spaces(body);
+
        disk = get_gendisk(MKDEV(major, minor), &part);
        if (!disk)
-               return -EINVAL;
+               return -ENODEV;
        if (part) {
                put_disk(disk);
-               return -EINVAL;
+               return -ENODEV;
        }
 
        rcu_read_lock();
@@ -731,7 +811,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        if (blkcg_policy_enabled(disk->queue, pol))
                blkg = blkg_lookup_create(blkcg, disk->queue);
        else
-               blkg = ERR_PTR(-EINVAL);
+               blkg = ERR_PTR(-EOPNOTSUPP);
 
        if (IS_ERR(blkg)) {
                ret = PTR_ERR(blkg);
@@ -753,7 +833,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
        ctx->disk = disk;
        ctx->blkg = blkg;
-       ctx->v = v;
+       ctx->body = body;
        return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -774,7 +854,54 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+               const char *dname;
+               struct blkg_rwstat rwstat;
+               u64 rbytes, wbytes, rios, wios;
+
+               dname = blkg_dev_name(blkg);
+               if (!dname)
+                       continue;
+
+               spin_lock_irq(blkg->q->queue_lock);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+               rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_ios));
+               rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               spin_unlock_irq(blkg->q->queue_lock);
+
+               if (rbytes || wbytes || rios || wios)
+                       seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+                                  dname, rbytes, wbytes, rios, wios);
+       }
+
+       rcu_read_unlock();
+       return 0;
+}
+
 struct cftype blkcg_files[] = {
+       {
+               .name = "stat",
+               .seq_show = blkcg_print_stat,
+       },
+       { }     /* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
@@ -822,18 +949,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
        struct blkcg *blkcg = css_to_blkcg(css);
+       int i;
 
        mutex_lock(&blkcg_pol_mutex);
+
        list_del(&blkcg->all_blkcgs_node);
-       mutex_unlock(&blkcg_pol_mutex);
 
-       if (blkcg != &blkcg_root) {
-               int i;
+       for (i = 0; i < BLKCG_MAX_POLS; i++)
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 
-               for (i = 0; i < BLKCG_MAX_POLS; i++)
-                       kfree(blkcg->pd[i]);
-               kfree(blkcg);
-       }
+       mutex_unlock(&blkcg_pol_mutex);
+
+       kfree(blkcg);
 }
 
 static struct cgroup_subsys_state *
@@ -847,13 +975,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
        if (!parent_css) {
                blkcg = &blkcg_root;
-               goto done;
-       }
-
-       blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-       if (!blkcg) {
-               ret = ERR_PTR(-ENOMEM);
-               goto free_blkcg;
+       } else {
+               blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+               if (!blkcg) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto free_blkcg;
+               }
        }
 
        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -866,23 +993,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
-               if (!pol || !pol->cpd_size)
+               if (!pol || !pol->cpd_alloc_fn)
                        continue;
 
-               BUG_ON(blkcg->pd[i]);
-               cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+               cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd) {
                        ret = ERR_PTR(-ENOMEM);
                        goto free_pd_blkcg;
                }
-               blkcg->pd[i] = cpd;
+               blkcg->cpd[i] = cpd;
+               cpd->blkcg = blkcg;
                cpd->plid = i;
-               pol->cpd_init_fn(blkcg);
+               if (pol->cpd_init_fn)
+                       pol->cpd_init_fn(cpd);
        }
 
-done:
        spin_lock_init(&blkcg->lock);
-       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -894,7 +1021,8 @@ done:
 
 free_pd_blkcg:
        for (i--; i >= 0; i--)
-               kfree(blkcg->pd[i]);
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 free_blkcg:
        kfree(blkcg);
        mutex_unlock(&blkcg_pol_mutex);
@@ -938,7 +1066,7 @@ int blkcg_init_queue(struct request_queue *q)
                radix_tree_preload_end();
 
        if (IS_ERR(blkg)) {
-               kfree(new_blkg);
+               blkg_free(new_blkg);
                return PTR_ERR(blkg);
        }
 
@@ -1015,12 +1143,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
        return ret;
 }
 
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+       int i;
+
+       mutex_lock(&blkcg_pol_mutex);
+
+       for (i = 0; i < BLKCG_MAX_POLS; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
+               struct blkcg *blkcg;
+
+               if (!pol || !pol->cpd_bind_fn)
+                       continue;
+
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+                       if (blkcg->cpd[pol->plid])
+                               pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+       }
+       mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .can_attach = blkcg_can_attach,
-       .legacy_cftypes = blkcg_files,
+       .bind = blkcg_bind,
+       .dfl_cftypes = blkcg_files,
+       .legacy_cftypes = blkcg_legacy_files,
+       .legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
@@ -1030,7 +1181,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
        .depends_on = 1 << memory_cgrp_id,
 #endif
 };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1051,65 +1202,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 int blkcg_activate_policy(struct request_queue *q,
                          const struct blkcg_policy *pol)
 {
-       LIST_HEAD(pds);
+       struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg;
-       struct blkg_policy_data *pd, *nd;
-       int cnt = 0, ret;
+       int ret;
 
        if (blkcg_policy_enabled(q, pol))
                return 0;
 
-       /* count and allocate policy_data for all existing blkgs */
        blk_queue_bypass_start(q);
-       spin_lock_irq(q->queue_lock);
-       list_for_each_entry(blkg, &q->blkg_list, q_node)
-               cnt++;
-       spin_unlock_irq(q->queue_lock);
-
-       /* allocate per-blkg policy data for all existing blkgs */
-       while (cnt--) {
-               pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-               if (!pd) {
+pd_prealloc:
+       if (!pd_prealloc) {
+               pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+               if (!pd_prealloc) {
                        ret = -ENOMEM;
-                       goto out_free;
+                       goto out_bypass_end;
                }
-               list_add_tail(&pd->alloc_node, &pds);
        }
 
-       /*
-        * Install the allocated pds and cpds. With @q bypassing, no new blkg
-        * should have been created while the queue lock was dropped.
-        */
        spin_lock_irq(q->queue_lock);
 
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
-               if (WARN_ON(list_empty(&pds))) {
-                       /* umm... this shouldn't happen, just abort */
-                       ret = -ENOMEM;
-                       goto out_unlock;
-               }
-               pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-               list_del_init(&pd->alloc_node);
+               struct blkg_policy_data *pd;
 
-               /* grab blkcg lock too while installing @pd on @blkg */
-               spin_lock(&blkg->blkcg->lock);
+               if (blkg->pd[pol->plid])
+                       continue;
+
+               pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+               if (!pd)
+                       swap(pd, pd_prealloc);
+               if (!pd) {
+                       spin_unlock_irq(q->queue_lock);
+                       goto pd_prealloc;
+               }
 
                blkg->pd[pol->plid] = pd;
                pd->blkg = blkg;
                pd->plid = pol->plid;
-               pol->pd_init_fn(blkg);
-
-               spin_unlock(&blkg->blkcg->lock);
+               if (pol->pd_init_fn)
+                       pol->pd_init_fn(pd);
        }
 
        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;
-out_unlock:
+
        spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
        blk_queue_bypass_end(q);
-       list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-               kfree(pd);
+       if (pd_prealloc)
+               pol->pd_free_fn(pd_prealloc);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1139,13 +1279,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
                /* grab blkcg lock too while removing @pd from @blkg */
                spin_lock(&blkg->blkcg->lock);
 
-               if (pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
-               if (pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-
-               kfree(blkg->pd[pol->plid]);
-               blkg->pd[pol->plid] = NULL;
+               if (blkg->pd[pol->plid]) {
+                       if (pol->pd_offline_fn)
+                               pol->pd_offline_fn(blkg->pd[pol->plid]);
+                       pol->pd_free_fn(blkg->pd[pol->plid]);
+                       blkg->pd[pol->plid] = NULL;
+               }
 
                spin_unlock(&blkg->blkcg->lock);
        }
@@ -1167,9 +1306,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
        struct blkcg *blkcg;
        int i, ret;
 
-       if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-               return -EINVAL;
-
        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);
 
@@ -1186,36 +1322,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
        blkcg_policy[pol->plid] = pol;
 
        /* allocate and install cpd's */
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;
 
-                       cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+                       cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd) {
                                mutex_unlock(&blkcg_pol_mutex);
                                goto err_free_cpds;
                        }
 
-                       blkcg->pd[pol->plid] = cpd;
+                       blkcg->cpd[pol->plid] = cpd;
+                       cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
-                       pol->cpd_init_fn(blkcg);
+                       pol->cpd_init_fn(cpd);
                }
        }
 
        mutex_unlock(&blkcg_pol_mutex);
 
        /* everything is in place, add intf files for the new policy */
-       if (pol->cftypes)
-               WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
-                                                 pol->cftypes));
+       if (pol->dfl_cftypes)
+               WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+                                              pol->dfl_cftypes));
+       if (pol->legacy_cftypes)
+               WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+                                                 pol->legacy_cftypes));
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;
 
 err_free_cpds:
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                }
        }
        blkcg_policy[pol->plid] = NULL;
@@ -1242,16 +1384,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
                goto out_unlock;
 
        /* kill the intf files first */
-       if (pol->cftypes)
-               cgroup_rm_cftypes(pol->cftypes);
+       if (pol->dfl_cftypes)
+               cgroup_rm_cftypes(pol->dfl_cftypes);
+       if (pol->legacy_cftypes)
+               cgroup_rm_cftypes(pol->legacy_cftypes);
 
        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);
 
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                }
        }
        blkcg_policy[pol->plid] = NULL;
index 60912e9..2eb722d 100644 (file)
@@ -1888,8 +1888,8 @@ generic_make_request_checks(struct bio *bio)
         */
        create_io_context(GFP_ATOMIC, q->node);
 
-       if (blk_throtl_bio(q, bio))
-               return false;   /* throttled, will be resubmitted later */
+       if (!blkcg_bio_issue_check(q, bio))
+               return false;
 
        trace_block_bio_queue(q, bio);
        return true;
index b231935..c75a263 100644 (file)
@@ -83,14 +83,6 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
 
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-       /* total bytes transferred */
-       struct blkg_rwstat              service_bytes;
-       /* total IOs serviced, post merge */
-       struct blkg_rwstat              serviced;
-};
-
 struct throtl_grp {
        /* must be the first member */
        struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
        /* When did we start a new slice */
        unsigned long slice_start[2];
        unsigned long slice_end[2];
-
-       /* Per cpu stats pointer */
-       struct tg_stats_cpu __percpu *stats_cpu;
-
-       /* List of tgs waiting for per cpu stats memory to be allocated */
-       struct list_head stats_alloc_node;
 };
 
 struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
        struct work_struct dispatch_work;
 };
 
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-
 static void throtl_pending_timer_fn(unsigned long arg);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
        return pd_to_blkg(&tg->pd);
 }
 
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
-       return blkg_to_tg(td->queue->root_blkg);
-}
-
 /**
  * sq_to_tg - return the throl_grp the specified service queue belongs to
  * @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
        }                                                               \
 } while (0)
 
-static void tg_stats_init(struct tg_stats_cpu *tg_stats)
-{
-       blkg_rwstat_init(&tg_stats->service_bytes);
-       blkg_rwstat_init(&tg_stats->serviced);
-}
-
-/*
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
-{
-       static struct tg_stats_cpu *stats_cpu;  /* this fn is non-reentrant */
-       struct delayed_work *dwork = to_delayed_work(work);
-       bool empty = false;
-
-alloc_stats:
-       if (!stats_cpu) {
-               int cpu;
-
-               stats_cpu = alloc_percpu(struct tg_stats_cpu);
-               if (!stats_cpu) {
-                       /* allocation failed, try again after some time */
-                       schedule_delayed_work(dwork, msecs_to_jiffies(10));
-                       return;
-               }
-               for_each_possible_cpu(cpu)
-                       tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
-       }
-
-       spin_lock_irq(&tg_stats_alloc_lock);
-
-       if (!list_empty(&tg_stats_alloc_list)) {
-               struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
-                                                        struct throtl_grp,
-                                                        stats_alloc_node);
-               swap(tg->stats_cpu, stats_cpu);
-               list_del_init(&tg->stats_alloc_node);
-       }
-
-       empty = list_empty(&tg_stats_alloc_list);
-       spin_unlock_irq(&tg_stats_alloc_lock);
-       if (!empty)
-               goto alloc_stats;
-}
-
 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
 {
        INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
 }
 
 /* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
-                                     struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
        INIT_LIST_HEAD(&sq->queued[0]);
        INIT_LIST_HEAD(&sq->queued[1]);
        sq->pending_tree = RB_ROOT;
-       sq->parent_sq = parent_sq;
        setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
                    (unsigned long)sq);
 }
 
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
-       del_timer_sync(&sq->pending_timer);
+       struct throtl_grp *tg;
+       int rw;
+
+       tg = kzalloc_node(sizeof(*tg), gfp, node);
+       if (!tg)
+               return NULL;
+
+       throtl_service_queue_init(&tg->service_queue);
+
+       for (rw = READ; rw <= WRITE; rw++) {
+               throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+               throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+       }
+
+       RB_CLEAR_NODE(&tg->rb_node);
+       tg->bps[READ] = -1;
+       tg->bps[WRITE] = -1;
+       tg->iops[READ] = -1;
+       tg->iops[WRITE] = -1;
+
+       return &tg->pd;
 }
 
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
 {
-       struct throtl_grp *tg = blkg_to_tg(blkg);
+       struct throtl_grp *tg = pd_to_tg(pd);
+       struct blkcg_gq *blkg = tg_to_blkg(tg);
        struct throtl_data *td = blkg->q->td;
-       struct throtl_service_queue *parent_sq;
-       unsigned long flags;
-       int rw;
+       struct throtl_service_queue *sq = &tg->service_queue;
 
        /*
         * If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
         * Limits of a group don't interact with limits of other groups
         * regardless of the position of the group in the hierarchy.
         */
-       parent_sq = &td->service_queue;
-
+       sq->parent_sq = &td->service_queue;
        if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
-               parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
-       throtl_service_queue_init(&tg->service_queue, parent_sq);
-
-       for (rw = READ; rw <= WRITE; rw++) {
-               throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-               throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
-       }
-
-       RB_CLEAR_NODE(&tg->rb_node);
+               sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
        tg->td = td;
-
-       tg->bps[READ] = -1;
-       tg->bps[WRITE] = -1;
-       tg->iops[READ] = -1;
-       tg->iops[WRITE] = -1;
-
-       /*
-        * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
-        * but percpu allocator can't be called from IO path.  Queue tg on
-        * tg_stats_alloc_list and allocate from work item.
-        */
-       spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-       list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
-       schedule_delayed_work(&tg_stats_alloc_work, 0);
-       spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
 /*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
                                    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
 }
 
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
 {
        /*
         * We don't want new groups to escape the limits of its ancestors.
         * Update has_rules[] after a new group is brought online.
         */
-       tg_update_has_rules(blkg_to_tg(blkg));
-}
-
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       unsigned long flags;
-
-       spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-       list_del_init(&tg->stats_alloc_node);
-       spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
-
-       free_percpu(tg->stats_cpu);
-
-       throtl_service_queue_exit(&tg->service_queue);
-}
-
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       int cpu;
-
-       if (tg->stats_cpu == NULL)
-               return;
-
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               blkg_rwstat_reset(&sc->service_bytes);
-               blkg_rwstat_reset(&sc->serviced);
-       }
-}
-
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
-                                          struct blkcg *blkcg)
-{
-       /*
-        * This is the common case when there are no blkcgs.  Avoid lookup
-        * in this case
-        */
-       if (blkcg == &blkcg_root)
-               return td_root_tg(td);
-
-       return blkg_to_tg(blkg_lookup(blkcg, td->queue));
+       tg_update_has_rules(pd_to_tg(pd));
 }
 
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
-                                                 struct blkcg *blkcg)
+static void throtl_pd_free(struct blkg_policy_data *pd)
 {
-       struct request_queue *q = td->queue;
-       struct throtl_grp *tg = NULL;
-
-       /*
-        * This is the common case when there are no blkcgs.  Avoid lookup
-        * in this case
-        */
-       if (blkcg == &blkcg_root) {
-               tg = td_root_tg(td);
-       } else {
-               struct blkcg_gq *blkg;
-
-               blkg = blkg_lookup_create(blkcg, q);
-
-               /* if %NULL and @q is alive, fall back to root_tg */
-               if (!IS_ERR(blkg))
-                       tg = blkg_to_tg(blkg);
-               else if (!blk_queue_dying(q))
-                       tg = td_root_tg(td);
-       }
+       struct throtl_grp *tg = pd_to_tg(pd);
 
-       return tg;
+       del_timer_sync(&tg->service_queue.pending_timer);
+       kfree(tg);
 }
 
 static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
        return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
-                                        int rw)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       struct tg_stats_cpu *stats_cpu;
-       unsigned long flags;
-
-       /* If per cpu stats are not allocated yet, don't do any accounting. */
-       if (tg->stats_cpu == NULL)
-               return;
-
-       /*
-        * Disabling interrupts to provide mutual exclusion between two
-        * writes on same cpu. It probably is not needed for 64bit. Not
-        * optimizing that case yet.
-        */
-       local_irq_save(flags);
-
-       stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
-       blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-       blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
-       local_irq_restore(flags);
-}
-
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
        bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
         * more than once as a throttled bio will go through blk-throtl the
         * second time when it eventually gets issued.  Set it when a bio
         * is being charged to a tg.
-        *
-        * Dispatch stats aren't recursive and each @bio should only be
-        * accounted by the @tg it was originally associated with.  Let's
-        * update the stats when setting REQ_THROTTLED for the first time
-        * which is guaranteed to be for the @bio's original tg.
         */
-       if (!(bio->bi_rw & REQ_THROTTLED)) {
+       if (!(bio->bi_rw & REQ_THROTTLED))
                bio->bi_rw |= REQ_THROTTLED;
-               throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                            bio->bi_iter.bi_size, bio->bi_rw);
-       }
 }
 
 /**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
        }
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-                               struct blkg_policy_data *pd, int off)
-{
-       struct throtl_grp *tg = pd_to_tg(pd);
-       struct blkg_rwstat rwstat = { }, tmp;
-       int i, cpu;
-
-       if (tg->stats_cpu == NULL)
-               return 0;
-
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               tmp = blkg_rwstat_read((void *)sc + off);
-               for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       rwstat.cnt[i] += tmp.cnt[i];
-       }
-
-       return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
-{
-       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
-                         &blkcg_policy_throtl, seq_cft(sf)->private, true);
-       return 0;
-}
-
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
                              int off)
 {
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
        return 0;
 }
 
-static ssize_t tg_set_conf(struct kernfs_open_file *of,
-                          char *buf, size_t nbytes, loff_t off, bool is_u64)
+static void tg_conf_updated(struct throtl_grp *tg)
 {
-       struct blkcg *blkcg = css_to_blkcg(of_css(of));
-       struct blkg_conf_ctx ctx;
-       struct throtl_grp *tg;
-       struct throtl_service_queue *sq;
-       struct blkcg_gq *blkg;
+       struct throtl_service_queue *sq = &tg->service_queue;
        struct cgroup_subsys_state *pos_css;
-       int ret;
-
-       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-       if (ret)
-               return ret;
-
-       tg = blkg_to_tg(ctx.blkg);
-       sq = &tg->service_queue;
-
-       if (!ctx.v)
-               ctx.v = -1;
-
-       if (is_u64)
-               *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
-       else
-               *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+       struct blkcg_gq *blkg;
 
        throtl_log(&tg->service_queue,
                   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
         * restrictions in the whole hierarchy and allows them to bypass
         * blk-throttle.
         */
-       blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+       blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
                tg_update_has_rules(blkg_to_tg(blkg));
 
        /*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
                tg_update_disptime(tg);
                throtl_schedule_next_dispatch(sq->parent_sq, true);
        }
+}
+
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+                          char *buf, size_t nbytes, loff_t off, bool is_u64)
+{
+       struct blkcg *blkcg = css_to_blkcg(of_css(of));
+       struct blkg_conf_ctx ctx;
+       struct throtl_grp *tg;
+       int ret;
+       u64 v;
 
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+       if (ret)
+               return ret;
+
+       ret = -EINVAL;
+       if (sscanf(ctx.body, "%llu", &v) != 1)
+               goto out_finish;
+       if (!v)
+               v = -1;
+
+       tg = blkg_to_tg(ctx.blkg);
+
+       if (is_u64)
+               *(u64 *)((void *)tg + of_cft(of)->private) = v;
+       else
+               *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
+
+       tg_conf_updated(tg);
+       ret = 0;
+out_finish:
        blkg_conf_finish(&ctx);
-       return nbytes;
+       return ret ?: nbytes;
 }
 
 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
        return tg_set_conf(of, buf, nbytes, off, false);
 }
 
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
        {
                .name = "throttle.read_bps_device",
                .private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
        },
        {
                .name = "throttle.io_service_bytes",
-               .private = offsetof(struct tg_stats_cpu, service_bytes),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_bytes,
        },
        {
                .name = "throttle.io_serviced",
-               .private = offsetof(struct tg_stats_cpu, serviced),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_ios,
+       },
+       { }     /* terminate */
+};
+
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+                        int off)
+{
+       struct throtl_grp *tg = pd_to_tg(pd);
+       const char *dname = blkg_dev_name(pd->blkg);
+       char bufs[4][21] = { "max", "max", "max", "max" };
+
+       if (!dname)
+               return 0;
+       if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+           tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+               return 0;
+
+       if (tg->bps[READ] != -1)
+               snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+       if (tg->bps[WRITE] != -1)
+               snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+       if (tg->iops[READ] != -1)
+               snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+       if (tg->iops[WRITE] != -1)
+               snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+       seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+                  dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+       return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+                         &blkcg_policy_throtl, seq_cft(sf)->private, false);
+       return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+                         char *buf, size_t nbytes, loff_t off)
+{
+       struct blkcg *blkcg = css_to_blkcg(of_css(of));
+       struct blkg_conf_ctx ctx;
+       struct throtl_grp *tg;
+       u64 v[4];
+       int ret;
+
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+       if (ret)
+               return ret;
+
+       tg = blkg_to_tg(ctx.blkg);
+
+       v[0] = tg->bps[READ];
+       v[1] = tg->bps[WRITE];
+       v[2] = tg->iops[READ];
+       v[3] = tg->iops[WRITE];
+
+       while (true) {
+               char tok[27];   /* wiops=18446744073709551616 */
+               char *p;
+               u64 val = -1;
+               int len;
+
+               if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+                       break;
+               if (tok[0] == '\0')
+                       break;
+               ctx.body += len;
+
+               ret = -EINVAL;
+               p = tok;
+               strsep(&p, "=");
+               if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+                       goto out_finish;
+
+               ret = -ERANGE;
+               if (!val)
+                       goto out_finish;
+
+               ret = -EINVAL;
+               if (!strcmp(tok, "rbps"))
+                       v[0] = val;
+               else if (!strcmp(tok, "wbps"))
+                       v[1] = val;
+               else if (!strcmp(tok, "riops"))
+                       v[2] = min_t(u64, val, UINT_MAX);
+               else if (!strcmp(tok, "wiops"))
+                       v[3] = min_t(u64, val, UINT_MAX);
+               else
+                       goto out_finish;
+       }
+
+       tg->bps[READ] = v[0];
+       tg->bps[WRITE] = v[1];
+       tg->iops[READ] = v[2];
+       tg->iops[WRITE] = v[3];
+
+       tg_conf_updated(tg);
+       ret = 0;
+out_finish:
+       blkg_conf_finish(&ctx);
+       return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+       {
+               .name = "max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = tg_print_max,
+               .write = tg_set_max,
        },
        { }     /* terminate */
 };
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-       .pd_size                = sizeof(struct throtl_grp),
-       .cftypes                = throtl_files,
+       .dfl_cftypes            = throtl_files,
+       .legacy_cftypes         = throtl_legacy_files,
 
+       .pd_alloc_fn            = throtl_pd_alloc,
        .pd_init_fn             = throtl_pd_init,
        .pd_online_fn           = throtl_pd_online,
-       .pd_exit_fn             = throtl_pd_exit,
-       .pd_reset_stats_fn      = throtl_pd_reset_stats,
+       .pd_free_fn             = throtl_pd_free,
 };
 
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                   struct bio *bio)
 {
-       struct throtl_data *td = q->td;
        struct throtl_qnode *qn = NULL;
-       struct throtl_grp *tg;
+       struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
        struct throtl_service_queue *sq;
        bool rw = bio_data_dir(bio);
-       struct blkcg *blkcg;
        bool throttled = false;
 
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
        /* see throtl_charge_bio() */
-       if (bio->bi_rw & REQ_THROTTLED)
+       if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
                goto out;
 
-       /*
-        * A throtl_grp pointer retrieved under rcu can be used to access
-        * basic fields like stats and io rates. If a group has no rules,
-        * just update the dispatch stats in lockless manner and return.
-        */
-       rcu_read_lock();
-       blkcg = bio_blkcg(bio);
-       tg = throtl_lookup_tg(td, blkcg);
-       if (tg) {
-               if (!tg->has_rules[rw]) {
-                       throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                       bio->bi_iter.bi_size, bio->bi_rw);
-                       goto out_unlock_rcu;
-               }
-       }
-
-       /*
-        * Either group has not been allocated yet or it is not an unlimited
-        * IO group
-        */
        spin_lock_irq(q->queue_lock);
-       tg = throtl_lookup_create_tg(td, blkcg);
-       if (unlikely(!tg))
+
+       if (unlikely(blk_queue_bypass(q)))
                goto out_unlock;
 
        sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 out_unlock:
        spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
-       rcu_read_unlock();
 out:
        /*
         * As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
                return -ENOMEM;
 
        INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-       throtl_service_queue_init(&td->service_queue, NULL);
+       throtl_service_queue_init(&td->service_queue);
 
        q->td = td;
        td->queue = q;
index 838188b..98614ad 100644 (file)
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
  * Internal throttling interface
  */
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
-       return false;
-}
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
index c62bb2e..04de884 100644 (file)
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
 #define rb_entry_cfqg(node)    rb_entry((node), struct cfq_group, rb_node)
 
 /* blkio-related constants */
-#define CFQ_WEIGHT_MIN          10
-#define CFQ_WEIGHT_MAX          1000
-#define CFQ_WEIGHT_DEFAULT      500
+#define CFQ_WEIGHT_LEGACY_MIN  10
+#define CFQ_WEIGHT_LEGACY_DFL  500
+#define CFQ_WEIGHT_LEGACY_MAX  1000
 
 struct cfq_ttime {
        unsigned long last_end_request;
@@ -177,10 +177,6 @@ enum wl_type_t {
 
 struct cfqg_stats {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-       /* total bytes transferred */
-       struct blkg_rwstat              service_bytes;
-       /* total IOs serviced, post merge */
-       struct blkg_rwstat              serviced;
        /* number of ios merged */
        struct blkg_rwstat              merged;
        /* total time spent on device in ns, may not be accurate w/ queueing */
@@ -189,8 +185,6 @@ struct cfqg_stats {
        struct blkg_rwstat              wait_time;
        /* number of IOs queued up */
        struct blkg_rwstat              queued;
-       /* total sectors transferred */
-       struct blkg_stat                sectors;
        /* total disk time and nr sectors dispatched by this group */
        struct blkg_stat                time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -220,7 +214,7 @@ struct cfqg_stats {
 /* Per-cgroup data */
 struct cfq_group_data {
        /* must be the first member */
-       struct blkcg_policy_data pd;
+       struct blkcg_policy_data cpd;
 
        unsigned int weight;
        unsigned int leaf_weight;
@@ -304,7 +298,11 @@ struct cfq_group {
        int dispatched;
        struct cfq_ttime ttime;
        struct cfqg_stats stats;        /* stats for this cfqg */
-       struct cfqg_stats dead_stats;   /* stats pushed from dead children */
+
+       /* async queue for each priority case */
+       struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+       struct cfq_queue *async_idle_cfqq;
+
 };
 
 struct cfq_io_cq {
@@ -370,12 +368,6 @@ struct cfq_data {
        struct cfq_queue *active_queue;
        struct cfq_io_cq *active_cic;
 
-       /*
-        * async queue for each priority case
-        */
-       struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
-       struct cfq_queue *async_idle_cfqq;
-
        sector_t last_position;
 
        /*
@@ -401,6 +393,7 @@ struct cfq_data {
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+static void cfq_put_queue(struct cfq_queue *cfqq);
 
 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
                                            enum wl_class_t class,
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 static struct cfq_group_data
 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 {
-       return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+       return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 }
 
 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
        blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
 }
 
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                             uint64_t bytes, int rw)
-{
-       blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-       blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
-       blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
-}
-
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
                        uint64_t start_time, uint64_t io_start_time, int rw)
 {
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 static void cfqg_stats_reset(struct cfqg_stats *stats)
 {
        /* queued stats shouldn't be cleared */
-       blkg_rwstat_reset(&stats->service_bytes);
-       blkg_rwstat_reset(&stats->serviced);
        blkg_rwstat_reset(&stats->merged);
        blkg_rwstat_reset(&stats->service_time);
        blkg_rwstat_reset(&stats->wait_time);
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 }
 
 /* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 {
        /* queued stats shouldn't be cleared */
-       blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
-       blkg_rwstat_merge(&to->serviced, &from->serviced);
-       blkg_rwstat_merge(&to->merged, &from->merged);
-       blkg_rwstat_merge(&to->service_time, &from->service_time);
-       blkg_rwstat_merge(&to->wait_time, &from->wait_time);
-       blkg_stat_merge(&from->time, &from->time);
+       blkg_rwstat_add_aux(&to->merged, &from->merged);
+       blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+       blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+       blkg_stat_add_aux(&from->time, &from->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
-       blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-       blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-       blkg_stat_merge(&to->dequeue, &from->dequeue);
-       blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
-       blkg_stat_merge(&to->idle_time, &from->idle_time);
-       blkg_stat_merge(&to->empty_time, &from->empty_time);
+       blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+       blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+       blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+       blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+       blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+       blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+       blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 #endif
 }
 
 /*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
  * recursive stats can still account for the amount used by this cfqg after
  * it's gone.
  */
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
        if (unlikely(!parent))
                return;
 
-       cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
-       cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+       cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
        cfqg_stats_reset(&cfqg->stats);
-       cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 #else  /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
                        unsigned long time, unsigned long unaccounted_time) { }
 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                             uint64_t bytes, int rw) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
                        uint64_t start_time, uint64_t io_start_time, int rw) { }
 
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-                                      struct cfq_io_cq *cic, struct bio *bio,
-                                      gfp_t gfp_mask);
+                                      struct cfq_io_cq *cic, struct bio *bio);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+                           bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
+static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-       blkg_rwstat_init(&stats->service_bytes);
-       blkg_rwstat_init(&stats->serviced);
-       blkg_rwstat_init(&stats->merged);
-       blkg_rwstat_init(&stats->service_time);
-       blkg_rwstat_init(&stats->wait_time);
-       blkg_rwstat_init(&stats->queued);
+       blkg_rwstat_exit(&stats->merged);
+       blkg_rwstat_exit(&stats->service_time);
+       blkg_rwstat_exit(&stats->wait_time);
+       blkg_rwstat_exit(&stats->queued);
+       blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg_stat_exit(&stats->unaccounted_time);
+       blkg_stat_exit(&stats->avg_queue_size_sum);
+       blkg_stat_exit(&stats->avg_queue_size_samples);
+       blkg_stat_exit(&stats->dequeue);
+       blkg_stat_exit(&stats->group_wait_time);
+       blkg_stat_exit(&stats->idle_time);
+       blkg_stat_exit(&stats->empty_time);
+#endif
+}
 
-       blkg_stat_init(&stats->sectors);
-       blkg_stat_init(&stats->time);
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+       if (blkg_rwstat_init(&stats->merged, gfp) ||
+           blkg_rwstat_init(&stats->service_time, gfp) ||
+           blkg_rwstat_init(&stats->wait_time, gfp) ||
+           blkg_rwstat_init(&stats->queued, gfp) ||
+           blkg_stat_init(&stats->time, gfp))
+               goto err;
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       blkg_stat_init(&stats->unaccounted_time);
-       blkg_stat_init(&stats->avg_queue_size_sum);
-       blkg_stat_init(&stats->avg_queue_size_samples);
-       blkg_stat_init(&stats->dequeue);
-       blkg_stat_init(&stats->group_wait_time);
-       blkg_stat_init(&stats->idle_time);
-       blkg_stat_init(&stats->empty_time);
+       if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+           blkg_stat_init(&stats->dequeue, gfp) ||
+           blkg_stat_init(&stats->group_wait_time, gfp) ||
+           blkg_stat_init(&stats->idle_time, gfp) ||
+           blkg_stat_init(&stats->empty_time, gfp))
+               goto err;
 #endif
+       return 0;
+err:
+       cfqg_stats_exit(stats);
+       return -ENOMEM;
 }
 
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 {
-       struct cfq_group_data *cgd =
-               cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+       struct cfq_group_data *cgd;
 
-       if (blkcg == &blkcg_root) {
-               cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
-               cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
-       } else {
-               cgd->weight = CFQ_WEIGHT_DEFAULT;
-               cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
-       }
+       cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+       if (!cgd)
+               return NULL;
+       return &cgd->cpd;
+}
+
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+       struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+       unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+                             CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+       if (cpd_to_blkcg(cpd) == &blkcg_root)
+               weight *= 2;
+
+       cgd->weight = weight;
+       cgd->leaf_weight = weight;
 }
 
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
 {
-       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-       struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+       kfree(cpd_to_cfqgd(cpd));
+}
+
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+       struct blkcg *blkcg = cpd_to_blkcg(cpd);
+       bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+       unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+       if (blkcg == &blkcg_root)
+               weight *= 2;
+
+       WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+       WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+       struct cfq_group *cfqg;
+
+       cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+       if (!cfqg)
+               return NULL;
 
        cfq_init_cfqg_base(cfqg);
+       if (cfqg_stats_init(&cfqg->stats, gfp)) {
+               kfree(cfqg);
+               return NULL;
+       }
+
+       return &cfqg->pd;
+}
+
+static void cfq_pd_init(struct blkg_policy_data *pd)
+{
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+       struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
+
        cfqg->weight = cgd->weight;
        cfqg->leaf_weight = cgd->leaf_weight;
-       cfqg_stats_init(&cfqg->stats);
-       cfqg_stats_init(&cfqg->dead_stats);
 }
 
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
 {
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+       int i;
+
+       for (i = 0; i < IOPRIO_BE_NR; i++) {
+               if (cfqg->async_cfqq[0][i])
+                       cfq_put_queue(cfqg->async_cfqq[0][i]);
+               if (cfqg->async_cfqq[1][i])
+                       cfq_put_queue(cfqg->async_cfqq[1][i]);
+       }
+
+       if (cfqg->async_idle_cfqq)
+               cfq_put_queue(cfqg->async_idle_cfqq);
+
        /*
         * @blkg is going offline and will be ignored by
         * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
         * that they don't get lost.  If IOs complete after this point, the
         * stats for them will be lost.  Oh well...
         */
-       cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+       cfqg_stats_xfer_dead(cfqg);
 }
 
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
-                                       offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void cfq_pd_free(struct blkg_policy_data *pd)
 {
-       u64 sum = 0;
-
-       sum += blkg_stat_recursive_sum(pd, off);
-       sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
-       return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
-                                                      int off)
-{
-       struct blkg_rwstat a, b;
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
 
-       a = blkg_rwstat_recursive_sum(pd, off);
-       b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
-       blkg_rwstat_merge(&a, &b);
-       return a;
+       cfqg_stats_exit(&cfqg->stats);
+       return kfree(cfqg);
 }
 
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
-       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
 
        cfqg_stats_reset(&cfqg->stats);
-       cfqg_stats_reset(&cfqg->dead_stats);
 }
 
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-                                               struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+                                        struct blkcg *blkcg)
 {
-       struct request_queue *q = cfqd->queue;
-       struct cfq_group *cfqg = NULL;
-
-       /* avoid lookup for the common case where there's no blkcg */
-       if (blkcg == &blkcg_root) {
-               cfqg = cfqd->root_group;
-       } else {
-               struct blkcg_gq *blkg;
-
-               blkg = blkg_lookup_create(blkcg, q);
-               if (!IS_ERR(blkg))
-                       cfqg = blkg_to_cfqg(blkg);
-       }
+       struct blkcg_gq *blkg;
 
-       return cfqg;
+       blkg = blkg_lookup(blkcg, cfqd->queue);
+       if (likely(blkg))
+               return blkg_to_cfqg(blkg);
+       return NULL;
 }
 
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
-       /* Currently, all async queues are mapped to root group */
-       if (!cfq_cfqq_sync(cfqq))
-               cfqg = cfqq->cfqd->root_group;
-
        cfqq->cfqg = cfqg;
        /* cfqq reference on cfqg */
        cfqg_get(cfqg);
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 
 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
                                        char *buf, size_t nbytes, loff_t off,
-                                       bool is_leaf_weight)
+                                       bool on_dfl, bool is_leaf_weight)
 {
+       unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+       unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
        struct blkcg *blkcg = css_to_blkcg(of_css(of));
        struct blkg_conf_ctx ctx;
        struct cfq_group *cfqg;
        struct cfq_group_data *cfqgd;
        int ret;
+       u64 v;
 
        ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
        if (ret)
                return ret;
 
-       ret = -EINVAL;
+       if (sscanf(ctx.body, "%llu", &v) == 1) {
+               /* require "default" on dfl */
+               ret = -ERANGE;
+               if (!v && on_dfl)
+                       goto out_finish;
+       } else if (!strcmp(strim(ctx.body), "default")) {
+               v = 0;
+       } else {
+               ret = -EINVAL;
+               goto out_finish;
+       }
+
        cfqg = blkg_to_cfqg(ctx.blkg);
        cfqgd = blkcg_to_cfqgd(blkcg);
-       if (!cfqg || !cfqgd)
-               goto err;
 
-       if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+       ret = -ERANGE;
+       if (!v || (v >= min && v <= max)) {
                if (!is_leaf_weight) {
-                       cfqg->dev_weight = ctx.v;
-                       cfqg->new_weight = ctx.v ?: cfqgd->weight;
+                       cfqg->dev_weight = v;
+                       cfqg->new_weight = v ?: cfqgd->weight;
                } else {
-                       cfqg->dev_leaf_weight = ctx.v;
-                       cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+                       cfqg->dev_leaf_weight = v;
+                       cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
                }
                ret = 0;
        }
-
-err:
+out_finish:
        blkg_conf_finish(&ctx);
        return ret ?: nbytes;
 }
@@ -1776,25 +1805,27 @@ err:
 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
 {
-       return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+       return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
 }
 
 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
 {
-       return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+       return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
 }
 
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-                           u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+                           bool on_dfl, bool reset_dev, bool is_leaf_weight)
 {
+       unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+       unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
        struct cfq_group_data *cfqgd;
        int ret = 0;
 
-       if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
-               return -EINVAL;
+       if (val < min || val > max)
+               return -ERANGE;
 
        spin_lock_irq(&blkcg->lock);
        cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
                        continue;
 
                if (!is_leaf_weight) {
+                       if (reset_dev)
+                               cfqg->dev_weight = 0;
                        if (!cfqg->dev_weight)
                                cfqg->new_weight = cfqgd->weight;
                } else {
+                       if (reset_dev)
+                               cfqg->dev_leaf_weight = 0;
                        if (!cfqg->dev_leaf_weight)
                                cfqg->new_leaf_weight = cfqgd->leaf_weight;
                }
@@ -1831,13 +1866,13 @@ out:
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
                          u64 val)
 {
-       return __cfq_set_weight(css, cft, val, false);
+       return __cfq_set_weight(css, val, false, false, false);
 }
 
 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
                               struct cftype *cft, u64 val)
 {
-       return __cfq_set_weight(css, cft, val, true);
+       return __cfq_set_weight(css, val, false, false, true);
 }
 
 static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
                                      struct blkg_policy_data *pd, int off)
 {
-       u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
-
+       u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+                                         &blkcg_policy_cfq, off);
        return __blkg_prfill_u64(sf, pd, sum);
 }
 
 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
                                        struct blkg_policy_data *pd, int off)
 {
-       struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
-
+       struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+                                                       &blkcg_policy_cfq, off);
        return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
        return 0;
 }
 
+static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+                              int off)
+{
+       u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
+       return 0;
+}
+
+static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
+                                        struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+       u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
+                         false);
+       return 0;
+}
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
                                      struct blkg_policy_data *pd, int off)
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
 }
 #endif /* CONFIG_DEBUG_BLK_CGROUP */
 
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
        /* on root, weight is mapped to leaf_weight */
        {
                .name = "weight_device",
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = {
        },
        {
                .name = "sectors",
-               .private = offsetof(struct cfq_group, stats.sectors),
-               .seq_show = cfqg_print_stat,
+               .seq_show = cfqg_print_stat_sectors,
        },
        {
                .name = "io_service_bytes",
-               .private = offsetof(struct cfq_group, stats.service_bytes),
-               .seq_show = cfqg_print_rwstat,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_bytes,
        },
        {
                .name = "io_serviced",
-               .private = offsetof(struct cfq_group, stats.serviced),
-               .seq_show = cfqg_print_rwstat,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_ios,
        },
        {
                .name = "io_service_time",
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = {
        },
        {
                .name = "sectors_recursive",
-               .private = offsetof(struct cfq_group, stats.sectors),
-               .seq_show = cfqg_print_stat_recursive,
+               .seq_show = cfqg_print_stat_sectors_recursive,
        },
        {
                .name = "io_service_bytes_recursive",
-               .private = offsetof(struct cfq_group, stats.service_bytes),
-               .seq_show = cfqg_print_rwstat_recursive,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_bytes_recursive,
        },
        {
                .name = "io_serviced_recursive",
-               .private = offsetof(struct cfq_group, stats.serviced),
-               .seq_show = cfqg_print_rwstat_recursive,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_ios_recursive,
        },
        {
                .name = "io_service_time_recursive",
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = {
 #endif /* CONFIG_DEBUG_BLK_CGROUP */
        { }     /* terminate */
 };
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+       seq_printf(sf, "default %u\n", cgd->weight);
+       blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+                         &blkcg_policy_cfq, 0, false);
+       return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+                                    char *buf, size_t nbytes, loff_t off)
+{
+       char *endp;
+       int ret;
+       u64 v;
+
+       buf = strim(buf);
+
+       /* "WEIGHT" or "default WEIGHT" sets the default weight */
+       v = simple_strtoull(buf, &endp, 0);
+       if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+               ret = __cfq_set_weight(of_css(of), v, true, false, false);
+               return ret ?: nbytes;
+       }
+
+       /* "MAJ:MIN WEIGHT" */
+       return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+       {
+               .name = "weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cfq_print_weight_on_dfl,
+               .write = cfq_set_weight_on_dfl,
+       },
+       { }     /* terminate */
+};
+
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-                                               struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+                                        struct blkcg *blkcg)
 {
        return cfqd->root_group;
 }
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
        cfqq->nr_sectors += blk_rq_sectors(rq);
-       cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
 }
 
 /*
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq)
        struct cfq_io_cq *cic = icq_to_cic(icq);
        struct cfq_data *cfqd = cic_to_cfqd(cic);
 
-       if (cic->cfqq[BLK_RW_ASYNC]) {
-               cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-               cic->cfqq[BLK_RW_ASYNC] = NULL;
+       if (cic_to_cfqq(cic, false)) {
+               cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
+               cic_set_cfqq(cic, NULL, false);
        }
 
-       if (cic->cfqq[BLK_RW_SYNC]) {
-               cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
-               cic->cfqq[BLK_RW_SYNC] = NULL;
+       if (cic_to_cfqq(cic, true)) {
+               cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
+               cic_set_cfqq(cic, NULL, true);
        }
 }
 
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
        if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
                return;
 
-       cfqq = cic->cfqq[BLK_RW_ASYNC];
+       cfqq = cic_to_cfqq(cic, false);
        if (cfqq) {
-               struct cfq_queue *new_cfqq;
-               new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
-                                        GFP_ATOMIC);
-               if (new_cfqq) {
-                       cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
-                       cfq_put_queue(cfqq);
-               }
+               cfq_put_queue(cfqq);
+               cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
+               cic_set_cfqq(cic, cfqq, false);
        }
 
-       cfqq = cic->cfqq[BLK_RW_SYNC];
+       cfqq = cic_to_cfqq(cic, true);
        if (cfqq)
                cfq_mark_cfqq_prio_changed(cfqq);
 
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
        struct cfq_data *cfqd = cic_to_cfqd(cic);
-       struct cfq_queue *sync_cfqq;
+       struct cfq_queue *cfqq;
        uint64_t serial_nr;
 
        rcu_read_lock();
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
        if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
                return;
 
-       sync_cfqq = cic_to_cfqq(cic, 1);
-       if (sync_cfqq) {
-               /*
-                * Drop reference to sync queue. A new sync queue will be
-                * assigned in new group upon arrival of a fresh request.
-                */
-               cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
-               cic_set_cfqq(cic, NULL, 1);
-               cfq_put_queue(sync_cfqq);
+       /*
+        * Drop reference to queues.  New queues will be assigned in new
+        * group upon arrival of fresh requests.
+        */
+       cfqq = cic_to_cfqq(cic, false);
+       if (cfqq) {
+               cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+               cic_set_cfqq(cic, NULL, false);
+               cfq_put_queue(cfqq);
+       }
+
+       cfqq = cic_to_cfqq(cic, true);
+       if (cfqq) {
+               cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+               cic_set_cfqq(cic, NULL, true);
+               cfq_put_queue(cfqq);
        }
 
        cic->blkcg_serial_nr = serial_nr;
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
-static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-                    struct bio *bio, gfp_t gfp_mask)
-{
-       struct blkcg *blkcg;
-       struct cfq_queue *cfqq, *new_cfqq = NULL;
-       struct cfq_group *cfqg;
-
-retry:
-       rcu_read_lock();
-
-       blkcg = bio_blkcg(bio);
-       cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
-       if (!cfqg) {
-               cfqq = &cfqd->oom_cfqq;
-               goto out;
-       }
-
-       cfqq = cic_to_cfqq(cic, is_sync);
-
-       /*
-        * Always try a new alloc if we fell back to the OOM cfqq
-        * originally, since it should just be a temporary situation.
-        */
-       if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-               cfqq = NULL;
-               if (new_cfqq) {
-                       cfqq = new_cfqq;
-                       new_cfqq = NULL;
-               } else if (gfp_mask & __GFP_WAIT) {
-                       rcu_read_unlock();
-                       spin_unlock_irq(cfqd->queue->queue_lock);
-                       new_cfqq = kmem_cache_alloc_node(cfq_pool,
-                                       gfp_mask | __GFP_ZERO,
-                                       cfqd->queue->node);
-                       spin_lock_irq(cfqd->queue->queue_lock);
-                       if (new_cfqq)
-                               goto retry;
-                       else
-                               return &cfqd->oom_cfqq;
-               } else {
-                       cfqq = kmem_cache_alloc_node(cfq_pool,
-                                       gfp_mask | __GFP_ZERO,
-                                       cfqd->queue->node);
-               }
-
-               if (cfqq) {
-                       cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-                       cfq_init_prio_data(cfqq, cic);
-                       cfq_link_cfqq_cfqg(cfqq, cfqg);
-                       cfq_log_cfqq(cfqd, cfqq, "alloced");
-               } else
-                       cfqq = &cfqd->oom_cfqq;
-       }
-out:
-       if (new_cfqq)
-               kmem_cache_free(cfq_pool, new_cfqq);
-
-       rcu_read_unlock();
-       return cfqq;
-}
-
 static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
 {
        switch (ioprio_class) {
        case IOPRIO_CLASS_RT:
-               return &cfqd->async_cfqq[0][ioprio];
+               return &cfqg->async_cfqq[0][ioprio];
        case IOPRIO_CLASS_NONE:
                ioprio = IOPRIO_NORM;
                /* fall through */
        case IOPRIO_CLASS_BE:
-               return &cfqd->async_cfqq[1][ioprio];
+               return &cfqg->async_cfqq[1][ioprio];
        case IOPRIO_CLASS_IDLE:
-               return &cfqd->async_idle_cfqq;
+               return &cfqg->async_idle_cfqq;
        default:
                BUG();
        }
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-             struct bio *bio, gfp_t gfp_mask)
+             struct bio *bio)
 {
        int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
        int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
        struct cfq_queue **async_cfqq = NULL;
-       struct cfq_queue *cfqq = NULL;
+       struct cfq_queue *cfqq;
+       struct cfq_group *cfqg;
+
+       rcu_read_lock();
+       cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+       if (!cfqg) {
+               cfqq = &cfqd->oom_cfqq;
+               goto out;
+       }
 
        if (!is_sync) {
                if (!ioprio_valid(cic->ioprio)) {
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
                        ioprio = task_nice_ioprio(tsk);
                        ioprio_class = task_nice_ioclass(tsk);
                }
-               async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+               async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
                cfqq = *async_cfqq;
+               if (cfqq)
+                       goto out;
        }
 
-       if (!cfqq)
-               cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+       cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+                                    cfqd->queue->node);
+       if (!cfqq) {
+               cfqq = &cfqd->oom_cfqq;
+               goto out;
+       }
 
-       /*
-        * pin the queue now that it's allocated, scheduler exit will prune it
-        */
-       if (!is_sync && !(*async_cfqq)) {
+       cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+       cfq_init_prio_data(cfqq, cic);
+       cfq_link_cfqq_cfqg(cfqq, cfqg);
+       cfq_log_cfqq(cfqd, cfqq, "alloced");
+
+       if (async_cfqq) {
+               /* a new async queue is created, pin and remember */
                cfqq->ref++;
                *async_cfqq = cfqq;
        }
-
+out:
        cfqq->ref++;
+       rcu_read_unlock();
        return cfqq;
 }
 
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
        const bool is_sync = rq_is_sync(rq);
        struct cfq_queue *cfqq;
 
-       might_sleep_if(gfp_mask & __GFP_WAIT);
-
        spin_lock_irq(q->queue_lock);
 
        check_ioprio_changed(cic, bio);
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
        cfqq = cic_to_cfqq(cic, is_sync);
        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-               cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+               if (cfqq)
+                       cfq_put_queue(cfqq);
+               cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
                cic_set_cfqq(cic, cfqq, is_sync);
        } else {
                /*
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
        cancel_work_sync(&cfqd->unplug_work);
 }
 
-static void cfq_put_async_queues(struct cfq_data *cfqd)
-{
-       int i;
-
-       for (i = 0; i < IOPRIO_BE_NR; i++) {
-               if (cfqd->async_cfqq[0][i])
-                       cfq_put_queue(cfqd->async_cfqq[0][i]);
-               if (cfqd->async_cfqq[1][i])
-                       cfq_put_queue(cfqd->async_cfqq[1][i]);
-       }
-
-       if (cfqd->async_idle_cfqq)
-               cfq_put_queue(cfqd->async_idle_cfqq);
-}
-
 static void cfq_exit_queue(struct elevator_queue *e)
 {
        struct cfq_data *cfqd = e->elevator_data;
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
        if (cfqd->active_queue)
                __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
-       cfq_put_async_queues(cfqd);
-
        spin_unlock_irq(q->queue_lock);
 
        cfq_shutdown_timer_wq(cfqd);
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
                goto out_free;
 
        cfq_init_cfqg_base(cfqd->root_group);
+       cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+       cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
 #endif
-       cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
-       cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
 
        /*
         * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
                cfqd->prio_trees[i] = RB_ROOT;
 
        /*
-        * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
+        * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
         * Grab a permanent reference to it, so that the normal code flow
         * will not attempt to free it.  oom_cfqq is linked to root_group
         * but shouldn't hold a reference as it'll never be unlinked.  Lose
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-       .pd_size                = sizeof(struct cfq_group),
-       .cpd_size               = sizeof(struct cfq_group_data),
-       .cftypes                = cfq_blkcg_files,
+       .dfl_cftypes            = cfq_blkcg_files,
+       .legacy_cftypes         = cfq_blkcg_legacy_files,
 
+       .cpd_alloc_fn           = cfq_cpd_alloc,
        .cpd_init_fn            = cfq_cpd_init,
+       .cpd_free_fn            = cfq_cpd_free,
+       .cpd_bind_fn            = cfq_cpd_bind,
+
+       .pd_alloc_fn            = cfq_pd_alloc,
        .pd_init_fn             = cfq_pd_init,
        .pd_offline_fn          = cfq_pd_offline,
+       .pd_free_fn             = cfq_pd_free,
        .pd_reset_stats_fn      = cfq_pd_reset_stats,
 };
 #endif
index 35c2de1..fa18753 100644 (file)
@@ -940,6 +940,7 @@ static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
        char *xbuf[XBUFSIZE];
        char *xoutbuf[XBUFSIZE];
        int ret = -ENOMEM;
+       unsigned int ivsize = crypto_skcipher_ivsize(tfm);
 
        if (testmgr_alloc_buf(xbuf))
                goto out_nobuf;
@@ -975,7 +976,7 @@ static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
                        continue;
 
                if (template[i].iv)
-                       memcpy(iv, template[i].iv, MAX_IVLEN);
+                       memcpy(iv, template[i].iv, ivsize);
                else
                        memset(iv, 0, MAX_IVLEN);
 
@@ -1051,7 +1052,7 @@ static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
                        continue;
 
                if (template[i].iv)
-                       memcpy(iv, template[i].iv, MAX_IVLEN);
+                       memcpy(iv, template[i].iv, ivsize);
                else
                        memset(iv, 0, MAX_IVLEN);
 
index fc28b9f..30d8518 100644 (file)
@@ -525,8 +525,7 @@ static void acpi_thermal_check(void *data)
 
 /* sys I/F for generic thermal sysfs support */
 
-static int thermal_get_temp(struct thermal_zone_device *thermal,
-                           unsigned long *temp)
+static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp)
 {
        struct acpi_thermal *tz = thermal->devdata;
        int result;
@@ -633,7 +632,7 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
-                                int trip, unsigned long *temp)
+                                int trip, int *temp)
 {
        struct acpi_thermal *tz = thermal->devdata;
        int i;
@@ -686,7 +685,8 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_crit_temp(struct thermal_zone_device *thermal,
-                               unsigned long *temperature) {
+                               int *temperature)
+{
        struct acpi_thermal *tz = thermal->devdata;
 
        if (tz->trips.critical.flags.valid) {
@@ -709,8 +709,8 @@ static int thermal_get_trend(struct thermal_zone_device *thermal,
                return -EINVAL;
 
        if (type == THERMAL_TRIP_ACTIVE) {
-               unsigned long trip_temp;
-               unsigned long temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(
+               int trip_temp;
+               int temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(
                                        tz->temperature, tz->kelvin_offset);
                if (thermal_get_trip_temp(thermal, trip, &trip_temp))
                        return -EINVAL;
index 6607f3c..a39e85f 100644 (file)
@@ -2834,7 +2834,7 @@ static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct binder_vm_ops = {
+static const struct vm_operations_struct binder_vm_ops = {
        .open = binder_vma_open,
        .close = binder_vma_close,
        .fault = binder_vm_fault,
index 4167201..16550c6 100644 (file)
@@ -212,6 +212,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool timed)
        return ret;
 }
 
+/**
+ * genpd_queue_power_off_work - Queue up the execution of pm_genpd_poweroff().
+ * @genpd: PM domait to power off.
+ *
+ * Queue up the execution of pm_genpd_poweroff() unless it's already been done
+ * before.
+ */
+static void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
+{
+       queue_work(pm_wq, &genpd->power_off_work);
+}
+
 /**
  * __pm_genpd_poweron - Restore power to a given PM domain and its masters.
  * @genpd: PM domain to power up.
@@ -259,8 +271,12 @@ static int __pm_genpd_poweron(struct generic_pm_domain *genpd)
        return 0;
 
  err:
-       list_for_each_entry_continue_reverse(link, &genpd->slave_links, slave_node)
+       list_for_each_entry_continue_reverse(link,
+                                       &genpd->slave_links,
+                                       slave_node) {
                genpd_sd_counter_dec(link->master);
+               genpd_queue_power_off_work(link->master);
+       }
 
        return ret;
 }
@@ -348,18 +364,6 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
        return NOTIFY_DONE;
 }
 
-/**
- * genpd_queue_power_off_work - Queue up the execution of pm_genpd_poweroff().
- * @genpd: PM domait to power off.
- *
- * Queue up the execution of pm_genpd_poweroff() unless it's already been done
- * before.
- */
-static void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
-{
-       queue_work(pm_wq, &genpd->power_off_work);
-}
-
 /**
  * pm_genpd_poweroff - Remove power from a given PM domain.
  * @genpd: PM domain to power down.
@@ -1469,6 +1473,13 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 
        mutex_lock(&genpd->lock);
 
+       if (!list_empty(&subdomain->slave_links) || subdomain->device_count) {
+               pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
+                       subdomain->name);
+               ret = -EBUSY;
+               goto out;
+       }
+
        list_for_each_entry(link, &genpd->master_links, master_node) {
                if (link->slave != subdomain)
                        continue;
@@ -1487,6 +1498,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
                break;
        }
 
+out:
        mutex_unlock(&genpd->lock);
 
        return ret;
index eb25449..28cd75c 100644 (file)
@@ -340,6 +340,34 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
 
+/**
+ * dev_pm_opp_get_suspend_opp() - Get suspend opp
+ * @dev:       device for which we do this operation
+ *
+ * Return: This function returns pointer to the suspend opp if it is
+ * defined and available, otherwise it returns NULL.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+       struct device_opp *dev_opp;
+
+       opp_rcu_lockdep_assert();
+
+       dev_opp = _find_device_opp(dev);
+       if (IS_ERR(dev_opp) || !dev_opp->suspend_opp ||
+           !dev_opp->suspend_opp->available)
+               return NULL;
+
+       return dev_opp->suspend_opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
+
 /**
  * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
  * @dev:       device for which we do this operation
index ff03f23..2d75366 100644 (file)
@@ -611,13 +611,15 @@ static void *device_get_mac_addr(struct device *dev,
 */
 void *device_get_mac_address(struct device *dev, char *addr, int alen)
 {
-       addr = device_get_mac_addr(dev, "mac-address", addr, alen);
-       if (addr)
-               return addr;
+       char *res;
 
-       addr = device_get_mac_addr(dev, "local-mac-address", addr, alen);
-       if (addr)
-               return addr;
+       res = device_get_mac_addr(dev, "mac-address", addr, alen);
+       if (res)
+               return res;
+
+       res = device_get_mac_addr(dev, "local-mac-address", addr, alen);
+       if (res)
+               return res;
 
        return device_get_mac_addr(dev, "address", addr, alen);
 }
index 698f761..d93a037 100644 (file)
@@ -4673,7 +4673,10 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
        }
 
        ret = rbd_dev_v2_snap_context(rbd_dev);
-       dout("rbd_dev_v2_snap_context returned %d\n", ret);
+       if (ret && first_time) {
+               kfree(rbd_dev->header.object_prefix);
+               rbd_dev->header.object_prefix = NULL;
+       }
 
        return ret;
 }
@@ -5154,7 +5157,6 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
 out_err:
        if (parent) {
                rbd_dev_unparent(rbd_dev);
-               kfree(rbd_dev->header_name);
                rbd_dev_destroy(parent);
        } else {
                rbd_put_client(rbdc);
index 1508353..0823a96 100644 (file)
@@ -249,7 +249,7 @@ static struct grant *get_grant(grant_ref_t *gref_head,
                                struct blkfront_info *info)
 {
        struct grant *gnt_list_entry;
-       unsigned long buffer_mfn;
+       unsigned long buffer_gfn;
 
        BUG_ON(list_empty(&info->grants));
        gnt_list_entry = list_first_entry(&info->grants, struct grant,
@@ -268,10 +268,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
                BUG_ON(!pfn);
                gnt_list_entry->pfn = pfn;
        }
-       buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
+       buffer_gfn = pfn_to_gfn(gnt_list_entry->pfn);
        gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
                                        info->xbdev->otherend_id,
-                                       buffer_mfn, 0);
+                                       buffer_gfn, 0);
        return gnt_list_entry;
 }
 
index 2a38eb4..6cf38dc 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/err.h>
 #include <linux/device.h>
 #include <linux/of_address.h>
+#include <linux/slab.h>
 
 static DEFINE_SPINLOCK(clklock);
 
index 2c16807..e434854 100644 (file)
@@ -1,6 +1,12 @@
 config COMMON_CLK_HI6220
        bool "Hi6220 Clock Driver"
-       depends on (ARCH_HISI || COMPILE_TEST) && MAILBOX
+       depends on ARCH_HISI || COMPILE_TEST
        default ARCH_HISI
        help
          Build the Hisilicon Hi6220 clock driver based on the common clock framework.
+
+config STUB_CLK_HI6220
+       bool "Hi6220 Stub Clock Driver"
+       depends on COMMON_CLK_HI6220 && MAILBOX
+       help
+         Build the Hisilicon Hi6220 stub clock driver.
index 4a1001a..74dba31 100644 (file)
@@ -7,4 +7,5 @@ obj-y   += clk.o clkgate-separated.o clkdivider-hi6220.o
 obj-$(CONFIG_ARCH_HI3xxx)      += clk-hi3620.o
 obj-$(CONFIG_ARCH_HIP04)       += clk-hip04.o
 obj-$(CONFIG_ARCH_HIX5HD2)     += clk-hix5hd2.o
-obj-$(CONFIG_COMMON_CLK_HI6220)        += clk-hi6220.o clk-hi6220-stub.o
+obj-$(CONFIG_COMMON_CLK_HI6220)        += clk-hi6220.o
+obj-$(CONFIG_STUB_CLK_HI6220)  += clk-hi6220-stub.o
index ed02bbc..abb4760 100644 (file)
@@ -716,6 +716,8 @@ static const char *const rk3188_critical_clocks[] __initconst = {
        "aclk_cpu",
        "aclk_peri",
        "hclk_peri",
+       "pclk_cpu",
+       "pclk_peri",
 };
 
 static void __init rk3188_common_clk_init(struct device_node *np)
@@ -744,8 +746,6 @@ static void __init rk3188_common_clk_init(struct device_node *np)
 
        rockchip_clk_register_branches(common_clk_branches,
                                  ARRAY_SIZE(common_clk_branches));
-       rockchip_clk_protect_critical(rk3188_critical_clocks,
-                                     ARRAY_SIZE(rk3188_critical_clocks));
 
        rockchip_register_softrst(np, 9, reg_base + RK2928_SOFTRST_CON(0),
                                  ROCKCHIP_SOFTRST_HIWORD_MASK);
@@ -765,6 +765,8 @@ static void __init rk3066a_clk_init(struct device_node *np)
                        mux_armclk_p, ARRAY_SIZE(mux_armclk_p),
                        &rk3066_cpuclk_data, rk3066_cpuclk_rates,
                        ARRAY_SIZE(rk3066_cpuclk_rates));
+       rockchip_clk_protect_critical(rk3188_critical_clocks,
+                                     ARRAY_SIZE(rk3188_critical_clocks));
 }
 CLK_OF_DECLARE(rk3066a_cru, "rockchip,rk3066a-cru", rk3066a_clk_init);
 
@@ -801,6 +803,9 @@ static void __init rk3188a_clk_init(struct device_node *np)
                pr_warn("%s: missing clocks to reparent aclk_cpu_pre to gpll\n",
                        __func__);
        }
+
+       rockchip_clk_protect_critical(rk3188_critical_clocks,
+                                     ARRAY_SIZE(rk3188_critical_clocks));
 }
 CLK_OF_DECLARE(rk3188a_cru, "rockchip,rk3188a-cru", rk3188a_clk_init);
 
index 251f48d..7f370d3 100644 (file)
@@ -1398,6 +1398,45 @@ static const struct exynos_cpuclk_cfg_data e4210_armclk_d[] __initconst = {
        {  0 },
 };
 
+static const struct exynos_cpuclk_cfg_data e4212_armclk_d[] __initconst = {
+       { 1500000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4210_CPU_DIV1(2, 6), },
+       { 1400000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4210_CPU_DIV1(2, 6), },
+       { 1300000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4210_CPU_DIV1(2, 5), },
+       { 1200000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4210_CPU_DIV1(2, 5), },
+       { 1100000, E4210_CPU_DIV0(2, 1, 4, 0, 6, 3), E4210_CPU_DIV1(2, 4), },
+       { 1000000, E4210_CPU_DIV0(1, 1, 4, 0, 5, 2), E4210_CPU_DIV1(2, 4), },
+       {  900000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4210_CPU_DIV1(2, 3), },
+       {  800000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4210_CPU_DIV1(2, 3), },
+       {  700000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  600000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  500000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  400000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  300000, E4210_CPU_DIV0(1, 1, 2, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  200000, E4210_CPU_DIV0(1, 1, 1, 0, 3, 1), E4210_CPU_DIV1(2, 3), },
+       {  0 },
+};
+
+#define E4412_CPU_DIV1(cores, hpm, copy)                               \
+               (((cores) << 8) | ((hpm) << 4) | ((copy) << 0))
+
+static const struct exynos_cpuclk_cfg_data e4412_armclk_d[] __initconst = {
+       { 1500000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4412_CPU_DIV1(7, 0, 6), },
+       { 1400000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4412_CPU_DIV1(6, 0, 6), },
+       { 1300000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4412_CPU_DIV1(6, 0, 5), },
+       { 1200000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4412_CPU_DIV1(5, 0, 5), },
+       { 1100000, E4210_CPU_DIV0(2, 1, 4, 0, 6, 3), E4412_CPU_DIV1(5, 0, 4), },
+       { 1000000, E4210_CPU_DIV0(1, 1, 4, 0, 5, 2), E4412_CPU_DIV1(4, 0, 4), },
+       {  900000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4412_CPU_DIV1(4, 0, 3), },
+       {  800000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4412_CPU_DIV1(3, 0, 3), },
+       {  700000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(3, 0, 3), },
+       {  600000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(2, 0, 3), },
+       {  500000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(2, 0, 3), },
+       {  400000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(1, 0, 3), },
+       {  300000, E4210_CPU_DIV0(1, 1, 2, 0, 4, 2), E4412_CPU_DIV1(1, 0, 3), },
+       {  200000, E4210_CPU_DIV0(1, 1, 1, 0, 3, 1), E4412_CPU_DIV1(0, 0, 3), },
+       {  0 },
+};
+
 /* register exynos4 clocks */
 static void __init exynos4_clk_init(struct device_node *np,
                                    enum exynos4_soc soc)
@@ -1491,6 +1530,17 @@ static void __init exynos4_clk_init(struct device_node *np,
                samsung_clk_register_fixed_factor(ctx,
                        exynos4x12_fixed_factor_clks,
                        ARRAY_SIZE(exynos4x12_fixed_factor_clks));
+               if (of_machine_is_compatible("samsung,exynos4412")) {
+                       exynos_register_cpu_clock(ctx, CLK_ARM_CLK, "armclk",
+                               mout_core_p4x12[0], mout_core_p4x12[1], 0x14200,
+                               e4412_armclk_d, ARRAY_SIZE(e4412_armclk_d),
+                               CLK_CPU_NEEDS_DEBUG_ALT_DIV | CLK_CPU_HAS_DIV1);
+               } else {
+                       exynos_register_cpu_clock(ctx, CLK_ARM_CLK, "armclk",
+                               mout_core_p4x12[0], mout_core_p4x12[1], 0x14200,
+                               e4212_armclk_d, ARRAY_SIZE(e4212_armclk_d),
+                               CLK_CPU_NEEDS_DEBUG_ALT_DIV | CLK_CPU_HAS_DIV1);
+               }
        }
 
        samsung_clk_register_alias(ctx, exynos4_aliases,
index 77aa34e..cd0391e 100644 (file)
@@ -24,55 +24,6 @@ config ARM_VEXPRESS_SPC_CPUFREQ
           This add the CPUfreq driver support for Versatile Express
          big.LITTLE platforms using SPC for power management.
 
-
-config ARM_EXYNOS_CPUFREQ
-       tristate "SAMSUNG EXYNOS CPUfreq Driver"
-       depends on CPU_EXYNOS4210 || SOC_EXYNOS4212 || SOC_EXYNOS4412 || SOC_EXYNOS5250
-       depends on THERMAL
-       help
-         This adds the CPUFreq driver for Samsung EXYNOS platforms.
-         Supported SoC versions are:
-            Exynos4210, Exynos4212, Exynos4412, and Exynos5250.
-
-         If in doubt, say N.
-
-config ARM_EXYNOS4X12_CPUFREQ
-       bool "SAMSUNG EXYNOS4x12"
-       depends on SOC_EXYNOS4212 || SOC_EXYNOS4412
-       depends on ARM_EXYNOS_CPUFREQ
-       default y
-       help
-         This adds the CPUFreq driver for Samsung EXYNOS4X12
-         SoC (EXYNOS4212 or EXYNOS4412).
-
-         If in doubt, say N.
-
-config ARM_EXYNOS5250_CPUFREQ
-       bool "SAMSUNG EXYNOS5250"
-       depends on SOC_EXYNOS5250
-       depends on ARM_EXYNOS_CPUFREQ
-       default y
-       help
-         This adds the CPUFreq driver for Samsung EXYNOS5250
-         SoC.
-
-         If in doubt, say N.
-
-config ARM_EXYNOS_CPU_FREQ_BOOST_SW
-       bool "EXYNOS Frequency Overclocking - Software"
-       depends on ARM_EXYNOS_CPUFREQ && THERMAL
-       select CPU_FREQ_BOOST_SW
-       select EXYNOS_THERMAL
-       help
-         This driver supports software managed overclocking (BOOST).
-         It allows usage of special frequencies for Samsung Exynos
-         processors if thermal conditions are appropriate.
-
-         It requires, for safe operation, thermal framework with properly
-         defined trip points.
-
-         If in doubt, say N.
-
 config ARM_EXYNOS5440_CPUFREQ
        tristate "SAMSUNG EXYNOS5440"
        depends on SOC_EXYNOS5440
@@ -133,6 +84,7 @@ config ARM_KIRKWOOD_CPUFREQ
 config ARM_MT8173_CPUFREQ
        bool "Mediatek MT8173 CPUFreq support"
        depends on ARCH_MEDIATEK && REGULATOR
+       depends on !CPU_THERMAL || THERMAL=y
        select PM_OPP
        help
          This adds the CPUFreq driver support for Mediatek MT8173 SoC.
index 60a57ca..4134038 100644 (file)
@@ -52,10 +52,6 @@ obj-$(CONFIG_ARM_DT_BL_CPUFREQ)              += arm_big_little_dt.o
 
 obj-$(CONFIG_ARCH_DAVINCI)             += davinci-cpufreq.o
 obj-$(CONFIG_UX500_SOC_DB8500)         += dbx500-cpufreq.o
-obj-$(CONFIG_ARM_EXYNOS_CPUFREQ)       += arm-exynos-cpufreq.o
-arm-exynos-cpufreq-y                                   := exynos-cpufreq.o
-arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ)    += exynos4x12-cpufreq.o
-arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS5250_CPUFREQ)    += exynos5250-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ)   += exynos5440-cpufreq.o
 obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ)     += highbank-cpufreq.o
 obj-$(CONFIG_ARM_HISI_ACPU_CPUFREQ)    += hisi-acpu-cpufreq.o
index c3583cd..7c0d70e 100644 (file)
@@ -196,6 +196,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
        struct device *cpu_dev;
        struct regulator *cpu_reg;
        struct clk *cpu_clk;
+       struct dev_pm_opp *suspend_opp;
        unsigned long min_uV = ~0, max_uV = 0;
        unsigned int transition_latency;
        bool need_update = false;
@@ -239,6 +240,17 @@ static int cpufreq_init(struct cpufreq_policy *policy)
         */
        of_cpumask_init_opp_table(policy->cpus);
 
+       /*
+        * But we need OPP table to function so if it is not there let's
+        * give platform code chance to provide it for us.
+        */
+       ret = dev_pm_opp_get_opp_count(cpu_dev);
+       if (ret <= 0) {
+               pr_debug("OPP table is not ready, deferring probe\n");
+               ret = -EPROBE_DEFER;
+               goto out_free_opp;
+       }
+
        if (need_update) {
                struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
 
@@ -249,24 +261,16 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                 * OPP tables are initialized only for policy->cpu, do it for
                 * others as well.
                 */
-               set_cpus_sharing_opps(cpu_dev, policy->cpus);
+               ret = set_cpus_sharing_opps(cpu_dev, policy->cpus);
+               if (ret)
+                       dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
+                               __func__, ret);
 
                of_property_read_u32(np, "clock-latency", &transition_latency);
        } else {
                transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
        }
 
-       /*
-        * But we need OPP table to function so if it is not there let's
-        * give platform code chance to provide it for us.
-        */
-       ret = dev_pm_opp_get_opp_count(cpu_dev);
-       if (ret <= 0) {
-               pr_debug("OPP table is not ready, deferring probe\n");
-               ret = -EPROBE_DEFER;
-               goto out_free_opp;
-       }
-
        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv) {
                ret = -ENOMEM;
@@ -300,7 +304,8 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                        rcu_read_unlock();
 
                        tol_uV = opp_uV * priv->voltage_tolerance / 100;
-                       if (regulator_is_supported_voltage(cpu_reg, opp_uV,
+                       if (regulator_is_supported_voltage(cpu_reg,
+                                                          opp_uV - tol_uV,
                                                           opp_uV + tol_uV)) {
                                if (opp_uV < min_uV)
                                        min_uV = opp_uV;
@@ -329,6 +334,13 @@ static int cpufreq_init(struct cpufreq_policy *policy)
        policy->driver_data = priv;
 
        policy->clk = cpu_clk;
+
+       rcu_read_lock();
+       suspend_opp = dev_pm_opp_get_suspend_opp(cpu_dev);
+       if (suspend_opp)
+               policy->suspend_freq = dev_pm_opp_get_freq(suspend_opp) / 1000;
+       rcu_read_unlock();
+
        ret = cpufreq_table_validate_and_show(policy, freq_table);
        if (ret) {
                dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__,
@@ -419,6 +431,7 @@ static struct cpufreq_driver dt_cpufreq_driver = {
        .ready = cpufreq_ready,
        .name = "cpufreq-dt",
        .attr = cpufreq_dt_attr,
+       .suspend = cpufreq_generic_suspend,
 };
 
 static int dt_cpufreq_probe(struct platform_device *pdev)
index b3d9368..6633b3f 100644 (file)
@@ -239,7 +239,7 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
 EXPORT_SYMBOL_GPL(cpufreq_generic_init);
 
 /* Only for cpufreq core internal use */
-struct cpufreq_policy *cpufreq_cpu_get_raw(unsigned int cpu)
+static struct cpufreq_policy *cpufreq_cpu_get_raw(unsigned int cpu)
 {
        struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
 
@@ -1626,8 +1626,8 @@ int cpufreq_generic_suspend(struct cpufreq_policy *policy)
        int ret;
 
        if (!policy->suspend_freq) {
-               pr_err("%s: suspend_freq can't be zero\n", __func__);
-               return -EINVAL;
+               pr_debug("%s: suspend_freq not defined\n", __func__);
+               return 0;
        }
 
        pr_debug("%s: Setting suspend-freq: %u\n", __func__,
@@ -2031,8 +2031,7 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
                if (!try_module_get(policy->governor->owner))
                        return -EINVAL;
 
-       pr_debug("__cpufreq_governor for CPU %u, event %u\n",
-                policy->cpu, event);
+       pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event);
 
        mutex_lock(&cpufreq_governor_lock);
        if ((policy->governor_enabled && event == CPUFREQ_GOV_START)
diff --git a/drivers/cpufreq/exynos-cpufreq.c b/drivers/cpufreq/exynos-cpufreq.c
deleted file mode 100644 (file)
index fa3dd84..0000000
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 2010-2011 Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS - CPU frequency scaling support for EXYNOS series
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/regulator/consumer.h>
-#include <linux/cpufreq.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/cpu_cooling.h>
-#include <linux/cpu.h>
-
-#include "exynos-cpufreq.h"
-
-static struct exynos_dvfs_info *exynos_info;
-static struct thermal_cooling_device *cdev;
-static struct regulator *arm_regulator;
-static unsigned int locking_frequency;
-
-static int exynos_cpufreq_get_index(unsigned int freq)
-{
-       struct cpufreq_frequency_table *freq_table = exynos_info->freq_table;
-       struct cpufreq_frequency_table *pos;
-
-       cpufreq_for_each_entry(pos, freq_table)
-               if (pos->frequency == freq)
-                       break;
-
-       if (pos->frequency == CPUFREQ_TABLE_END)
-               return -EINVAL;
-
-       return pos - freq_table;
-}
-
-static int exynos_cpufreq_scale(unsigned int target_freq)
-{
-       struct cpufreq_frequency_table *freq_table = exynos_info->freq_table;
-       unsigned int *volt_table = exynos_info->volt_table;
-       struct cpufreq_policy *policy = cpufreq_cpu_get(0);
-       unsigned int arm_volt, safe_arm_volt = 0;
-       unsigned int mpll_freq_khz = exynos_info->mpll_freq_khz;
-       struct device *dev = exynos_info->dev;
-       unsigned int old_freq;
-       int index, old_index;
-       int ret = 0;
-
-       old_freq = policy->cur;
-
-       /*
-        * The policy max have been changed so that we cannot get proper
-        * old_index with cpufreq_frequency_table_target(). Thus, ignore
-        * policy and get the index from the raw frequency table.
-        */
-       old_index = exynos_cpufreq_get_index(old_freq);
-       if (old_index < 0) {
-               ret = old_index;
-               goto out;
-       }
-
-       index = exynos_cpufreq_get_index(target_freq);
-       if (index < 0) {
-               ret = index;
-               goto out;
-       }
-
-       /*
-        * ARM clock source will be changed APLL to MPLL temporary
-        * To support this level, need to control regulator for
-        * required voltage level
-        */
-       if (exynos_info->need_apll_change != NULL) {
-               if (exynos_info->need_apll_change(old_index, index) &&
-                  (freq_table[index].frequency < mpll_freq_khz) &&
-                  (freq_table[old_index].frequency < mpll_freq_khz))
-                       safe_arm_volt = volt_table[exynos_info->pll_safe_idx];
-       }
-       arm_volt = volt_table[index];
-
-       /* When the new frequency is higher than current frequency */
-       if ((target_freq > old_freq) && !safe_arm_volt) {
-               /* Firstly, voltage up to increase frequency */
-               ret = regulator_set_voltage(arm_regulator, arm_volt, arm_volt);
-               if (ret) {
-                       dev_err(dev, "failed to set cpu voltage to %d\n",
-                               arm_volt);
-                       return ret;
-               }
-       }
-
-       if (safe_arm_volt) {
-               ret = regulator_set_voltage(arm_regulator, safe_arm_volt,
-                                     safe_arm_volt);
-               if (ret) {
-                       dev_err(dev, "failed to set cpu voltage to %d\n",
-                               safe_arm_volt);
-                       return ret;
-               }
-       }
-
-       exynos_info->set_freq(old_index, index);
-
-       /* When the new frequency is lower than current frequency */
-       if ((target_freq < old_freq) ||
-          ((target_freq > old_freq) && safe_arm_volt)) {
-               /* down the voltage after frequency change */
-               ret = regulator_set_voltage(arm_regulator, arm_volt,
-                               arm_volt);
-               if (ret) {
-                       dev_err(dev, "failed to set cpu voltage to %d\n",
-                               arm_volt);
-                       goto out;
-               }
-       }
-
-out:
-       cpufreq_cpu_put(policy);
-
-       return ret;
-}
-
-static int exynos_target(struct cpufreq_policy *policy, unsigned int index)
-{
-       return exynos_cpufreq_scale(exynos_info->freq_table[index].frequency);
-}
-
-static int exynos_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-       policy->clk = exynos_info->cpu_clk;
-       policy->suspend_freq = locking_frequency;
-       return cpufreq_generic_init(policy, exynos_info->freq_table, 100000);
-}
-
-static struct cpufreq_driver exynos_driver = {
-       .flags          = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
-       .verify         = cpufreq_generic_frequency_table_verify,
-       .target_index   = exynos_target,
-       .get            = cpufreq_generic_get,
-       .init           = exynos_cpufreq_cpu_init,
-       .name           = "exynos_cpufreq",
-       .attr           = cpufreq_generic_attr,
-#ifdef CONFIG_ARM_EXYNOS_CPU_FREQ_BOOST_SW
-       .boost_supported = true,
-#endif
-#ifdef CONFIG_PM
-       .suspend        = cpufreq_generic_suspend,
-#endif
-};
-
-static int exynos_cpufreq_probe(struct platform_device *pdev)
-{
-       struct device_node *cpu0;
-       int ret = -EINVAL;
-
-       exynos_info = kzalloc(sizeof(*exynos_info), GFP_KERNEL);
-       if (!exynos_info)
-               return -ENOMEM;
-
-       exynos_info->dev = &pdev->dev;
-
-       if (of_machine_is_compatible("samsung,exynos4212")) {
-               exynos_info->type = EXYNOS_SOC_4212;
-               ret = exynos4x12_cpufreq_init(exynos_info);
-       } else if (of_machine_is_compatible("samsung,exynos4412")) {
-               exynos_info->type = EXYNOS_SOC_4412;
-               ret = exynos4x12_cpufreq_init(exynos_info);
-       } else if (of_machine_is_compatible("samsung,exynos5250")) {
-               exynos_info->type = EXYNOS_SOC_5250;
-               ret = exynos5250_cpufreq_init(exynos_info);
-       } else {
-               pr_err("%s: Unknown SoC type\n", __func__);
-               ret = -ENODEV;
-       }
-
-       if (ret)
-               goto err_vdd_arm;
-
-       if (exynos_info->set_freq == NULL) {
-               dev_err(&pdev->dev, "No set_freq function (ERR)\n");
-               ret = -EINVAL;
-               goto err_vdd_arm;
-       }
-
-       arm_regulator = regulator_get(NULL, "vdd_arm");
-       if (IS_ERR(arm_regulator)) {
-               dev_err(&pdev->dev, "failed to get resource vdd_arm\n");
-               ret = -EINVAL;
-               goto err_vdd_arm;
-       }
-
-       /* Done here as we want to capture boot frequency */
-       locking_frequency = clk_get_rate(exynos_info->cpu_clk) / 1000;
-
-       ret = cpufreq_register_driver(&exynos_driver);
-       if (ret)
-               goto err_cpufreq_reg;
-
-       cpu0 = of_get_cpu_node(0, NULL);
-       if (!cpu0) {
-               pr_err("failed to find cpu0 node\n");
-               return 0;
-       }
-
-       if (of_find_property(cpu0, "#cooling-cells", NULL)) {
-               cdev = of_cpufreq_cooling_register(cpu0,
-                                                  cpu_present_mask);
-               if (IS_ERR(cdev))
-                       pr_err("running cpufreq without cooling device: %ld\n",
-                              PTR_ERR(cdev));
-       }
-
-       return 0;
-
-err_cpufreq_reg:
-       dev_err(&pdev->dev, "failed to register cpufreq driver\n");
-       regulator_put(arm_regulator);
-err_vdd_arm:
-       kfree(exynos_info);
-       return ret;
-}
-
-static struct platform_driver exynos_cpufreq_platdrv = {
-       .driver = {
-               .name   = "exynos-cpufreq",
-       },
-       .probe = exynos_cpufreq_probe,
-};
-module_platform_driver(exynos_cpufreq_platdrv);
diff --git a/drivers/cpufreq/exynos-cpufreq.h b/drivers/cpufreq/exynos-cpufreq.h
deleted file mode 100644 (file)
index a3855e4..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2010 Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS - CPUFreq support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-enum cpufreq_level_index {
-       L0, L1, L2, L3, L4,
-       L5, L6, L7, L8, L9,
-       L10, L11, L12, L13, L14,
-       L15, L16, L17, L18, L19,
-       L20,
-};
-
-enum exynos_soc_type {
-       EXYNOS_SOC_4212,
-       EXYNOS_SOC_4412,
-       EXYNOS_SOC_5250,
-};
-
-#define APLL_FREQ(f, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, m, p, s) \
-       { \
-               .freq = (f) * 1000, \
-               .clk_div_cpu0 = ((a0) | (a1) << 4 | (a2) << 8 | (a3) << 12 | \
-                       (a4) << 16 | (a5) << 20 | (a6) << 24 | (a7) << 28), \
-               .clk_div_cpu1 = (b0 << 0 | b1 << 4 | b2 << 8), \
-               .mps = ((m) << 16 | (p) << 8 | (s)), \
-       }
-
-struct apll_freq {
-       unsigned int freq;
-       u32 clk_div_cpu0;
-       u32 clk_div_cpu1;
-       u32 mps;
-};
-
-struct exynos_dvfs_info {
-       enum exynos_soc_type type;
-       struct device   *dev;
-       unsigned long   mpll_freq_khz;
-       unsigned int    pll_safe_idx;
-       struct clk      *cpu_clk;
-       unsigned int    *volt_table;
-       struct cpufreq_frequency_table  *freq_table;
-       void (*set_freq)(unsigned int, unsigned int);
-       bool (*need_apll_change)(unsigned int, unsigned int);
-       void __iomem    *cmu_regs;
-};
-
-#ifdef CONFIG_ARM_EXYNOS4X12_CPUFREQ
-extern int exynos4x12_cpufreq_init(struct exynos_dvfs_info *);
-#else
-static inline int exynos4x12_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       return -EOPNOTSUPP;
-}
-#endif
-#ifdef CONFIG_ARM_EXYNOS5250_CPUFREQ
-extern int exynos5250_cpufreq_init(struct exynos_dvfs_info *);
-#else
-static inline int exynos5250_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       return -EOPNOTSUPP;
-}
-#endif
-
-#define EXYNOS4_CLKSRC_CPU                     0x14200
-#define EXYNOS4_CLKMUX_STATCPU                 0x14400
-
-#define EXYNOS4_CLKDIV_CPU                     0x14500
-#define EXYNOS4_CLKDIV_CPU1                    0x14504
-#define EXYNOS4_CLKDIV_STATCPU                 0x14600
-#define EXYNOS4_CLKDIV_STATCPU1                        0x14604
-
-#define EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT       (16)
-#define EXYNOS4_CLKMUX_STATCPU_MUXCORE_MASK    (0x7 << EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT)
-
-#define EXYNOS5_APLL_LOCK                      0x00000
-#define EXYNOS5_APLL_CON0                      0x00100
-#define EXYNOS5_CLKMUX_STATCPU                 0x00400
-#define EXYNOS5_CLKDIV_CPU0                    0x00500
-#define EXYNOS5_CLKDIV_CPU1                    0x00504
-#define EXYNOS5_CLKDIV_STATCPU0                        0x00600
-#define EXYNOS5_CLKDIV_STATCPU1                        0x00604
diff --git a/drivers/cpufreq/exynos4x12-cpufreq.c b/drivers/cpufreq/exynos4x12-cpufreq.c
deleted file mode 100644 (file)
index 9e78a85..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2010-2012 Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS4X12 - CPU frequency scaling support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/cpufreq.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include "exynos-cpufreq.h"
-
-static struct clk *cpu_clk;
-static struct clk *moutcore;
-static struct clk *mout_mpll;
-static struct clk *mout_apll;
-static struct exynos_dvfs_info *cpufreq;
-
-static unsigned int exynos4x12_volt_table[] = {
-       1350000, 1287500, 1250000, 1187500, 1137500, 1087500, 1037500,
-       1000000,  987500,  975000,  950000,  925000,  900000,  900000
-};
-
-static struct cpufreq_frequency_table exynos4x12_freq_table[] = {
-       {CPUFREQ_BOOST_FREQ, L0, 1500 * 1000},
-       {0, L1, 1400 * 1000},
-       {0, L2, 1300 * 1000},
-       {0, L3, 1200 * 1000},
-       {0, L4, 1100 * 1000},
-       {0, L5, 1000 * 1000},
-       {0, L6,  900 * 1000},
-       {0, L7,  800 * 1000},
-       {0, L8,  700 * 1000},
-       {0, L9,  600 * 1000},
-       {0, L10, 500 * 1000},
-       {0, L11, 400 * 1000},
-       {0, L12, 300 * 1000},
-       {0, L13, 200 * 1000},
-       {0, 0, CPUFREQ_TABLE_END},
-};
-
-static struct apll_freq *apll_freq_4x12;
-
-static struct apll_freq apll_freq_4212[] = {
-       /*
-        * values:
-        * freq
-        * clock divider for CORE, COREM0, COREM1, PERIPH, ATB, PCLK_DBG, APLL, CORE2
-        * clock divider for COPY, HPM, RESERVED
-        * PLL M, P, S
-        */
-       APLL_FREQ(1500, 0, 3, 7, 0, 6, 1, 2, 0, 6, 2, 0, 250, 4, 0),
-       APLL_FREQ(1400, 0, 3, 7, 0, 6, 1, 2, 0, 6, 2, 0, 175, 3, 0),
-       APLL_FREQ(1300, 0, 3, 7, 0, 5, 1, 2, 0, 5, 2, 0, 325, 6, 0),
-       APLL_FREQ(1200, 0, 3, 7, 0, 5, 1, 2, 0, 5, 2, 0, 200, 4, 0),
-       APLL_FREQ(1100, 0, 3, 6, 0, 4, 1, 2, 0, 4, 2, 0, 275, 6, 0),
-       APLL_FREQ(1000, 0, 2, 5, 0, 4, 1, 1, 0, 4, 2, 0, 125, 3, 0),
-       APLL_FREQ(900,  0, 2, 5, 0, 3, 1, 1, 0, 3, 2, 0, 150, 4, 0),
-       APLL_FREQ(800,  0, 2, 5, 0, 3, 1, 1, 0, 3, 2, 0, 100, 3, 0),
-       APLL_FREQ(700,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 175, 3, 1),
-       APLL_FREQ(600,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 200, 4, 1),
-       APLL_FREQ(500,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 125, 3, 1),
-       APLL_FREQ(400,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 100, 3, 1),
-       APLL_FREQ(300,  0, 2, 4, 0, 2, 1, 1, 0, 3, 2, 0, 200, 4, 2),
-       APLL_FREQ(200,  0, 1, 3, 0, 1, 1, 1, 0, 3, 2, 0, 100, 3, 2),
-};
-
-static struct apll_freq apll_freq_4412[] = {
-       /*
-        * values:
-        * freq
-        * clock divider for CORE, COREM0, COREM1, PERIPH, ATB, PCLK_DBG, APLL, CORE2
-        * clock divider for COPY, HPM, CORES
-        * PLL M, P, S
-        */
-       APLL_FREQ(1500, 0, 3, 7, 0, 6, 1, 2, 0, 6, 0, 7, 250, 4, 0),
-       APLL_FREQ(1400, 0, 3, 7, 0, 6, 1, 2, 0, 6, 0, 6, 175, 3, 0),
-       APLL_FREQ(1300, 0, 3, 7, 0, 5, 1, 2, 0, 5, 0, 6, 325, 6, 0),
-       APLL_FREQ(1200, 0, 3, 7, 0, 5, 1, 2, 0, 5, 0, 5, 200, 4, 0),
-       APLL_FREQ(1100, 0, 3, 6, 0, 4, 1, 2, 0, 4, 0, 5, 275, 6, 0),
-       APLL_FREQ(1000, 0, 2, 5, 0, 4, 1, 1, 0, 4, 0, 4, 125, 3, 0),
-       APLL_FREQ(900,  0, 2, 5, 0, 3, 1, 1, 0, 3, 0, 4, 150, 4, 0),
-       APLL_FREQ(800,  0, 2, 5, 0, 3, 1, 1, 0, 3, 0, 3, 100, 3, 0),
-       APLL_FREQ(700,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 3, 175, 3, 1),
-       APLL_FREQ(600,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 2, 200, 4, 1),
-       APLL_FREQ(500,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 2, 125, 3, 1),
-       APLL_FREQ(400,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 1, 100, 3, 1),
-       APLL_FREQ(300,  0, 2, 4, 0, 2, 1, 1, 0, 3, 0, 1, 200, 4, 2),
-       APLL_FREQ(200,  0, 1, 3, 0, 1, 1, 1, 0, 3, 0, 0, 100, 3, 2),
-};
-
-static void exynos4x12_set_clkdiv(unsigned int div_index)
-{
-       unsigned int tmp;
-
-       /* Change Divider - CPU0 */
-
-       tmp = apll_freq_4x12[div_index].clk_div_cpu0;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS4_CLKDIV_CPU);
-
-       while (__raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKDIV_STATCPU)
-              & 0x11111111)
-               cpu_relax();
-
-       /* Change Divider - CPU1 */
-       tmp = apll_freq_4x12[div_index].clk_div_cpu1;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS4_CLKDIV_CPU1);
-
-       do {
-               cpu_relax();
-               tmp = __raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKDIV_STATCPU1);
-       } while (tmp != 0x0);
-}
-
-static void exynos4x12_set_apll(unsigned int index)
-{
-       unsigned int tmp, freq = apll_freq_4x12[index].freq;
-
-       /* MUX_CORE_SEL = MPLL, ARMCLK uses MPLL for lock time */
-       clk_set_parent(moutcore, mout_mpll);
-
-       do {
-               cpu_relax();
-               tmp = (__raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKMUX_STATCPU)
-                       >> EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT);
-               tmp &= 0x7;
-       } while (tmp != 0x2);
-
-       clk_set_rate(mout_apll, freq * 1000);
-
-       /* MUX_CORE_SEL = APLL */
-       clk_set_parent(moutcore, mout_apll);
-
-       do {
-               cpu_relax();
-               tmp = __raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKMUX_STATCPU);
-               tmp &= EXYNOS4_CLKMUX_STATCPU_MUXCORE_MASK;
-       } while (tmp != (0x1 << EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT));
-}
-
-static void exynos4x12_set_frequency(unsigned int old_index,
-                                 unsigned int new_index)
-{
-       if (old_index > new_index) {
-               exynos4x12_set_clkdiv(new_index);
-               exynos4x12_set_apll(new_index);
-       } else if (old_index < new_index) {
-               exynos4x12_set_apll(new_index);
-               exynos4x12_set_clkdiv(new_index);
-       }
-}
-
-int exynos4x12_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       struct device_node *np;
-       unsigned long rate;
-
-       /*
-        * HACK: This is a temporary workaround to get access to clock
-        * controller registers directly and remove static mappings and
-        * dependencies on platform headers. It is necessary to enable
-        * Exynos multi-platform support and will be removed together with
-        * this whole driver as soon as Exynos gets migrated to use
-        * cpufreq-dt driver.
-        */
-       np = of_find_compatible_node(NULL, NULL, "samsung,exynos4412-clock");
-       if (!np) {
-               pr_err("%s: failed to find clock controller DT node\n",
-                       __func__);
-               return -ENODEV;
-       }
-
-       info->cmu_regs = of_iomap(np, 0);
-       if (!info->cmu_regs) {
-               pr_err("%s: failed to map CMU registers\n", __func__);
-               return -EFAULT;
-       }
-
-       cpu_clk = clk_get(NULL, "armclk");
-       if (IS_ERR(cpu_clk))
-               return PTR_ERR(cpu_clk);
-
-       moutcore = clk_get(NULL, "moutcore");
-       if (IS_ERR(moutcore))
-               goto err_moutcore;
-
-       mout_mpll = clk_get(NULL, "mout_mpll");
-       if (IS_ERR(mout_mpll))
-               goto err_mout_mpll;
-
-       rate = clk_get_rate(mout_mpll) / 1000;
-
-       mout_apll = clk_get(NULL, "mout_apll");
-       if (IS_ERR(mout_apll))
-               goto err_mout_apll;
-
-       if (info->type == EXYNOS_SOC_4212)
-               apll_freq_4x12 = apll_freq_4212;
-       else
-               apll_freq_4x12 = apll_freq_4412;
-
-       info->mpll_freq_khz = rate;
-       /* 800Mhz */
-       info->pll_safe_idx = L7;
-       info->cpu_clk = cpu_clk;
-       info->volt_table = exynos4x12_volt_table;
-       info->freq_table = exynos4x12_freq_table;
-       info->set_freq = exynos4x12_set_frequency;
-
-       cpufreq = info;
-
-       return 0;
-
-err_mout_apll:
-       clk_put(mout_mpll);
-err_mout_mpll:
-       clk_put(moutcore);
-err_moutcore:
-       clk_put(cpu_clk);
-
-       pr_debug("%s: failed initialization\n", __func__);
-       return -EINVAL;
-}
diff --git a/drivers/cpufreq/exynos5250-cpufreq.c b/drivers/cpufreq/exynos5250-cpufreq.c
deleted file mode 100644 (file)
index 3eafdc7..0000000
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2010-20122Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS5250 - CPU frequency scaling support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/cpufreq.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include "exynos-cpufreq.h"
-
-static struct clk *cpu_clk;
-static struct clk *moutcore;
-static struct clk *mout_mpll;
-static struct clk *mout_apll;
-static struct exynos_dvfs_info *cpufreq;
-
-static unsigned int exynos5250_volt_table[] = {
-       1300000, 1250000, 1225000, 1200000, 1150000,
-       1125000, 1100000, 1075000, 1050000, 1025000,
-       1012500, 1000000,  975000,  950000,  937500,
-       925000
-};
-
-static struct cpufreq_frequency_table exynos5250_freq_table[] = {
-       {0, L0, 1700 * 1000},
-       {0, L1, 1600 * 1000},
-       {0, L2, 1500 * 1000},
-       {0, L3, 1400 * 1000},
-       {0, L4, 1300 * 1000},
-       {0, L5, 1200 * 1000},
-       {0, L6, 1100 * 1000},
-       {0, L7, 1000 * 1000},
-       {0, L8,  900 * 1000},
-       {0, L9,  800 * 1000},
-       {0, L10, 700 * 1000},
-       {0, L11, 600 * 1000},
-       {0, L12, 500 * 1000},
-       {0, L13, 400 * 1000},
-       {0, L14, 300 * 1000},
-       {0, L15, 200 * 1000},
-       {0, 0, CPUFREQ_TABLE_END},
-};
-
-static struct apll_freq apll_freq_5250[] = {
-       /*
-        * values:
-        * freq
-        * clock divider for ARM, CPUD, ACP, PERIPH, ATB, PCLK_DBG, APLL, ARM2
-        * clock divider for COPY, HPM, RESERVED
-        * PLL M, P, S
-        */
-       APLL_FREQ(1700, 0, 3, 7, 7, 7, 3, 5, 0, 0, 2, 0, 425, 6, 0),
-       APLL_FREQ(1600, 0, 3, 7, 7, 7, 1, 4, 0, 0, 2, 0, 200, 3, 0),
-       APLL_FREQ(1500, 0, 2, 7, 7, 7, 1, 4, 0, 0, 2, 0, 250, 4, 0),
-       APLL_FREQ(1400, 0, 2, 7, 7, 6, 1, 4, 0, 0, 2, 0, 175, 3, 0),
-       APLL_FREQ(1300, 0, 2, 7, 7, 6, 1, 3, 0, 0, 2, 0, 325, 6, 0),
-       APLL_FREQ(1200, 0, 2, 7, 7, 5, 1, 3, 0, 0, 2, 0, 200, 4, 0),
-       APLL_FREQ(1100, 0, 3, 7, 7, 5, 1, 3, 0, 0, 2, 0, 275, 6, 0),
-       APLL_FREQ(1000, 0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 125, 3, 0),
-       APLL_FREQ(900,  0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 150, 4, 0),
-       APLL_FREQ(800,  0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 100, 3, 0),
-       APLL_FREQ(700,  0, 1, 7, 7, 3, 1, 1, 0, 0, 2, 0, 175, 3, 1),
-       APLL_FREQ(600,  0, 1, 7, 7, 3, 1, 1, 0, 0, 2, 0, 200, 4, 1),
-       APLL_FREQ(500,  0, 1, 7, 7, 2, 1, 1, 0, 0, 2, 0, 125, 3, 1),
-       APLL_FREQ(400,  0, 1, 7, 7, 2, 1, 1, 0, 0, 2, 0, 100, 3, 1),
-       APLL_FREQ(300,  0, 1, 7, 7, 1, 1, 1, 0, 0, 2, 0, 200, 4, 2),
-       APLL_FREQ(200,  0, 1, 7, 7, 1, 1, 1, 0, 0, 2, 0, 100, 3, 2),
-};
-
-static void set_clkdiv(unsigned int div_index)
-{
-       unsigned int tmp;
-
-       /* Change Divider - CPU0 */
-
-       tmp = apll_freq_5250[div_index].clk_div_cpu0;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS5_CLKDIV_CPU0);
-
-       while (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKDIV_STATCPU0)
-              & 0x11111111)
-               cpu_relax();
-
-       /* Change Divider - CPU1 */
-       tmp = apll_freq_5250[div_index].clk_div_cpu1;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS5_CLKDIV_CPU1);
-
-       while (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKDIV_STATCPU1) & 0x11)
-               cpu_relax();
-}
-
-static void set_apll(unsigned int index)
-{
-       unsigned int tmp;
-       unsigned int freq = apll_freq_5250[index].freq;
-
-       /* MUX_CORE_SEL = MPLL, ARMCLK uses MPLL for lock time */
-       clk_set_parent(moutcore, mout_mpll);
-
-       do {
-               cpu_relax();
-               tmp = (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKMUX_STATCPU)
-                       >> 16);
-               tmp &= 0x7;
-       } while (tmp != 0x2);
-
-       clk_set_rate(mout_apll, freq * 1000);
-
-       /* MUX_CORE_SEL = APLL */
-       clk_set_parent(moutcore, mout_apll);
-
-       do {
-               cpu_relax();
-               tmp = __raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKMUX_STATCPU);
-               tmp &= (0x7 << 16);
-       } while (tmp != (0x1 << 16));
-}
-
-static void exynos5250_set_frequency(unsigned int old_index,
-                                 unsigned int new_index)
-{
-       if (old_index > new_index) {
-               set_clkdiv(new_index);
-               set_apll(new_index);
-       } else if (old_index < new_index) {
-               set_apll(new_index);
-               set_clkdiv(new_index);
-       }
-}
-
-int exynos5250_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       struct device_node *np;
-       unsigned long rate;
-
-       /*
-        * HACK: This is a temporary workaround to get access to clock
-        * controller registers directly and remove static mappings and
-        * dependencies on platform headers. It is necessary to enable
-        * Exynos multi-platform support and will be removed together with
-        * this whole driver as soon as Exynos gets migrated to use
-        * cpufreq-dt driver.
-        */
-       np = of_find_compatible_node(NULL, NULL, "samsung,exynos5250-clock");
-       if (!np) {
-               pr_err("%s: failed to find clock controller DT node\n",
-                       __func__);
-               return -ENODEV;
-       }
-
-       info->cmu_regs = of_iomap(np, 0);
-       if (!info->cmu_regs) {
-               pr_err("%s: failed to map CMU registers\n", __func__);
-               return -EFAULT;
-       }
-
-       cpu_clk = clk_get(NULL, "armclk");
-       if (IS_ERR(cpu_clk))
-               return PTR_ERR(cpu_clk);
-
-       moutcore = clk_get(NULL, "mout_cpu");
-       if (IS_ERR(moutcore))
-               goto err_moutcore;
-
-       mout_mpll = clk_get(NULL, "mout_mpll");
-       if (IS_ERR(mout_mpll))
-               goto err_mout_mpll;
-
-       rate = clk_get_rate(mout_mpll) / 1000;
-
-       mout_apll = clk_get(NULL, "mout_apll");
-       if (IS_ERR(mout_apll))
-               goto err_mout_apll;
-
-       info->mpll_freq_khz = rate;
-       /* 800Mhz */
-       info->pll_safe_idx = L9;
-       info->cpu_clk = cpu_clk;
-       info->volt_table = exynos5250_volt_table;
-       info->freq_table = exynos5250_freq_table;
-       info->set_freq = exynos5250_set_frequency;
-
-       cpufreq = info;
-
-       return 0;
-
-err_mout_apll:
-       clk_put(mout_mpll);
-err_mout_mpll:
-       clk_put(moutcore);
-err_moutcore:
-       clk_put(cpu_clk);
-
-       pr_err("%s: failed initialization\n", __func__);
-       return -EINVAL;
-}
index cddc619..3af9dd7 100644 (file)
@@ -260,24 +260,31 @@ static inline void update_turbo_state(void)
                 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 }
 
-#define PCT_TO_HWP(x) (x * 255 / 100)
 static void intel_pstate_hwp_set(void)
 {
-       int min, max, cpu;
-       u64 value, freq;
+       int min, hw_min, max, hw_max, cpu, range, adj_range;
+       u64 value, cap;
+
+       rdmsrl(MSR_HWP_CAPABILITIES, cap);
+       hw_min = HWP_LOWEST_PERF(cap);
+       hw_max = HWP_HIGHEST_PERF(cap);
+       range = hw_max - hw_min;
 
        get_online_cpus();
 
        for_each_online_cpu(cpu) {
                rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
-               min = PCT_TO_HWP(limits.min_perf_pct);
+               adj_range = limits.min_perf_pct * range / 100;
+               min = hw_min + adj_range;
                value &= ~HWP_MIN_PERF(~0L);
                value |= HWP_MIN_PERF(min);
 
-               max = PCT_TO_HWP(limits.max_perf_pct);
+               adj_range = limits.max_perf_pct * range / 100;
+               max = hw_min + adj_range;
                if (limits.no_turbo) {
-                       rdmsrl( MSR_HWP_CAPABILITIES, freq);
-                       max = HWP_GUARANTEED_PERF(freq);
+                       hw_max = HWP_GUARANTEED_PERF(cap);
+                       if (hw_max < max)
+                               max = hw_max;
                }
 
                value &= ~HWP_MAX_PERF(~0L);
@@ -423,6 +430,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 
        limits.max_sysfs_pct = clamp_t(int, input, 0 , 100);
        limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
+       limits.max_perf_pct = max(limits.min_policy_pct, limits.max_perf_pct);
+       limits.max_perf_pct = max(limits.min_perf_pct, limits.max_perf_pct);
        limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
 
        if (hwp_active)
@@ -442,6 +451,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 
        limits.min_sysfs_pct = clamp_t(int, input, 0 , 100);
        limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
+       limits.min_perf_pct = min(limits.max_policy_pct, limits.min_perf_pct);
+       limits.min_perf_pct = min(limits.max_perf_pct, limits.min_perf_pct);
        limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
 
        if (hwp_active)
@@ -989,12 +1000,19 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 
        limits.min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
        limits.min_policy_pct = clamp_t(int, limits.min_policy_pct, 0 , 100);
-       limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
-       limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
-
        limits.max_policy_pct = (policy->max * 100) / policy->cpuinfo.max_freq;
        limits.max_policy_pct = clamp_t(int, limits.max_policy_pct, 0 , 100);
+
+       /* Normalize user input to [min_policy_pct, max_policy_pct] */
+       limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
+       limits.min_perf_pct = min(limits.max_policy_pct, limits.min_perf_pct);
        limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
+       limits.max_perf_pct = max(limits.min_policy_pct, limits.max_perf_pct);
+
+       /* Make sure min_perf_pct <= max_perf_pct */
+       limits.min_perf_pct = min(limits.max_perf_pct, limits.min_perf_pct);
+
+       limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
        limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
 
        if (hwp_active)
index 1523e2d..344058f 100644 (file)
@@ -186,6 +186,28 @@ bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
        return drv->states[state].flags & CPUIDLE_FLAG_COUPLED;
 }
 
+/**
+ * cpuidle_coupled_state_verify - check if the coupled states are correctly set.
+ * @drv: struct cpuidle_driver for the platform
+ *
+ * Returns 0 for valid state values, a negative error code otherwise:
+ *  * -EINVAL if any coupled state(safe_state_index) is wrongly set.
+ */
+int cpuidle_coupled_state_verify(struct cpuidle_driver *drv)
+{
+       int i;
+
+       for (i = drv->state_count - 1; i >= 0; i--) {
+               if (cpuidle_state_is_coupled(drv, i) &&
+                   (drv->safe_state_index == i ||
+                    drv->safe_state_index < 0 ||
+                    drv->safe_state_index >= drv->state_count))
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
 /**
  * cpuidle_coupled_set_ready - mark a cpu as ready
  * @coupled: the struct coupled that contains the current cpu
index 178c5ad..f87f399 100644 (file)
@@ -35,6 +35,7 @@ extern void cpuidle_remove_sysfs(struct cpuidle_device *dev);
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
 bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state);
+int cpuidle_coupled_state_verify(struct cpuidle_driver *drv);
 int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
                struct cpuidle_driver *drv, int next_state);
 int cpuidle_coupled_register_device(struct cpuidle_device *dev);
@@ -46,6 +47,11 @@ bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
        return false;
 }
 
+static inline int cpuidle_coupled_state_verify(struct cpuidle_driver *drv)
+{
+       return 0;
+}
+
 static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
                struct cpuidle_driver *drv, int next_state)
 {
index 5db1478..389ade4 100644 (file)
@@ -227,6 +227,10 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv)
        if (!drv || !drv->state_count)
                return -EINVAL;
 
+       ret = cpuidle_coupled_state_verify(drv);
+       if (ret)
+               return ret;
+
        if (cpuidle_disabled())
                return -ENODEV;
 
index 07bc7aa..d234719 100644 (file)
@@ -461,7 +461,7 @@ config CRYPTO_DEV_QCE
 
 config CRYPTO_DEV_VMX
        bool "Support for VMX cryptographic acceleration instructions"
-       depends on PPC64
+       depends on PPC64 && VSX
        help
          Support for VMX cryptographic acceleration instructions.
 
index e419869..52340b9 100644 (file)
@@ -86,9 +86,7 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
 {
        struct adf_etr_ring_data *ring = sfile->private;
        struct adf_etr_bank_data *bank = ring->bank;
-       uint32_t *msg = v;
        void __iomem *csr = ring->bank->csr_addr;
-       int i, x;
 
        if (v == SEQ_START_TOKEN) {
                int head, tail, empty;
@@ -113,18 +111,8 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
                seq_puts(sfile, "----------- Ring data ------------\n");
                return 0;
        }
-       seq_printf(sfile, "%p:", msg);
-       x = 0;
-       i = 0;
-       for (; i < (ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2); i++) {
-               seq_printf(sfile, " %08X", *(msg + i));
-               if ((ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2) != i + 1 &&
-                   (++x == 8)) {
-                       seq_printf(sfile, "\n%p:", msg + i + 1);
-                       x = 0;
-               }
-       }
-       seq_puts(sfile, "\n");
+       seq_hex_dump(sfile, "", DUMP_PREFIX_ADDRESS, 32, 4,
+                    v, ADF_MSG_SIZE_TO_BYTES(ring->msg_size), false);
        return 0;
 }
 
index e070c31..a19ee12 100644 (file)
@@ -104,7 +104,7 @@ static int sun4i_ss_opti_poll(struct ablkcipher_request *areq)
                        sg_miter_next(&mo);
                        oo = 0;
                }
-       } while (mo.length > 0);
+       } while (oleft > 0);
 
        if (areq->info) {
                for (i = 0; i < 4 && i < ivsize / 4; i++) {
index ca78311..cf1268d 100644 (file)
@@ -280,6 +280,7 @@ struct sbridge_info {
        u8              max_interleave;
        u8              (*get_node_id)(struct sbridge_pvt *pvt);
        enum mem_type   (*get_memory_type)(struct sbridge_pvt *pvt);
+       enum dev_type   (*get_width)(struct sbridge_pvt *pvt, u32 mtr);
        struct pci_dev  *pci_vtd;
 };
 
@@ -471,6 +472,9 @@ static const struct pci_id_table pci_dev_descr_ibridge_table[] = {
 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2 0x2f6c
 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3 0x2f6d
 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0 0x2fbd
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1 0x2fbf
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2 0x2fb9
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3 0x2fbb
 static const struct pci_id_descr pci_dev_descr_haswell[] = {
        /* first item must be the HA */
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0, 0)             },
@@ -488,6 +492,9 @@ static const struct pci_id_descr pci_dev_descr_haswell[] = {
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3, 1)        },
 
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0, 1)          },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1, 1)          },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2, 1)          },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3, 1)          },
 
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA, 1)          },
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_THERMAL, 1)     },
@@ -762,6 +769,49 @@ out:
        return mtype;
 }
 
+static enum dev_type sbridge_get_width(struct sbridge_pvt *pvt, u32 mtr)
+{
+       /* there's no way to figure out */
+       return DEV_UNKNOWN;
+}
+
+static enum dev_type __ibridge_get_width(u32 mtr)
+{
+       enum dev_type type;
+
+       switch (mtr) {
+       case 3:
+               type = DEV_UNKNOWN;
+               break;
+       case 2:
+               type = DEV_X16;
+               break;
+       case 1:
+               type = DEV_X8;
+               break;
+       case 0:
+               type = DEV_X4;
+               break;
+       }
+
+       return type;
+}
+
+static enum dev_type ibridge_get_width(struct sbridge_pvt *pvt, u32 mtr)
+{
+       /*
+        * ddr3_width on the documentation but also valid for DDR4 on
+        * Haswell
+        */
+       return __ibridge_get_width(GET_BITFIELD(mtr, 7, 8));
+}
+
+static enum dev_type broadwell_get_width(struct sbridge_pvt *pvt, u32 mtr)
+{
+       /* ddr3_width on the documentation but also valid for DDR4 */
+       return __ibridge_get_width(GET_BITFIELD(mtr, 8, 9));
+}
+
 static u8 get_node_id(struct sbridge_pvt *pvt)
 {
        u32 reg;
@@ -966,17 +1016,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
 
                                dimm->nr_pages = npages;
                                dimm->grain = 32;
-                               switch (banks) {
-                               case 16:
-                                       dimm->dtype = DEV_X16;
-                                       break;
-                               case 8:
-                                       dimm->dtype = DEV_X8;
-                                       break;
-                               case 4:
-                                       dimm->dtype = DEV_X4;
-                                       break;
-                               }
+                               dimm->dtype = pvt->info.get_width(pvt, mtr);
                                dimm->mtype = mtype;
                                dimm->edac_mode = mode;
                                snprintf(dimm->label, sizeof(dimm->label),
@@ -1869,7 +1909,11 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
                }
                        break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0:
-                       pvt->pci_ddrio = pdev;
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1:
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2:
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3:
+                       if (!pvt->pci_ddrio)
+                               pvt->pci_ddrio = pdev;
                        break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1:
                        pvt->pci_ha1 = pdev;
@@ -2361,6 +2405,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = ibridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
                pvt->info.interleave_pkg = ibridge_interleave_pkg;
+               pvt->info.get_width = ibridge_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Ivy Bridge Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
@@ -2380,6 +2425,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = sbridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(sbridge_interleave_list);
                pvt->info.interleave_pkg = sbridge_interleave_pkg;
+               pvt->info.get_width = sbridge_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Sandy Bridge Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
@@ -2399,6 +2445,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = ibridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
                pvt->info.interleave_pkg = ibridge_interleave_pkg;
+               pvt->info.get_width = ibridge_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Haswell Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
@@ -2418,6 +2465,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = ibridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
                pvt->info.interleave_pkg = ibridge_interleave_pkg;
+               pvt->info.get_width = broadwell_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Broadwell Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
index 54071c1..84533e0 100644 (file)
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
 
 config EFI_RUNTIME_MAP
        bool "Export efi runtime maps to sysfs"
-       depends on X86 && EFI && KEXEC
+       depends on X86 && EFI && KEXEC_CORE
        default y
        help
          Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
index b4fc9e4..8949b3f 100644 (file)
@@ -356,7 +356,7 @@ config GPIO_PXA
 
 config GPIO_RCAR
        tristate "Renesas R-Car GPIO"
-       depends on ARM && (ARCH_SHMOBILE || COMPILE_TEST)
+       depends on ARCH_SHMOBILE || COMPILE_TEST
        select GPIOLIB_IRQCHIP
        help
          Say yes here to support GPIO on Renesas R-Car SoCs.
index b752b56..8813aba 100644 (file)
@@ -339,13 +339,15 @@ static int gpio_set_wake_irq(struct irq_data *d, u32 enable)
        return 0;
 }
 
-static void mxc_gpio_init_gc(struct mxc_gpio_port *port, int irq_base)
+static int mxc_gpio_init_gc(struct mxc_gpio_port *port, int irq_base)
 {
        struct irq_chip_generic *gc;
        struct irq_chip_type *ct;
 
        gc = irq_alloc_generic_chip("gpio-mxc", 1, irq_base,
                                    port->base, handle_level_irq);
+       if (!gc)
+               return -ENOMEM;
        gc->private = port;
 
        ct = gc->chip_types;
@@ -360,6 +362,8 @@ static void mxc_gpio_init_gc(struct mxc_gpio_port *port, int irq_base)
 
        irq_setup_generic_chip(gc, IRQ_MSK(32), IRQ_GC_INIT_NESTED_LOCK,
                               IRQ_NOREQUEST, 0);
+
+       return 0;
 }
 
 static void mxc_gpio_get_hw(struct platform_device *pdev)
@@ -477,12 +481,16 @@ static int mxc_gpio_probe(struct platform_device *pdev)
        }
 
        /* gpio-mxc can be a generic irq chip */
-       mxc_gpio_init_gc(port, irq_base);
+       err = mxc_gpio_init_gc(port, irq_base);
+       if (err < 0)
+               goto out_irqdomain_remove;
 
        list_add_tail(&port->node, &mxc_gpio_ports);
 
        return 0;
 
+out_irqdomain_remove:
+       irq_domain_remove(port->domain);
 out_irqdesc_free:
        irq_free_descs(irq_base, 32);
 out_gpiochip_remove:
index b7f383e..1387385 100644 (file)
@@ -196,13 +196,16 @@ static int mxs_gpio_set_wake_irq(struct irq_data *d, unsigned int enable)
        return 0;
 }
 
-static void __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base)
+static int __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base)
 {
        struct irq_chip_generic *gc;
        struct irq_chip_type *ct;
 
        gc = irq_alloc_generic_chip("gpio-mxs", 1, irq_base,
                                    port->base, handle_level_irq);
+       if (!gc)
+               return -ENOMEM;
+
        gc->private = port;
 
        ct = gc->chip_types;
@@ -216,6 +219,8 @@ static void __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base)
 
        irq_setup_generic_chip(gc, IRQ_MSK(32), IRQ_GC_INIT_NESTED_LOCK,
                               IRQ_NOREQUEST, 0);
+
+       return 0;
 }
 
 static int mxs_gpio_to_irq(struct gpio_chip *gc, unsigned offset)
@@ -317,7 +322,9 @@ static int mxs_gpio_probe(struct platform_device *pdev)
        }
 
        /* gpio-mxs can be a generic irq chip */
-       mxs_gpio_init_gc(port, irq_base);
+       err = mxs_gpio_init_gc(port, irq_base);
+       if (err < 0)
+               goto out_irqdomain_remove;
 
        /* setup one handler for each entry */
        irq_set_chained_handler_and_data(port->irq, mxs_gpio_irq_handler,
@@ -343,6 +350,8 @@ static int mxs_gpio_probe(struct platform_device *pdev)
 
 out_bgpio_remove:
        bgpio_remove(&port->bgc);
+out_irqdomain_remove:
+       irq_domain_remove(port->domain);
 out_irqdesc_free:
        irq_free_descs(irq_base, 32);
        return err;
index 2ae0d47..072af52 100644 (file)
@@ -1098,7 +1098,6 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct irq_chip *irqc)
        } else {
                bank->chip.label = "gpio";
                bank->chip.base = gpio;
-               gpio += bank->width;
        }
        bank->chip.ngpio = bank->width;
 
@@ -1108,6 +1107,9 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct irq_chip *irqc)
                return ret;
        }
 
+       if (!bank->is_mpuio)
+               gpio += bank->width;
+
 #ifdef CONFIG_ARCH_OMAP1
        /*
         * REVISIT: Once we have OMAP1 supporting SPARSE_IRQ, we can drop
@@ -1253,8 +1255,11 @@ static int omap_gpio_probe(struct platform_device *pdev)
        omap_gpio_mod_init(bank);
 
        ret = omap_gpio_chip_init(bank, irqc);
-       if (ret)
+       if (ret) {
+               pm_runtime_put_sync(bank->dev);
+               pm_runtime_disable(bank->dev);
                return ret;
+       }
 
        omap_gpio_show_rev(bank);
 
index 458d9d7..9c6b967 100644 (file)
@@ -706,4 +706,3 @@ module_exit(sx150x_exit);
 MODULE_AUTHOR("Gregory Bean <gbean@codeaurora.org>");
 MODULE_DESCRIPTION("Driver for Semtech SX150X I2C GPIO Expanders");
 MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("i2c:sx150x");
index 980c1f8..5db3445 100644 (file)
@@ -1174,15 +1174,16 @@ EXPORT_SYMBOL_GPL(gpiod_is_active_low);
  * that the GPIO was actually requested.
  */
 
-static bool _gpiod_get_raw_value(const struct gpio_desc *desc)
+static int _gpiod_get_raw_value(const struct gpio_desc *desc)
 {
        struct gpio_chip        *chip;
-       bool value;
        int offset;
+       int value;
 
        chip = desc->chip;
        offset = gpio_chip_hwgpio(desc);
-       value = chip->get ? chip->get(chip, offset) : false;
+       value = chip->get ? chip->get(chip, offset) : -EIO;
+       value = value < 0 ? value : !!value;
        trace_gpio_value(desc_to_gpio(desc), 1, value);
        return value;
 }
@@ -1192,7 +1193,7 @@ static bool _gpiod_get_raw_value(const struct gpio_desc *desc)
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's raw value, i.e. the value of the physical line disregarding
- * its ACTIVE_LOW status.
+ * its ACTIVE_LOW status, or negative errno on failure.
  *
  * This function should be called from contexts where we cannot sleep, and will
  * complain if the GPIO chip functions potentially sleep.
@@ -1212,7 +1213,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_value);
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's logical value, i.e. taking the ACTIVE_LOW status into
- * account.
+ * account, or negative errno on failure.
  *
  * This function should be called from contexts where we cannot sleep, and will
  * complain if the GPIO chip functions potentially sleep.
@@ -1226,6 +1227,9 @@ int gpiod_get_value(const struct gpio_desc *desc)
        WARN_ON(desc->chip->can_sleep);
 
        value = _gpiod_get_raw_value(desc);
+       if (value < 0)
+               return value;
+
        if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
                value = !value;
 
@@ -1548,7 +1552,7 @@ EXPORT_SYMBOL_GPL(gpiochip_unlock_as_irq);
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's raw value, i.e. the value of the physical line disregarding
- * its ACTIVE_LOW status.
+ * its ACTIVE_LOW status, or negative errno on failure.
  *
  * This function is to be called from contexts that can sleep.
  */
@@ -1566,7 +1570,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_value_cansleep);
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's logical value, i.e. taking the ACTIVE_LOW status into
- * account.
+ * account, or negative errno on failure.
  *
  * This function is to be called from contexts that can sleep.
  */
@@ -1579,6 +1583,9 @@ int gpiod_get_value_cansleep(const struct gpio_desc *desc)
                return 0;
 
        value = _gpiod_get_raw_value(desc);
+       if (value < 0)
+               return value;
+
        if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
                value = !value;
 
index 4349154..f7d5166 100644 (file)
@@ -1515,7 +1515,8 @@ retry:
                        copied_props++;
                }
 
-               if (obj->type == DRM_MODE_OBJECT_PLANE && count_props) {
+               if (obj->type == DRM_MODE_OBJECT_PLANE && count_props &&
+                   !(arg->flags & DRM_MODE_ATOMIC_TEST_ONLY)) {
                        plane = obj_to_plane(obj);
                        plane_mask |= (1 << drm_plane_index(plane));
                        plane->old_fb = plane->fb;
@@ -1537,10 +1538,11 @@ retry:
        }
 
        if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) {
+               /*
+                * Unlike commit, check_only does not clean up state.
+                * Below we call drm_atomic_state_free for it.
+                */
                ret = drm_atomic_check_only(state);
-               /* _check_only() does not free state, unlike _commit() */
-               if (!ret)
-                       drm_atomic_state_free(state);
        } else if (arg->flags & DRM_MODE_ATOMIC_NONBLOCK) {
                ret = drm_atomic_async_commit(state);
        } else {
@@ -1567,25 +1569,30 @@ out:
                plane->old_fb = NULL;
        }
 
+       if (ret && arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
+               /*
+                * TEST_ONLY and PAGE_FLIP_EVENT are mutually exclusive,
+                * if they weren't, this code should be called on success
+                * for TEST_ONLY too.
+                */
+
+               for_each_crtc_in_state(state, crtc, crtc_state, i) {
+                       if (!crtc_state->event)
+                               continue;
+
+                       destroy_vblank_event(dev, file_priv,
+                                            crtc_state->event);
+               }
+       }
+
        if (ret == -EDEADLK) {
                drm_atomic_state_clear(state);
                drm_modeset_backoff(&ctx);
                goto retry;
        }
 
-       if (ret) {
-               if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
-                       for_each_crtc_in_state(state, crtc, crtc_state, i) {
-                               if (!crtc_state->event)
-                                       continue;
-
-                               destroy_vblank_event(dev, file_priv,
-                                                    crtc_state->event);
-                       }
-               }
-
+       if (ret || arg->flags & DRM_MODE_ATOMIC_TEST_ONLY)
                drm_atomic_state_free(state);
-       }
 
        drm_modeset_drop_locks(&ctx);
        drm_modeset_acquire_fini(&ctx);
index 80a02a4..291734e 100644 (file)
@@ -159,6 +159,8 @@ int drm_dp_bw_code_to_link_rate(u8 link_bw)
 }
 EXPORT_SYMBOL(drm_dp_bw_code_to_link_rate);
 
+#define AUX_RETRY_INTERVAL 500 /* us */
+
 /**
  * DOC: dp helpers
  *
@@ -213,7 +215,7 @@ static int drm_dp_dpcd_access(struct drm_dp_aux *aux, u8 request,
                        return -EIO;
 
                case DP_AUX_NATIVE_REPLY_DEFER:
-                       usleep_range(400, 500);
+                       usleep_range(AUX_RETRY_INTERVAL, AUX_RETRY_INTERVAL + 100);
                        break;
                }
        }
@@ -422,6 +424,90 @@ static u32 drm_dp_i2c_functionality(struct i2c_adapter *adapter)
               I2C_FUNC_10BIT_ADDR;
 }
 
+#define AUX_PRECHARGE_LEN 10 /* 10 to 16 */
+#define AUX_SYNC_LEN (16 + 4) /* preamble + AUX_SYNC_END */
+#define AUX_STOP_LEN 4
+#define AUX_CMD_LEN 4
+#define AUX_ADDRESS_LEN 20
+#define AUX_REPLY_PAD_LEN 4
+#define AUX_LENGTH_LEN 8
+
+/*
+ * Calculate the duration of the AUX request/reply in usec. Gives the
+ * "best" case estimate, ie. successful while as short as possible.
+ */
+static int drm_dp_aux_req_duration(const struct drm_dp_aux_msg *msg)
+{
+       int len = AUX_PRECHARGE_LEN + AUX_SYNC_LEN + AUX_STOP_LEN +
+               AUX_CMD_LEN + AUX_ADDRESS_LEN + AUX_LENGTH_LEN;
+
+       if ((msg->request & DP_AUX_I2C_READ) == 0)
+               len += msg->size * 8;
+
+       return len;
+}
+
+static int drm_dp_aux_reply_duration(const struct drm_dp_aux_msg *msg)
+{
+       int len = AUX_PRECHARGE_LEN + AUX_SYNC_LEN + AUX_STOP_LEN +
+               AUX_CMD_LEN + AUX_REPLY_PAD_LEN;
+
+       /*
+        * For read we expect what was asked. For writes there will
+        * be 0 or 1 data bytes. Assume 0 for the "best" case.
+        */
+       if (msg->request & DP_AUX_I2C_READ)
+               len += msg->size * 8;
+
+       return len;
+}
+
+#define I2C_START_LEN 1
+#define I2C_STOP_LEN 1
+#define I2C_ADDR_LEN 9 /* ADDRESS + R/W + ACK/NACK */
+#define I2C_DATA_LEN 9 /* DATA + ACK/NACK */
+
+/*
+ * Calculate the length of the i2c transfer in usec, assuming
+ * the i2c bus speed is as specified. Gives the the "worst"
+ * case estimate, ie. successful while as long as possible.
+ * Doesn't account the the "MOT" bit, and instead assumes each
+ * message includes a START, ADDRESS and STOP. Neither does it
+ * account for additional random variables such as clock stretching.
+ */
+static int drm_dp_i2c_msg_duration(const struct drm_dp_aux_msg *msg,
+                                  int i2c_speed_khz)
+{
+       /* AUX bitrate is 1MHz, i2c bitrate as specified */
+       return DIV_ROUND_UP((I2C_START_LEN + I2C_ADDR_LEN +
+                            msg->size * I2C_DATA_LEN +
+                            I2C_STOP_LEN) * 1000, i2c_speed_khz);
+}
+
+/*
+ * Deterine how many retries should be attempted to successfully transfer
+ * the specified message, based on the estimated durations of the
+ * i2c and AUX transfers.
+ */
+static int drm_dp_i2c_retry_count(const struct drm_dp_aux_msg *msg,
+                             int i2c_speed_khz)
+{
+       int aux_time_us = drm_dp_aux_req_duration(msg) +
+               drm_dp_aux_reply_duration(msg);
+       int i2c_time_us = drm_dp_i2c_msg_duration(msg, i2c_speed_khz);
+
+       return DIV_ROUND_UP(i2c_time_us, aux_time_us + AUX_RETRY_INTERVAL);
+}
+
+/*
+ * FIXME currently assumes 10 kHz as some real world devices seem
+ * to require it. We should query/set the speed via DPCD if supported.
+ */
+static int dp_aux_i2c_speed_khz __read_mostly = 10;
+module_param_unsafe(dp_aux_i2c_speed_khz, int, 0644);
+MODULE_PARM_DESC(dp_aux_i2c_speed_khz,
+                "Assumed speed of the i2c bus in kHz, (1-400, default 10)");
+
 /*
  * Transfer a single I2C-over-AUX message and handle various error conditions,
  * retrying the transaction as appropriate.  It is assumed that the
@@ -434,13 +520,16 @@ static int drm_dp_i2c_do_msg(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg)
 {
        unsigned int retry, defer_i2c;
        int ret;
-
        /*
         * DP1.2 sections 2.7.7.1.5.6.1 and 2.7.7.1.6.6.1: A DP Source device
         * is required to retry at least seven times upon receiving AUX_DEFER
         * before giving up the AUX transaction.
+        *
+        * We also try to account for the i2c bus speed.
         */
-       for (retry = 0, defer_i2c = 0; retry < (7 + defer_i2c); retry++) {
+       int max_retries = max(7, drm_dp_i2c_retry_count(msg, dp_aux_i2c_speed_khz));
+
+       for (retry = 0, defer_i2c = 0; retry < (max_retries + defer_i2c); retry++) {
                mutex_lock(&aux->hw_mutex);
                ret = aux->transfer(aux, msg);
                mutex_unlock(&aux->hw_mutex);
@@ -476,7 +565,7 @@ static int drm_dp_i2c_do_msg(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg)
                         * For now just defer for long enough to hopefully be
                         * safe for all use-cases.
                         */
-                       usleep_range(500, 600);
+                       usleep_range(AUX_RETRY_INTERVAL, AUX_RETRY_INTERVAL + 100);
                        continue;
 
                default:
@@ -506,7 +595,7 @@ static int drm_dp_i2c_do_msg(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg)
                        aux->i2c_defer_count++;
                        if (defer_i2c < 7)
                                defer_i2c++;
-                       usleep_range(400, 500);
+                       usleep_range(AUX_RETRY_INTERVAL, AUX_RETRY_INTERVAL + 100);
                        continue;
 
                default:
index df0b61a..bd1a415 100644 (file)
@@ -77,6 +77,7 @@ config DRM_EXYNOS_VIDI
 config DRM_EXYNOS_G2D
        bool "Exynos DRM G2D"
        depends on DRM_EXYNOS && !VIDEO_SAMSUNG_S5P_G2D
+       select FRAME_VECTOR
        help
          Choose this option if you want to use Exynos G2D for DRM.
 
index 535b4ad..3734c34 100644 (file)
@@ -194,10 +194,8 @@ struct g2d_cmdlist_userptr {
        dma_addr_t              dma_addr;
        unsigned long           userptr;
        unsigned long           size;
-       struct page             **pages;
-       unsigned int            npages;
+       struct frame_vector     *vec;
        struct sg_table         *sgt;
-       struct vm_area_struct   *vma;
        atomic_t                refcount;
        bool                    in_pool;
        bool                    out_of_list;
@@ -367,6 +365,7 @@ static void g2d_userptr_put_dma_addr(struct drm_device *drm_dev,
 {
        struct g2d_cmdlist_userptr *g2d_userptr =
                                        (struct g2d_cmdlist_userptr *)obj;
+       struct page **pages;
 
        if (!obj)
                return;
@@ -386,19 +385,21 @@ out:
        exynos_gem_unmap_sgt_from_dma(drm_dev, g2d_userptr->sgt,
                                        DMA_BIDIRECTIONAL);
 
-       exynos_gem_put_pages_to_userptr(g2d_userptr->pages,
-                                       g2d_userptr->npages,
-                                       g2d_userptr->vma);
+       pages = frame_vector_pages(g2d_userptr->vec);
+       if (!IS_ERR(pages)) {
+               int i;
 
-       exynos_gem_put_vma(g2d_userptr->vma);
+               for (i = 0; i < frame_vector_count(g2d_userptr->vec); i++)
+                       set_page_dirty_lock(pages[i]);
+       }
+       put_vaddr_frames(g2d_userptr->vec);
+       frame_vector_destroy(g2d_userptr->vec);
 
        if (!g2d_userptr->out_of_list)
                list_del_init(&g2d_userptr->list);
 
        sg_free_table(g2d_userptr->sgt);
        kfree(g2d_userptr->sgt);
-
-       drm_free_large(g2d_userptr->pages);
        kfree(g2d_userptr);
 }
 
@@ -412,9 +413,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
        struct exynos_drm_g2d_private *g2d_priv = file_priv->g2d_priv;
        struct g2d_cmdlist_userptr *g2d_userptr;
        struct g2d_data *g2d;
-       struct page **pages;
        struct sg_table *sgt;
-       struct vm_area_struct *vma;
        unsigned long start, end;
        unsigned int npages, offset;
        int ret;
@@ -460,65 +459,40 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
                return ERR_PTR(-ENOMEM);
 
        atomic_set(&g2d_userptr->refcount, 1);
+       g2d_userptr->size = size;
 
        start = userptr & PAGE_MASK;
        offset = userptr & ~PAGE_MASK;
        end = PAGE_ALIGN(userptr + size);
        npages = (end - start) >> PAGE_SHIFT;
-       g2d_userptr->npages = npages;
-
-       pages = drm_calloc_large(npages, sizeof(struct page *));
-       if (!pages) {
-               DRM_ERROR("failed to allocate pages.\n");
+       g2d_userptr->vec = frame_vector_create(npages);
+       if (!g2d_userptr->vec) {
                ret = -ENOMEM;
                goto err_free;
        }
 
-       down_read(&current->mm->mmap_sem);
-       vma = find_vma(current->mm, userptr);
-       if (!vma) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("failed to get vm region.\n");
+       ret = get_vaddr_frames(start, npages, true, true, g2d_userptr->vec);
+       if (ret != npages) {
+               DRM_ERROR("failed to get user pages from userptr.\n");
+               if (ret < 0)
+                       goto err_destroy_framevec;
                ret = -EFAULT;
-               goto err_free_pages;
+               goto err_put_framevec;
        }
-
-       if (vma->vm_end < userptr + size) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("vma is too small.\n");
+       if (frame_vector_to_pages(g2d_userptr->vec) < 0) {
                ret = -EFAULT;
-               goto err_free_pages;
-       }
-
-       g2d_userptr->vma = exynos_gem_get_vma(vma);
-       if (!g2d_userptr->vma) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("failed to copy vma.\n");
-               ret = -ENOMEM;
-               goto err_free_pages;
-       }
-
-       g2d_userptr->size = size;
-
-       ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK,
-                                               npages, pages, vma);
-       if (ret < 0) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("failed to get user pages from userptr.\n");
-               goto err_put_vma;
+               goto err_put_framevec;
        }
 
-       up_read(&current->mm->mmap_sem);
-       g2d_userptr->pages = pages;
-
        sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
        if (!sgt) {
                ret = -ENOMEM;
-               goto err_free_userptr;
+               goto err_put_framevec;
        }
 
-       ret = sg_alloc_table_from_pages(sgt, pages, npages, offset,
-                                       size, GFP_KERNEL);
+       ret = sg_alloc_table_from_pages(sgt,
+                                       frame_vector_pages(g2d_userptr->vec),
+                                       npages, offset, size, GFP_KERNEL);
        if (ret < 0) {
                DRM_ERROR("failed to get sgt from pages.\n");
                goto err_free_sgt;
@@ -553,16 +527,11 @@ err_sg_free_table:
 err_free_sgt:
        kfree(sgt);
 
-err_free_userptr:
-       exynos_gem_put_pages_to_userptr(g2d_userptr->pages,
-                                       g2d_userptr->npages,
-                                       g2d_userptr->vma);
-
-err_put_vma:
-       exynos_gem_put_vma(g2d_userptr->vma);
+err_put_framevec:
+       put_vaddr_frames(g2d_userptr->vec);
 
-err_free_pages:
-       drm_free_large(pages);
+err_destroy_framevec:
+       frame_vector_destroy(g2d_userptr->vec);
 
 err_free:
        kfree(g2d_userptr);
index 62b9ea1..f12fbc3 100644 (file)
@@ -366,103 +366,6 @@ int exynos_drm_gem_get_ioctl(struct drm_device *dev, void *data,
        return 0;
 }
 
-struct vm_area_struct *exynos_gem_get_vma(struct vm_area_struct *vma)
-{
-       struct vm_area_struct *vma_copy;
-
-       vma_copy = kmalloc(sizeof(*vma_copy), GFP_KERNEL);
-       if (!vma_copy)
-               return NULL;
-
-       if (vma->vm_ops && vma->vm_ops->open)
-               vma->vm_ops->open(vma);
-
-       if (vma->vm_file)
-               get_file(vma->vm_file);
-
-       memcpy(vma_copy, vma, sizeof(*vma));
-
-       vma_copy->vm_mm = NULL;
-       vma_copy->vm_next = NULL;
-       vma_copy->vm_prev = NULL;
-
-       return vma_copy;
-}
-
-void exynos_gem_put_vma(struct vm_area_struct *vma)
-{
-       if (!vma)
-               return;
-
-       if (vma->vm_ops && vma->vm_ops->close)
-               vma->vm_ops->close(vma);
-
-       if (vma->vm_file)
-               fput(vma->vm_file);
-
-       kfree(vma);
-}
-
-int exynos_gem_get_pages_from_userptr(unsigned long start,
-                                               unsigned int npages,
-                                               struct page **pages,
-                                               struct vm_area_struct *vma)
-{
-       int get_npages;
-
-       /* the memory region mmaped with VM_PFNMAP. */
-       if (vma_is_io(vma)) {
-               unsigned int i;
-
-               for (i = 0; i < npages; ++i, start += PAGE_SIZE) {
-                       unsigned long pfn;
-                       int ret = follow_pfn(vma, start, &pfn);
-                       if (ret)
-                               return ret;
-
-                       pages[i] = pfn_to_page(pfn);
-               }
-
-               if (i != npages) {
-                       DRM_ERROR("failed to get user_pages.\n");
-                       return -EINVAL;
-               }
-
-               return 0;
-       }
-
-       get_npages = get_user_pages(current, current->mm, start,
-                                       npages, 1, 1, pages, NULL);
-       get_npages = max(get_npages, 0);
-       if (get_npages != npages) {
-               DRM_ERROR("failed to get user_pages.\n");
-               while (get_npages)
-                       put_page(pages[--get_npages]);
-               return -EFAULT;
-       }
-
-       return 0;
-}
-
-void exynos_gem_put_pages_to_userptr(struct page **pages,
-                                       unsigned int npages,
-                                       struct vm_area_struct *vma)
-{
-       if (!vma_is_io(vma)) {
-               unsigned int i;
-
-               for (i = 0; i < npages; i++) {
-                       set_page_dirty_lock(pages[i]);
-
-                       /*
-                        * undo the reference we took when populating
-                        * the table.
-                        */
-                       put_page(pages[i]);
-               }
-       }
-}
-
 int exynos_gem_map_sgt_with_dma(struct drm_device *drm_dev,
                                struct sg_table *sgt,
                                enum dma_data_direction dir)
index 81adf89..e1db8de 100644 (file)
@@ -1929,6 +1929,8 @@ struct drm_i915_private {
                        struct skl_wm_values skl_hw;
                        struct vlv_wm_values vlv;
                };
+
+               uint8_t max_level;
        } wm;
 
        struct i915_runtime_pm pm;
@@ -3384,13 +3386,13 @@ int intel_freq_opcode(struct drm_i915_private *dev_priv, int val);
 #define I915_READ64(reg)       dev_priv->uncore.funcs.mmio_readq(dev_priv, (reg), true)
 
 #define I915_READ64_2x32(lower_reg, upper_reg) ({                      \
-       u32 upper, lower, tmp;                                          \
-       tmp = I915_READ(upper_reg);                                     \
+       u32 upper, lower, old_upper, loop = 0;                          \
+       upper = I915_READ(upper_reg);                                   \
        do {                                                            \
-               upper = tmp;                                            \
+               old_upper = upper;                                      \
                lower = I915_READ(lower_reg);                           \
-               tmp = I915_READ(upper_reg);                             \
-       } while (upper != tmp);                                         \
+               upper = I915_READ(upper_reg);                           \
+       } while (upper != old_upper && loop++ < 2);                     \
        (u64)upper << 32 | lower; })
 
 #define POSTING_READ(reg)      (void)I915_READ_NOTRACE(reg)
index 923a3c4..a953d49 100644 (file)
@@ -1032,6 +1032,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
                u32 old_read = obj->base.read_domains;
                u32 old_write = obj->base.write_domain;
 
+               obj->dirty = 1; /* be paranoid  */
                obj->base.write_domain = obj->base.pending_write_domain;
                if (obj->base.write_domain == 0)
                        obj->base.pending_read_domains |= obj->base.read_domains;
@@ -1039,7 +1040,6 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 
                i915_vma_move_to_active(vma, req);
                if (obj->base.write_domain) {
-                       obj->dirty = 1;
                        i915_gem_request_assign(&obj->last_write_req, req);
 
                        intel_fb_obj_invalidate(obj, ORIGIN_CS);
index b5fb143..5a244ab 100644 (file)
@@ -1558,7 +1558,7 @@ static void i9xx_hpd_irq_handler(struct drm_device *dev)
                u32 hotplug_trigger = hotplug_status & HOTPLUG_INT_STATUS_I915;
 
                intel_get_hpd_pins(&pin_mask, &long_mask, hotplug_trigger,
-                                  hotplug_trigger, hpd_status_g4x,
+                                  hotplug_trigger, hpd_status_i915,
                                   i9xx_port_hotplug_long_detect);
                intel_hpd_irq_handler(dev, pin_mask, long_mask);
        }
index ba1ae03..d0f1b8d 100644 (file)
@@ -350,7 +350,7 @@ static void finish_csr_load(const struct firmware *fw, void *context)
        }
        csr->mmio_count = dmc_header->mmio_count;
        for (i = 0; i < dmc_header->mmio_count; i++) {
-               if (dmc_header->mmioaddr[i] < CSR_MMIO_START_RANGE &&
+               if (dmc_header->mmioaddr[i] < CSR_MMIO_START_RANGE ||
                        dmc_header->mmioaddr[i] > CSR_MMIO_END_RANGE) {
                        DRM_ERROR(" Firmware has wrong mmio address 0x%x\n",
                                                dmc_header->mmioaddr[i]);
index ca9278b..8cc9264 100644 (file)
@@ -6305,7 +6305,7 @@ static void intel_connector_check_state(struct intel_connector *connector)
                      connector->base.name);
 
        if (connector->get_hw_state(connector)) {
-               struct drm_encoder *encoder = &connector->encoder->base;
+               struct intel_encoder *encoder = connector->encoder;
                struct drm_connector_state *conn_state = connector->base.state;
 
                I915_STATE_WARN(!crtc,
@@ -6317,13 +6317,13 @@ static void intel_connector_check_state(struct intel_connector *connector)
                I915_STATE_WARN(!crtc->state->active,
                      "connector is active, but attached crtc isn't\n");
 
-               if (!encoder)
+               if (!encoder || encoder->type == INTEL_OUTPUT_DP_MST)
                        return;
 
-               I915_STATE_WARN(conn_state->best_encoder != encoder,
+               I915_STATE_WARN(conn_state->best_encoder != &encoder->base,
                        "atomic encoder doesn't match attached encoder\n");
 
-               I915_STATE_WARN(conn_state->crtc != encoder->crtc,
+               I915_STATE_WARN(conn_state->crtc != encoder->base.crtc,
                        "attached encoder crtc differs from connector crtc\n");
        } else {
                I915_STATE_WARN(crtc && crtc->state->active,
index 983553c..3e4be5a 100644 (file)
@@ -173,6 +173,11 @@ static void intel_mst_pre_enable_dp(struct intel_encoder *encoder)
                return;
        }
 
+       /* MST encoders are bound to a crtc, not to a connector,
+        * force the mapping here for get_hw_state.
+        */
+       found->encoder = encoder;
+
        DRM_DEBUG_KMS("%d\n", intel_dp->active_mst_links);
        intel_mst->port = found->port;
 
@@ -400,7 +405,7 @@ static const struct drm_encoder_funcs intel_dp_mst_enc_funcs = {
 
 static bool intel_dp_mst_get_hw_state(struct intel_connector *connector)
 {
-       if (connector->encoder) {
+       if (connector->encoder && connector->base.state->crtc) {
                enum pipe pipe;
                if (!connector->encoder->get_hw_state(connector->encoder, &pipe))
                        return false;
index 4a601cf..32a6c71 100644 (file)
@@ -1048,11 +1048,7 @@ void intel_dsi_init(struct drm_device *dev)
        intel_connector->unregister = intel_connector_unregister;
 
        /* Pipe A maps to MIPI DSI port A, pipe B maps to MIPI DSI port C */
-       if (dev_priv->vbt.dsi.config->dual_link) {
-               /* XXX: does dual link work on either pipe? */
-               intel_encoder->crtc_mask = (1 << PIPE_A);
-               intel_dsi->ports = ((1 << PORT_A) | (1 << PORT_C));
-       } else if (dev_priv->vbt.dsi.port == DVO_PORT_MIPIA) {
+       if (dev_priv->vbt.dsi.port == DVO_PORT_MIPIA) {
                intel_encoder->crtc_mask = (1 << PIPE_A);
                intel_dsi->ports = (1 << PORT_A);
        } else if (dev_priv->vbt.dsi.port == DVO_PORT_MIPIC) {
@@ -1060,6 +1056,9 @@ void intel_dsi_init(struct drm_device *dev)
                intel_dsi->ports = (1 << PORT_C);
        }
 
+       if (dev_priv->vbt.dsi.config->dual_link)
+               intel_dsi->ports = ((1 << PORT_A) | (1 << PORT_C));
+
        /* Create a DSI host (and a device) for each port. */
        for_each_dsi_port(port, intel_dsi->ports) {
                struct intel_dsi_host *host;
index fff0c22..ddbb7ed 100644 (file)
@@ -955,8 +955,6 @@ enum vlv_wm_level {
        VLV_WM_LEVEL_PM2,
        VLV_WM_LEVEL_PM5,
        VLV_WM_LEVEL_DDR_DVFS,
-       CHV_WM_NUM_LEVELS,
-       VLV_WM_NUM_LEVELS = 1,
 };
 
 /* latency must be in 0.1us units. */
@@ -982,9 +980,13 @@ static void vlv_setup_wm_latency(struct drm_device *dev)
        /* all latencies in usec */
        dev_priv->wm.pri_latency[VLV_WM_LEVEL_PM2] = 3;
 
+       dev_priv->wm.max_level = VLV_WM_LEVEL_PM2;
+
        if (IS_CHERRYVIEW(dev_priv)) {
                dev_priv->wm.pri_latency[VLV_WM_LEVEL_PM5] = 12;
                dev_priv->wm.pri_latency[VLV_WM_LEVEL_DDR_DVFS] = 33;
+
+               dev_priv->wm.max_level = VLV_WM_LEVEL_DDR_DVFS;
        }
 }
 
@@ -1137,10 +1139,7 @@ static void vlv_compute_wm(struct intel_crtc *crtc)
        memset(wm_state, 0, sizeof(*wm_state));
 
        wm_state->cxsr = crtc->pipe != PIPE_C && crtc->wm.cxsr_allowed;
-       if (IS_CHERRYVIEW(dev))
-               wm_state->num_levels = CHV_WM_NUM_LEVELS;
-       else
-               wm_state->num_levels = VLV_WM_NUM_LEVELS;
+       wm_state->num_levels = to_i915(dev)->wm.max_level + 1;
 
        wm_state->num_active_planes = 0;
 
@@ -1220,7 +1219,7 @@ static void vlv_compute_wm(struct intel_crtc *crtc)
        }
 
        /* clear any (partially) filled invalid levels */
-       for (level = wm_state->num_levels; level < CHV_WM_NUM_LEVELS; level++) {
+       for (level = wm_state->num_levels; level < to_i915(dev)->wm.max_level + 1; level++) {
                memset(&wm_state->wm[level], 0, sizeof(wm_state->wm[level]));
                memset(&wm_state->sr[level], 0, sizeof(wm_state->sr[level]));
        }
@@ -1324,10 +1323,7 @@ static void vlv_merge_wm(struct drm_device *dev,
        struct intel_crtc *crtc;
        int num_active_crtcs = 0;
 
-       if (IS_CHERRYVIEW(dev))
-               wm->level = VLV_WM_LEVEL_DDR_DVFS;
-       else
-               wm->level = VLV_WM_LEVEL_PM2;
+       wm->level = to_i915(dev)->wm.max_level;
        wm->cxsr = true;
 
        for_each_intel_crtc(dev, crtc) {
@@ -4083,9 +4079,29 @@ void vlv_wm_get_hw_state(struct drm_device *dev)
                if (val & DSP_MAXFIFO_PM5_ENABLE)
                        wm->level = VLV_WM_LEVEL_PM5;
 
+               /*
+                * If DDR DVFS is disabled in the BIOS, Punit
+                * will never ack the request. So if that happens
+                * assume we don't have to enable/disable DDR DVFS
+                * dynamically. To test that just set the REQ_ACK
+                * bit to poke the Punit, but don't change the
+                * HIGH/LOW bits so that we don't actually change
+                * the current state.
+                */
                val = vlv_punit_read(dev_priv, PUNIT_REG_DDR_SETUP2);
-               if ((val & FORCE_DDR_HIGH_FREQ) == 0)
-                       wm->level = VLV_WM_LEVEL_DDR_DVFS;
+               val |= FORCE_DDR_FREQ_REQ_ACK;
+               vlv_punit_write(dev_priv, PUNIT_REG_DDR_SETUP2, val);
+
+               if (wait_for((vlv_punit_read(dev_priv, PUNIT_REG_DDR_SETUP2) &
+                             FORCE_DDR_FREQ_REQ_ACK) == 0, 3)) {
+                       DRM_DEBUG_KMS("Punit not acking DDR DVFS request, "
+                                     "assuming DDR DVFS is disabled\n");
+                       dev_priv->wm.max_level = VLV_WM_LEVEL_PM5;
+               } else {
+                       val = vlv_punit_read(dev_priv, PUNIT_REG_DDR_SETUP2);
+                       if ((val & FORCE_DDR_HIGH_FREQ) == 0)
+                               wm->level = VLV_WM_LEVEL_DDR_DVFS;
+               }
 
                mutex_unlock(&dev_priv->rps.hw_lock);
        }
index 9dd1cac..e8eb14e 100644 (file)
@@ -689,6 +689,7 @@ nvkm_device_pci_10de_11e3[] = {
 
 static const struct nvkm_device_pci_vendor
 nvkm_device_pci_10de_11fc[] = {
+       { 0x1179, 0x0001, NULL, { .War00C800_0 = true } }, /* Toshiba Tecra W50 */
        { 0x17aa, 0x2211, NULL, { .War00C800_0 = true } }, /* Lenovo W541 */
        { 0x17aa, 0x221e, NULL, { .War00C800_0 = true } }, /* Lenovo W541 */
        {}
index 426ba00..85c5b7f 100644 (file)
@@ -1048,11 +1048,11 @@ nv04_gr_object_bind(struct nvkm_object *object, struct nvkm_gpuobj *parent,
        if (ret == 0) {
                nvkm_kmap(*pgpuobj);
                nvkm_wo32(*pgpuobj, 0x00, object->oclass);
-               nvkm_wo32(*pgpuobj, 0x04, 0x00000000);
-               nvkm_wo32(*pgpuobj, 0x08, 0x00000000);
 #ifdef __BIG_ENDIAN
-               nvkm_mo32(*pgpuobj, 0x08, 0x00080000, 0x00080000);
+               nvkm_mo32(*pgpuobj, 0x00, 0x00080000, 0x00080000);
 #endif
+               nvkm_wo32(*pgpuobj, 0x04, 0x00000000);
+               nvkm_wo32(*pgpuobj, 0x08, 0x00000000);
                nvkm_wo32(*pgpuobj, 0x0c, 0x00000000);
                nvkm_done(*pgpuobj);
        }
index 07feae6..c233e3f 100644 (file)
@@ -326,7 +326,7 @@ gt215_clk_pre(struct nvkm_clk *clk, unsigned long *flags)
                return -EIO;
 
        if (nvkm_msec(device, 2000,
-               u32 tmp = nvkm_rd32(device, 0x002504) & 0x0000003f;
+               u32 tmp = nvkm_rd32(device, 0x00251c) & 0x0000003f;
                if (tmp == 0x0000003f)
                        break;
        ) < 0)
index a8dbb3e..7c6225c 100644 (file)
@@ -160,9 +160,35 @@ static int qxl_add_monitors_config_modes(struct drm_connector *connector,
        *pwidth = head->width;
        *pheight = head->height;
        drm_mode_probed_add(connector, mode);
+       /* remember the last custom size for mode validation */
+       qdev->monitors_config_width = mode->hdisplay;
+       qdev->monitors_config_height = mode->vdisplay;
        return 1;
 }
 
+static struct mode_size {
+       int w;
+       int h;
+} common_modes[] = {
+       { 640,  480},
+       { 720,  480},
+       { 800,  600},
+       { 848,  480},
+       {1024,  768},
+       {1152,  768},
+       {1280,  720},
+       {1280,  800},
+       {1280,  854},
+       {1280,  960},
+       {1280, 1024},
+       {1440,  900},
+       {1400, 1050},
+       {1680, 1050},
+       {1600, 1200},
+       {1920, 1080},
+       {1920, 1200}
+};
+
 static int qxl_add_common_modes(struct drm_connector *connector,
                                 unsigned pwidth,
                                 unsigned pheight)
@@ -170,29 +196,6 @@ static int qxl_add_common_modes(struct drm_connector *connector,
        struct drm_device *dev = connector->dev;
        struct drm_display_mode *mode = NULL;
        int i;
-       struct mode_size {
-               int w;
-               int h;
-       } common_modes[] = {
-               { 640,  480},
-               { 720,  480},
-               { 800,  600},
-               { 848,  480},
-               {1024,  768},
-               {1152,  768},
-               {1280,  720},
-               {1280,  800},
-               {1280,  854},
-               {1280,  960},
-               {1280, 1024},
-               {1440,  900},
-               {1400, 1050},
-               {1680, 1050},
-               {1600, 1200},
-               {1920, 1080},
-               {1920, 1200}
-       };
-
        for (i = 0; i < ARRAY_SIZE(common_modes); i++) {
                mode = drm_cvt_mode(dev, common_modes[i].w, common_modes[i].h,
                                    60, false, false, false);
@@ -823,11 +826,22 @@ static int qxl_conn_get_modes(struct drm_connector *connector)
 static int qxl_conn_mode_valid(struct drm_connector *connector,
                               struct drm_display_mode *mode)
 {
+       struct drm_device *ddev = connector->dev;
+       struct qxl_device *qdev = ddev->dev_private;
+       int i;
+
        /* TODO: is this called for user defined modes? (xrandr --add-mode)
         * TODO: check that the mode fits in the framebuffer */
-       DRM_DEBUG("%s: %dx%d status=%d\n", mode->name, mode->hdisplay,
-                 mode->vdisplay, mode->status);
-       return MODE_OK;
+
+       if(qdev->monitors_config_width == mode->hdisplay &&
+          qdev->monitors_config_height == mode->vdisplay)
+               return MODE_OK;
+
+       for (i = 0; i < ARRAY_SIZE(common_modes); i++) {
+               if (common_modes[i].w == mode->hdisplay && common_modes[i].h == mode->vdisplay)
+                       return MODE_OK;
+       }
+       return MODE_BAD;
 }
 
 static struct drm_encoder *qxl_best_encoder(struct drm_connector *connector)
index d854969..01a8694 100644 (file)
@@ -325,6 +325,8 @@ struct qxl_device {
        struct work_struct fb_work;
 
        struct drm_property *hotplug_mode_update_property;
+       int monitors_config_width;
+       int monitors_config_height;
 };
 
 /* forward declaration for QXL_INFO_IO */
index 6394547..860062e 100644 (file)
@@ -125,7 +125,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
 }
 
-static struct vm_operations_struct vgem_gem_vm_ops = {
+static const struct vm_operations_struct vgem_gem_vm_ops = {
        .fault = vgem_gem_fault,
        .open = drm_gem_vm_open,
        .close = drm_gem_vm_close,
index d04643f..95638df 100644 (file)
@@ -1110,7 +1110,7 @@ static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return 0;
 }
 
-static struct vm_operations_struct cs_char_vm_ops = {
+static const struct vm_operations_struct cs_char_vm_ops = {
        .fault  = cs_char_vma_fault,
 };
 
index 500b262..e13c902 100644 (file)
@@ -1140,8 +1140,8 @@ config SENSORS_NCT6775
        help
          If you say yes here you get support for the hardware monitoring
          functionality of the Nuvoton NCT6106D, NCT6775F, NCT6776F, NCT6779D,
-         NCT6791D, NCT6792D and compatible Super-I/O chips. This driver
-         replaces the w83627ehf driver for NCT6775F and NCT6776F.
+         NCT6791D, NCT6792D, NCT6793D, and compatible Super-I/O chips. This
+         driver replaces the w83627ehf driver for NCT6775F and NCT6776F.
 
          This driver can also be built as a module.  If so, the module
          will be called nct6775.
index fe41d5a..e4e57bb 100644 (file)
@@ -104,7 +104,7 @@ static inline long lm75_reg_to_mc(s16 temp, u8 resolution)
 
 /* sysfs attributes for hwmon */
 
-static int lm75_read_temp(void *dev, long *temp)
+static int lm75_read_temp(void *dev, int *temp)
 {
        struct lm75_data *data = lm75_update_device(dev);
 
index bd1c99d..8b4fa55 100644 (file)
@@ -39,6 +39,7 @@
  * nct6779d    15      5       5       2+6    0xc560 0xc1    0x5ca3
  * nct6791d    15      6       6       2+6    0xc800 0xc1    0x5ca3
  * nct6792d    15      6       6       2+6    0xc910 0xc1    0x5ca3
+ * nct6793d    15      6       6       2+6    0xd120 0xc1    0x5ca3
  *
  * #temp lists the number of monitored temperature sources (first value) plus
  * the number of directly connectable temperature sensors (second value).
@@ -63,7 +64,7 @@
 
 #define USE_ALTERNATE
 
-enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792 };
+enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793 };
 
 /* used to set data->name = nct6775_device_names[data->sio_kind] */
 static const char * const nct6775_device_names[] = {
@@ -73,6 +74,17 @@ static const char * const nct6775_device_names[] = {
        "nct6779",
        "nct6791",
        "nct6792",
+       "nct6793",
+};
+
+static const char * const nct6775_sio_names[] __initconst = {
+       "NCT6106D",
+       "NCT6775F",
+       "NCT6776D/F",
+       "NCT6779D",
+       "NCT6791D",
+       "NCT6792D",
+       "NCT6793D",
 };
 
 static unsigned short force_id;
@@ -104,6 +116,7 @@ MODULE_PARM_DESC(fan_debounce, "Enable debouncing for fan RPM signal");
 #define SIO_NCT6779_ID         0xc560
 #define SIO_NCT6791_ID         0xc800
 #define SIO_NCT6792_ID         0xc910
+#define SIO_NCT6793_ID         0xd120
 #define SIO_ID_MASK            0xFFF0
 
 enum pwm_enable { off, manual, thermal_cruise, speed_cruise, sf3, sf4 };
@@ -354,6 +367,10 @@ static const u16 NCT6775_REG_TEMP_CRIT[ARRAY_SIZE(nct6775_temp_label) - 1]
 
 /* NCT6776 specific data */
 
+/* STEP_UP_TIME and STEP_DOWN_TIME regs are swapped for all chips but NCT6775 */
+#define NCT6776_REG_FAN_STEP_UP_TIME NCT6775_REG_FAN_STEP_DOWN_TIME
+#define NCT6776_REG_FAN_STEP_DOWN_TIME NCT6775_REG_FAN_STEP_UP_TIME
+
 static const s8 NCT6776_ALARM_BITS[] = {
        0, 1, 2, 3, 8, 21, 20, 16,      /* in0.. in7 */
        17, -1, -1, -1, -1, -1, -1,     /* in8..in14 */
@@ -533,7 +550,7 @@ static const s8 NCT6791_ALARM_BITS[] = {
        4, 5, 13, -1, -1, -1,           /* temp1..temp6 */
        12, 9 };                        /* intrusion0, intrusion1 */
 
-/* NCT6792 specific data */
+/* NCT6792/NCT6793 specific data */
 
 static const u16 NCT6792_REG_TEMP_MON[] = {
        0x73, 0x75, 0x77, 0x79, 0x7b, 0x7d };
@@ -1056,6 +1073,7 @@ static bool is_word_sized(struct nct6775_data *data, u16 reg)
        case nct6779:
        case nct6791:
        case nct6792:
+       case nct6793:
                return reg == 0x150 || reg == 0x153 || reg == 0x155 ||
                  ((reg & 0xfff0) == 0x4b0 && (reg & 0x000f) < 0x0b) ||
                  reg == 0x402 ||
@@ -1407,6 +1425,7 @@ static void nct6775_update_pwm_limits(struct device *dev)
                case nct6779:
                case nct6791:
                case nct6792:
+               case nct6793:
                        reg = nct6775_read_value(data,
                                        data->REG_CRITICAL_PWM_ENABLE[i]);
                        if (reg & data->CRITICAL_PWM_ENABLE_MASK)
@@ -2822,6 +2841,7 @@ store_auto_pwm(struct device *dev, struct device_attribute *attr,
                case nct6779:
                case nct6791:
                case nct6792:
+               case nct6793:
                        nct6775_write_value(data, data->REG_CRITICAL_PWM[nr],
                                            val);
                        reg = nct6775_read_value(data,
@@ -3256,7 +3276,7 @@ nct6775_check_fan_inputs(struct nct6775_data *data)
                pwm4pin = false;
                pwm5pin = false;
                pwm6pin = false;
-       } else {        /* NCT6779D, NCT6791D, or NCT6792D */
+       } else {        /* NCT6779D, NCT6791D, NCT6792D, or NCT6793D */
                regval = superio_inb(sioreg, 0x1c);
 
                fan3pin = !(regval & (1 << 5));
@@ -3269,7 +3289,8 @@ nct6775_check_fan_inputs(struct nct6775_data *data)
 
                fan4min = fan4pin;
 
-               if (data->kind == nct6791 || data->kind == nct6792) {
+               if (data->kind == nct6791 || data->kind == nct6792 ||
+                   data->kind == nct6793) {
                        regval = superio_inb(sioreg, 0x2d);
                        fan6pin = (regval & (1 << 1));
                        pwm6pin = (regval & (1 << 0));
@@ -3528,8 +3549,8 @@ static int nct6775_probe(struct platform_device *pdev)
                data->REG_FAN_PULSES = NCT6776_REG_FAN_PULSES;
                data->FAN_PULSE_SHIFT = NCT6775_FAN_PULSE_SHIFT;
                data->REG_FAN_TIME[0] = NCT6775_REG_FAN_STOP_TIME;
-               data->REG_FAN_TIME[1] = NCT6775_REG_FAN_STEP_UP_TIME;
-               data->REG_FAN_TIME[2] = NCT6775_REG_FAN_STEP_DOWN_TIME;
+               data->REG_FAN_TIME[1] = NCT6776_REG_FAN_STEP_UP_TIME;
+               data->REG_FAN_TIME[2] = NCT6776_REG_FAN_STEP_DOWN_TIME;
                data->REG_TOLERANCE_H = NCT6776_REG_TOLERANCE_H;
                data->REG_PWM[0] = NCT6775_REG_PWM;
                data->REG_PWM[1] = NCT6775_REG_FAN_START_OUTPUT;
@@ -3600,8 +3621,8 @@ static int nct6775_probe(struct platform_device *pdev)
                data->REG_FAN_PULSES = NCT6779_REG_FAN_PULSES;
                data->FAN_PULSE_SHIFT = NCT6775_FAN_PULSE_SHIFT;
                data->REG_FAN_TIME[0] = NCT6775_REG_FAN_STOP_TIME;
-               data->REG_FAN_TIME[1] = NCT6775_REG_FAN_STEP_UP_TIME;
-               data->REG_FAN_TIME[2] = NCT6775_REG_FAN_STEP_DOWN_TIME;
+               data->REG_FAN_TIME[1] = NCT6776_REG_FAN_STEP_UP_TIME;
+               data->REG_FAN_TIME[2] = NCT6776_REG_FAN_STEP_DOWN_TIME;
                data->REG_TOLERANCE_H = NCT6776_REG_TOLERANCE_H;
                data->REG_PWM[0] = NCT6775_REG_PWM;
                data->REG_PWM[1] = NCT6775_REG_FAN_START_OUTPUT;
@@ -3643,6 +3664,7 @@ static int nct6775_probe(struct platform_device *pdev)
                break;
        case nct6791:
        case nct6792:
+       case nct6793:
                data->in_num = 15;
                data->pwm_num = 6;
                data->auto_pwm_num = 4;
@@ -3677,8 +3699,8 @@ static int nct6775_probe(struct platform_device *pdev)
                data->REG_FAN_PULSES = NCT6779_REG_FAN_PULSES;
                data->FAN_PULSE_SHIFT = NCT6775_FAN_PULSE_SHIFT;
                data->REG_FAN_TIME[0] = NCT6775_REG_FAN_STOP_TIME;
-               data->REG_FAN_TIME[1] = NCT6775_REG_FAN_STEP_UP_TIME;
-               data->REG_FAN_TIME[2] = NCT6775_REG_FAN_STEP_DOWN_TIME;
+               data->REG_FAN_TIME[1] = NCT6776_REG_FAN_STEP_UP_TIME;
+               data->REG_FAN_TIME[2] = NCT6776_REG_FAN_STEP_DOWN_TIME;
                data->REG_TOLERANCE_H = NCT6776_REG_TOLERANCE_H;
                data->REG_PWM[0] = NCT6775_REG_PWM;
                data->REG_PWM[1] = NCT6775_REG_FAN_START_OUTPUT;
@@ -3918,6 +3940,7 @@ static int nct6775_probe(struct platform_device *pdev)
        case nct6779:
        case nct6791:
        case nct6792:
+       case nct6793:
                break;
        }
 
@@ -3950,6 +3973,7 @@ static int nct6775_probe(struct platform_device *pdev)
                        break;
                case nct6791:
                case nct6792:
+               case nct6793:
                        tmp |= 0x7e;
                        break;
                }
@@ -4047,7 +4071,8 @@ static int __maybe_unused nct6775_resume(struct device *dev)
        if (reg != data->sio_reg_enable)
                superio_outb(sioreg, SIO_REG_ENABLE, data->sio_reg_enable);
 
-       if (data->kind == nct6791 || data->kind == nct6792)
+       if (data->kind == nct6791 || data->kind == nct6792 ||
+           data->kind == nct6793)
                nct6791_enable_io_mapping(sioreg);
 
        superio_exit(sioreg);
@@ -4106,15 +4131,6 @@ static struct platform_driver nct6775_driver = {
        .probe          = nct6775_probe,
 };
 
-static const char * const nct6775_sio_names[] __initconst = {
-       "NCT6106D",
-       "NCT6775F",
-       "NCT6776D/F",
-       "NCT6779D",
-       "NCT6791D",
-       "NCT6792D",
-};
-
 /* nct6775_find() looks for a '627 in the Super-I/O config space */
 static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
 {
@@ -4150,6 +4166,9 @@ static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
        case SIO_NCT6792_ID:
                sio_data->kind = nct6792;
                break;
+       case SIO_NCT6793_ID:
+               sio_data->kind = nct6793;
+               break;
        default:
                if (val != 0xffff)
                        pr_debug("unsupported chip ID: 0x%04x\n", val);
@@ -4175,7 +4194,8 @@ static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
                superio_outb(sioaddr, SIO_REG_ENABLE, val | 0x01);
        }
 
-       if (sio_data->kind == nct6791 || sio_data->kind == nct6792)
+       if (sio_data->kind == nct6791 || sio_data->kind == nct6792 ||
+           sio_data->kind == nct6793)
                nct6791_enable_io_mapping(sioaddr);
 
        superio_exit(sioaddr);
@@ -4285,7 +4305,7 @@ static void __exit sensors_nct6775_exit(void)
 }
 
 MODULE_AUTHOR("Guenter Roeck <linux@roeck-us.net>");
-MODULE_DESCRIPTION("NCT6106D/NCT6775F/NCT6776F/NCT6779D/NCT6791D/NCT6792D driver");
+MODULE_DESCRIPTION("Driver for NCT6775F and compatible chips");
 MODULE_LICENSE("GPL");
 
 module_init(sensors_nct6775_init);
index dc0b76c..feed306 100644 (file)
@@ -477,7 +477,7 @@ static int ntc_thermistor_get_ohm(struct ntc_data *data)
        return -EINVAL;
 }
 
-static int ntc_read_temp(void *dev, long *temp)
+static int ntc_read_temp(void *dev, int *temp)
 {
        struct ntc_data *data = dev_get_drvdata(dev);
        int ohm;
index 9da2735..6548262 100644 (file)
@@ -98,7 +98,7 @@ static struct tmp102 *tmp102_update_device(struct device *dev)
        return tmp102;
 }
 
-static int tmp102_read_temp(void *dev, long *temp)
+static int tmp102_read_temp(void *dev, int *temp)
 {
        struct tmp102 *tmp102 = tmp102_update_device(dev);
 
index da4c697..aa26f3c 100644 (file)
@@ -56,7 +56,6 @@ config INFINIBAND_ADDR_TRANS
 
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
-source "drivers/infiniband/hw/ehca/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
index 1bdb999..aded2a5 100644 (file)
@@ -1,6 +1,5 @@
 obj-$(CONFIG_INFINIBAND_MTHCA)         += mthca/
 obj-$(CONFIG_INFINIBAND_QIB)           += qib/
-obj-$(CONFIG_INFINIBAND_EHCA)          += ehca/
 obj-$(CONFIG_INFINIBAND_CXGB3)         += cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)         += cxgb4/
 obj-$(CONFIG_MLX4_INFINIBAND)          += mlx4/
diff --git a/drivers/infiniband/hw/ehca/Kconfig b/drivers/infiniband/hw/ehca/Kconfig
deleted file mode 100644 (file)
index 59f807d..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-config INFINIBAND_EHCA
-       tristate "eHCA support"
-       depends on IBMEBUS
-       ---help---
-       This driver supports the IBM pSeries eHCA InfiniBand adapter.
-
-       To compile the driver as a module, choose M here. The module
-       will be called ib_ehca.
-
diff --git a/drivers/infiniband/hw/ehca/Makefile b/drivers/infiniband/hw/ehca/Makefile
deleted file mode 100644 (file)
index 74d284e..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#  Authors: Heiko J Schick <schickhj@de.ibm.com>
-#           Christoph Raisch <raisch@de.ibm.com>
-#           Joachim Fenkes <fenkes@de.ibm.com>
-#
-#  Copyright (c) 2005 IBM Corporation
-#
-#  All rights reserved.
-#
-#  This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
-
-obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
-
-ib_ehca-objs  = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
-               ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
-               ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
-
diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c
deleted file mode 100644 (file)
index 4659263..0000000
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  address vector functions
- *
- *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Khadija Souissi <souissik@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-static struct kmem_cache *av_cache;
-
-int ehca_calc_ipd(struct ehca_shca *shca, int port,
-                 enum ib_rate path_rate, u32 *ipd)
-{
-       int path = ib_rate_to_mult(path_rate);
-       int link, ret;
-       struct ib_port_attr pa;
-
-       if (path_rate == IB_RATE_PORT_CURRENT) {
-               *ipd = 0;
-               return 0;
-       }
-
-       if (unlikely(path < 0)) {
-               ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
-                        path_rate);
-               return -EINVAL;
-       }
-
-       ret = ehca_query_port(&shca->ib_device, port, &pa);
-       if (unlikely(ret < 0)) {
-               ehca_err(&shca->ib_device, "Failed to query port  ret=%i", ret);
-               return ret;
-       }
-
-       link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
-
-       if (path >= link)
-               /* no need to throttle if path faster than link */
-               *ipd = 0;
-       else
-               /* IPD = round((link / path) - 1) */
-               *ipd = ((link + (path >> 1)) / path) - 1;
-
-       return 0;
-}
-
-struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
-{
-       int ret;
-       struct ehca_av *av;
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-
-       av = kmem_cache_alloc(av_cache, GFP_KERNEL);
-       if (!av) {
-               ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
-                        pd, ah_attr);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       av->av.sl = ah_attr->sl;
-       av->av.dlid = ah_attr->dlid;
-       av->av.slid_path_bits = ah_attr->src_path_bits;
-
-       if (ehca_static_rate < 0) {
-               u32 ipd;
-               if (ehca_calc_ipd(shca, ah_attr->port_num,
-                                 ah_attr->static_rate, &ipd)) {
-                       ret = -EINVAL;
-                       goto create_ah_exit1;
-               }
-               av->av.ipd = ipd;
-       } else
-               av->av.ipd = ehca_static_rate;
-
-       av->av.lnh = ah_attr->ah_flags;
-       av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
-                                           ah_attr->grh.traffic_class);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
-                                           ah_attr->grh.flow_label);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
-                                           ah_attr->grh.hop_limit);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
-       /* set sgid in grh.word_1 */
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               int rc;
-               struct ib_port_attr port_attr;
-               union ib_gid gid;
-               memset(&port_attr, 0, sizeof(port_attr));
-               rc = ehca_query_port(pd->device, ah_attr->port_num,
-                                    &port_attr);
-               if (rc) { /* invalid port number */
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Invalid port number "
-                                "ehca_query_port() returned %x "
-                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
-                       goto create_ah_exit1;
-               }
-               memset(&gid, 0, sizeof(gid));
-               rc = ehca_query_gid(pd->device,
-                                   ah_attr->port_num,
-                                   ah_attr->grh.sgid_index, &gid);
-               if (rc) {
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Failed to retrieve sgid "
-                                "ehca_query_gid() returned %x "
-                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
-                       goto create_ah_exit1;
-               }
-               memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
-       }
-       av->av.pmtu = shca->max_mtu;
-
-       /* dgid comes in grh.word_3 */
-       memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
-              sizeof(ah_attr->grh.dgid));
-
-       return &av->ib_ah;
-
-create_ah_exit1:
-       kmem_cache_free(av_cache, av);
-
-       return ERR_PTR(ret);
-}
-
-int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
-       struct ehca_av *av;
-       struct ehca_ud_av new_ehca_av;
-       struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
-                                             ib_device);
-
-       memset(&new_ehca_av, 0, sizeof(new_ehca_av));
-       new_ehca_av.sl = ah_attr->sl;
-       new_ehca_av.dlid = ah_attr->dlid;
-       new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
-       new_ehca_av.ipd = ah_attr->static_rate;
-       new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
-                                        (ah_attr->ah_flags & IB_AH_GRH) > 0);
-       new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
-                                               ah_attr->grh.traffic_class);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
-                                                ah_attr->grh.flow_label);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
-                                                ah_attr->grh.hop_limit);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
-
-       /* set sgid in grh.word_1 */
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               int rc;
-               struct ib_port_attr port_attr;
-               union ib_gid gid;
-               memset(&port_attr, 0, sizeof(port_attr));
-               rc = ehca_query_port(ah->device, ah_attr->port_num,
-                                    &port_attr);
-               if (rc) { /* invalid port number */
-                       ehca_err(ah->device, "Invalid port number "
-                                "ehca_query_port() returned %x "
-                                "ah=%p ah_attr=%p port_num=%x",
-                                rc, ah, ah_attr, ah_attr->port_num);
-                       return -EINVAL;
-               }
-               memset(&gid, 0, sizeof(gid));
-               rc = ehca_query_gid(ah->device,
-                                   ah_attr->port_num,
-                                   ah_attr->grh.sgid_index, &gid);
-               if (rc) {
-                       ehca_err(ah->device, "Failed to retrieve sgid "
-                                "ehca_query_gid() returned %x "
-                                "ah=%p ah_attr=%p port_num=%x "
-                                "sgid_index=%x",
-                                rc, ah, ah_attr, ah_attr->port_num,
-                                ah_attr->grh.sgid_index);
-                       return -EINVAL;
-               }
-               memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
-       }
-
-       new_ehca_av.pmtu = shca->max_mtu;
-
-       memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
-              sizeof(ah_attr->grh.dgid));
-
-       av = container_of(ah, struct ehca_av, ib_ah);
-       av->av = new_ehca_av;
-
-       return 0;
-}
-
-int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
-       struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
-
-       memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
-              sizeof(ah_attr->grh.dgid));
-       ah_attr->sl = av->av.sl;
-
-       ah_attr->dlid = av->av.dlid;
-
-       ah_attr->src_path_bits = av->av.slid_path_bits;
-       ah_attr->static_rate = av->av.ipd;
-       ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
-       ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
-                                                   av->av.grh.word_0);
-       ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
-                                               av->av.grh.word_0);
-       ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
-                                                av->av.grh.word_0);
-
-       return 0;
-}
-
-int ehca_destroy_ah(struct ib_ah *ah)
-{
-       kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
-
-       return 0;
-}
-
-int ehca_init_av_cache(void)
-{
-       av_cache = kmem_cache_create("ehca_cache_av",
-                                  sizeof(struct ehca_av), 0,
-                                  SLAB_HWCACHE_ALIGN,
-                                  NULL);
-       if (!av_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_av_cache(void)
-{
-       if (av_cache)
-               kmem_cache_destroy(av_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
deleted file mode 100644 (file)
index bd45e0f..0000000
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Struct definition for eHCA internal structures
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_CLASSES_H__
-#define __EHCA_CLASSES_H__
-
-struct ehca_module;
-struct ehca_qp;
-struct ehca_cq;
-struct ehca_eq;
-struct ehca_mr;
-struct ehca_mw;
-struct ehca_pd;
-struct ehca_av;
-
-#include <linux/wait.h>
-#include <linux/mutex.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_user_verbs.h>
-
-#ifdef CONFIG_PPC64
-#include "ehca_classes_pSeries.h"
-#endif
-#include "ipz_pt_fn.h"
-#include "ehca_qes.h"
-#include "ehca_irq.h"
-
-#define EHCA_EQE_CACHE_SIZE 20
-#define EHCA_MAX_NUM_QUEUES 0xffff
-
-struct ehca_eqe_cache_entry {
-       struct ehca_eqe *eqe;
-       struct ehca_cq *cq;
-};
-
-struct ehca_eq {
-       u32 length;
-       struct ipz_queue ipz_queue;
-       struct ipz_eq_handle ipz_eq_handle;
-       struct work_struct work;
-       struct h_galpas galpas;
-       int is_initialized;
-       struct ehca_pfeq pf;
-       spinlock_t spinlock;
-       struct tasklet_struct interrupt_task;
-       u32 ist;
-       spinlock_t irq_spinlock;
-       struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
-};
-
-struct ehca_sma_attr {
-       u16 lid, lmc, sm_sl, sm_lid;
-       u16 pkey_tbl_len, pkeys[16];
-};
-
-struct ehca_sport {
-       struct ib_cq *ibcq_aqp1;
-       struct ib_qp *ibqp_sqp[2];
-       /* lock to serialze modify_qp() calls for sqp in normal
-        * and irq path (when event PORT_ACTIVE is received first time)
-        */
-       spinlock_t mod_sqp_lock;
-       enum ib_port_state port_state;
-       struct ehca_sma_attr saved_attr;
-       u32 pma_qp_nr;
-};
-
-#define HCA_CAP_MR_PGSIZE_4K  0x80000000
-#define HCA_CAP_MR_PGSIZE_64K 0x40000000
-#define HCA_CAP_MR_PGSIZE_1M  0x20000000
-#define HCA_CAP_MR_PGSIZE_16M 0x10000000
-
-struct ehca_shca {
-       struct ib_device ib_device;
-       struct platform_device *ofdev;
-       u8 num_ports;
-       int hw_level;
-       struct list_head shca_list;
-       struct ipz_adapter_handle ipz_hca_handle;
-       struct ehca_sport sport[2];
-       struct ehca_eq eq;
-       struct ehca_eq neq;
-       struct ehca_mr *maxmr;
-       struct ehca_pd *pd;
-       struct h_galpas galpas;
-       struct mutex modify_mutex;
-       u64 hca_cap;
-       /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
-       u32 hca_cap_mr_pgsize;
-       int max_mtu;
-       int max_num_qps;
-       int max_num_cqs;
-       atomic_t num_cqs;
-       atomic_t num_qps;
-};
-
-struct ehca_pd {
-       struct ib_pd ib_pd;
-       struct ipz_pd fw_pd;
-       /* small queue mgmt */
-       struct mutex lock;
-       struct list_head free[2];
-       struct list_head full[2];
-};
-
-enum ehca_ext_qp_type {
-       EQPT_NORMAL    = 0,
-       EQPT_LLQP      = 1,
-       EQPT_SRQBASE   = 2,
-       EQPT_SRQ       = 3,
-};
-
-/* struct to cache modify_qp()'s parms for GSI/SMI qp */
-struct ehca_mod_qp_parm {
-       int mask;
-       struct ib_qp_attr attr;
-};
-
-#define EHCA_MOD_QP_PARM_MAX 4
-
-#define QMAP_IDX_MASK 0xFFFFULL
-
-/* struct for tracking if cqes have been reported to the application */
-struct ehca_qmap_entry {
-       u16 app_wr_id;
-       u8 reported;
-       u8 cqe_req;
-};
-
-struct ehca_queue_map {
-       struct ehca_qmap_entry *map;
-       unsigned int entries;
-       unsigned int tail;
-       unsigned int left_to_poll;
-       unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
-};
-
-/* function to calculate the next index for the qmap */
-static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
-{
-       unsigned int temp = cur_index + 1;
-       return (temp == limit) ? 0 : temp;
-}
-
-struct ehca_qp {
-       union {
-               struct ib_qp ib_qp;
-               struct ib_srq ib_srq;
-       };
-       u32 qp_type;
-       enum ehca_ext_qp_type ext_type;
-       enum ib_qp_state state;
-       struct ipz_queue ipz_squeue;
-       struct ehca_queue_map sq_map;
-       struct ipz_queue ipz_rqueue;
-       struct ehca_queue_map rq_map;
-       struct h_galpas galpas;
-       u32 qkey;
-       u32 real_qp_num;
-       u32 token;
-       spinlock_t spinlock_s;
-       spinlock_t spinlock_r;
-       u32 sq_max_inline_data_size;
-       struct ipz_qp_handle ipz_qp_handle;
-       struct ehca_pfqp pf;
-       struct ib_qp_init_attr init_attr;
-       struct ehca_cq *send_cq;
-       struct ehca_cq *recv_cq;
-       unsigned int sqerr_purgeflag;
-       struct hlist_node list_entries;
-       /* array to cache modify_qp()'s parms for GSI/SMI qp */
-       struct ehca_mod_qp_parm *mod_qp_parm;
-       int mod_qp_parm_idx;
-       /* mmap counter for resources mapped into user space */
-       u32 mm_count_squeue;
-       u32 mm_count_rqueue;
-       u32 mm_count_galpa;
-       /* unsolicited ack circumvention */
-       int unsol_ack_circ;
-       int mtu_shift;
-       u32 message_count;
-       u32 packet_count;
-       atomic_t nr_events; /* events seen */
-       wait_queue_head_t wait_completion;
-       int mig_armed;
-       struct list_head sq_err_node;
-       struct list_head rq_err_node;
-};
-
-#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
-#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
-#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
-
-/* must be power of 2 */
-#define QP_HASHTAB_LEN 8
-
-struct ehca_cq {
-       struct ib_cq ib_cq;
-       struct ipz_queue ipz_queue;
-       struct h_galpas galpas;
-       spinlock_t spinlock;
-       u32 cq_number;
-       u32 token;
-       u32 nr_of_entries;
-       struct ipz_cq_handle ipz_cq_handle;
-       struct ehca_pfcq pf;
-       spinlock_t cb_lock;
-       struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
-       struct list_head entry;
-       u32 nr_callbacks;   /* #events assigned to cpu by scaling code */
-       atomic_t nr_events; /* #events seen */
-       wait_queue_head_t wait_completion;
-       spinlock_t task_lock;
-       /* mmap counter for resources mapped into user space */
-       u32 mm_count_queue;
-       u32 mm_count_galpa;
-       struct list_head sqp_err_list;
-       struct list_head rqp_err_list;
-};
-
-enum ehca_mr_flag {
-       EHCA_MR_FLAG_FMR = 0x80000000,   /* FMR, created with ehca_alloc_fmr */
-       EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR                           */
-};
-
-struct ehca_mr {
-       union {
-               struct ib_mr ib_mr;     /* must always be first in ehca_mr */
-               struct ib_fmr ib_fmr;   /* must always be first in ehca_mr */
-       } ib;
-       struct ib_umem *umem;
-       spinlock_t mrlock;
-
-       enum ehca_mr_flag flags;
-       u32 num_kpages;         /* number of kernel pages */
-       u32 num_hwpages;        /* number of hw pages to form MR */
-       u64 hwpage_size;        /* hw page size used for this MR */
-       int acl;                /* ACL (stored here for usage in reregister) */
-       u64 *start;             /* virtual start address (stored here for */
-                               /* usage in reregister) */
-       u64 size;               /* size (stored here for usage in reregister) */
-       u32 fmr_page_size;      /* page size for FMR */
-       u32 fmr_max_pages;      /* max pages for FMR */
-       u32 fmr_max_maps;       /* max outstanding maps for FMR */
-       u32 fmr_map_cnt;        /* map counter for FMR */
-       /* fw specific data */
-       struct ipz_mrmw_handle ipz_mr_handle;   /* MR handle for h-calls */
-       struct h_galpas galpas;
-};
-
-struct ehca_mw {
-       struct ib_mw ib_mw;     /* gen2 mw, must always be first in ehca_mw */
-       spinlock_t mwlock;
-
-       u8 never_bound;         /* indication MW was never bound */
-       struct ipz_mrmw_handle ipz_mw_handle;   /* MW handle for h-calls */
-       struct h_galpas galpas;
-};
-
-enum ehca_mr_pgi_type {
-       EHCA_MR_PGI_PHYS   = 1,  /* type of ehca_reg_phys_mr,
-                                 * ehca_rereg_phys_mr,
-                                 * ehca_reg_internal_maxmr */
-       EHCA_MR_PGI_USER   = 2,  /* type of ehca_reg_user_mr */
-       EHCA_MR_PGI_FMR    = 3   /* type of ehca_map_phys_fmr */
-};
-
-struct ehca_mr_pginfo {
-       enum ehca_mr_pgi_type type;
-       u64 num_kpages;
-       u64 kpage_cnt;
-       u64 hwpage_size;     /* hw page size used for this MR */
-       u64 num_hwpages;     /* number of hw pages */
-       u64 hwpage_cnt;      /* counter for hw pages */
-       u64 next_hwpage;     /* next hw page in buffer/chunk/listelem */
-
-       union {
-               struct { /* type EHCA_MR_PGI_PHYS section */
-                       int num_phys_buf;
-                       struct ib_phys_buf *phys_buf_array;
-                       u64 next_buf;
-               } phy;
-               struct { /* type EHCA_MR_PGI_USER section */
-                       struct ib_umem *region;
-                       struct scatterlist *next_sg;
-                       u64 next_nmap;
-               } usr;
-               struct { /* type EHCA_MR_PGI_FMR section */
-                       u64 fmr_pgsize;
-                       u64 *page_list;
-                       u64 next_listelem;
-               } fmr;
-       } u;
-};
-
-/* output parameters for MR/FMR hipz calls */
-struct ehca_mr_hipzout_parms {
-       struct ipz_mrmw_handle handle;
-       u32 lkey;
-       u32 rkey;
-       u64 len;
-       u64 vaddr;
-       u32 acl;
-};
-
-/* output parameters for MW hipz calls */
-struct ehca_mw_hipzout_parms {
-       struct ipz_mrmw_handle handle;
-       u32 rkey;
-};
-
-struct ehca_av {
-       struct ib_ah ib_ah;
-       struct ehca_ud_av av;
-};
-
-struct ehca_ucontext {
-       struct ib_ucontext ib_ucontext;
-};
-
-int ehca_init_pd_cache(void);
-void ehca_cleanup_pd_cache(void);
-int ehca_init_cq_cache(void);
-void ehca_cleanup_cq_cache(void);
-int ehca_init_qp_cache(void);
-void ehca_cleanup_qp_cache(void);
-int ehca_init_av_cache(void);
-void ehca_cleanup_av_cache(void);
-int ehca_init_mrmw_cache(void);
-void ehca_cleanup_mrmw_cache(void);
-int ehca_init_small_qp_cache(void);
-void ehca_cleanup_small_qp_cache(void);
-
-extern rwlock_t ehca_qp_idr_lock;
-extern rwlock_t ehca_cq_idr_lock;
-extern struct idr ehca_qp_idr;
-extern struct idr ehca_cq_idr;
-extern spinlock_t shca_list_lock;
-
-extern int ehca_static_rate;
-extern int ehca_port_act_time;
-extern bool ehca_use_hp_mr;
-extern bool ehca_scaling_code;
-extern int ehca_lock_hcalls;
-extern int ehca_nr_ports;
-extern int ehca_max_cq;
-extern int ehca_max_qp;
-
-struct ipzu_queue_resp {
-       u32 qe_size;      /* queue entry size */
-       u32 act_nr_of_sg;
-       u32 queue_length; /* queue length allocated in bytes */
-       u32 pagesize;
-       u32 toggle_state;
-       u32 offset; /* save offset within a page for small_qp */
-};
-
-struct ehca_create_cq_resp {
-       u32 cq_number;
-       u32 token;
-       struct ipzu_queue_resp ipz_queue;
-       u32 fw_handle_ofs;
-       u32 dummy;
-};
-
-struct ehca_create_qp_resp {
-       u32 qp_num;
-       u32 token;
-       u32 qp_type;
-       u32 ext_type;
-       u32 qkey;
-       /* qp_num assigned by ehca: sqp0/1 may have got different numbers */
-       u32 real_qp_num;
-       u32 fw_handle_ofs;
-       u32 dummy;
-       struct ipzu_queue_resp ipz_squeue;
-       struct ipzu_queue_resp ipz_rqueue;
-};
-
-struct ehca_alloc_cq_parms {
-       u32 nr_cqe;
-       u32 act_nr_of_entries;
-       u32 act_pages;
-       struct ipz_eq_handle eq_handle;
-};
-
-enum ehca_service_type {
-       ST_RC  = 0,
-       ST_UC  = 1,
-       ST_RD  = 2,
-       ST_UD  = 3,
-};
-
-enum ehca_ll_comp_flags {
-       LLQP_SEND_COMP = 0x20,
-       LLQP_RECV_COMP = 0x40,
-       LLQP_COMP_MASK = 0x60,
-};
-
-struct ehca_alloc_queue_parms {
-       /* input parameters */
-       int max_wr;
-       int max_sge;
-       int page_size;
-       int is_small;
-
-       /* output parameters */
-       u16 act_nr_wqes;
-       u8  act_nr_sges;
-       u32 queue_size; /* bytes for small queues, pages otherwise */
-};
-
-struct ehca_alloc_qp_parms {
-       struct ehca_alloc_queue_parms squeue;
-       struct ehca_alloc_queue_parms rqueue;
-
-       /* input parameters */
-       enum ehca_service_type servicetype;
-       int qp_storage;
-       int sigtype;
-       enum ehca_ext_qp_type ext_type;
-       enum ehca_ll_comp_flags ll_comp_flags;
-       int ud_av_l_key_ctl;
-
-       u32 token;
-       struct ipz_eq_handle eq_handle;
-       struct ipz_pd pd;
-       struct ipz_cq_handle send_cq_handle, recv_cq_handle;
-
-       u32 srq_qpn, srq_token, srq_limit;
-
-       /* output parameters */
-       u32 real_qp_num;
-       struct ipz_qp_handle qp_handle;
-       struct h_galpas galpas;
-};
-
-int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
-int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
-struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
deleted file mode 100644 (file)
index 689c357..0000000
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  pSeries interface definitions
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_CLASSES_PSERIES_H__
-#define __EHCA_CLASSES_PSERIES_H__
-
-#include "hcp_phyp.h"
-#include "ipz_pt_fn.h"
-
-
-struct ehca_pfqp {
-       struct ipz_qpt sqpt;
-       struct ipz_qpt rqpt;
-};
-
-struct ehca_pfcq {
-       struct ipz_qpt qpt;
-       u32 cqnr;
-};
-
-struct ehca_pfeq {
-       struct ipz_qpt qpt;
-       struct h_galpa galpa;
-       u32 eqnr;
-};
-
-struct ipz_adapter_handle {
-       u64 handle;
-};
-
-struct ipz_cq_handle {
-       u64 handle;
-};
-
-struct ipz_eq_handle {
-       u64 handle;
-};
-
-struct ipz_qp_handle {
-       u64 handle;
-};
-struct ipz_mrmw_handle {
-       u64 handle;
-};
-
-struct ipz_pd {
-       u32 value;
-};
-
-struct hcp_modify_qp_control_block {
-       u32 qkey;                      /* 00 */
-       u32 rdd;                       /* reliable datagram domain */
-       u32 send_psn;                  /* 02 */
-       u32 receive_psn;               /* 03 */
-       u32 prim_phys_port;            /* 04 */
-       u32 alt_phys_port;             /* 05 */
-       u32 prim_p_key_idx;            /* 06 */
-       u32 alt_p_key_idx;             /* 07 */
-       u32 rdma_atomic_ctrl;          /* 08 */
-       u32 qp_state;                  /* 09 */
-       u32 reserved_10;               /* 10 */
-       u32 rdma_nr_atomic_resp_res;   /* 11 */
-       u32 path_migration_state;      /* 12 */
-       u32 rdma_atomic_outst_dest_qp; /* 13 */
-       u32 dest_qp_nr;                /* 14 */
-       u32 min_rnr_nak_timer_field;   /* 15 */
-       u32 service_level;             /* 16 */
-       u32 send_grh_flag;             /* 17 */
-       u32 retry_count;               /* 18 */
-       u32 timeout;                   /* 19 */
-       u32 path_mtu;                  /* 20 */
-       u32 max_static_rate;           /* 21 */
-       u32 dlid;                      /* 22 */
-       u32 rnr_retry_count;           /* 23 */
-       u32 source_path_bits;          /* 24 */
-       u32 traffic_class;             /* 25 */
-       u32 hop_limit;                 /* 26 */
-       u32 source_gid_idx;            /* 27 */
-       u32 flow_label;                /* 28 */
-       u32 reserved_29;               /* 29 */
-       union {                        /* 30 */
-               u64 dw[2];
-               u8 byte[16];
-       } dest_gid;
-       u32 service_level_al;          /* 34 */
-       u32 send_grh_flag_al;          /* 35 */
-       u32 retry_count_al;            /* 36 */
-       u32 timeout_al;                /* 37 */
-       u32 max_static_rate_al;        /* 38 */
-       u32 dlid_al;                   /* 39 */
-       u32 rnr_retry_count_al;        /* 40 */
-       u32 source_path_bits_al;       /* 41 */
-       u32 traffic_class_al;          /* 42 */
-       u32 hop_limit_al;              /* 43 */
-       u32 source_gid_idx_al;         /* 44 */
-       u32 flow_label_al;             /* 45 */
-       u32 reserved_46;               /* 46 */
-       u32 reserved_47;               /* 47 */
-       union {                        /* 48 */
-               u64 dw[2];
-               u8 byte[16];
-       } dest_gid_al;
-       u32 max_nr_outst_send_wr;      /* 52 */
-       u32 max_nr_outst_recv_wr;      /* 53 */
-       u32 disable_ete_credit_check;  /* 54 */
-       u32 qp_number;                 /* 55 */
-       u64 send_queue_handle;         /* 56 */
-       u64 recv_queue_handle;         /* 58 */
-       u32 actual_nr_sges_in_sq_wqe;  /* 60 */
-       u32 actual_nr_sges_in_rq_wqe;  /* 61 */
-       u32 qp_enable;                 /* 62 */
-       u32 curr_srq_limit;            /* 63 */
-       u64 qp_aff_asyn_ev_log_reg;    /* 64 */
-       u64 shared_rq_hndl;            /* 66 */
-       u64 trigg_doorbell_qp_hndl;    /* 68 */
-       u32 reserved_70_127[58];       /* 70 */
-};
-
-#define MQPCB_MASK_QKEY                         EHCA_BMASK_IBM( 0,  0)
-#define MQPCB_MASK_SEND_PSN                     EHCA_BMASK_IBM( 2,  2)
-#define MQPCB_MASK_RECEIVE_PSN                  EHCA_BMASK_IBM( 3,  3)
-#define MQPCB_MASK_PRIM_PHYS_PORT               EHCA_BMASK_IBM( 4,  4)
-#define MQPCB_PRIM_PHYS_PORT                    EHCA_BMASK_IBM(24, 31)
-#define MQPCB_MASK_ALT_PHYS_PORT                EHCA_BMASK_IBM( 5,  5)
-#define MQPCB_MASK_PRIM_P_KEY_IDX               EHCA_BMASK_IBM( 6,  6)
-#define MQPCB_PRIM_P_KEY_IDX                    EHCA_BMASK_IBM(24, 31)
-#define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
-#define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
-#define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
-#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
-#define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
-#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
-#define MQPCB_MASK_DEST_QP_NR                   EHCA_BMASK_IBM(14, 14)
-#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD      EHCA_BMASK_IBM(15, 15)
-#define MQPCB_MASK_SERVICE_LEVEL                EHCA_BMASK_IBM(16, 16)
-#define MQPCB_MASK_SEND_GRH_FLAG                EHCA_BMASK_IBM(17, 17)
-#define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
-#define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
-#define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
-#define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
-#define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
-#define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
-#define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
-#define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
-#define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
-#define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
-#define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
-#define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
-#define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
-#define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
-#define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
-#define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
-#define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
-#define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
-#define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
-#define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
-#define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
-#define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
-#define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
-#define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
-#define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
-#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
-#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
-#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
-#define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
-#define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
-#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
-#define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
-
-#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
deleted file mode 100644 (file)
index 9b68b17..0000000
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Completion queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_iverbs.h"
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "hcp_if.h"
-
-static struct kmem_cache *cq_cache;
-
-int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
-{
-       unsigned int qp_num = qp->real_qp_num;
-       unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-       hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-
-       ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
-                cq->cq_number, qp_num);
-
-       return 0;
-}
-
-int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
-{
-       int ret = -EINVAL;
-       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
-       struct hlist_node *iter;
-       struct ehca_qp *qp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-       hlist_for_each(iter, &cq->qp_hashtab[key]) {
-               qp = hlist_entry(iter, struct ehca_qp, list_entries);
-               if (qp->real_qp_num == real_qp_num) {
-                       hlist_del(iter);
-                       ehca_dbg(cq->ib_cq.device,
-                                "removed qp from cq .cq_num=%x real_qp_num=%x",
-                                cq->cq_number, real_qp_num);
-                       ret = 0;
-                       break;
-               }
-       }
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-       if (ret)
-               ehca_err(cq->ib_cq.device,
-                        "qp not found cq_num=%x real_qp_num=%x",
-                        cq->cq_number, real_qp_num);
-
-       return ret;
-}
-
-struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
-{
-       struct ehca_qp *ret = NULL;
-       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
-       struct hlist_node *iter;
-       struct ehca_qp *qp;
-       hlist_for_each(iter, &cq->qp_hashtab[key]) {
-               qp = hlist_entry(iter, struct ehca_qp, list_entries);
-               if (qp->real_qp_num == real_qp_num) {
-                       ret = qp;
-                       break;
-               }
-       }
-       return ret;
-}
-
-struct ib_cq *ehca_create_cq(struct ib_device *device,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *context,
-                            struct ib_udata *udata)
-{
-       int cqe = attr->cqe;
-       static const u32 additional_cqe = 20;
-       struct ib_cq *cq;
-       struct ehca_cq *my_cq;
-       struct ehca_shca *shca =
-               container_of(device, struct ehca_shca, ib_device);
-       struct ipz_adapter_handle adapter_handle;
-       struct ehca_alloc_cq_parms param; /* h_call's out parameters */
-       struct h_galpa gal;
-       void *vpage;
-       u32 counter;
-       u64 rpage, cqx_fec, h_ret;
-       int ipz_rc, i;
-       unsigned long flags;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
-               return ERR_PTR(-EINVAL);
-
-       if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
-               ehca_err(device, "Unable to create CQ, max number of %i "
-                       "CQs reached.", shca->max_num_cqs);
-               ehca_err(device, "To increase the maximum number of CQs "
-                       "use the number_of_cqs module parameter.\n");
-               return ERR_PTR(-ENOSPC);
-       }
-
-       my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
-       if (!my_cq) {
-               ehca_err(device, "Out of memory for ehca_cq struct device=%p",
-                        device);
-               atomic_dec(&shca->num_cqs);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
-
-       spin_lock_init(&my_cq->spinlock);
-       spin_lock_init(&my_cq->cb_lock);
-       spin_lock_init(&my_cq->task_lock);
-       atomic_set(&my_cq->nr_events, 0);
-       init_waitqueue_head(&my_cq->wait_completion);
-
-       cq = &my_cq->ib_cq;
-
-       adapter_handle = shca->ipz_hca_handle;
-       param.eq_handle = shca->eq.ipz_eq_handle;
-
-       idr_preload(GFP_KERNEL);
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-       idr_preload_end();
-
-       if (my_cq->token < 0) {
-               cq = ERR_PTR(-ENOMEM);
-               ehca_err(device, "Can't allocate new idr entry. device=%p",
-                        device);
-               goto create_cq_exit1;
-       }
-
-       /*
-        * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
-        * for receiving errors CQEs.
-        */
-       param.nr_cqe = cqe + additional_cqe;
-       h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(device, "hipz_h_alloc_resource_cq() failed "
-                        "h_ret=%lli device=%p", h_ret, device);
-               cq = ERR_PTR(ehca2ib_return_code(h_ret));
-               goto create_cq_exit2;
-       }
-
-       ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
-                               EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
-       if (!ipz_rc) {
-               ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
-                        ipz_rc, device);
-               cq = ERR_PTR(-EINVAL);
-               goto create_cq_exit3;
-       }
-
-       for (counter = 0; counter < param.act_pages; counter++) {
-               vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
-               if (!vpage) {
-                       ehca_err(device, "ipz_qpageit_get_inc() "
-                                "returns NULL device=%p", device);
-                       cq = ERR_PTR(-EAGAIN);
-                       goto create_cq_exit4;
-               }
-               rpage = __pa(vpage);
-
-               h_ret = hipz_h_register_rpage_cq(adapter_handle,
-                                                my_cq->ipz_cq_handle,
-                                                &my_cq->pf,
-                                                0,
-                                                0,
-                                                rpage,
-                                                1,
-                                                my_cq->galpas.
-                                                kernel);
-
-               if (h_ret < H_SUCCESS) {
-                       ehca_err(device, "hipz_h_register_rpage_cq() failed "
-                                "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
-                                "act_pages=%i", my_cq, my_cq->cq_number,
-                                h_ret, counter, param.act_pages);
-                       cq = ERR_PTR(-EINVAL);
-                       goto create_cq_exit4;
-               }
-
-               if (counter == (param.act_pages - 1)) {
-                       vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
-                       if ((h_ret != H_SUCCESS) || vpage) {
-                               ehca_err(device, "Registration of pages not "
-                                        "complete ehca_cq=%p cq_num=%x "
-                                        "h_ret=%lli", my_cq, my_cq->cq_number,
-                                        h_ret);
-                               cq = ERR_PTR(-EAGAIN);
-                               goto create_cq_exit4;
-                       }
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED) {
-                               ehca_err(device, "Registration of page failed "
-                                        "ehca_cq=%p cq_num=%x h_ret=%lli "
-                                        "counter=%i act_pages=%i",
-                                        my_cq, my_cq->cq_number,
-                                        h_ret, counter, param.act_pages);
-                               cq = ERR_PTR(-ENOMEM);
-                               goto create_cq_exit4;
-                       }
-               }
-       }
-
-       ipz_qeit_reset(&my_cq->ipz_queue);
-
-       gal = my_cq->galpas.kernel;
-       cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
-       ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
-                my_cq, my_cq->cq_number, cqx_fec);
-
-       my_cq->ib_cq.cqe = my_cq->nr_of_entries =
-               param.act_nr_of_entries - additional_cqe;
-       my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
-
-       for (i = 0; i < QP_HASHTAB_LEN; i++)
-               INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
-
-       INIT_LIST_HEAD(&my_cq->sqp_err_list);
-       INIT_LIST_HEAD(&my_cq->rqp_err_list);
-
-       if (context) {
-               struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
-               struct ehca_create_cq_resp resp;
-               memset(&resp, 0, sizeof(resp));
-               resp.cq_number = my_cq->cq_number;
-               resp.token = my_cq->token;
-               resp.ipz_queue.qe_size = ipz_queue->qe_size;
-               resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
-               resp.ipz_queue.queue_length = ipz_queue->queue_length;
-               resp.ipz_queue.pagesize = ipz_queue->pagesize;
-               resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
-               resp.fw_handle_ofs = (u32)
-                       (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
-               if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-                       ehca_err(device, "Copy to udata failed.");
-                       cq = ERR_PTR(-EFAULT);
-                       goto create_cq_exit4;
-               }
-       }
-
-       return cq;
-
-create_cq_exit4:
-       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
-
-create_cq_exit3:
-       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
-       if (h_ret != H_SUCCESS)
-               ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
-                        "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
-
-create_cq_exit2:
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       idr_remove(&ehca_cq_idr, my_cq->token);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-
-create_cq_exit1:
-       kmem_cache_free(cq_cache, my_cq);
-
-       atomic_dec(&shca->num_cqs);
-       return cq;
-}
-
-int ehca_destroy_cq(struct ib_cq *cq)
-{
-       u64 h_ret;
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int cq_num = my_cq->cq_number;
-       struct ib_device *device = cq->device;
-       struct ehca_shca *shca = container_of(device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       unsigned long flags;
-
-       if (cq->uobject) {
-               if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
-                       ehca_err(device, "Resources still referenced in "
-                                "user space cq_num=%x", my_cq->cq_number);
-                       return -EINVAL;
-               }
-       }
-
-       /*
-        * remove the CQ from the idr first to make sure
-        * no more interrupt tasklets will touch this CQ
-        */
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       idr_remove(&ehca_cq_idr, my_cq->token);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-
-       /* now wait until all pending events have completed */
-       wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
-
-       /* nobody's using our CQ any longer -- we can destroy it */
-       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
-       if (h_ret == H_R_STATE) {
-               /* cq in err: read err data and destroy it forcibly */
-               ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
-                        "state. Try to delete it forcibly.",
-                        my_cq, cq_num, my_cq->ipz_cq_handle.handle);
-               ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
-               h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
-               if (h_ret == H_SUCCESS)
-                       ehca_dbg(device, "cq_num=%x deleted successfully.",
-                                cq_num);
-       }
-       if (h_ret != H_SUCCESS) {
-               ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
-                        "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
-               return ehca2ib_return_code(h_ret);
-       }
-       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
-       kmem_cache_free(cq_cache, my_cq);
-
-       atomic_dec(&shca->num_cqs);
-       return 0;
-}
-
-int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
-{
-       /* TODO: proper resize needs to be done */
-       ehca_err(cq->device, "not implemented yet");
-
-       return -EFAULT;
-}
-
-int ehca_init_cq_cache(void)
-{
-       cq_cache = kmem_cache_create("ehca_cache_cq",
-                                    sizeof(struct ehca_cq), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!cq_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_cq_cache(void)
-{
-       if (cq_cache)
-               kmem_cache_destroy(cq_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c
deleted file mode 100644 (file)
index 90da674..0000000
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Event queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "ehca_iverbs.h"
-#include "ehca_qes.h"
-#include "hcp_if.h"
-#include "ipz_pt_fn.h"
-
-int ehca_create_eq(struct ehca_shca *shca,
-                  struct ehca_eq *eq,
-                  const enum ehca_eq_type type, const u32 length)
-{
-       int ret;
-       u64 h_ret;
-       u32 nr_pages;
-       u32 i;
-       void *vpage;
-       struct ib_device *ib_dev = &shca->ib_device;
-
-       spin_lock_init(&eq->spinlock);
-       spin_lock_init(&eq->irq_spinlock);
-       eq->is_initialized = 0;
-
-       if (type != EHCA_EQ && type != EHCA_NEQ) {
-               ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
-               return -EINVAL;
-       }
-       if (!length) {
-               ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
-               return -EINVAL;
-       }
-
-       h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
-                                        &eq->pf,
-                                        type,
-                                        length,
-                                        &eq->ipz_eq_handle,
-                                        &eq->length,
-                                        &nr_pages, &eq->ist);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
-               return -EINVAL;
-       }
-
-       ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
-                            EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
-       if (!ret) {
-               ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
-               goto create_eq_exit1;
-       }
-
-       for (i = 0; i < nr_pages; i++) {
-               u64 rpage;
-
-               vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
-               if (!vpage)
-                       goto create_eq_exit2;
-
-               rpage = __pa(vpage);
-               h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
-                                                eq->ipz_eq_handle,
-                                                &eq->pf,
-                                                0, 0, rpage, 1);
-
-               if (i == (nr_pages - 1)) {
-                       /* last page */
-                       vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
-                       if (h_ret != H_SUCCESS || vpage)
-                               goto create_eq_exit2;
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED)
-                               goto create_eq_exit2;
-               }
-       }
-
-       ipz_qeit_reset(&eq->ipz_queue);
-
-       /* register interrupt handlers and initialize work queues */
-       if (type == EHCA_EQ) {
-               tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
-
-               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
-                                         0, "ehca_eq",
-                                         (void *)shca);
-               if (ret < 0)
-                       ehca_err(ib_dev, "Can't map interrupt handler.");
-       } else if (type == EHCA_NEQ) {
-               tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
-
-               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
-                                         0, "ehca_neq",
-                                         (void *)shca);
-               if (ret < 0)
-                       ehca_err(ib_dev, "Can't map interrupt handler.");
-       }
-
-       eq->is_initialized = 1;
-
-       return 0;
-
-create_eq_exit2:
-       ipz_queue_dtor(NULL, &eq->ipz_queue);
-
-create_eq_exit1:
-       hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
-
-       return -EINVAL;
-}
-
-void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
-{
-       unsigned long flags;
-       void *eqe;
-
-       spin_lock_irqsave(&eq->spinlock, flags);
-       eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
-       spin_unlock_irqrestore(&eq->spinlock, flags);
-
-       return eqe;
-}
-
-int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
-{
-       unsigned long flags;
-       u64 h_ret;
-
-       ibmebus_free_irq(eq->ist, (void *)shca);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       eq->is_initialized = 0;
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       tasklet_kill(&eq->interrupt_task);
-
-       h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't free EQ resources.");
-               return -EINVAL;
-       }
-       ipz_queue_dtor(NULL, &eq->ipz_queue);
-
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
deleted file mode 100644 (file)
index e8b1bb6..0000000
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HCA query functions
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/gfp.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-static unsigned int limit_uint(unsigned int value)
-{
-       return min_t(unsigned int, value, INT_MAX);
-}
-
-int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                     struct ib_udata *uhw)
-{
-       int i, ret = 0;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_hca *rblock;
-
-       static const u32 cap_mapping[] = {
-               IB_DEVICE_RESIZE_MAX_WR,      HCA_CAP_WQE_RESIZE,
-               IB_DEVICE_BAD_PKEY_CNTR,      HCA_CAP_BAD_P_KEY_CTR,
-               IB_DEVICE_BAD_QKEY_CNTR,      HCA_CAP_Q_KEY_VIOL_CTR,
-               IB_DEVICE_RAW_MULTI,          HCA_CAP_RAW_PACKET_MCAST,
-               IB_DEVICE_AUTO_PATH_MIG,      HCA_CAP_AUTO_PATH_MIG,
-               IB_DEVICE_CHANGE_PHY_PORT,    HCA_CAP_SQD_RTS_PORT_CHANGE,
-               IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
-               IB_DEVICE_CURR_QP_STATE_MOD,  HCA_CAP_CUR_QP_STATE_MOD,
-               IB_DEVICE_SHUTDOWN_PORT,      HCA_CAP_SHUTDOWN_PORT,
-               IB_DEVICE_INIT_TYPE,          HCA_CAP_INIT_TYPE,
-               IB_DEVICE_PORT_ACTIVE_EVENT,  HCA_CAP_PORT_ACTIVE_EVENT,
-       };
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query device properties");
-               ret = -EINVAL;
-               goto query_device1;
-       }
-
-       memset(props, 0, sizeof(struct ib_device_attr));
-       props->page_size_cap   = shca->hca_cap_mr_pgsize;
-       props->fw_ver          = rblock->hw_ver;
-       props->max_mr_size     = rblock->max_mr_size;
-       props->vendor_id       = rblock->vendor_id >> 8;
-       props->vendor_part_id  = rblock->vendor_part_id >> 16;
-       props->hw_ver          = rblock->hw_ver;
-       props->max_qp          = limit_uint(rblock->max_qp);
-       props->max_qp_wr       = limit_uint(rblock->max_wqes_wq);
-       props->max_sge         = limit_uint(rblock->max_sge);
-       props->max_sge_rd      = limit_uint(rblock->max_sge_rd);
-       props->max_cq          = limit_uint(rblock->max_cq);
-       props->max_cqe         = limit_uint(rblock->max_cqe);
-       props->max_mr          = limit_uint(rblock->max_mr);
-       props->max_mw          = limit_uint(rblock->max_mw);
-       props->max_pd          = limit_uint(rblock->max_pd);
-       props->max_ah          = limit_uint(rblock->max_ah);
-       props->max_ee          = limit_uint(rblock->max_rd_ee_context);
-       props->max_rdd         = limit_uint(rblock->max_rd_domain);
-       props->max_fmr         = limit_uint(rblock->max_mr);
-       props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
-       props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
-       props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
-       props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
-       props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
-
-       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
-               props->max_srq         = limit_uint(props->max_qp);
-               props->max_srq_wr      = limit_uint(props->max_qp_wr);
-               props->max_srq_sge     = 3;
-       }
-
-       props->max_pkeys           = 16;
-       /* Some FW versions say 0 here; insert sensible value in that case */
-       props->local_ca_ack_delay  = rblock->local_ca_ack_delay ?
-               min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
-       props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
-       props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
-       props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
-       props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
-       props->max_total_mcast_qp_attach
-               = limit_uint(rblock->max_total_mcast_qp_attach);
-
-       /* translate device capabilities */
-       props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
-               IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
-       for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
-               if (rblock->hca_cap_indicators & cap_mapping[i + 1])
-                       props->device_cap_flags |= cap_mapping[i];
-
-query_device1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
-{
-       switch (fw_mtu) {
-       case 0x1:
-               return IB_MTU_256;
-       case 0x2:
-               return IB_MTU_512;
-       case 0x3:
-               return IB_MTU_1024;
-       case 0x4:
-               return IB_MTU_2048;
-       case 0x5:
-               return IB_MTU_4096;
-       default:
-               ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
-                        fw_mtu);
-               return 0;
-       }
-}
-
-static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
-{
-       switch (vl_cap) {
-       case 0x1:
-               return 1;
-       case 0x2:
-               return 2;
-       case 0x3:
-               return 4;
-       case 0x4:
-               return 8;
-       case 0x5:
-               return 15;
-       default:
-               ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
-                        vl_cap);
-               return 0;
-       }
-}
-
-int ehca_query_port(struct ib_device *ibdev,
-                   u8 port, struct ib_port_attr *props)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_port *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_port1;
-       }
-
-       memset(props, 0, sizeof(struct ib_port_attr));
-
-       props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
-       props->port_cap_flags  = rblock->capability_mask;
-       props->gid_tbl_len     = rblock->gid_tbl_len;
-       if (rblock->max_msg_sz)
-               props->max_msg_sz      = rblock->max_msg_sz;
-       else
-               props->max_msg_sz      = 0x1 << 31;
-       props->bad_pkey_cntr   = rblock->bad_pkey_cntr;
-       props->qkey_viol_cntr  = rblock->qkey_viol_cntr;
-       props->pkey_tbl_len    = rblock->pkey_tbl_len;
-       props->lid             = rblock->lid;
-       props->sm_lid          = rblock->sm_lid;
-       props->lmc             = rblock->lmc;
-       props->sm_sl           = rblock->sm_sl;
-       props->subnet_timeout  = rblock->subnet_timeout;
-       props->init_type_reply = rblock->init_type_reply;
-       props->max_vl_num      = map_number_of_vls(shca, rblock->vl_cap);
-
-       if (rblock->state && rblock->phys_width) {
-               props->phys_state      = rblock->phys_pstate;
-               props->state           = rblock->phys_state;
-               props->active_width    = rblock->phys_width;
-               props->active_speed    = rblock->phys_speed;
-       } else {
-               /* old firmware releases don't report physical
-                * port info, so use default values
-                */
-               props->phys_state      = 5;
-               props->state           = rblock->state;
-               props->active_width    = IB_WIDTH_12X;
-               props->active_speed    = IB_SPEED_SDR;
-       }
-
-query_port1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_sma_attr(struct ehca_shca *shca,
-                       u8 port, struct ehca_sma_attr *attr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct hipz_query_port *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_sma_attr1;
-       }
-
-       memset(attr, 0, sizeof(struct ehca_sma_attr));
-
-       attr->lid    = rblock->lid;
-       attr->lmc    = rblock->lmc;
-       attr->sm_sl  = rblock->sm_sl;
-       attr->sm_lid = rblock->sm_lid;
-
-       attr->pkey_tbl_len = rblock->pkey_tbl_len;
-       memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
-
-query_sma_attr1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca;
-       struct hipz_query_port *rblock;
-
-       shca = container_of(ibdev, struct ehca_shca, ib_device);
-       if (index > 16) {
-               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
-               return -EINVAL;
-       }
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_pkey1;
-       }
-
-       memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
-
-query_pkey1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_gid(struct ib_device *ibdev, u8 port,
-                  int index, union ib_gid *gid)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_port *rblock;
-
-       if (index < 0 || index > 255) {
-               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
-               return -EINVAL;
-       }
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_gid1;
-       }
-
-       memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
-       memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
-
-query_gid1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-static const u32 allowed_port_caps = (
-       IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
-       IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
-       IB_PORT_VENDOR_CLASS_SUP);
-
-int ehca_modify_port(struct ib_device *ibdev,
-                    u8 port, int port_modify_mask,
-                    struct ib_port_modify *props)
-{
-       int ret = 0;
-       struct ehca_shca *shca;
-       struct hipz_query_port *rblock;
-       u32 cap;
-       u64 hret;
-
-       shca = container_of(ibdev, struct ehca_shca, ib_device);
-       if ((props->set_port_cap_mask | props->clr_port_cap_mask)
-           & ~allowed_port_caps) {
-               ehca_err(&shca->ib_device, "Non-changeable bits set in masks  "
-                        "set=%x  clr=%x  allowed=%x", props->set_port_cap_mask,
-                        props->clr_port_cap_mask, allowed_port_caps);
-               return -EINVAL;
-       }
-
-       if (mutex_lock_interruptible(&shca->modify_mutex))
-               return -ERESTARTSYS;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
-               ret = -ENOMEM;
-               goto modify_port1;
-       }
-
-       hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (hret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto modify_port2;
-       }
-
-       cap = (rblock->capability_mask | props->set_port_cap_mask)
-               & ~props->clr_port_cap_mask;
-
-       hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
-                                 cap, props->init_type, port_modify_mask);
-       if (hret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Modify port failed  h_ret=%lli",
-                        hret);
-               ret = -EINVAL;
-       }
-
-modify_port2:
-       ehca_free_fw_ctrlblock(rblock);
-
-modify_port1:
-       mutex_unlock(&shca->modify_mutex);
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
deleted file mode 100644 (file)
index 8615d7c..0000000
+++ /dev/null
@@ -1,870 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Functions for EQs, NEQs and interrupts
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-#include <linux/smpboot.h>
-
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "ehca_iverbs.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-#include "ipz_pt_fn.h"
-
-#define EQE_COMPLETION_EVENT   EHCA_BMASK_IBM( 1,  1)
-#define EQE_CQ_QP_NUMBER       EHCA_BMASK_IBM( 8, 31)
-#define EQE_EE_IDENTIFIER      EHCA_BMASK_IBM( 2,  7)
-#define EQE_CQ_NUMBER          EHCA_BMASK_IBM( 8, 31)
-#define EQE_QP_NUMBER          EHCA_BMASK_IBM( 8, 31)
-#define EQE_QP_TOKEN           EHCA_BMASK_IBM(32, 63)
-#define EQE_CQ_TOKEN           EHCA_BMASK_IBM(32, 63)
-
-#define NEQE_COMPLETION_EVENT  EHCA_BMASK_IBM( 1,  1)
-#define NEQE_EVENT_CODE        EHCA_BMASK_IBM( 2,  7)
-#define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
-#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
-#define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
-#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
-
-#define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
-#define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
-
-static void queue_comp_task(struct ehca_cq *__cq);
-
-static struct ehca_comp_pool *pool;
-
-static inline void comp_event_callback(struct ehca_cq *cq)
-{
-       if (!cq->ib_cq.comp_handler)
-               return;
-
-       spin_lock(&cq->cb_lock);
-       cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
-       spin_unlock(&cq->cb_lock);
-
-       return;
-}
-
-static void print_error_data(struct ehca_shca *shca, void *data,
-                            u64 *rblock, int length)
-{
-       u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
-       u64 resource = rblock[1];
-
-       switch (type) {
-       case 0x1: /* Queue Pair */
-       {
-               struct ehca_qp *qp = (struct ehca_qp *)data;
-
-               /* only print error data if AER is set */
-               if (rblock[6] == 0)
-                       return;
-
-               ehca_err(&shca->ib_device,
-                        "QP 0x%x (resource=%llx) has errors.",
-                        qp->ib_qp.qp_num, resource);
-               break;
-       }
-       case 0x4: /* Completion Queue */
-       {
-               struct ehca_cq *cq = (struct ehca_cq *)data;
-
-               ehca_err(&shca->ib_device,
-                        "CQ 0x%x (resource=%llx) has errors.",
-                        cq->cq_number, resource);
-               break;
-       }
-       default:
-               ehca_err(&shca->ib_device,
-                        "Unknown error type: %llx on %s.",
-                        type, shca->ib_device.name);
-               break;
-       }
-
-       ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
-       ehca_err(&shca->ib_device, "EHCA ----- error data begin "
-                "---------------------------------------------------");
-       ehca_dmp(rblock, length, "resource=%llx", resource);
-       ehca_err(&shca->ib_device, "EHCA ----- error data end "
-                "----------------------------------------------------");
-
-       return;
-}
-
-int ehca_error_data(struct ehca_shca *shca, void *data,
-                   u64 resource)
-{
-
-       unsigned long ret;
-       u64 *rblock;
-       unsigned long block_count;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
-               ret = -ENOMEM;
-               goto error_data1;
-       }
-
-       /* rblock must be 4K aligned and should be 4K large */
-       ret = hipz_h_error_data(shca->ipz_hca_handle,
-                               resource,
-                               rblock,
-                               &block_count);
-
-       if (ret == H_R_STATE)
-               ehca_err(&shca->ib_device,
-                        "No error data is available: %llx.", resource);
-       else if (ret == H_SUCCESS) {
-               int length;
-
-               length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
-
-               if (length > EHCA_PAGESIZE)
-                       length = EHCA_PAGESIZE;
-
-               print_error_data(shca, data, rblock, length);
-       } else
-               ehca_err(&shca->ib_device,
-                        "Error data could not be fetched: %llx", resource);
-
-       ehca_free_fw_ctrlblock(rblock);
-
-error_data1:
-       return ret;
-
-}
-
-static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
-                             enum ib_event_type event_type)
-{
-       struct ib_event event;
-
-       /* PATH_MIG without the QP ever having been armed is false alarm */
-       if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
-               return;
-
-       event.device = &shca->ib_device;
-       event.event = event_type;
-
-       if (qp->ext_type == EQPT_SRQ) {
-               if (!qp->ib_srq.event_handler)
-                       return;
-
-               event.element.srq = &qp->ib_srq;
-               qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
-       } else {
-               if (!qp->ib_qp.event_handler)
-                       return;
-
-               event.element.qp = &qp->ib_qp;
-               qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
-       }
-}
-
-static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
-                             enum ib_event_type event_type, int fatal)
-{
-       struct ehca_qp *qp;
-       u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
-
-       read_lock(&ehca_qp_idr_lock);
-       qp = idr_find(&ehca_qp_idr, token);
-       if (qp)
-               atomic_inc(&qp->nr_events);
-       read_unlock(&ehca_qp_idr_lock);
-
-       if (!qp)
-               return;
-
-       if (fatal)
-               ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
-
-       dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
-                         IB_EVENT_SRQ_ERR : event_type);
-
-       /*
-        * eHCA only processes one WQE at a time for SRQ base QPs,
-        * so the last WQE has been processed as soon as the QP enters
-        * error state.
-        */
-       if (fatal && qp->ext_type == EQPT_SRQBASE)
-               dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
-
-       if (atomic_dec_and_test(&qp->nr_events))
-               wake_up(&qp->wait_completion);
-       return;
-}
-
-static void cq_event_callback(struct ehca_shca *shca,
-                             u64 eqe)
-{
-       struct ehca_cq *cq;
-       u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
-
-       read_lock(&ehca_cq_idr_lock);
-       cq = idr_find(&ehca_cq_idr, token);
-       if (cq)
-               atomic_inc(&cq->nr_events);
-       read_unlock(&ehca_cq_idr_lock);
-
-       if (!cq)
-               return;
-
-       ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
-
-       if (atomic_dec_and_test(&cq->nr_events))
-               wake_up(&cq->wait_completion);
-
-       return;
-}
-
-static void parse_identifier(struct ehca_shca *shca, u64 eqe)
-{
-       u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
-
-       switch (identifier) {
-       case 0x02: /* path migrated */
-               qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
-               break;
-       case 0x03: /* communication established */
-               qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
-               break;
-       case 0x04: /* send queue drained */
-               qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
-               break;
-       case 0x05: /* QP error */
-       case 0x06: /* QP error */
-               qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
-               break;
-       case 0x07: /* CQ error */
-       case 0x08: /* CQ error */
-               cq_event_callback(shca, eqe);
-               break;
-       case 0x09: /* MRMWPTE error */
-               ehca_err(&shca->ib_device, "MRMWPTE error.");
-               break;
-       case 0x0A: /* port event */
-               ehca_err(&shca->ib_device, "Port event.");
-               break;
-       case 0x0B: /* MR access error */
-               ehca_err(&shca->ib_device, "MR access error.");
-               break;
-       case 0x0C: /* EQ error */
-               ehca_err(&shca->ib_device, "EQ error.");
-               break;
-       case 0x0D: /* P/Q_Key mismatch */
-               ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
-               break;
-       case 0x10: /* sampling complete */
-               ehca_err(&shca->ib_device, "Sampling complete.");
-               break;
-       case 0x11: /* unaffiliated access error */
-               ehca_err(&shca->ib_device, "Unaffiliated access error.");
-               break;
-       case 0x12: /* path migrating */
-               ehca_err(&shca->ib_device, "Path migrating.");
-               break;
-       case 0x13: /* interface trace stopped */
-               ehca_err(&shca->ib_device, "Interface trace stopped.");
-               break;
-       case 0x14: /* first error capture info available */
-               ehca_info(&shca->ib_device, "First error capture available");
-               break;
-       case 0x15: /* SRQ limit reached */
-               qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
-                        identifier, shca->ib_device.name);
-               break;
-       }
-
-       return;
-}
-
-static void dispatch_port_event(struct ehca_shca *shca, int port_num,
-                               enum ib_event_type type, const char *msg)
-{
-       struct ib_event event;
-
-       ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
-       event.device = &shca->ib_device;
-       event.event = type;
-       event.element.port_num = port_num;
-       ib_dispatch_event(&event);
-}
-
-static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
-{
-       struct ehca_sma_attr  new_attr;
-       struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
-
-       ehca_query_sma_attr(shca, port_num, &new_attr);
-
-       if (new_attr.sm_sl  != old_attr->sm_sl ||
-           new_attr.sm_lid != old_attr->sm_lid)
-               dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
-                                   "SM changed");
-
-       if (new_attr.lid != old_attr->lid ||
-           new_attr.lmc != old_attr->lmc)
-               dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
-                                   "LID changed");
-
-       if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
-           memcmp(new_attr.pkeys, old_attr->pkeys,
-                  sizeof(u16) * new_attr.pkey_tbl_len))
-               dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
-                                   "P_Key changed");
-
-       *old_attr = new_attr;
-}
-
-/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
-static int replay_modify_qp(struct ehca_sport *sport)
-{
-       int aqp1_destroyed;
-       unsigned long flags;
-
-       spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-
-       aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
-
-       if (sport->ibqp_sqp[IB_QPT_SMI])
-               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
-       if (!aqp1_destroyed)
-               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
-
-       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-
-       return aqp1_destroyed;
-}
-
-static void parse_ec(struct ehca_shca *shca, u64 eqe)
-{
-       u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
-       u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
-       u8 spec_event;
-       struct ehca_sport *sport = &shca->sport[port - 1];
-
-       switch (ec) {
-       case 0x30: /* port availability change */
-               if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
-                       /* only replay modify_qp calls in autodetect mode;
-                        * if AQP1 was destroyed, the port is already down
-                        * again and we can drop the event.
-                        */
-                       if (ehca_nr_ports < 0)
-                               if (replay_modify_qp(sport))
-                                       break;
-
-                       sport->port_state = IB_PORT_ACTIVE;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
-                                           "is active");
-                       ehca_query_sma_attr(shca, port, &sport->saved_attr);
-               } else {
-                       sport->port_state = IB_PORT_DOWN;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
-                                           "is inactive");
-               }
-               break;
-       case 0x31:
-               /* port configuration change
-                * disruptive change is caused by
-                * LID, PKEY or SM change
-                */
-               if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
-                       ehca_warn(&shca->ib_device, "disruptive port "
-                                 "%d configuration change", port);
-
-                       sport->port_state = IB_PORT_DOWN;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
-                                           "is inactive");
-
-                       sport->port_state = IB_PORT_ACTIVE;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
-                                           "is active");
-                       ehca_query_sma_attr(shca, port,
-                                           &sport->saved_attr);
-               } else
-                       notify_port_conf_change(shca, port);
-               break;
-       case 0x32: /* adapter malfunction */
-               ehca_err(&shca->ib_device, "Adapter malfunction.");
-               break;
-       case 0x33:  /* trace stopped */
-               ehca_err(&shca->ib_device, "Traced stopped.");
-               break;
-       case 0x34: /* util async event */
-               spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
-               if (spec_event == 0x80) /* client reregister required */
-                       dispatch_port_event(shca, port,
-                                           IB_EVENT_CLIENT_REREGISTER,
-                                           "client reregister req.");
-               else
-                       ehca_warn(&shca->ib_device, "Unknown util async "
-                                 "event %x on port %x", spec_event, port);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
-                        ec, shca->ib_device.name);
-               break;
-       }
-
-       return;
-}
-
-static inline void reset_eq_pending(struct ehca_cq *cq)
-{
-       u64 CQx_EP;
-       struct h_galpa gal = cq->galpas.kernel;
-
-       hipz_galpa_store_cq(gal, cqx_ep, 0x0);
-       CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
-
-       return;
-}
-
-irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
-
-       tasklet_hi_schedule(&shca->neq.interrupt_task);
-
-       return IRQ_HANDLED;
-}
-
-void ehca_tasklet_neq(unsigned long data)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)data;
-       struct ehca_eqe *eqe;
-       u64 ret;
-
-       eqe = ehca_poll_eq(shca, &shca->neq);
-
-       while (eqe) {
-               if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
-                       parse_ec(shca, eqe->entry);
-
-               eqe = ehca_poll_eq(shca, &shca->neq);
-       }
-
-       ret = hipz_h_reset_event(shca->ipz_hca_handle,
-                                shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
-
-       if (ret != H_SUCCESS)
-               ehca_err(&shca->ib_device, "Can't clear notification events.");
-
-       return;
-}
-
-irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
-
-       tasklet_hi_schedule(&shca->eq.interrupt_task);
-
-       return IRQ_HANDLED;
-}
-
-
-static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
-{
-       u64 eqe_value;
-       u32 token;
-       struct ehca_cq *cq;
-
-       eqe_value = eqe->entry;
-       ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
-       if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
-               ehca_dbg(&shca->ib_device, "Got completion event");
-               token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
-               read_lock(&ehca_cq_idr_lock);
-               cq = idr_find(&ehca_cq_idr, token);
-               if (cq)
-                       atomic_inc(&cq->nr_events);
-               read_unlock(&ehca_cq_idr_lock);
-               if (cq == NULL) {
-                       ehca_err(&shca->ib_device,
-                                "Invalid eqe for non-existing cq token=%x",
-                                token);
-                       return;
-               }
-               reset_eq_pending(cq);
-               if (ehca_scaling_code)
-                       queue_comp_task(cq);
-               else {
-                       comp_event_callback(cq);
-                       if (atomic_dec_and_test(&cq->nr_events))
-                               wake_up(&cq->wait_completion);
-               }
-       } else {
-               ehca_dbg(&shca->ib_device, "Got non completion event");
-               parse_identifier(shca, eqe_value);
-       }
-}
-
-void ehca_process_eq(struct ehca_shca *shca, int is_irq)
-{
-       struct ehca_eq *eq = &shca->eq;
-       struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
-       u64 eqe_value, ret;
-       int eqe_cnt, i;
-       int eq_empty = 0;
-
-       spin_lock(&eq->irq_spinlock);
-       if (is_irq) {
-               const int max_query_cnt = 100;
-               int query_cnt = 0;
-               int int_state = 1;
-               do {
-                       int_state = hipz_h_query_int_state(
-                               shca->ipz_hca_handle, eq->ist);
-                       query_cnt++;
-                       iosync();
-               } while (int_state && query_cnt < max_query_cnt);
-               if (unlikely((query_cnt == max_query_cnt)))
-                       ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
-                                int_state, query_cnt);
-       }
-
-       /* read out all eqes */
-       eqe_cnt = 0;
-       do {
-               u32 token;
-               eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
-               if (!eqe_cache[eqe_cnt].eqe)
-                       break;
-               eqe_value = eqe_cache[eqe_cnt].eqe->entry;
-               if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
-                       token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
-                       read_lock(&ehca_cq_idr_lock);
-                       eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
-                       if (eqe_cache[eqe_cnt].cq)
-                               atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
-                       read_unlock(&ehca_cq_idr_lock);
-                       if (!eqe_cache[eqe_cnt].cq) {
-                               ehca_err(&shca->ib_device,
-                                        "Invalid eqe for non-existing cq "
-                                        "token=%x", token);
-                               continue;
-                       }
-               } else
-                       eqe_cache[eqe_cnt].cq = NULL;
-               eqe_cnt++;
-       } while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
-       if (!eqe_cnt) {
-               if (is_irq)
-                       ehca_dbg(&shca->ib_device,
-                                "No eqe found for irq event");
-               goto unlock_irq_spinlock;
-       } else if (!is_irq) {
-               ret = hipz_h_eoi(eq->ist);
-               if (ret != H_SUCCESS)
-                       ehca_err(&shca->ib_device,
-                                "bad return code EOI -rc = %lld\n", ret);
-               ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
-       }
-       if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
-               ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
-       /* enable irq for new packets */
-       for (i = 0; i < eqe_cnt; i++) {
-               if (eq->eqe_cache[i].cq)
-                       reset_eq_pending(eq->eqe_cache[i].cq);
-       }
-       /* check eq */
-       spin_lock(&eq->spinlock);
-       eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
-       spin_unlock(&eq->spinlock);
-       /* call completion handler for cached eqes */
-       for (i = 0; i < eqe_cnt; i++)
-               if (eq->eqe_cache[i].cq) {
-                       if (ehca_scaling_code)
-                               queue_comp_task(eq->eqe_cache[i].cq);
-                       else {
-                               struct ehca_cq *cq = eq->eqe_cache[i].cq;
-                               comp_event_callback(cq);
-                               if (atomic_dec_and_test(&cq->nr_events))
-                                       wake_up(&cq->wait_completion);
-                       }
-               } else {
-                       ehca_dbg(&shca->ib_device, "Got non completion event");
-                       parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
-               }
-       /* poll eq if not empty */
-       if (eq_empty)
-               goto unlock_irq_spinlock;
-       do {
-               struct ehca_eqe *eqe;
-               eqe = ehca_poll_eq(shca, &shca->eq);
-               if (!eqe)
-                       break;
-               process_eqe(shca, eqe);
-       } while (1);
-
-unlock_irq_spinlock:
-       spin_unlock(&eq->irq_spinlock);
-}
-
-void ehca_tasklet_eq(unsigned long data)
-{
-       ehca_process_eq((struct ehca_shca*)data, 1);
-}
-
-static int find_next_online_cpu(struct ehca_comp_pool *pool)
-{
-       int cpu;
-       unsigned long flags;
-
-       WARN_ON_ONCE(!in_interrupt());
-       if (ehca_debug_level >= 3)
-               ehca_dmp(cpu_online_mask, cpumask_size(), "");
-
-       spin_lock_irqsave(&pool->last_cpu_lock, flags);
-       do {
-               cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
-               if (cpu >= nr_cpu_ids)
-                       cpu = cpumask_first(cpu_online_mask);
-               pool->last_cpu = cpu;
-       } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
-       spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
-
-       return cpu;
-}
-
-static void __queue_comp_task(struct ehca_cq *__cq,
-                             struct ehca_cpu_comp_task *cct,
-                             struct task_struct *thread)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cct->task_lock, flags);
-       spin_lock(&__cq->task_lock);
-
-       if (__cq->nr_callbacks == 0) {
-               __cq->nr_callbacks++;
-               list_add_tail(&__cq->entry, &cct->cq_list);
-               cct->cq_jobs++;
-               wake_up_process(thread);
-       } else
-               __cq->nr_callbacks++;
-
-       spin_unlock(&__cq->task_lock);
-       spin_unlock_irqrestore(&cct->task_lock, flags);
-}
-
-static void queue_comp_task(struct ehca_cq *__cq)
-{
-       int cpu_id;
-       struct ehca_cpu_comp_task *cct;
-       struct task_struct *thread;
-       int cq_jobs;
-       unsigned long flags;
-
-       cpu_id = find_next_online_cpu(pool);
-       BUG_ON(!cpu_online(cpu_id));
-
-       cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
-       BUG_ON(!cct || !thread);
-
-       spin_lock_irqsave(&cct->task_lock, flags);
-       cq_jobs = cct->cq_jobs;
-       spin_unlock_irqrestore(&cct->task_lock, flags);
-       if (cq_jobs > 0) {
-               cpu_id = find_next_online_cpu(pool);
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-               thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
-               BUG_ON(!cct || !thread);
-       }
-       __queue_comp_task(__cq, cct, thread);
-}
-
-static void run_comp_task(struct ehca_cpu_comp_task *cct)
-{
-       struct ehca_cq *cq;
-
-       while (!list_empty(&cct->cq_list)) {
-               cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-               spin_unlock_irq(&cct->task_lock);
-
-               comp_event_callback(cq);
-               if (atomic_dec_and_test(&cq->nr_events))
-                       wake_up(&cq->wait_completion);
-
-               spin_lock_irq(&cct->task_lock);
-               spin_lock(&cq->task_lock);
-               cq->nr_callbacks--;
-               if (!cq->nr_callbacks) {
-                       list_del_init(cct->cq_list.next);
-                       cct->cq_jobs--;
-               }
-               spin_unlock(&cq->task_lock);
-       }
-}
-
-static void comp_task_park(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       struct ehca_cpu_comp_task *target;
-       struct task_struct *thread;
-       struct ehca_cq *cq, *tmp;
-       LIST_HEAD(list);
-
-       spin_lock_irq(&cct->task_lock);
-       cct->cq_jobs = 0;
-       cct->active = 0;
-       list_splice_init(&cct->cq_list, &list);
-       spin_unlock_irq(&cct->task_lock);
-
-       cpu = find_next_online_cpu(pool);
-       target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
-       spin_lock_irq(&target->task_lock);
-       list_for_each_entry_safe(cq, tmp, &list, entry) {
-               list_del(&cq->entry);
-               __queue_comp_task(cq, target, thread);
-       }
-       spin_unlock_irq(&target->task_lock);
-}
-
-static void comp_task_stop(unsigned int cpu, bool online)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-       spin_lock_irq(&cct->task_lock);
-       cct->cq_jobs = 0;
-       cct->active = 0;
-       WARN_ON(!list_empty(&cct->cq_list));
-       spin_unlock_irq(&cct->task_lock);
-}
-
-static int comp_task_should_run(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-       return cct->cq_jobs;
-}
-
-static void comp_task(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
-       int cql_empty;
-
-       spin_lock_irq(&cct->task_lock);
-       cql_empty = list_empty(&cct->cq_list);
-       if (!cql_empty) {
-               __set_current_state(TASK_RUNNING);
-               run_comp_task(cct);
-       }
-       spin_unlock_irq(&cct->task_lock);
-}
-
-static struct smp_hotplug_thread comp_pool_threads = {
-       .thread_should_run      = comp_task_should_run,
-       .thread_fn              = comp_task,
-       .thread_comm            = "ehca_comp/%u",
-       .cleanup                = comp_task_stop,
-       .park                   = comp_task_park,
-};
-
-int ehca_create_comp_pool(void)
-{
-       int cpu, ret = -ENOMEM;
-
-       if (!ehca_scaling_code)
-               return 0;
-
-       pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
-       if (pool == NULL)
-               return -ENOMEM;
-
-       spin_lock_init(&pool->last_cpu_lock);
-       pool->last_cpu = cpumask_any(cpu_online_mask);
-
-       pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
-       if (!pool->cpu_comp_tasks)
-               goto out_pool;
-
-       pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
-       if (!pool->cpu_comp_threads)
-               goto out_tasks;
-
-       for_each_present_cpu(cpu) {
-               struct ehca_cpu_comp_task *cct;
-
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-               spin_lock_init(&cct->task_lock);
-               INIT_LIST_HEAD(&cct->cq_list);
-       }
-
-       comp_pool_threads.store = pool->cpu_comp_threads;
-       ret = smpboot_register_percpu_thread(&comp_pool_threads);
-       if (ret)
-               goto out_threads;
-
-       pr_info("eHCA scaling code enabled\n");
-       return ret;
-
-out_threads:
-       free_percpu(pool->cpu_comp_threads);
-out_tasks:
-       free_percpu(pool->cpu_comp_tasks);
-out_pool:
-       kfree(pool);
-       return ret;
-}
-
-void ehca_destroy_comp_pool(void)
-{
-       if (!ehca_scaling_code)
-               return;
-
-       smpboot_unregister_percpu_thread(&comp_pool_threads);
-
-       free_percpu(pool->cpu_comp_threads);
-       free_percpu(pool->cpu_comp_tasks);
-       kfree(pool);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
deleted file mode 100644 (file)
index 5370199..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Function definitions and structs for EQs, NEQs and interrupts
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_IRQ_H
-#define __EHCA_IRQ_H
-
-
-struct ehca_shca;
-
-#include <linux/interrupt.h>
-#include <linux/types.h>
-
-int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
-
-irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
-void ehca_tasklet_neq(unsigned long data);
-
-irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
-void ehca_tasklet_eq(unsigned long data);
-void ehca_process_eq(struct ehca_shca *shca, int is_irq);
-
-struct ehca_cpu_comp_task {
-       struct list_head cq_list;
-       spinlock_t task_lock;
-       int cq_jobs;
-       int active;
-};
-
-struct ehca_comp_pool {
-       struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
-       struct task_struct * __percpu *cpu_comp_threads;
-       int last_cpu;
-       spinlock_t last_cpu_lock;
-};
-
-int ehca_create_comp_pool(void);
-void ehca_destroy_comp_pool(void);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
deleted file mode 100644 (file)
index 80e6a3d..0000000
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Function definitions for internal functions
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Dietmar Decker <ddecker@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_IVERBS_H__
-#define __EHCA_IVERBS_H__
-
-#include "ehca_classes.h"
-
-int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                     struct ib_udata *uhw);
-
-int ehca_query_port(struct ib_device *ibdev, u8 port,
-                   struct ib_port_attr *props);
-
-enum rdma_protocol_type
-ehca_query_protocol(struct ib_device *device, u8 port_num);
-
-int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
-                       struct ehca_sma_attr *attr);
-
-int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
-
-int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
-                  union ib_gid *gid);
-
-int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
-                    struct ib_port_modify *props);
-
-struct ib_pd *ehca_alloc_pd(struct ib_device *device,
-                           struct ib_ucontext *context,
-                           struct ib_udata *udata);
-
-int ehca_dealloc_pd(struct ib_pd *pd);
-
-struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
-
-int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
-
-int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
-
-int ehca_destroy_ah(struct ib_ah *ah);
-
-struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
-
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *phys_buf_array,
-                              int num_phys_buf,
-                              int mr_access_flags, u64 *iova_start);
-
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                              u64 virt, int mr_access_flags,
-                              struct ib_udata *udata);
-
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-                      int mr_rereg_mask,
-                      struct ib_pd *pd,
-                      struct ib_phys_buf *phys_buf_array,
-                      int num_phys_buf, int mr_access_flags, u64 *iova_start);
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
-int ehca_dereg_mr(struct ib_mr *mr);
-
-struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-
-int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-                struct ib_mw_bind *mw_bind);
-
-int ehca_dealloc_mw(struct ib_mw *mw);
-
-struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
-                             int mr_access_flags,
-                             struct ib_fmr_attr *fmr_attr);
-
-int ehca_map_phys_fmr(struct ib_fmr *fmr,
-                     u64 *page_list, int list_len, u64 iova);
-
-int ehca_unmap_fmr(struct list_head *fmr_list);
-
-int ehca_dealloc_fmr(struct ib_fmr *fmr);
-
-enum ehca_eq_type {
-       EHCA_EQ = 0, /* Event Queue              */
-       EHCA_NEQ     /* Notification Event Queue */
-};
-
-int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
-                  enum ehca_eq_type type, const u32 length);
-
-int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
-
-void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
-
-
-struct ib_cq *ehca_create_cq(struct ib_device *device,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *context,
-                            struct ib_udata *udata);
-
-int ehca_destroy_cq(struct ib_cq *cq);
-
-int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
-
-int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
-
-int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
-
-int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
-
-struct ib_qp *ehca_create_qp(struct ib_pd *pd,
-                            struct ib_qp_init_attr *init_attr,
-                            struct ib_udata *udata);
-
-int ehca_destroy_qp(struct ib_qp *qp);
-
-int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-                  struct ib_udata *udata);
-
-int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
-                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-
-int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
-                  struct ib_send_wr **bad_send_wr);
-
-int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
-                  struct ib_recv_wr **bad_recv_wr);
-
-int ehca_post_srq_recv(struct ib_srq *srq,
-                      struct ib_recv_wr *recv_wr,
-                      struct ib_recv_wr **bad_recv_wr);
-
-struct ib_srq *ehca_create_srq(struct ib_pd *pd,
-                              struct ib_srq_init_attr *init_attr,
-                              struct ib_udata *udata);
-
-int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
-                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
-
-int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-
-int ehca_destroy_srq(struct ib_srq *srq);
-
-u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
-                   struct ib_qp_init_attr *qp_init_attr);
-
-int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
-
-int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
-
-struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
-                                       struct ib_udata *udata);
-
-int ehca_dealloc_ucontext(struct ib_ucontext *context);
-
-int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-
-int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in, size_t in_mad_size,
-                    struct ib_mad_hdr *out, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index);
-
-void ehca_poll_eqs(unsigned long data);
-
-int ehca_calc_ipd(struct ehca_shca *shca, int port,
-                 enum ib_rate path_rate, u32 *ipd);
-
-void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
-
-#ifdef CONFIG_PPC_64K_PAGES
-void *ehca_alloc_fw_ctrlblock(gfp_t flags);
-void ehca_free_fw_ctrlblock(void *ptr);
-#else
-#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
-#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
-#endif
-
-void ehca_recover_sqp(struct ib_qp *sqp);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
deleted file mode 100644 (file)
index 8246418..0000000
+++ /dev/null
@@ -1,1123 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  module start stop, hca detection
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifdef CONFIG_PPC_64K_PAGES
-#include <linux/slab.h>
-#endif
-
-#include <linux/notifier.h>
-#include <linux/memory.h>
-#include <rdma/ib_mad.h>
-#include "ehca_classes.h"
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-
-#define HCAD_VERSION "0029"
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
-MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
-MODULE_VERSION(HCAD_VERSION);
-
-static bool ehca_open_aqp1    = 0;
-static int ehca_hw_level      = 0;
-static bool ehca_poll_all_eqs = 1;
-
-int ehca_debug_level   = 0;
-int ehca_nr_ports      = -1;
-bool ehca_use_hp_mr    = 0;
-int ehca_port_act_time = 30;
-int ehca_static_rate   = -1;
-bool ehca_scaling_code = 0;
-int ehca_lock_hcalls   = -1;
-int ehca_max_cq        = -1;
-int ehca_max_qp        = -1;
-
-module_param_named(open_aqp1,     ehca_open_aqp1,     bool, S_IRUGO);
-module_param_named(debug_level,   ehca_debug_level,   int,  S_IRUGO);
-module_param_named(hw_level,      ehca_hw_level,      int,  S_IRUGO);
-module_param_named(nr_ports,      ehca_nr_ports,      int,  S_IRUGO);
-module_param_named(use_hp_mr,     ehca_use_hp_mr,     bool, S_IRUGO);
-module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
-module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
-module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
-module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
-module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
-module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
-module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
-
-MODULE_PARM_DESC(open_aqp1,
-                "Open AQP1 on startup (default: no)");
-MODULE_PARM_DESC(debug_level,
-                "Amount of debug output (0: none (default), 1: traces, "
-                "2: some dumps, 3: lots)");
-MODULE_PARM_DESC(hw_level,
-                "Hardware level (0: autosensing (default), "
-                "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
-MODULE_PARM_DESC(nr_ports,
-                "number of connected ports (-1: autodetect (default), "
-                "1: port one only, 2: two ports)");
-MODULE_PARM_DESC(use_hp_mr,
-                "Use high performance MRs (default: no)");
-MODULE_PARM_DESC(port_act_time,
-                "Time to wait for port activation (default: 30 sec)");
-MODULE_PARM_DESC(poll_all_eqs,
-                "Poll all event queues periodically (default: yes)");
-MODULE_PARM_DESC(static_rate,
-                "Set permanent static rate (default: no static rate)");
-MODULE_PARM_DESC(scaling_code,
-                "Enable scaling code (default: no)");
-MODULE_PARM_DESC(lock_hcalls,
-                "Serialize all hCalls made by the driver "
-                "(default: autodetect)");
-MODULE_PARM_DESC(number_of_cqs,
-               "Max number of CQs which can be allocated "
-               "(default: autodetect)");
-MODULE_PARM_DESC(number_of_qps,
-               "Max number of QPs which can be allocated "
-               "(default: autodetect)");
-
-DEFINE_RWLOCK(ehca_qp_idr_lock);
-DEFINE_RWLOCK(ehca_cq_idr_lock);
-DEFINE_IDR(ehca_qp_idr);
-DEFINE_IDR(ehca_cq_idr);
-
-static LIST_HEAD(shca_list); /* list of all registered ehcas */
-DEFINE_SPINLOCK(shca_list_lock);
-
-static struct timer_list poll_eqs_timer;
-
-#ifdef CONFIG_PPC_64K_PAGES
-static struct kmem_cache *ctblk_cache;
-
-void *ehca_alloc_fw_ctrlblock(gfp_t flags)
-{
-       void *ret = kmem_cache_zalloc(ctblk_cache, flags);
-       if (!ret)
-               ehca_gen_err("Out of memory for ctblk");
-       return ret;
-}
-
-void ehca_free_fw_ctrlblock(void *ptr)
-{
-       if (ptr)
-               kmem_cache_free(ctblk_cache, ptr);
-
-}
-#endif
-
-int ehca2ib_return_code(u64 ehca_rc)
-{
-       switch (ehca_rc) {
-       case H_SUCCESS:
-               return 0;
-       case H_RESOURCE:             /* Resource in use */
-       case H_BUSY:
-               return -EBUSY;
-       case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
-       case H_CONSTRAINED:          /* resource constraint */
-       case H_NO_MEM:
-               return -ENOMEM;
-       default:
-               return -EINVAL;
-       }
-}
-
-static int ehca_create_slab_caches(void)
-{
-       int ret;
-
-       ret = ehca_init_pd_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create PD SLAB cache.");
-               return ret;
-       }
-
-       ret = ehca_init_cq_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create CQ SLAB cache.");
-               goto create_slab_caches2;
-       }
-
-       ret = ehca_init_qp_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create QP SLAB cache.");
-               goto create_slab_caches3;
-       }
-
-       ret = ehca_init_av_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create AV SLAB cache.");
-               goto create_slab_caches4;
-       }
-
-       ret = ehca_init_mrmw_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create MR&MW SLAB cache.");
-               goto create_slab_caches5;
-       }
-
-       ret = ehca_init_small_qp_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create small queue SLAB cache.");
-               goto create_slab_caches6;
-       }
-
-#ifdef CONFIG_PPC_64K_PAGES
-       ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
-                                       EHCA_PAGESIZE, H_CB_ALIGNMENT,
-                                       SLAB_HWCACHE_ALIGN,
-                                       NULL);
-       if (!ctblk_cache) {
-               ehca_gen_err("Cannot create ctblk SLAB cache.");
-               ehca_cleanup_small_qp_cache();
-               ret = -ENOMEM;
-               goto create_slab_caches6;
-       }
-#endif
-       return 0;
-
-create_slab_caches6:
-       ehca_cleanup_mrmw_cache();
-
-create_slab_caches5:
-       ehca_cleanup_av_cache();
-
-create_slab_caches4:
-       ehca_cleanup_qp_cache();
-
-create_slab_caches3:
-       ehca_cleanup_cq_cache();
-
-create_slab_caches2:
-       ehca_cleanup_pd_cache();
-
-       return ret;
-}
-
-static void ehca_destroy_slab_caches(void)
-{
-       ehca_cleanup_small_qp_cache();
-       ehca_cleanup_mrmw_cache();
-       ehca_cleanup_av_cache();
-       ehca_cleanup_qp_cache();
-       ehca_cleanup_cq_cache();
-       ehca_cleanup_pd_cache();
-#ifdef CONFIG_PPC_64K_PAGES
-       if (ctblk_cache)
-               kmem_cache_destroy(ctblk_cache);
-#endif
-}
-
-#define EHCA_HCAAVER  EHCA_BMASK_IBM(32, 39)
-#define EHCA_REVID    EHCA_BMASK_IBM(40, 63)
-
-static struct cap_descr {
-       u64 mask;
-       char *descr;
-} hca_cap_descr[] = {
-       { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
-       { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
-       { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
-       { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
-       { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
-       { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
-       { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
-       { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
-       { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
-       { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
-       { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
-       { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
-       { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
-       { HCA_CAP_SRQ, "HCA_CAP_SRQ" },
-       { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
-       { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
-       { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
-       { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
-};
-
-static int ehca_sense_attributes(struct ehca_shca *shca)
-{
-       int i, ret = 0;
-       u64 h_ret;
-       struct hipz_query_hca *rblock;
-       struct hipz_query_port *port;
-       const char *loc_code;
-
-       static const u32 pgsize_map[] = {
-               HCA_CAP_MR_PGSIZE_4K,  0x1000,
-               HCA_CAP_MR_PGSIZE_64K, 0x10000,
-               HCA_CAP_MR_PGSIZE_1M,  0x100000,
-               HCA_CAP_MR_PGSIZE_16M, 0x1000000,
-       };
-
-       ehca_gen_dbg("Probing adapter %s...",
-                    shca->ofdev->dev.of_node->full_name);
-       loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
-                                  NULL);
-       if (loc_code)
-               ehca_gen_dbg(" ... location lode=%s", loc_code);
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_gen_err("Cannot allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_gen_err("Cannot query device properties. h_ret=%lli",
-                            h_ret);
-               ret = -EPERM;
-               goto sense_attributes1;
-       }
-
-       if (ehca_nr_ports == 1)
-               shca->num_ports = 1;
-       else
-               shca->num_ports = (u8)rblock->num_ports;
-
-       ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
-
-       if (ehca_hw_level == 0) {
-               u32 hcaaver;
-               u32 revid;
-
-               hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
-               revid   = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
-
-               ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
-
-               if (hcaaver == 1) {
-                       if (revid <= 3)
-                               shca->hw_level = 0x10 | (revid + 1);
-                       else
-                               shca->hw_level = 0x14;
-               } else if (hcaaver == 2) {
-                       if (revid == 0)
-                               shca->hw_level = 0x21;
-                       else if (revid == 0x10)
-                               shca->hw_level = 0x22;
-                       else if (revid == 0x20 || revid == 0x21)
-                               shca->hw_level = 0x23;
-               }
-
-               if (!shca->hw_level) {
-                       ehca_gen_warn("unknown hardware version"
-                                     " - assuming default level");
-                       shca->hw_level = 0x22;
-               }
-       } else
-               shca->hw_level = ehca_hw_level;
-       ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
-
-       shca->hca_cap = rblock->hca_cap_indicators;
-       ehca_gen_dbg(" ... HCA capabilities:");
-       for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
-               if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
-                       ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
-
-       /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
-        * a firmware property, so it's valid across all adapters
-        */
-       if (ehca_lock_hcalls == -1)
-               ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
-                                       shca->hca_cap);
-
-       /* translate supported MR page sizes; always support 4K */
-       shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
-       for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
-               if (rblock->memory_page_size_supported & pgsize_map[i])
-                       shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
-
-       /* Set maximum number of CQs and QPs to calculate EQ size */
-       if (shca->max_num_qps == -1)
-               shca->max_num_qps = min_t(int, rblock->max_qp,
-                                         EHCA_MAX_NUM_QUEUES);
-       else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
-               ehca_gen_warn("The requested number of QPs is out of range "
-                             "(1 - %i) specified by HW. Value is set to %i",
-                             rblock->max_qp, rblock->max_qp);
-               shca->max_num_qps = rblock->max_qp;
-       }
-
-       if (shca->max_num_cqs == -1)
-               shca->max_num_cqs = min_t(int, rblock->max_cq,
-                                         EHCA_MAX_NUM_QUEUES);
-       else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
-               ehca_gen_warn("The requested number of CQs is out of range "
-                             "(1 - %i) specified by HW. Value is set to %i",
-                             rblock->max_cq, rblock->max_cq);
-       }
-
-       /* query max MTU from first port -- it's the same for all ports */
-       port = (struct hipz_query_port *)rblock;
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
-       if (h_ret != H_SUCCESS) {
-               ehca_gen_err("Cannot query port properties. h_ret=%lli",
-                            h_ret);
-               ret = -EPERM;
-               goto sense_attributes1;
-       }
-
-       shca->max_mtu = port->max_mtu;
-
-sense_attributes1:
-       ehca_free_fw_ctrlblock(rblock);
-       return ret;
-}
-
-static int init_node_guid(struct ehca_shca *shca)
-{
-       int ret = 0;
-       struct hipz_query_hca *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query device properties");
-               ret = -EINVAL;
-               goto init_node_guid1;
-       }
-
-       memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
-
-init_node_guid1:
-       ehca_free_fw_ctrlblock(rblock);
-       return ret;
-}
-
-static int ehca_port_immutable(struct ib_device *ibdev, u8 port_num,
-                              struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       err = ehca_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
-
-       return 0;
-}
-
-static int ehca_init_device(struct ehca_shca *shca)
-{
-       int ret;
-
-       ret = init_node_guid(shca);
-       if (ret)
-               return ret;
-
-       strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
-       shca->ib_device.owner               = THIS_MODULE;
-
-       shca->ib_device.uverbs_abi_ver      = 8;
-       shca->ib_device.uverbs_cmd_mask     =
-               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
-               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
-               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
-               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
-               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
-               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
-               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
-               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
-               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
-               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
-               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
-               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
-               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
-
-       shca->ib_device.node_type           = RDMA_NODE_IB_CA;
-       shca->ib_device.phys_port_cnt       = shca->num_ports;
-       shca->ib_device.num_comp_vectors    = 1;
-       shca->ib_device.dma_device          = &shca->ofdev->dev;
-       shca->ib_device.query_device        = ehca_query_device;
-       shca->ib_device.query_port          = ehca_query_port;
-       shca->ib_device.query_gid           = ehca_query_gid;
-       shca->ib_device.query_pkey          = ehca_query_pkey;
-       /* shca->in_device.modify_device    = ehca_modify_device    */
-       shca->ib_device.modify_port         = ehca_modify_port;
-       shca->ib_device.alloc_ucontext      = ehca_alloc_ucontext;
-       shca->ib_device.dealloc_ucontext    = ehca_dealloc_ucontext;
-       shca->ib_device.alloc_pd            = ehca_alloc_pd;
-       shca->ib_device.dealloc_pd          = ehca_dealloc_pd;
-       shca->ib_device.create_ah           = ehca_create_ah;
-       /* shca->ib_device.modify_ah        = ehca_modify_ah;       */
-       shca->ib_device.query_ah            = ehca_query_ah;
-       shca->ib_device.destroy_ah          = ehca_destroy_ah;
-       shca->ib_device.create_qp           = ehca_create_qp;
-       shca->ib_device.modify_qp           = ehca_modify_qp;
-       shca->ib_device.query_qp            = ehca_query_qp;
-       shca->ib_device.destroy_qp          = ehca_destroy_qp;
-       shca->ib_device.post_send           = ehca_post_send;
-       shca->ib_device.post_recv           = ehca_post_recv;
-       shca->ib_device.create_cq           = ehca_create_cq;
-       shca->ib_device.destroy_cq          = ehca_destroy_cq;
-       shca->ib_device.resize_cq           = ehca_resize_cq;
-       shca->ib_device.poll_cq             = ehca_poll_cq;
-       /* shca->ib_device.peek_cq          = ehca_peek_cq;         */
-       shca->ib_device.req_notify_cq       = ehca_req_notify_cq;
-       /* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
-       shca->ib_device.get_dma_mr          = ehca_get_dma_mr;
-       shca->ib_device.reg_phys_mr         = ehca_reg_phys_mr;
-       shca->ib_device.reg_user_mr         = ehca_reg_user_mr;
-       shca->ib_device.query_mr            = ehca_query_mr;
-       shca->ib_device.dereg_mr            = ehca_dereg_mr;
-       shca->ib_device.rereg_phys_mr       = ehca_rereg_phys_mr;
-       shca->ib_device.alloc_mw            = ehca_alloc_mw;
-       shca->ib_device.bind_mw             = ehca_bind_mw;
-       shca->ib_device.dealloc_mw          = ehca_dealloc_mw;
-       shca->ib_device.alloc_fmr           = ehca_alloc_fmr;
-       shca->ib_device.map_phys_fmr        = ehca_map_phys_fmr;
-       shca->ib_device.unmap_fmr           = ehca_unmap_fmr;
-       shca->ib_device.dealloc_fmr         = ehca_dealloc_fmr;
-       shca->ib_device.attach_mcast        = ehca_attach_mcast;
-       shca->ib_device.detach_mcast        = ehca_detach_mcast;
-       shca->ib_device.process_mad         = ehca_process_mad;
-       shca->ib_device.mmap                = ehca_mmap;
-       shca->ib_device.dma_ops             = &ehca_dma_mapping_ops;
-       shca->ib_device.get_port_immutable  = ehca_port_immutable;
-
-       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
-               shca->ib_device.uverbs_cmd_mask |=
-                       (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
-
-               shca->ib_device.create_srq          = ehca_create_srq;
-               shca->ib_device.modify_srq          = ehca_modify_srq;
-               shca->ib_device.query_srq           = ehca_query_srq;
-               shca->ib_device.destroy_srq         = ehca_destroy_srq;
-               shca->ib_device.post_srq_recv       = ehca_post_srq_recv;
-       }
-
-       return ret;
-}
-
-static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
-{
-       struct ehca_sport *sport = &shca->sport[port - 1];
-       struct ib_cq *ibcq;
-       struct ib_qp *ibqp;
-       struct ib_qp_init_attr qp_init_attr;
-       struct ib_cq_init_attr cq_attr = {};
-       int ret;
-
-       if (sport->ibcq_aqp1) {
-               ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
-               return -EPERM;
-       }
-
-       cq_attr.cqe = 10;
-       ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
-                           &cq_attr);
-       if (IS_ERR(ibcq)) {
-               ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
-               return PTR_ERR(ibcq);
-       }
-       sport->ibcq_aqp1 = ibcq;
-
-       if (sport->ibqp_sqp[IB_QPT_GSI]) {
-               ehca_err(&shca->ib_device, "AQP1 QP is already created.");
-               ret = -EPERM;
-               goto create_aqp1;
-       }
-
-       memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
-       qp_init_attr.send_cq          = ibcq;
-       qp_init_attr.recv_cq          = ibcq;
-       qp_init_attr.sq_sig_type      = IB_SIGNAL_ALL_WR;
-       qp_init_attr.cap.max_send_wr  = 100;
-       qp_init_attr.cap.max_recv_wr  = 100;
-       qp_init_attr.cap.max_send_sge = 2;
-       qp_init_attr.cap.max_recv_sge = 1;
-       qp_init_attr.qp_type          = IB_QPT_GSI;
-       qp_init_attr.port_num         = port;
-       qp_init_attr.qp_context       = NULL;
-       qp_init_attr.event_handler    = NULL;
-       qp_init_attr.srq              = NULL;
-
-       ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
-       if (IS_ERR(ibqp)) {
-               ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
-               ret = PTR_ERR(ibqp);
-               goto create_aqp1;
-       }
-       sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
-
-       return 0;
-
-create_aqp1:
-       ib_destroy_cq(sport->ibcq_aqp1);
-       return ret;
-}
-
-static int ehca_destroy_aqp1(struct ehca_sport *sport)
-{
-       int ret;
-
-       ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
-       if (ret) {
-               ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
-               return ret;
-       }
-
-       ret = ib_destroy_cq(sport->ibcq_aqp1);
-       if (ret)
-               ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
-
-       return ret;
-}
-
-static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
-{
-       return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
-}
-
-static ssize_t ehca_store_debug_level(struct device_driver *ddp,
-                                     const char *buf, size_t count)
-{
-       int value = (*buf) - '0';
-       if (value >= 0 && value <= 9)
-               ehca_debug_level = value;
-       return 1;
-}
-
-static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
-                  ehca_show_debug_level, ehca_store_debug_level);
-
-static struct attribute *ehca_drv_attrs[] = {
-       &driver_attr_debug_level.attr,
-       NULL
-};
-
-static struct attribute_group ehca_drv_attr_grp = {
-       .attrs = ehca_drv_attrs
-};
-
-static const struct attribute_group *ehca_drv_attr_groups[] = {
-       &ehca_drv_attr_grp,
-       NULL,
-};
-
-#define EHCA_RESOURCE_ATTR(name)                                           \
-static ssize_t  ehca_show_##name(struct device *dev,                       \
-                                struct device_attribute *attr,            \
-                                char *buf)                                \
-{                                                                         \
-       struct ehca_shca *shca;                                            \
-       struct hipz_query_hca *rblock;                                     \
-       int data;                                                          \
-                                                                          \
-       shca = dev_get_drvdata(dev);                                       \
-                                                                          \
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);                      \
-       if (!rblock) {                                                     \
-               dev_err(dev, "Can't allocate rblock memory.\n");           \
-               return 0;                                                  \
-       }                                                                  \
-                                                                          \
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
-               dev_err(dev, "Can't query device properties\n");           \
-               ehca_free_fw_ctrlblock(rblock);                            \
-               return 0;                                                  \
-       }                                                                  \
-                                                                          \
-       data = rblock->name;                                               \
-       ehca_free_fw_ctrlblock(rblock);                                    \
-                                                                          \
-       if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1))     \
-               return snprintf(buf, 256, "1\n");                          \
-       else                                                               \
-               return snprintf(buf, 256, "%d\n", data);                   \
-                                                                          \
-}                                                                         \
-static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
-
-EHCA_RESOURCE_ATTR(num_ports);
-EHCA_RESOURCE_ATTR(hw_ver);
-EHCA_RESOURCE_ATTR(max_eq);
-EHCA_RESOURCE_ATTR(cur_eq);
-EHCA_RESOURCE_ATTR(max_cq);
-EHCA_RESOURCE_ATTR(cur_cq);
-EHCA_RESOURCE_ATTR(max_qp);
-EHCA_RESOURCE_ATTR(cur_qp);
-EHCA_RESOURCE_ATTR(max_mr);
-EHCA_RESOURCE_ATTR(cur_mr);
-EHCA_RESOURCE_ATTR(max_mw);
-EHCA_RESOURCE_ATTR(cur_mw);
-EHCA_RESOURCE_ATTR(max_pd);
-EHCA_RESOURCE_ATTR(max_ah);
-
-static ssize_t ehca_show_adapter_handle(struct device *dev,
-                                       struct device_attribute *attr,
-                                       char *buf)
-{
-       struct ehca_shca *shca = dev_get_drvdata(dev);
-
-       return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
-
-}
-static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
-
-static struct attribute *ehca_dev_attrs[] = {
-       &dev_attr_adapter_handle.attr,
-       &dev_attr_num_ports.attr,
-       &dev_attr_hw_ver.attr,
-       &dev_attr_max_eq.attr,
-       &dev_attr_cur_eq.attr,
-       &dev_attr_max_cq.attr,
-       &dev_attr_cur_cq.attr,
-       &dev_attr_max_qp.attr,
-       &dev_attr_cur_qp.attr,
-       &dev_attr_max_mr.attr,
-       &dev_attr_cur_mr.attr,
-       &dev_attr_max_mw.attr,
-       &dev_attr_cur_mw.attr,
-       &dev_attr_max_pd.attr,
-       &dev_attr_max_ah.attr,
-       NULL
-};
-
-static struct attribute_group ehca_dev_attr_grp = {
-       .attrs = ehca_dev_attrs
-};
-
-static int ehca_probe(struct platform_device *dev)
-{
-       struct ehca_shca *shca;
-       const u64 *handle;
-       struct ib_pd *ibpd;
-       int ret, i, eq_size;
-       unsigned long flags;
-
-       handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
-       if (!handle) {
-               ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
-                            dev->dev.of_node->full_name);
-               return -ENODEV;
-       }
-
-       if (!(*handle)) {
-               ehca_gen_err("Wrong eHCA handle for adapter: %s.",
-                            dev->dev.of_node->full_name);
-               return -ENODEV;
-       }
-
-       shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
-       if (!shca) {
-               ehca_gen_err("Cannot allocate shca memory.");
-               return -ENOMEM;
-       }
-
-       mutex_init(&shca->modify_mutex);
-       atomic_set(&shca->num_cqs, 0);
-       atomic_set(&shca->num_qps, 0);
-       shca->max_num_qps = ehca_max_qp;
-       shca->max_num_cqs = ehca_max_cq;
-
-       for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
-               spin_lock_init(&shca->sport[i].mod_sqp_lock);
-
-       shca->ofdev = dev;
-       shca->ipz_hca_handle.handle = *handle;
-       dev_set_drvdata(&dev->dev, shca);
-
-       ret = ehca_sense_attributes(shca);
-       if (ret < 0) {
-               ehca_gen_err("Cannot sense eHCA attributes.");
-               goto probe1;
-       }
-
-       ret = ehca_init_device(shca);
-       if (ret) {
-               ehca_gen_err("Cannot init ehca  device struct");
-               goto probe1;
-       }
-
-       eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
-       /* create event queues */
-       ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create EQ.");
-               goto probe1;
-       }
-
-       ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create NEQ.");
-               goto probe3;
-       }
-
-       /* create internal protection domain */
-       ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
-       if (IS_ERR(ibpd)) {
-               ehca_err(&shca->ib_device, "Cannot create internal PD.");
-               ret = PTR_ERR(ibpd);
-               goto probe4;
-       }
-
-       shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
-       shca->pd->ib_pd.device = &shca->ib_device;
-
-       /* create internal max MR */
-       ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
-
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
-                        ret);
-               goto probe5;
-       }
-
-       ret = ib_register_device(&shca->ib_device, NULL);
-       if (ret) {
-               ehca_err(&shca->ib_device,
-                        "ib_register_device() failed ret=%i", ret);
-               goto probe6;
-       }
-
-       /* create AQP1 for port 1 */
-       if (ehca_open_aqp1 == 1) {
-               shca->sport[0].port_state = IB_PORT_DOWN;
-               ret = ehca_create_aqp1(shca, 1);
-               if (ret) {
-                       ehca_err(&shca->ib_device,
-                                "Cannot create AQP1 for port 1.");
-                       goto probe7;
-               }
-       }
-
-       /* create AQP1 for port 2 */
-       if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
-               shca->sport[1].port_state = IB_PORT_DOWN;
-               ret = ehca_create_aqp1(shca, 2);
-               if (ret) {
-                       ehca_err(&shca->ib_device,
-                                "Cannot create AQP1 for port 2.");
-                       goto probe8;
-               }
-       }
-
-       ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
-       if (ret) /* only complain; we can live without attributes */
-               ehca_err(&shca->ib_device,
-                        "Cannot create device attributes  ret=%d", ret);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       list_add(&shca->shca_list, &shca_list);
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       return 0;
-
-probe8:
-       ret = ehca_destroy_aqp1(&shca->sport[0]);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy AQP1 for port 1. ret=%i", ret);
-
-probe7:
-       ib_unregister_device(&shca->ib_device);
-
-probe6:
-       ret = ehca_dereg_internal_maxmr(shca);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal MR. ret=%x", ret);
-
-probe5:
-       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal PD. ret=%x", ret);
-
-probe4:
-       ret = ehca_destroy_eq(shca, &shca->neq);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy NEQ. ret=%x", ret);
-
-probe3:
-       ret = ehca_destroy_eq(shca, &shca->eq);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy EQ. ret=%x", ret);
-
-probe1:
-       ib_dealloc_device(&shca->ib_device);
-
-       return -EINVAL;
-}
-
-static int ehca_remove(struct platform_device *dev)
-{
-       struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
-       unsigned long flags;
-       int ret;
-
-       sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
-
-       if (ehca_open_aqp1 == 1) {
-               int i;
-               for (i = 0; i < shca->num_ports; i++) {
-                       ret = ehca_destroy_aqp1(&shca->sport[i]);
-                       if (ret)
-                               ehca_err(&shca->ib_device,
-                                        "Cannot destroy AQP1 for port %x "
-                                        "ret=%i", ret, i);
-               }
-       }
-
-       ib_unregister_device(&shca->ib_device);
-
-       ret = ehca_dereg_internal_maxmr(shca);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal MR. ret=%i", ret);
-
-       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal PD. ret=%i", ret);
-
-       ret = ehca_destroy_eq(shca, &shca->eq);
-       if (ret)
-               ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
-
-       ret = ehca_destroy_eq(shca, &shca->neq);
-       if (ret)
-               ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
-
-       ib_dealloc_device(&shca->ib_device);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       list_del(&shca->shca_list);
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       return ret;
-}
-
-static struct of_device_id ehca_device_table[] =
-{
-       {
-               .name       = "lhca",
-               .compatible = "IBM,lhca",
-       },
-       {},
-};
-MODULE_DEVICE_TABLE(of, ehca_device_table);
-
-static struct platform_driver ehca_driver = {
-       .probe       = ehca_probe,
-       .remove      = ehca_remove,
-       .driver = {
-               .name = "ehca",
-               .owner = THIS_MODULE,
-               .groups = ehca_drv_attr_groups,
-               .of_match_table = ehca_device_table,
-       },
-};
-
-void ehca_poll_eqs(unsigned long data)
-{
-       struct ehca_shca *shca;
-
-       spin_lock(&shca_list_lock);
-       list_for_each_entry(shca, &shca_list, shca_list) {
-               if (shca->eq.is_initialized) {
-                       /* call deadman proc only if eq ptr does not change */
-                       struct ehca_eq *eq = &shca->eq;
-                       int max = 3;
-                       volatile u64 q_ofs, q_ofs2;
-                       unsigned long flags;
-                       spin_lock_irqsave(&eq->spinlock, flags);
-                       q_ofs = eq->ipz_queue.current_q_offset;
-                       spin_unlock_irqrestore(&eq->spinlock, flags);
-                       do {
-                               spin_lock_irqsave(&eq->spinlock, flags);
-                               q_ofs2 = eq->ipz_queue.current_q_offset;
-                               spin_unlock_irqrestore(&eq->spinlock, flags);
-                               max--;
-                       } while (q_ofs == q_ofs2 && max > 0);
-                       if (q_ofs == q_ofs2)
-                               ehca_process_eq(shca, 0);
-               }
-       }
-       mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
-       spin_unlock(&shca_list_lock);
-}
-
-static int ehca_mem_notifier(struct notifier_block *nb,
-                            unsigned long action, void *data)
-{
-       static unsigned long ehca_dmem_warn_time;
-       unsigned long flags;
-
-       switch (action) {
-       case MEM_CANCEL_OFFLINE:
-       case MEM_CANCEL_ONLINE:
-       case MEM_ONLINE:
-       case MEM_OFFLINE:
-               return NOTIFY_OK;
-       case MEM_GOING_ONLINE:
-       case MEM_GOING_OFFLINE:
-               /* only ok if no hca is attached to the lpar */
-               spin_lock_irqsave(&shca_list_lock, flags);
-               if (list_empty(&shca_list)) {
-                       spin_unlock_irqrestore(&shca_list_lock, flags);
-                       return NOTIFY_OK;
-               } else {
-                       spin_unlock_irqrestore(&shca_list_lock, flags);
-                       if (printk_timed_ratelimit(&ehca_dmem_warn_time,
-                                                  30 * 1000))
-                               ehca_gen_err("DMEM operations are not allowed"
-                                            "in conjunction with eHCA");
-                       return NOTIFY_BAD;
-               }
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block ehca_mem_nb = {
-       .notifier_call = ehca_mem_notifier,
-};
-
-static int __init ehca_module_init(void)
-{
-       int ret;
-
-       printk(KERN_INFO "eHCA Infiniband Device Driver "
-              "(Version " HCAD_VERSION ")\n");
-
-       ret = ehca_create_comp_pool();
-       if (ret) {
-               ehca_gen_err("Cannot create comp pool.");
-               return ret;
-       }
-
-       ret = ehca_create_slab_caches();
-       if (ret) {
-               ehca_gen_err("Cannot create SLAB caches");
-               ret = -ENOMEM;
-               goto module_init1;
-       }
-
-       ret = ehca_create_busmap();
-       if (ret) {
-               ehca_gen_err("Cannot create busmap.");
-               goto module_init2;
-       }
-
-       ret = ibmebus_register_driver(&ehca_driver);
-       if (ret) {
-               ehca_gen_err("Cannot register eHCA device driver");
-               ret = -EINVAL;
-               goto module_init3;
-       }
-
-       ret = register_memory_notifier(&ehca_mem_nb);
-       if (ret) {
-               ehca_gen_err("Failed registering memory add/remove notifier");
-               goto module_init4;
-       }
-
-       if (ehca_poll_all_eqs != 1) {
-               ehca_gen_err("WARNING!!!");
-               ehca_gen_err("It is possible to lose interrupts.");
-       } else {
-               init_timer(&poll_eqs_timer);
-               poll_eqs_timer.function = ehca_poll_eqs;
-               poll_eqs_timer.expires = jiffies + HZ;
-               add_timer(&poll_eqs_timer);
-       }
-
-       return 0;
-
-module_init4:
-       ibmebus_unregister_driver(&ehca_driver);
-
-module_init3:
-       ehca_destroy_busmap();
-
-module_init2:
-       ehca_destroy_slab_caches();
-
-module_init1:
-       ehca_destroy_comp_pool();
-       return ret;
-};
-
-static void __exit ehca_module_exit(void)
-{
-       if (ehca_poll_all_eqs == 1)
-               del_timer_sync(&poll_eqs_timer);
-
-       ibmebus_unregister_driver(&ehca_driver);
-
-       unregister_memory_notifier(&ehca_mem_nb);
-
-       ehca_destroy_busmap();
-
-       ehca_destroy_slab_caches();
-
-       ehca_destroy_comp_pool();
-
-       idr_destroy(&ehca_cq_idr);
-       idr_destroy(&ehca_qp_idr);
-};
-
-module_init(ehca_module_init);
-module_exit(ehca_module_exit);
diff --git a/drivers/infiniband/hw/ehca/ehca_mcast.c b/drivers/infiniband/hw/ehca/ehca_mcast.c
deleted file mode 100644 (file)
index cec1815..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  mcast  functions
- *
- *  Authors: Khadija Souissi <souissik@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-#define MAX_MC_LID 0xFFFE
-#define MIN_MC_LID 0xC000      /* Multicast limits */
-#define EHCA_VALID_MULTICAST_GID(gid)  ((gid)[0] == 0xFF)
-#define EHCA_VALID_MULTICAST_LID(lid) \
-       (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
-
-int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
-                                             ib_device);
-       union ib_gid my_gid;
-       u64 subnet_prefix, interface_id, h_ret;
-
-       if (ibqp->qp_type != IB_QPT_UD) {
-               ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
-               return -EINVAL;
-       }
-
-       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
-               ehca_err(ibqp->device, "invalid mulitcast gid");
-               return -EINVAL;
-       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
-               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
-               return -EINVAL;
-       }
-
-       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
-
-       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
-       interface_id = be64_to_cpu(my_gid.global.interface_id);
-       h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
-                                  my_qp->ipz_qp_handle,
-                                  my_qp->galpas.kernel,
-                                  lid, subnet_prefix, interface_id);
-       if (h_ret != H_SUCCESS)
-               ehca_err(ibqp->device,
-                        "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
-                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
-
-       return ehca2ib_return_code(h_ret);
-}
-
-int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(ibqp->pd->device,
-                                             struct ehca_shca, ib_device);
-       union ib_gid my_gid;
-       u64 subnet_prefix, interface_id, h_ret;
-
-       if (ibqp->qp_type != IB_QPT_UD) {
-               ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
-               return -EINVAL;
-       }
-
-       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
-               ehca_err(ibqp->device, "invalid mulitcast gid");
-               return -EINVAL;
-       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
-               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
-               return -EINVAL;
-       }
-
-       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
-
-       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
-       interface_id = be64_to_cpu(my_gid.global.interface_id);
-       h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
-                                  my_qp->ipz_qp_handle,
-                                  my_qp->galpas.kernel,
-                                  lid, subnet_prefix, interface_id);
-       if (h_ret != H_SUCCESS)
-               ehca_err(ibqp->device,
-                        "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
-                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
-
-       return ehca2ib_return_code(h_ret);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
deleted file mode 100644 (file)
index f914b30..0000000
+++ /dev/null
@@ -1,2593 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  MR/MW functions
- *
- *  Authors: Dietmar Decker <ddecker@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-#include <rdma/ib_umem.h>
-
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "hcp_if.h"
-#include "hipz_hw.h"
-
-#define NUM_CHUNKS(length, chunk_size) \
-       (((length) + (chunk_size - 1)) / (chunk_size))
-
-/* max number of rpages (per hcall register_rpages) */
-#define MAX_RPAGES 512
-
-/* DMEM toleration management */
-#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
-#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
-#define EHCA_HUGEPAGESHIFT     34
-#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
-#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
-#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
-#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
-#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
-#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
-#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
-#define EHCA_DIR_MAP_SIZE (0x10000)
-#define EHCA_ENT_MAP_SIZE (0x10000)
-#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
-
-static unsigned long ehca_mr_len;
-
-/*
- * Memory map data structures
- */
-struct ehca_dir_bmap {
-       u64 ent[EHCA_MAP_ENTRIES];
-};
-struct ehca_top_bmap {
-       struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
-};
-struct ehca_bmap {
-       struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
-};
-
-static struct ehca_bmap *ehca_bmap;
-
-static struct kmem_cache *mr_cache;
-static struct kmem_cache *mw_cache;
-
-enum ehca_mr_pgsize {
-       EHCA_MR_PGSIZE4K  = 0x1000L,
-       EHCA_MR_PGSIZE64K = 0x10000L,
-       EHCA_MR_PGSIZE1M  = 0x100000L,
-       EHCA_MR_PGSIZE16M = 0x1000000L
-};
-
-#define EHCA_MR_PGSHIFT4K  12
-#define EHCA_MR_PGSHIFT64K 16
-#define EHCA_MR_PGSHIFT1M  20
-#define EHCA_MR_PGSHIFT16M 24
-
-static u64 ehca_map_vaddr(void *caddr);
-
-static u32 ehca_encode_hwpage_size(u32 pgsize)
-{
-       int log = ilog2(pgsize);
-       WARN_ON(log < 12 || log > 24 || log & 3);
-       return (log - 12) / 4;
-}
-
-static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
-{
-       return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
-}
-
-static struct ehca_mr *ehca_mr_new(void)
-{
-       struct ehca_mr *me;
-
-       me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
-       if (me)
-               spin_lock_init(&me->mrlock);
-       else
-               ehca_gen_err("alloc failed");
-
-       return me;
-}
-
-static void ehca_mr_delete(struct ehca_mr *me)
-{
-       kmem_cache_free(mr_cache, me);
-}
-
-static struct ehca_mw *ehca_mw_new(void)
-{
-       struct ehca_mw *me;
-
-       me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
-       if (me)
-               spin_lock_init(&me->mwlock);
-       else
-               ehca_gen_err("alloc failed");
-
-       return me;
-}
-
-static void ehca_mw_delete(struct ehca_mw *me)
-{
-       kmem_cache_free(mw_cache, me);
-}
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
-{
-       struct ib_mr *ib_mr;
-       int ret;
-       struct ehca_mr *e_maxmr;
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-
-       if (shca->maxmr) {
-               e_maxmr = ehca_mr_new();
-               if (!e_maxmr) {
-                       ehca_err(&shca->ib_device, "out of memory");
-                       ib_mr = ERR_PTR(-ENOMEM);
-                       goto get_dma_mr_exit0;
-               }
-
-               ret = ehca_reg_maxmr(shca, e_maxmr,
-                                    (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
-                                    mr_access_flags, e_pd,
-                                    &e_maxmr->ib.ib_mr.lkey,
-                                    &e_maxmr->ib.ib_mr.rkey);
-               if (ret) {
-                       ehca_mr_delete(e_maxmr);
-                       ib_mr = ERR_PTR(ret);
-                       goto get_dma_mr_exit0;
-               }
-               ib_mr = &e_maxmr->ib.ib_mr;
-       } else {
-               ehca_err(&shca->ib_device, "no internal max-MR exist!");
-               ib_mr = ERR_PTR(-EINVAL);
-               goto get_dma_mr_exit0;
-       }
-
-get_dma_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
-                        PTR_ERR(ib_mr), pd, mr_access_flags);
-       return ib_mr;
-} /* end ehca_get_dma_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *phys_buf_array,
-                              int num_phys_buf,
-                              int mr_access_flags,
-                              u64 *iova_start)
-{
-       struct ib_mr *ib_mr;
-       int ret;
-       struct ehca_mr *e_mr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-       u64 size;
-
-       if ((num_phys_buf <= 0) || !phys_buf_array) {
-               ehca_err(pd->device, "bad input values: num_phys_buf=%x "
-                        "phys_buf_array=%p", num_phys_buf, phys_buf_array);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-
-       /* check physical buffer list and calculate size */
-       ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
-                                           iova_start, &size);
-       if (ret) {
-               ib_mr = ERR_PTR(ret);
-               goto reg_phys_mr_exit0;
-       }
-       if ((size == 0) ||
-           (((u64)iova_start + size) < (u64)iova_start)) {
-               ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
-                        size, iova_start);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(pd->device, "out of memory");
-               ib_mr = ERR_PTR(-ENOMEM);
-               goto reg_phys_mr_exit0;
-       }
-
-       /* register MR on HCA */
-       if (ehca_mr_is_maxmr(size, iova_start)) {
-               e_mr->flags |= EHCA_MR_FLAG_MAXMR;
-               ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
-                                    e_pd, &e_mr->ib.ib_mr.lkey,
-                                    &e_mr->ib.ib_mr.rkey);
-               if (ret) {
-                       ib_mr = ERR_PTR(ret);
-                       goto reg_phys_mr_exit1;
-               }
-       } else {
-               struct ehca_mr_pginfo pginfo;
-               u32 num_kpages;
-               u32 num_hwpages;
-               u64 hw_pgsize;
-
-               num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
-                                       PAGE_SIZE);
-               /* for kernel space we try most possible pgsize */
-               hw_pgsize = ehca_get_max_hwpage_size(shca);
-               num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
-                                        hw_pgsize);
-               memset(&pginfo, 0, sizeof(pginfo));
-               pginfo.type = EHCA_MR_PGI_PHYS;
-               pginfo.num_kpages = num_kpages;
-               pginfo.hwpage_size = hw_pgsize;
-               pginfo.num_hwpages = num_hwpages;
-               pginfo.u.phy.num_phys_buf = num_phys_buf;
-               pginfo.u.phy.phys_buf_array = phys_buf_array;
-               pginfo.next_hwpage =
-                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-
-               ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
-                                 e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-                                 &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
-               if (ret) {
-                       ib_mr = ERR_PTR(ret);
-                       goto reg_phys_mr_exit1;
-               }
-       }
-
-       /* successful registration of all pages */
-       return &e_mr->ib.ib_mr;
-
-reg_phys_mr_exit1:
-       ehca_mr_delete(e_mr);
-reg_phys_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
-                        "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
-                        PTR_ERR(ib_mr), pd, phys_buf_array,
-                        num_phys_buf, mr_access_flags, iova_start);
-       return ib_mr;
-} /* end ehca_reg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                              u64 virt, int mr_access_flags,
-                              struct ib_udata *udata)
-{
-       struct ib_mr *ib_mr;
-       struct ehca_mr *e_mr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_mr_pginfo pginfo;
-       int ret, page_shift;
-       u32 num_kpages;
-       u32 num_hwpages;
-       u64 hwpage_size;
-
-       if (!pd) {
-               ehca_gen_err("bad pd=%p", pd);
-               return ERR_PTR(-EFAULT);
-       }
-
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit0;
-       }
-
-       if (length == 0 || virt + length < virt) {
-               ehca_err(pd->device, "bad input values: length=%llx "
-                        "virt_base=%llx", length, virt);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(pd->device, "out of memory");
-               ib_mr = ERR_PTR(-ENOMEM);
-               goto reg_user_mr_exit0;
-       }
-
-       e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
-                                mr_access_flags, 0);
-       if (IS_ERR(e_mr->umem)) {
-               ib_mr = (void *)e_mr->umem;
-               goto reg_user_mr_exit1;
-       }
-
-       if (e_mr->umem->page_size != PAGE_SIZE) {
-               ehca_err(pd->device, "page size not supported, "
-                        "e_mr->umem->page_size=%x", e_mr->umem->page_size);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit2;
-       }
-
-       /* determine number of MR pages */
-       num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
-       /* select proper hw_pgsize */
-       page_shift = PAGE_SHIFT;
-       if (e_mr->umem->hugetlb) {
-               /* determine page_shift, clamp between 4K and 16M */
-               page_shift = (fls64(length - 1) + 3) & ~3;
-               page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
-                                EHCA_MR_PGSHIFT16M);
-       }
-       hwpage_size = 1UL << page_shift;
-
-       /* now that we have the desired page size, shift until it's
-        * supported, too. 4K is always supported, so this terminates.
-        */
-       while (!(hwpage_size & shca->hca_cap_mr_pgsize))
-               hwpage_size >>= 4;
-
-reg_user_mr_fallback:
-       num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
-       /* register MR on HCA */
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_USER;
-       pginfo.hwpage_size = hwpage_size;
-       pginfo.num_kpages = num_kpages;
-       pginfo.num_hwpages = num_hwpages;
-       pginfo.u.usr.region = e_mr->umem;
-       pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
-       pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
-       ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
-                         e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-                         &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
-       if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
-               ehca_warn(pd->device, "failed to register mr "
-                         "with hwpage_size=%llx", hwpage_size);
-               ehca_info(pd->device, "try to register mr with "
-                         "kpage_size=%lx", PAGE_SIZE);
-               /*
-                * this means kpages are not contiguous for a hw page
-                * try kernel page size as fallback solution
-                */
-               hwpage_size = PAGE_SIZE;
-               goto reg_user_mr_fallback;
-       }
-       if (ret) {
-               ib_mr = ERR_PTR(ret);
-               goto reg_user_mr_exit2;
-       }
-
-       /* successful registration of all pages */
-       return &e_mr->ib.ib_mr;
-
-reg_user_mr_exit2:
-       ib_umem_release(e_mr->umem);
-reg_user_mr_exit1:
-       ehca_mr_delete(e_mr);
-reg_user_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
-                        PTR_ERR(ib_mr), pd, mr_access_flags, udata);
-       return ib_mr;
-} /* end ehca_reg_user_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-                      int mr_rereg_mask,
-                      struct ib_pd *pd,
-                      struct ib_phys_buf *phys_buf_array,
-                      int num_phys_buf,
-                      int mr_access_flags,
-                      u64 *iova_start)
-{
-       int ret;
-
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-       u64 new_size;
-       u64 *new_start;
-       u32 new_acl;
-       struct ehca_pd *new_pd;
-       u32 tmp_lkey, tmp_rkey;
-       unsigned long sl_flags;
-       u32 num_kpages = 0;
-       u32 num_hwpages = 0;
-       struct ehca_mr_pginfo pginfo;
-
-       if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
-               /* TODO not supported, because PHYP rereg hCall needs pages */
-               ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
-                        "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       if (mr_rereg_mask & IB_MR_REREG_PD) {
-               if (!pd) {
-                       ehca_err(mr->device, "rereg with bad pd, pd=%p "
-                                "mr_rereg_mask=%x", pd, mr_rereg_mask);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-       }
-
-       if ((mr_rereg_mask &
-            ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
-           (mr_rereg_mask == 0)) {
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       /* check other parameters */
-       if (e_mr == shca->maxmr) {
-               /* should be impossible, however reject to be sure */
-               ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
-                        "shca->maxmr=%p mr->lkey=%x",
-                        mr, shca->maxmr, mr->lkey);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
-               if (e_mr->flags & EHCA_MR_FLAG_FMR) {
-                       ehca_err(mr->device, "not supported for FMR, mr=%p "
-                                "flags=%x", mr, e_mr->flags);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-               if (!phys_buf_array || num_phys_buf <= 0) {
-                       ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
-                                " phys_buf_array=%p num_phys_buf=%x",
-                                mr_rereg_mask, phys_buf_array, num_phys_buf);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-       }
-       if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&     /* change ACL */
-           (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-            ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
-                        "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       /* set requested values dependent on rereg request */
-       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-       new_start = e_mr->start;
-       new_size = e_mr->size;
-       new_acl = e_mr->acl;
-       new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
-
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-               u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
-
-               new_start = iova_start; /* change address */
-               /* check physical buffer list and calculate size */
-               ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
-                                                   num_phys_buf, iova_start,
-                                                   &new_size);
-               if (ret)
-                       goto rereg_phys_mr_exit1;
-               if ((new_size == 0) ||
-                   (((u64)iova_start + new_size) < (u64)iova_start)) {
-                       ehca_err(mr->device, "bad input values: new_size=%llx "
-                                "iova_start=%p", new_size, iova_start);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit1;
-               }
-               num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
-                                       new_size, PAGE_SIZE);
-               num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
-                                        new_size, hw_pgsize);
-               memset(&pginfo, 0, sizeof(pginfo));
-               pginfo.type = EHCA_MR_PGI_PHYS;
-               pginfo.num_kpages = num_kpages;
-               pginfo.hwpage_size = hw_pgsize;
-               pginfo.num_hwpages = num_hwpages;
-               pginfo.u.phy.num_phys_buf = num_phys_buf;
-               pginfo.u.phy.phys_buf_array = phys_buf_array;
-               pginfo.next_hwpage =
-                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-               new_acl = mr_access_flags;
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               new_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-       ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
-                           new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
-       if (ret)
-               goto rereg_phys_mr_exit1;
-
-       /* successful reregistration */
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               mr->pd = pd;
-       mr->lkey = tmp_lkey;
-       mr->rkey = tmp_rkey;
-
-rereg_phys_mr_exit1:
-       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-rereg_phys_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
-                        "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
-                        "iova_start=%p",
-                        ret, mr, mr_rereg_mask, pd, phys_buf_array,
-                        num_phys_buf, mr_access_flags, iova_start);
-       return ret;
-} /* end ehca_rereg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-       unsigned long sl_flags;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
-                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
-               ret = -EINVAL;
-               goto query_mr_exit0;
-       }
-
-       memset(mr_attr, 0, sizeof(struct ib_mr_attr));
-       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-
-       h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
-                        "hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, mr, shca->ipz_hca_handle.handle,
-                        e_mr->ipz_mr_handle.handle, mr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto query_mr_exit1;
-       }
-       mr_attr->pd = mr->pd;
-       mr_attr->device_virt_addr = hipzout.vaddr;
-       mr_attr->size = hipzout.len;
-       mr_attr->lkey = hipzout.lkey;
-       mr_attr->rkey = hipzout.rkey;
-       ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
-
-query_mr_exit1:
-       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-query_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
-                        ret, mr, mr_attr);
-       return ret;
-} /* end ehca_query_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dereg_mr(struct ib_mr *mr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-
-       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
-                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
-               ret = -EINVAL;
-               goto dereg_mr_exit0;
-       } else if (e_mr == shca->maxmr) {
-               /* should be impossible, however reject to be sure */
-               ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
-                        "shca->maxmr=%p mr->lkey=%x",
-                        mr, shca->maxmr, mr->lkey);
-               ret = -EINVAL;
-               goto dereg_mr_exit0;
-       }
-
-       /* TODO: BUSY: MR still has bound window(s) */
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
-                        "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
-                        h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
-                        e_mr->ipz_mr_handle.handle, mr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto dereg_mr_exit0;
-       }
-
-       if (e_mr->umem)
-               ib_umem_release(e_mr->umem);
-
-       /* successful deregistration */
-       ehca_mr_delete(e_mr);
-
-dereg_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
-       return ret;
-} /* end ehca_dereg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
-       struct ib_mw *ib_mw;
-       u64 h_ret;
-       struct ehca_mw *e_mw;
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_mw_hipzout_parms hipzout;
-
-       if (type != IB_MW_TYPE_1)
-               return ERR_PTR(-EINVAL);
-
-       e_mw = ehca_mw_new();
-       if (!e_mw) {
-               ib_mw = ERR_PTR(-ENOMEM);
-               goto alloc_mw_exit0;
-       }
-
-       h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
-                                        e_pd->fw_pd, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
-                        "shca=%p hca_hndl=%llx mw=%p",
-                        h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
-               ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
-               goto alloc_mw_exit1;
-       }
-       /* successful MW allocation */
-       e_mw->ipz_mw_handle = hipzout.handle;
-       e_mw->ib_mw.rkey    = hipzout.rkey;
-       return &e_mw->ib_mw;
-
-alloc_mw_exit1:
-       ehca_mw_delete(e_mw);
-alloc_mw_exit0:
-       if (IS_ERR(ib_mw))
-               ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
-       return ib_mw;
-} /* end ehca_alloc_mw() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_bind_mw(struct ib_qp *qp,
-                struct ib_mw *mw,
-                struct ib_mw_bind *mw_bind)
-{
-       /* TODO: not supported up to now */
-       ehca_gen_err("bind MW currently not supported by HCAD");
-
-       return -EPERM;
-} /* end ehca_bind_mw() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dealloc_mw(struct ib_mw *mw)
-{
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mw->device, struct ehca_shca, ib_device);
-       struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
-
-       h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
-                        "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
-                        h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
-                        e_mw->ipz_mw_handle.handle);
-               return ehca2ib_return_code(h_ret);
-       }
-       /* successful deallocation */
-       ehca_mw_delete(e_mw);
-       return 0;
-} /* end ehca_dealloc_mw() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
-                             int mr_access_flags,
-                             struct ib_fmr_attr *fmr_attr)
-{
-       struct ib_fmr *ib_fmr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_mr *e_fmr;
-       int ret;
-       u32 tmp_lkey, tmp_rkey;
-       struct ehca_mr_pginfo pginfo;
-       u64 hw_pgsize;
-
-       /* check other parameters */
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-       if (mr_access_flags & IB_ACCESS_MW_BIND) {
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-       if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
-               ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
-                        "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
-                        fmr_attr->max_pages, fmr_attr->max_maps,
-                        fmr_attr->page_shift);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-
-       hw_pgsize = 1 << fmr_attr->page_shift;
-       if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
-               ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
-                        fmr_attr->page_shift);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-
-       e_fmr = ehca_mr_new();
-       if (!e_fmr) {
-               ib_fmr = ERR_PTR(-ENOMEM);
-               goto alloc_fmr_exit0;
-       }
-       e_fmr->flags |= EHCA_MR_FLAG_FMR;
-
-       /* register MR on HCA */
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.hwpage_size = hw_pgsize;
-       /*
-        * pginfo.num_hwpages==0, ie register_rpages() will not be called
-        * but deferred to map_phys_fmr()
-        */
-       ret = ehca_reg_mr(shca, e_fmr, NULL,
-                         fmr_attr->max_pages * (1 << fmr_attr->page_shift),
-                         mr_access_flags, e_pd, &pginfo,
-                         &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
-       if (ret) {
-               ib_fmr = ERR_PTR(ret);
-               goto alloc_fmr_exit1;
-       }
-
-       /* successful */
-       e_fmr->hwpage_size = hw_pgsize;
-       e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
-       e_fmr->fmr_max_pages = fmr_attr->max_pages;
-       e_fmr->fmr_max_maps = fmr_attr->max_maps;
-       e_fmr->fmr_map_cnt = 0;
-       return &e_fmr->ib.ib_fmr;
-
-alloc_fmr_exit1:
-       ehca_mr_delete(e_fmr);
-alloc_fmr_exit0:
-       return ib_fmr;
-} /* end ehca_alloc_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_map_phys_fmr(struct ib_fmr *fmr,
-                     u64 *page_list,
-                     int list_len,
-                     u64 iova)
-{
-       int ret;
-       struct ehca_shca *shca =
-               container_of(fmr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
-       struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
-       struct ehca_mr_pginfo pginfo;
-       u32 tmp_lkey, tmp_rkey;
-
-       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
-                        e_fmr, e_fmr->flags);
-               ret = -EINVAL;
-               goto map_phys_fmr_exit0;
-       }
-       ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
-       if (ret)
-               goto map_phys_fmr_exit0;
-       if (iova % e_fmr->fmr_page_size) {
-               /* only whole-numbered pages */
-               ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
-                        iova, e_fmr->fmr_page_size);
-               ret = -EINVAL;
-               goto map_phys_fmr_exit0;
-       }
-       if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
-               /* HCAD does not limit the maps, however trace this anyway */
-               ehca_info(fmr->device, "map limit exceeded, fmr=%p "
-                         "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
-                         fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
-       }
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_FMR;
-       pginfo.num_kpages = list_len;
-       pginfo.hwpage_size = e_fmr->hwpage_size;
-       pginfo.num_hwpages =
-               list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
-       pginfo.u.fmr.page_list = page_list;
-       pginfo.next_hwpage =
-               (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
-       pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
-
-       ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
-                           list_len * e_fmr->fmr_page_size,
-                           e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
-       if (ret)
-               goto map_phys_fmr_exit0;
-
-       /* successful reregistration */
-       e_fmr->fmr_map_cnt++;
-       e_fmr->ib.ib_fmr.lkey = tmp_lkey;
-       e_fmr->ib.ib_fmr.rkey = tmp_rkey;
-       return 0;
-
-map_phys_fmr_exit0:
-       if (ret)
-               ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
-                        "iova=%llx", ret, fmr, page_list, list_len, iova);
-       return ret;
-} /* end ehca_map_phys_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_unmap_fmr(struct list_head *fmr_list)
-{
-       int ret = 0;
-       struct ib_fmr *ib_fmr;
-       struct ehca_shca *shca = NULL;
-       struct ehca_shca *prev_shca;
-       struct ehca_mr *e_fmr;
-       u32 num_fmr = 0;
-       u32 unmap_fmr_cnt = 0;
-
-       /* check all FMR belong to same SHCA, and check internal flag */
-       list_for_each_entry(ib_fmr, fmr_list, list) {
-               prev_shca = shca;
-               shca = container_of(ib_fmr->device, struct ehca_shca,
-                                   ib_device);
-               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
-               if ((shca != prev_shca) && prev_shca) {
-                       ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
-                                "prev_shca=%p e_fmr=%p",
-                                shca, prev_shca, e_fmr);
-                       ret = -EINVAL;
-                       goto unmap_fmr_exit0;
-               }
-               if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-                       ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
-                                "e_fmr->flags=%x", e_fmr, e_fmr->flags);
-                       ret = -EINVAL;
-                       goto unmap_fmr_exit0;
-               }
-               num_fmr++;
-       }
-
-       /* loop over all FMRs to unmap */
-       list_for_each_entry(ib_fmr, fmr_list, list) {
-               unmap_fmr_cnt++;
-               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
-               shca = container_of(ib_fmr->device, struct ehca_shca,
-                                   ib_device);
-               ret = ehca_unmap_one_fmr(shca, e_fmr);
-               if (ret) {
-                       /* unmap failed, stop unmapping of rest of FMRs */
-                       ehca_err(&shca->ib_device, "unmap of one FMR failed, "
-                                "stop rest, e_fmr=%p num_fmr=%x "
-                                "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
-                                unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
-                       goto unmap_fmr_exit0;
-               }
-       }
-
-unmap_fmr_exit0:
-       if (ret)
-               ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
-                            ret, fmr_list, num_fmr, unmap_fmr_cnt);
-       return ret;
-} /* end ehca_unmap_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dealloc_fmr(struct ib_fmr *fmr)
-{
-       int ret;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(fmr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
-
-       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
-                        e_fmr, e_fmr->flags);
-               ret = -EINVAL;
-               goto free_fmr_exit0;
-       }
-
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
-                        "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle, fmr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto free_fmr_exit0;
-       }
-       /* successful deregistration */
-       ehca_mr_delete(e_fmr);
-       return 0;
-
-free_fmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
-       return ret;
-} /* end ehca_dealloc_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
-                                  struct ehca_mr *e_mr,
-                                  struct ehca_mr_pginfo *pginfo);
-
-int ehca_reg_mr(struct ehca_shca *shca,
-               struct ehca_mr *e_mr,
-               u64 *iova_start,
-               u64 size,
-               int acl,
-               struct ehca_pd *e_pd,
-               struct ehca_mr_pginfo *pginfo,
-               u32 *lkey, /*OUT*/
-               u32 *rkey, /*OUT*/
-               enum ehca_reg_type reg_type)
-{
-       int ret;
-       u64 h_ret;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
-       if (ehca_use_hp_mr == 1)
-               hipz_acl |= 0x00000001;
-
-       h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
-                                        (u64)iova_start, size, hipz_acl,
-                                        e_pd->fw_pd, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
-                        "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_reg_mr_exit0;
-       }
-
-       e_mr->ipz_mr_handle = hipzout.handle;
-
-       if (reg_type == EHCA_REG_BUSMAP_MR)
-               ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
-       else if (reg_type == EHCA_REG_MR)
-               ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
-       else
-               ret = -EINVAL;
-
-       if (ret)
-               goto ehca_reg_mr_exit1;
-
-       /* successful registration */
-       e_mr->num_kpages = pginfo->num_kpages;
-       e_mr->num_hwpages = pginfo->num_hwpages;
-       e_mr->hwpage_size = pginfo->hwpage_size;
-       e_mr->start = iova_start;
-       e_mr->size = size;
-       e_mr->acl = acl;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-
-ehca_reg_mr_exit1:
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
-                        "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
-                        h_ret, shca, e_mr, iova_start, size, acl, e_pd,
-                        hipzout.lkey, pginfo, pginfo->num_kpages,
-                        pginfo->num_hwpages, ret);
-               ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
-                        "not recoverable");
-       }
-ehca_reg_mr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
-                        "num_kpages=%llx num_hwpages=%llx",
-                        ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
-                        pginfo->num_kpages, pginfo->num_hwpages);
-       return ret;
-} /* end ehca_reg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_mr_rpages(struct ehca_shca *shca,
-                      struct ehca_mr *e_mr,
-                      struct ehca_mr_pginfo *pginfo)
-{
-       int ret = 0;
-       u64 h_ret;
-       u32 rnum;
-       u64 rpage;
-       u32 i;
-       u64 *kpage;
-
-       if (!pginfo->num_hwpages) /* in case of fmr */
-               return 0;
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               ret = -ENOMEM;
-               goto ehca_reg_mr_rpages_exit0;
-       }
-
-       /* max MAX_RPAGES ehca mr pages per register call */
-       for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
-
-               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
-                       rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
-                       if (rnum == 0)
-                               rnum = MAX_RPAGES;      /* last shot is full */
-               } else
-                       rnum = MAX_RPAGES;
-
-               ret = ehca_set_pagebuf(pginfo, rnum, kpage);
-               if (ret) {
-                       ehca_err(&shca->ib_device, "ehca_set_pagebuf "
-                                "bad rc, ret=%i rnum=%x kpage=%p",
-                                ret, rnum, kpage);
-                       goto ehca_reg_mr_rpages_exit1;
-               }
-
-               if (rnum > 1) {
-                       rpage = __pa(kpage);
-                       if (!rpage) {
-                               ehca_err(&shca->ib_device, "kpage=%p i=%x",
-                                        kpage, i);
-                               ret = -EFAULT;
-                               goto ehca_reg_mr_rpages_exit1;
-                       }
-               } else
-                       rpage = *kpage;
-
-               h_ret = hipz_h_register_rpage_mr(
-                       shca->ipz_hca_handle, e_mr,
-                       ehca_encode_hwpage_size(pginfo->hwpage_size),
-                       0, rpage, rnum);
-
-               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
-                       /*
-                        * check for 'registration complete'==H_SUCCESS
-                        * and for 'page registered'==H_PAGE_REGISTERED
-                        */
-                       if (h_ret != H_SUCCESS) {
-                               ehca_err(&shca->ib_device, "last "
-                                        "hipz_reg_rpage_mr failed, h_ret=%lli "
-                                        "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
-                                        " lkey=%x", h_ret, e_mr, i,
-                                        shca->ipz_hca_handle.handle,
-                                        e_mr->ipz_mr_handle.handle,
-                                        e_mr->ib.ib_mr.lkey);
-                               ret = ehca2ib_return_code(h_ret);
-                               break;
-                       } else
-                               ret = 0;
-               } else if (h_ret != H_PAGE_REGISTERED) {
-                       ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
-                                "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
-                                "mr_hndl=%llx", h_ret, e_mr, i,
-                                e_mr->ib.ib_mr.lkey,
-                                shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle);
-                       ret = ehca2ib_return_code(h_ret);
-                       break;
-               } else
-                       ret = 0;
-       } /* end for(i) */
-
-
-ehca_reg_mr_rpages_exit1:
-       ehca_free_fw_ctrlblock(kpage);
-ehca_reg_mr_rpages_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
-                        "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
-                        pginfo, pginfo->num_kpages, pginfo->num_hwpages);
-       return ret;
-} /* end ehca_reg_mr_rpages() */
-
-/*----------------------------------------------------------------------*/
-
-inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
-                               struct ehca_mr *e_mr,
-                               u64 *iova_start,
-                               u64 size,
-                               u32 acl,
-                               struct ehca_pd *e_pd,
-                               struct ehca_mr_pginfo *pginfo,
-                               u32 *lkey, /*OUT*/
-                               u32 *rkey) /*OUT*/
-{
-       int ret;
-       u64 h_ret;
-       u32 hipz_acl;
-       u64 *kpage;
-       u64 rpage;
-       struct ehca_mr_pginfo pginfo_save;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               ret = -ENOMEM;
-               goto ehca_rereg_mr_rereg1_exit0;
-       }
-
-       pginfo_save = *pginfo;
-       ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
-       if (ret) {
-               ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
-                        "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
-                        "kpage=%p", e_mr, pginfo, pginfo->type,
-                        pginfo->num_kpages, pginfo->num_hwpages, kpage);
-               goto ehca_rereg_mr_rereg1_exit1;
-       }
-       rpage = __pa(kpage);
-       if (!rpage) {
-               ehca_err(&shca->ib_device, "kpage=%p", kpage);
-               ret = -EFAULT;
-               goto ehca_rereg_mr_rereg1_exit1;
-       }
-       h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
-                                     (u64)iova_start, size, hipz_acl,
-                                     e_pd->fw_pd, rpage, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               /*
-                * reregistration unsuccessful, try it again with the 3 hCalls,
-                * e.g. this is required in case H_MR_CONDITION
-                * (MW bound or MR is shared)
-                */
-               ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
-                         "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
-               *pginfo = pginfo_save;
-               ret = -EAGAIN;
-       } else if ((u64 *)hipzout.vaddr != iova_start) {
-               ehca_err(&shca->ib_device, "PHYP changed iova_start in "
-                        "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
-                        "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
-                        hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
-                        e_mr->ib.ib_mr.lkey, hipzout.lkey);
-               ret = -EFAULT;
-       } else {
-               /*
-                * successful reregistration
-                * note: start and start_out are identical for eServer HCAs
-                */
-               e_mr->num_kpages = pginfo->num_kpages;
-               e_mr->num_hwpages = pginfo->num_hwpages;
-               e_mr->hwpage_size = pginfo->hwpage_size;
-               e_mr->start = iova_start;
-               e_mr->size = size;
-               e_mr->acl = acl;
-               *lkey = hipzout.lkey;
-               *rkey = hipzout.rkey;
-       }
-
-ehca_rereg_mr_rereg1_exit1:
-       ehca_free_fw_ctrlblock(kpage);
-ehca_rereg_mr_rereg1_exit0:
-       if ( ret && (ret != -EAGAIN) )
-               ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
-                        "pginfo=%p num_kpages=%llx num_hwpages=%llx",
-                        ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
-                        pginfo->num_hwpages);
-       return ret;
-} /* end ehca_rereg_mr_rereg1() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_rereg_mr(struct ehca_shca *shca,
-                 struct ehca_mr *e_mr,
-                 u64 *iova_start,
-                 u64 size,
-                 int acl,
-                 struct ehca_pd *e_pd,
-                 struct ehca_mr_pginfo *pginfo,
-                 u32 *lkey,
-                 u32 *rkey)
-{
-       int ret = 0;
-       u64 h_ret;
-       int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
-       int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
-
-       /* first determine reregistration hCall(s) */
-       if ((pginfo->num_hwpages > MAX_RPAGES) ||
-           (e_mr->num_hwpages > MAX_RPAGES) ||
-           (pginfo->num_hwpages > e_mr->num_hwpages)) {
-               ehca_dbg(&shca->ib_device, "Rereg3 case, "
-                        "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
-                        pginfo->num_hwpages, e_mr->num_hwpages);
-               rereg_1_hcall = 0;
-               rereg_3_hcall = 1;
-       }
-
-       if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */
-               rereg_1_hcall = 0;
-               rereg_3_hcall = 1;
-               e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
-               ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
-                        e_mr);
-       }
-
-       if (rereg_1_hcall) {
-               ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
-                                          acl, e_pd, pginfo, lkey, rkey);
-               if (ret) {
-                       if (ret == -EAGAIN)
-                               rereg_3_hcall = 1;
-                       else
-                               goto ehca_rereg_mr_exit0;
-               }
-       }
-
-       if (rereg_3_hcall) {
-               struct ehca_mr save_mr;
-
-               /* first deregister old MR */
-               h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-               if (h_ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-                                "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
-                                "mr->lkey=%x",
-                                h_ret, e_mr, shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle,
-                                e_mr->ib.ib_mr.lkey);
-                       ret = ehca2ib_return_code(h_ret);
-                       goto ehca_rereg_mr_exit0;
-               }
-               /* clean ehca_mr_t, without changing struct ib_mr and lock */
-               save_mr = *e_mr;
-               ehca_mr_deletenew(e_mr);
-
-               /* set some MR values */
-               e_mr->flags = save_mr.flags;
-               e_mr->hwpage_size = save_mr.hwpage_size;
-               e_mr->fmr_page_size = save_mr.fmr_page_size;
-               e_mr->fmr_max_pages = save_mr.fmr_max_pages;
-               e_mr->fmr_max_maps = save_mr.fmr_max_maps;
-               e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
-
-               ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
-                                 e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
-               if (ret) {
-                       u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
-                       memcpy(&e_mr->flags, &(save_mr.flags),
-                              sizeof(struct ehca_mr) - offset);
-                       goto ehca_rereg_mr_exit0;
-               }
-       }
-
-ehca_rereg_mr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
-                        "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
-                        "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
-                        acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
-                        rereg_1_hcall, rereg_3_hcall);
-       return ret;
-} /* end ehca_rereg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_unmap_one_fmr(struct ehca_shca *shca,
-                      struct ehca_mr *e_fmr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_pd *e_pd =
-               container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
-       struct ehca_mr save_fmr;
-       u32 tmp_lkey, tmp_rkey;
-       struct ehca_mr_pginfo pginfo;
-       struct ehca_mr_hipzout_parms hipzout;
-       struct ehca_mr save_mr;
-
-       if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
-               /*
-                * note: after using rereg hcall with len=0,
-                * rereg hcall must be used again for registering pages
-                */
-               h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
-                                             0, 0, e_pd->fw_pd, 0, &hipzout);
-               if (h_ret == H_SUCCESS) {
-                       /* successful reregistration */
-                       e_fmr->start = NULL;
-                       e_fmr->size = 0;
-                       tmp_lkey = hipzout.lkey;
-                       tmp_rkey = hipzout.rkey;
-                       return 0;
-               }
-               /*
-                * should not happen, because length checked above,
-                * FMRs are not shared and no MW bound to FMRs
-                */
-               ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
-                        "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
-                        "mr_hndl=%llx lkey=%x lkey_out=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle,
-                        e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
-               /* try free and rereg */
-       }
-
-       /* first free old FMR */
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-                        "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
-                        "lkey=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle,
-                        e_fmr->ib.ib_fmr.lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_unmap_one_fmr_exit0;
-       }
-       /* clean ehca_mr_t, without changing lock */
-       save_fmr = *e_fmr;
-       ehca_mr_deletenew(e_fmr);
-
-       /* set some MR values */
-       e_fmr->flags = save_fmr.flags;
-       e_fmr->hwpage_size = save_fmr.hwpage_size;
-       e_fmr->fmr_page_size = save_fmr.fmr_page_size;
-       e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
-       e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
-       e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
-       e_fmr->acl = save_fmr.acl;
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_FMR;
-       ret = ehca_reg_mr(shca, e_fmr, NULL,
-                         (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
-                         e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
-                         &tmp_rkey, EHCA_REG_MR);
-       if (ret) {
-               u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
-               memcpy(&e_fmr->flags, &(save_mr.flags),
-                      sizeof(struct ehca_mr) - offset);
-       }
-
-ehca_unmap_one_fmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
-                        "fmr_max_pages=%x",
-                        ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
-       return ret;
-} /* end ehca_unmap_one_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_smr(struct ehca_shca *shca,
-                struct ehca_mr *e_origmr,
-                struct ehca_mr *e_newmr,
-                u64 *iova_start,
-                int acl,
-                struct ehca_pd *e_pd,
-                u32 *lkey, /*OUT*/
-                u32 *rkey) /*OUT*/
-{
-       int ret = 0;
-       u64 h_ret;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
-
-       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
-                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
-                                   &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
-                        "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
-                        "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
-                        shca->ipz_hca_handle.handle,
-                        e_origmr->ipz_mr_handle.handle,
-                        e_origmr->ib.ib_mr.lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_reg_smr_exit0;
-       }
-       /* successful registration */
-       e_newmr->num_kpages = e_origmr->num_kpages;
-       e_newmr->num_hwpages = e_origmr->num_hwpages;
-       e_newmr->hwpage_size   = e_origmr->hwpage_size;
-       e_newmr->start = iova_start;
-       e_newmr->size = e_origmr->size;
-       e_newmr->acl = acl;
-       e_newmr->ipz_mr_handle = hipzout.handle;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-
-ehca_reg_smr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
-                        "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
-                        ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
-       return ret;
-} /* end ehca_reg_smr() */
-
-/*----------------------------------------------------------------------*/
-static inline void *ehca_calc_sectbase(int top, int dir, int idx)
-{
-       unsigned long ret = idx;
-       ret |= dir << EHCA_DIR_INDEX_SHIFT;
-       ret |= top << EHCA_TOP_INDEX_SHIFT;
-       return __va(ret << SECTION_SIZE_BITS);
-}
-
-#define ehca_bmap_valid(entry) \
-       ((u64)entry != (u64)EHCA_INVAL_ADDR)
-
-static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
-                              struct ehca_shca *shca, struct ehca_mr *mr,
-                              struct ehca_mr_pginfo *pginfo)
-{
-       u64 h_ret = 0;
-       unsigned long page = 0;
-       u64 rpage = __pa(kpage);
-       int page_count;
-
-       void *sectbase = ehca_calc_sectbase(top, dir, idx);
-       if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
-               ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
-                                          "hwpage_size does not fit to "
-                                          "section start address");
-       }
-       page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
-
-       while (page < page_count) {
-               u64 rnum;
-               for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
-                    rnum++) {
-                       void *pg = sectbase + ((page++) * pginfo->hwpage_size);
-                       kpage[rnum] = __pa(pg);
-               }
-
-               h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
-                       ehca_encode_hwpage_size(pginfo->hwpage_size),
-                       0, rpage, rnum);
-
-               if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
-                       ehca_err(&shca->ib_device, "register_rpage_mr failed");
-                       return h_ret;
-               }
-       }
-       return h_ret;
-}
-
-static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
-                               struct ehca_shca *shca, struct ehca_mr *mr,
-                               struct ehca_mr_pginfo *pginfo)
-{
-       u64 hret = H_SUCCESS;
-       int idx;
-
-       for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
-                       continue;
-
-               hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
-                                          pginfo);
-               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
-                               return hret;
-       }
-       return hret;
-}
-
-static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
-                                   struct ehca_mr *mr,
-                                   struct ehca_mr_pginfo *pginfo)
-{
-       u64 hret = H_SUCCESS;
-       int dir;
-
-       for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-                       continue;
-
-               hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
-               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
-                               return hret;
-       }
-       return hret;
-}
-
-/* register internal max-MR to internal SHCA */
-int ehca_reg_internal_maxmr(
-       struct ehca_shca *shca,
-       struct ehca_pd *e_pd,
-       struct ehca_mr **e_maxmr)  /*OUT*/
-{
-       int ret;
-       struct ehca_mr *e_mr;
-       u64 *iova_start;
-       u64 size_maxmr;
-       struct ehca_mr_pginfo pginfo;
-       struct ib_phys_buf ib_pbuf;
-       u32 num_kpages;
-       u32 num_hwpages;
-       u64 hw_pgsize;
-
-       if (!ehca_bmap) {
-               ret = -EFAULT;
-               goto ehca_reg_internal_maxmr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(&shca->ib_device, "out of memory");
-               ret = -ENOMEM;
-               goto ehca_reg_internal_maxmr_exit0;
-       }
-       e_mr->flags |= EHCA_MR_FLAG_MAXMR;
-
-       /* register internal max-MR on HCA */
-       size_maxmr = ehca_mr_len;
-       iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
-       ib_pbuf.addr = 0;
-       ib_pbuf.size = size_maxmr;
-       num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
-                               PAGE_SIZE);
-       hw_pgsize = ehca_get_max_hwpage_size(shca);
-       num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
-                                hw_pgsize);
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_PHYS;
-       pginfo.num_kpages = num_kpages;
-       pginfo.num_hwpages = num_hwpages;
-       pginfo.hwpage_size = hw_pgsize;
-       pginfo.u.phy.num_phys_buf = 1;
-       pginfo.u.phy.phys_buf_array = &ib_pbuf;
-
-       ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
-                         &pginfo, &e_mr->ib.ib_mr.lkey,
-                         &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
-       if (ret) {
-               ehca_err(&shca->ib_device, "reg of internal max MR failed, "
-                        "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
-                        "num_hwpages=%x", e_mr, iova_start, size_maxmr,
-                        num_kpages, num_hwpages);
-               goto ehca_reg_internal_maxmr_exit1;
-       }
-
-       /* successful registration of all pages */
-       e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
-       e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
-       e_mr->ib.ib_mr.uobject = NULL;
-       atomic_inc(&(e_pd->ib_pd.usecnt));
-       atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
-       *e_maxmr = e_mr;
-       return 0;
-
-ehca_reg_internal_maxmr_exit1:
-       ehca_mr_delete(e_mr);
-ehca_reg_internal_maxmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
-                        ret, shca, e_pd, e_maxmr);
-       return ret;
-} /* end ehca_reg_internal_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_maxmr(struct ehca_shca *shca,
-                  struct ehca_mr *e_newmr,
-                  u64 *iova_start,
-                  int acl,
-                  struct ehca_pd *e_pd,
-                  u32 *lkey,
-                  u32 *rkey)
-{
-       u64 h_ret;
-       struct ehca_mr *e_origmr = shca->maxmr;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
-
-       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
-                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
-                                   &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
-                        "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, e_origmr, shca->ipz_hca_handle.handle,
-                        e_origmr->ipz_mr_handle.handle,
-                        e_origmr->ib.ib_mr.lkey);
-               return ehca2ib_return_code(h_ret);
-       }
-       /* successful registration */
-       e_newmr->num_kpages = e_origmr->num_kpages;
-       e_newmr->num_hwpages = e_origmr->num_hwpages;
-       e_newmr->hwpage_size = e_origmr->hwpage_size;
-       e_newmr->start = iova_start;
-       e_newmr->size = e_origmr->size;
-       e_newmr->acl = acl;
-       e_newmr->ipz_mr_handle = hipzout.handle;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-} /* end ehca_reg_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
-{
-       int ret;
-       struct ehca_mr *e_maxmr;
-       struct ib_pd *ib_pd;
-
-       if (!shca->maxmr) {
-               ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
-               ret = -EINVAL;
-               goto ehca_dereg_internal_maxmr_exit0;
-       }
-
-       e_maxmr = shca->maxmr;
-       ib_pd = e_maxmr->ib.ib_mr.pd;
-       shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
-
-       ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
-       if (ret) {
-               ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
-                        "ret=%i e_maxmr=%p shca=%p lkey=%x",
-                        ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
-               shca->maxmr = e_maxmr;
-               goto ehca_dereg_internal_maxmr_exit0;
-       }
-
-       atomic_dec(&ib_pd->usecnt);
-
-ehca_dereg_internal_maxmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
-                        ret, shca, shca->maxmr);
-       return ret;
-} /* end ehca_dereg_internal_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * check physical buffer array of MR verbs for validness and
- * calculates MR size
- */
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-                                 int num_phys_buf,
-                                 u64 *iova_start,
-                                 u64 *size)
-{
-       struct ib_phys_buf *pbuf = phys_buf_array;
-       u64 size_count = 0;
-       u32 i;
-
-       if (num_phys_buf == 0) {
-               ehca_gen_err("bad phys buf array len, num_phys_buf=0");
-               return -EINVAL;
-       }
-       /* check first buffer */
-       if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
-               ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
-                            "pbuf->addr=%llx pbuf->size=%llx",
-                            iova_start, pbuf->addr, pbuf->size);
-               return -EINVAL;
-       }
-       if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
-           (num_phys_buf > 1)) {
-               ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
-                            "pbuf->size=%llx", pbuf->addr, pbuf->size);
-               return -EINVAL;
-       }
-
-       for (i = 0; i < num_phys_buf; i++) {
-               if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
-                       ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
-                                    "pbuf->size=%llx",
-                                    i, pbuf->addr, pbuf->size);
-                       return -EINVAL;
-               }
-               if (((i > 0) && /* not 1st */
-                    (i < (num_phys_buf - 1)) &&        /* not last */
-                    (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
-                       ehca_gen_err("bad size, i=%x pbuf->size=%llx",
-                                    i, pbuf->size);
-                       return -EINVAL;
-               }
-               size_count += pbuf->size;
-               pbuf++;
-       }
-
-       *size = size_count;
-       return 0;
-} /* end ehca_mr_chk_buf_and_calc_size() */
-
-/*----------------------------------------------------------------------*/
-
-/* check page list of map FMR verb for validness */
-int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
-                            u64 *page_list,
-                            int list_len)
-{
-       u32 i;
-       u64 *page;
-
-       if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
-               ehca_gen_err("bad list_len, list_len=%x "
-                            "e_fmr->fmr_max_pages=%x fmr=%p",
-                            list_len, e_fmr->fmr_max_pages, e_fmr);
-               return -EINVAL;
-       }
-
-       /* each page must be aligned */
-       page = page_list;
-       for (i = 0; i < list_len; i++) {
-               if (*page % e_fmr->fmr_page_size) {
-                       ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
-                                    "fmr_page_size=%x", i, *page, page, e_fmr,
-                                    e_fmr->fmr_page_size);
-                       return -EINVAL;
-               }
-               page++;
-       }
-
-       return 0;
-} /* end ehca_fmr_check_page_list() */
-
-/*----------------------------------------------------------------------*/
-
-/* PAGE_SIZE >= pginfo->hwpage_size */
-static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
-                                 u32 number,
-                                 u64 *kpage)
-{
-       int ret = 0;
-       u64 pgaddr;
-       u32 j = 0;
-       int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
-       struct scatterlist **sg = &pginfo->u.usr.next_sg;
-
-       while (*sg != NULL) {
-               pgaddr = page_to_pfn(sg_page(*sg))
-                       << PAGE_SHIFT;
-               *kpage = pgaddr + (pginfo->next_hwpage *
-                                  pginfo->hwpage_size);
-               if (!(*kpage)) {
-                       ehca_gen_err("pgaddr=%llx "
-                                    "sg_dma_address=%llx "
-                                    "entry=%llx next_hwpage=%llx",
-                                    pgaddr, (u64)sg_dma_address(*sg),
-                                    pginfo->u.usr.next_nmap,
-                                    pginfo->next_hwpage);
-                       return -EFAULT;
-               }
-               (pginfo->hwpage_cnt)++;
-               (pginfo->next_hwpage)++;
-               kpage++;
-               if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
-                       (pginfo->kpage_cnt)++;
-                       (pginfo->u.usr.next_nmap)++;
-                       pginfo->next_hwpage = 0;
-                       *sg = sg_next(*sg);
-               }
-               j++;
-               if (j >= number)
-                       break;
-       }
-
-       return ret;
-}
-
-/*
- * check given pages for contiguous layout
- * last page addr is returned in prev_pgaddr for further check
- */
-static int ehca_check_kpages_per_ate(struct scatterlist **sg,
-                                    int num_pages,
-                                    u64 *prev_pgaddr)
-{
-       for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
-               u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
-               if (ehca_debug_level >= 3)
-                       ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
-                                    *(u64 *)__va(pgaddr));
-               if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
-                       ehca_gen_err("uncontiguous page found pgaddr=%llx "
-                                    "prev_pgaddr=%llx entries_left_in_hwpage=%x",
-                                    pgaddr, *prev_pgaddr, num_pages);
-                       return -EINVAL;
-               }
-               *prev_pgaddr = pgaddr;
-       }
-       return 0;
-}
-
-/* PAGE_SIZE < pginfo->hwpage_size */
-static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
-                                 u32 number,
-                                 u64 *kpage)
-{
-       int ret = 0;
-       u64 pgaddr, prev_pgaddr;
-       u32 j = 0;
-       int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
-       int nr_kpages = kpages_per_hwpage;
-       struct scatterlist **sg = &pginfo->u.usr.next_sg;
-
-       while (*sg != NULL) {
-
-               if (nr_kpages == kpages_per_hwpage) {
-                       pgaddr = (page_to_pfn(sg_page(*sg))
-                                  << PAGE_SHIFT);
-                       *kpage = pgaddr;
-                       if (!(*kpage)) {
-                               ehca_gen_err("pgaddr=%llx entry=%llx",
-                                            pgaddr, pginfo->u.usr.next_nmap);
-                               ret = -EFAULT;
-                               return ret;
-                       }
-                       /*
-                        * The first page in a hwpage must be aligned;
-                        * the first MR page is exempt from this rule.
-                        */
-                       if (pgaddr & (pginfo->hwpage_size - 1)) {
-                               if (pginfo->hwpage_cnt) {
-                                       ehca_gen_err(
-                                               "invalid alignment "
-                                               "pgaddr=%llx entry=%llx "
-                                               "mr_pgsize=%llx",
-                                               pgaddr, pginfo->u.usr.next_nmap,
-                                               pginfo->hwpage_size);
-                                       ret = -EFAULT;
-                                       return ret;
-                               }
-                               /* first MR page */
-                               pginfo->kpage_cnt =
-                                       (pgaddr &
-                                        (pginfo->hwpage_size - 1)) >>
-                                       PAGE_SHIFT;
-                               nr_kpages -= pginfo->kpage_cnt;
-                               *kpage = pgaddr &
-                                        ~(pginfo->hwpage_size - 1);
-                       }
-                       if (ehca_debug_level >= 3) {
-                               u64 val = *(u64 *)__va(pgaddr);
-                               ehca_gen_dbg("kpage=%llx page=%llx "
-                                            "value=%016llx",
-                                            *kpage, pgaddr, val);
-                       }
-                       prev_pgaddr = pgaddr;
-                       *sg = sg_next(*sg);
-                       pginfo->kpage_cnt++;
-                       pginfo->u.usr.next_nmap++;
-                       nr_kpages--;
-                       if (!nr_kpages)
-                               goto next_kpage;
-                       continue;
-               }
-
-               ret = ehca_check_kpages_per_ate(sg, nr_kpages,
-                                               &prev_pgaddr);
-               if (ret)
-                       return ret;
-               pginfo->kpage_cnt += nr_kpages;
-               pginfo->u.usr.next_nmap += nr_kpages;
-
-next_kpage:
-               nr_kpages = kpages_per_hwpage;
-               (pginfo->hwpage_cnt)++;
-               kpage++;
-               j++;
-               if (j >= number)
-                       break;
-       }
-
-       return ret;
-}
-
-static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
-                                u32 number, u64 *kpage)
-{
-       int ret = 0;
-       struct ib_phys_buf *pbuf;
-       u64 num_hw, offs_hw;
-       u32 i = 0;
-
-       /* loop over desired phys_buf_array entries */
-       while (i < number) {
-               pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
-               num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
-                                    pbuf->size, pginfo->hwpage_size);
-               offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
-                       pginfo->hwpage_size;
-               while (pginfo->next_hwpage < offs_hw + num_hw) {
-                       /* sanity check */
-                       if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
-                           (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
-                               ehca_gen_err("kpage_cnt >= num_kpages, "
-                                            "kpage_cnt=%llx num_kpages=%llx "
-                                            "hwpage_cnt=%llx "
-                                            "num_hwpages=%llx i=%x",
-                                            pginfo->kpage_cnt,
-                                            pginfo->num_kpages,
-                                            pginfo->hwpage_cnt,
-                                            pginfo->num_hwpages, i);
-                               return -EFAULT;
-                       }
-                       *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
-                                (pginfo->next_hwpage * pginfo->hwpage_size);
-                       if ( !(*kpage) && pbuf->addr ) {
-                               ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
-                                            "next_hwpage=%llx", pbuf->addr,
-                                            pbuf->size, pginfo->next_hwpage);
-                               return -EFAULT;
-                       }
-                       (pginfo->hwpage_cnt)++;
-                       (pginfo->next_hwpage)++;
-                       if (PAGE_SIZE >= pginfo->hwpage_size) {
-                               if (pginfo->next_hwpage %
-                                   (PAGE_SIZE / pginfo->hwpage_size) == 0)
-                                       (pginfo->kpage_cnt)++;
-                       } else
-                               pginfo->kpage_cnt += pginfo->hwpage_size /
-                                       PAGE_SIZE;
-                       kpage++;
-                       i++;
-                       if (i >= number) break;
-               }
-               if (pginfo->next_hwpage >= offs_hw + num_hw) {
-                       (pginfo->u.phy.next_buf)++;
-                       pginfo->next_hwpage = 0;
-               }
-       }
-       return ret;
-}
-
-static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
-                               u32 number, u64 *kpage)
-{
-       int ret = 0;
-       u64 *fmrlist;
-       u32 i;
-
-       /* loop over desired page_list entries */
-       fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
-       for (i = 0; i < number; i++) {
-               *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
-                          pginfo->next_hwpage * pginfo->hwpage_size;
-               if ( !(*kpage) ) {
-                       ehca_gen_err("*fmrlist=%llx fmrlist=%p "
-                                    "next_listelem=%llx next_hwpage=%llx",
-                                    *fmrlist, fmrlist,
-                                    pginfo->u.fmr.next_listelem,
-                                    pginfo->next_hwpage);
-                       return -EFAULT;
-               }
-               (pginfo->hwpage_cnt)++;
-               if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
-                       if (pginfo->next_hwpage %
-                           (pginfo->u.fmr.fmr_pgsize /
-                            pginfo->hwpage_size) == 0) {
-                               (pginfo->kpage_cnt)++;
-                               (pginfo->u.fmr.next_listelem)++;
-                               fmrlist++;
-                               pginfo->next_hwpage = 0;
-                       } else
-                               (pginfo->next_hwpage)++;
-               } else {
-                       unsigned int cnt_per_hwpage = pginfo->hwpage_size /
-                               pginfo->u.fmr.fmr_pgsize;
-                       unsigned int j;
-                       u64 prev = *kpage;
-                       /* check if adrs are contiguous */
-                       for (j = 1; j < cnt_per_hwpage; j++) {
-                               u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
-                               if (prev + pginfo->u.fmr.fmr_pgsize != p) {
-                                       ehca_gen_err("uncontiguous fmr pages "
-                                                    "found prev=%llx p=%llx "
-                                                    "idx=%x", prev, p, i + j);
-                                       return -EINVAL;
-                               }
-                               prev = p;
-                       }
-                       pginfo->kpage_cnt += cnt_per_hwpage;
-                       pginfo->u.fmr.next_listelem += cnt_per_hwpage;
-                       fmrlist += cnt_per_hwpage;
-               }
-               kpage++;
-       }
-       return ret;
-}
-
-/* setup page buffer from page info */
-int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
-                    u32 number,
-                    u64 *kpage)
-{
-       int ret;
-
-       switch (pginfo->type) {
-       case EHCA_MR_PGI_PHYS:
-               ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
-               break;
-       case EHCA_MR_PGI_USER:
-               ret = PAGE_SIZE >= pginfo->hwpage_size ?
-                       ehca_set_pagebuf_user1(pginfo, number, kpage) :
-                       ehca_set_pagebuf_user2(pginfo, number, kpage);
-               break;
-       case EHCA_MR_PGI_FMR:
-               ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
-               break;
-       default:
-               ehca_gen_err("bad pginfo->type=%x", pginfo->type);
-               ret = -EFAULT;
-               break;
-       }
-       return ret;
-} /* end ehca_set_pagebuf() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * check MR if it is a max-MR, i.e. uses whole memory
- * in case it's a max-MR 1 is returned, else 0
- */
-int ehca_mr_is_maxmr(u64 size,
-                    u64 *iova_start)
-{
-       /* a MR is treated as max-MR only if it fits following: */
-       if ((size == ehca_mr_len) &&
-           (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
-               ehca_gen_dbg("this is a max-MR");
-               return 1;
-       } else
-               return 0;
-} /* end ehca_mr_is_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-/* map access control for MR/MW. This routine is used for MR and MW. */
-void ehca_mrmw_map_acl(int ib_acl,
-                      u32 *hipz_acl)
-{
-       *hipz_acl = 0;
-       if (ib_acl & IB_ACCESS_REMOTE_READ)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
-       if (ib_acl & IB_ACCESS_REMOTE_WRITE)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
-       if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
-       if (ib_acl & IB_ACCESS_LOCAL_WRITE)
-               *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
-       if (ib_acl & IB_ACCESS_MW_BIND)
-               *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
-} /* end ehca_mrmw_map_acl() */
-
-/*----------------------------------------------------------------------*/
-
-/* sets page size in hipz access control for MR/MW. */
-void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
-{
-       *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
-} /* end ehca_mrmw_set_pgsize_hipz_acl() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * reverse map access control for MR/MW.
- * This routine is used for MR and MW.
- */
-void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
-                              int *ib_acl) /*OUT*/
-{
-       *ib_acl = 0;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
-               *ib_acl |= IB_ACCESS_REMOTE_READ;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
-               *ib_acl |= IB_ACCESS_REMOTE_WRITE;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
-               *ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
-               *ib_acl |= IB_ACCESS_LOCAL_WRITE;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
-               *ib_acl |= IB_ACCESS_MW_BIND;
-} /* end ehca_mrmw_reverse_map_acl() */
-
-
-/*----------------------------------------------------------------------*/
-
-/*
- * MR destructor and constructor
- * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
- * except struct ib_mr and spinlock
- */
-void ehca_mr_deletenew(struct ehca_mr *mr)
-{
-       mr->flags = 0;
-       mr->num_kpages = 0;
-       mr->num_hwpages = 0;
-       mr->acl = 0;
-       mr->start = NULL;
-       mr->fmr_page_size = 0;
-       mr->fmr_max_pages = 0;
-       mr->fmr_max_maps = 0;
-       mr->fmr_map_cnt = 0;
-       memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
-       memset(&mr->galpas, 0, sizeof(mr->galpas));
-} /* end ehca_mr_deletenew() */
-
-int ehca_init_mrmw_cache(void)
-{
-       mr_cache = kmem_cache_create("ehca_cache_mr",
-                                    sizeof(struct ehca_mr), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!mr_cache)
-               return -ENOMEM;
-       mw_cache = kmem_cache_create("ehca_cache_mw",
-                                    sizeof(struct ehca_mw), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!mw_cache) {
-               kmem_cache_destroy(mr_cache);
-               mr_cache = NULL;
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-void ehca_cleanup_mrmw_cache(void)
-{
-       if (mr_cache)
-               kmem_cache_destroy(mr_cache);
-       if (mw_cache)
-               kmem_cache_destroy(mw_cache);
-}
-
-static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
-                                    int dir)
-{
-       if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
-               ehca_top_bmap->dir[dir] =
-                       kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
-               if (!ehca_top_bmap->dir[dir])
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
-       }
-       return 0;
-}
-
-static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
-{
-       if (!ehca_bmap_valid(ehca_bmap->top[top])) {
-               ehca_bmap->top[top] =
-                       kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
-               if (!ehca_bmap->top[top])
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
-       }
-       return ehca_init_top_bmap(ehca_bmap->top[top], dir);
-}
-
-static inline int ehca_calc_index(unsigned long i, unsigned long s)
-{
-       return (i >> s) & EHCA_INDEX_MASK;
-}
-
-void ehca_destroy_busmap(void)
-{
-       int top, dir;
-
-       if (!ehca_bmap)
-               return;
-
-       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]))
-                       continue;
-               for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
-                       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-                               continue;
-
-                       kfree(ehca_bmap->top[top]->dir[dir]);
-               }
-
-               kfree(ehca_bmap->top[top]);
-       }
-
-       kfree(ehca_bmap);
-       ehca_bmap = NULL;
-}
-
-static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
-{
-       unsigned long i, start_section, end_section;
-       int top, dir, idx;
-
-       if (!nr_pages)
-               return 0;
-
-       if (!ehca_bmap) {
-               ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
-               if (!ehca_bmap)
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
-       }
-
-       start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
-       end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
-       for (i = start_section; i < end_section; i++) {
-               int ret;
-               top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
-               dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
-               idx = i & EHCA_INDEX_MASK;
-
-               ret = ehca_init_bmap(ehca_bmap, top, dir);
-               if (ret) {
-                       ehca_destroy_busmap();
-                       return ret;
-               }
-               ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
-               ehca_mr_len += EHCA_SECTSIZE;
-       }
-       return 0;
-}
-
-static int ehca_is_hugepage(unsigned long pfn)
-{
-       int page_order;
-
-       if (pfn & EHCA_HUGEPAGE_PFN_MASK)
-               return 0;
-
-       page_order = compound_order(pfn_to_page(pfn));
-       if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
-               return 0;
-
-       return 1;
-}
-
-static int ehca_create_busmap_callback(unsigned long initial_pfn,
-                                      unsigned long total_nr_pages, void *arg)
-{
-       int ret;
-       unsigned long pfn, start_pfn, end_pfn, nr_pages;
-
-       if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
-               return ehca_update_busmap(initial_pfn, total_nr_pages);
-
-       /* Given chunk is >= 16GB -> check for hugepages */
-       start_pfn = initial_pfn;
-       end_pfn = initial_pfn + total_nr_pages;
-       pfn = start_pfn;
-
-       while (pfn < end_pfn) {
-               if (ehca_is_hugepage(pfn)) {
-                       /* Add mem found in front of the hugepage */
-                       nr_pages = pfn - start_pfn;
-                       ret = ehca_update_busmap(start_pfn, nr_pages);
-                       if (ret)
-                               return ret;
-                       /* Skip the hugepage */
-                       pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
-                       start_pfn = pfn;
-               } else
-                       pfn += (EHCA_SECTSIZE / PAGE_SIZE);
-       }
-
-       /* Add mem found behind the hugepage(s)  */
-       nr_pages = pfn - start_pfn;
-       return ehca_update_busmap(start_pfn, nr_pages);
-}
-
-int ehca_create_busmap(void)
-{
-       int ret;
-
-       ehca_mr_len = 0;
-       ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
-                                  ehca_create_busmap_callback);
-       return ret;
-}
-
-static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
-                                  struct ehca_mr *e_mr,
-                                  struct ehca_mr_pginfo *pginfo)
-{
-       int top;
-       u64 hret, *kpage;
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               return -ENOMEM;
-       }
-       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]))
-                       continue;
-               hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
-               if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
-                       break;
-       }
-
-       ehca_free_fw_ctrlblock(kpage);
-
-       if (hret == H_SUCCESS)
-               return 0; /* Everything is fine */
-       else {
-               ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
-                                "h_ret=%lli e_mr=%p top=%x lkey=%x "
-                                "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
-                                e_mr->ib.ib_mr.lkey,
-                                shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle);
-               return ehca2ib_return_code(hret);
-       }
-}
-
-static u64 ehca_map_vaddr(void *caddr)
-{
-       int top, dir, idx;
-       unsigned long abs_addr, offset;
-       u64 entry;
-
-       if (!ehca_bmap)
-               return EHCA_INVAL_ADDR;
-
-       abs_addr = __pa(caddr);
-       top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
-       if (!ehca_bmap_valid(ehca_bmap->top[top]))
-               return EHCA_INVAL_ADDR;
-
-       dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
-       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-               return EHCA_INVAL_ADDR;
-
-       idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
-
-       entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
-       if (ehca_bmap_valid(entry)) {
-               offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
-               return entry | offset;
-       } else
-               return EHCA_INVAL_ADDR;
-}
-
-static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-       return dma_addr == EHCA_INVAL_ADDR;
-}
-
-static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
-                              size_t size, enum dma_data_direction direction)
-{
-       if (cpu_addr)
-               return ehca_map_vaddr(cpu_addr);
-       else
-               return EHCA_INVAL_ADDR;
-}
-
-static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
-                                 enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
-                            unsigned long offset, size_t size,
-                            enum dma_data_direction direction)
-{
-       u64 addr;
-
-       if (offset + size > PAGE_SIZE)
-               return EHCA_INVAL_ADDR;
-
-       addr = ehca_map_vaddr(page_address(page));
-       if (!ehca_dma_mapping_error(dev, addr))
-               addr += offset;
-
-       return addr;
-}
-
-static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
-                               enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-                          int nents, enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sgl, sg, nents, i) {
-               u64 addr;
-               addr = ehca_map_vaddr(sg_virt(sg));
-               if (ehca_dma_mapping_error(dev, addr))
-                       return 0;
-
-               sg->dma_address = addr;
-               sg->dma_length = sg->length;
-       }
-       return nents;
-}
-
-static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
-                             int nents, enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
-                                        size_t size,
-                                        enum dma_data_direction dir)
-{
-       dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
-}
-
-static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
-                                           size_t size,
-                                           enum dma_data_direction dir)
-{
-       dma_sync_single_for_device(dev->dma_device, addr, size, dir);
-}
-
-static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
-                                    u64 *dma_handle, gfp_t flag)
-{
-       struct page *p;
-       void *addr = NULL;
-       u64 dma_addr;
-
-       p = alloc_pages(flag, get_order(size));
-       if (p) {
-               addr = page_address(p);
-               dma_addr = ehca_map_vaddr(addr);
-               if (ehca_dma_mapping_error(dev, dma_addr)) {
-                       free_pages((unsigned long)addr, get_order(size));
-                       return NULL;
-               }
-               if (dma_handle)
-                       *dma_handle = dma_addr;
-               return addr;
-       }
-       return NULL;
-}
-
-static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
-                                  void *cpu_addr, u64 dma_handle)
-{
-       if (cpu_addr && size)
-               free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-
-struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
-       .mapping_error          = ehca_dma_mapping_error,
-       .map_single             = ehca_dma_map_single,
-       .unmap_single           = ehca_dma_unmap_single,
-       .map_page               = ehca_dma_map_page,
-       .unmap_page             = ehca_dma_unmap_page,
-       .map_sg                 = ehca_dma_map_sg,
-       .unmap_sg               = ehca_dma_unmap_sg,
-       .sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
-       .sync_single_for_device = ehca_dma_sync_single_for_device,
-       .alloc_coherent         = ehca_dma_alloc_coherent,
-       .free_coherent          = ehca_dma_free_coherent,
-};
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.h b/drivers/infiniband/hw/ehca/ehca_mrmw.h
deleted file mode 100644 (file)
index 50d8b51..0000000
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  MR/MW declarations and inline functions
- *
- *  Authors: Dietmar Decker <ddecker@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _EHCA_MRMW_H_
-#define _EHCA_MRMW_H_
-
-enum ehca_reg_type {
-       EHCA_REG_MR,
-       EHCA_REG_BUSMAP_MR
-};
-
-int ehca_reg_mr(struct ehca_shca *shca,
-               struct ehca_mr *e_mr,
-               u64 *iova_start,
-               u64 size,
-               int acl,
-               struct ehca_pd *e_pd,
-               struct ehca_mr_pginfo *pginfo,
-               u32 *lkey,
-               u32 *rkey,
-               enum ehca_reg_type reg_type);
-
-int ehca_reg_mr_rpages(struct ehca_shca *shca,
-                      struct ehca_mr *e_mr,
-                      struct ehca_mr_pginfo *pginfo);
-
-int ehca_rereg_mr(struct ehca_shca *shca,
-                 struct ehca_mr *e_mr,
-                 u64 *iova_start,
-                 u64 size,
-                 int mr_access_flags,
-                 struct ehca_pd *e_pd,
-                 struct ehca_mr_pginfo *pginfo,
-                 u32 *lkey,
-                 u32 *rkey);
-
-int ehca_unmap_one_fmr(struct ehca_shca *shca,
-                      struct ehca_mr *e_fmr);
-
-int ehca_reg_smr(struct ehca_shca *shca,
-                struct ehca_mr *e_origmr,
-                struct ehca_mr *e_newmr,
-                u64 *iova_start,
-                int acl,
-                struct ehca_pd *e_pd,
-                u32 *lkey,
-                u32 *rkey);
-
-int ehca_reg_internal_maxmr(struct ehca_shca *shca,
-                           struct ehca_pd *e_pd,
-                           struct ehca_mr **maxmr);
-
-int ehca_reg_maxmr(struct ehca_shca *shca,
-                  struct ehca_mr *e_newmr,
-                  u64 *iova_start,
-                  int acl,
-                  struct ehca_pd *e_pd,
-                  u32 *lkey,
-                  u32 *rkey);
-
-int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
-
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-                                 int num_phys_buf,
-                                 u64 *iova_start,
-                                 u64 *size);
-
-int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
-                            u64 *page_list,
-                            int list_len);
-
-int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
-                    u32 number,
-                    u64 *kpage);
-
-int ehca_mr_is_maxmr(u64 size,
-                    u64 *iova_start);
-
-void ehca_mrmw_map_acl(int ib_acl,
-                      u32 *hipz_acl);
-
-void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
-
-void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
-                              int *ib_acl);
-
-void ehca_mr_deletenew(struct ehca_mr *mr);
-
-int ehca_create_busmap(void);
-
-void ehca_destroy_busmap(void);
-
-extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
-#endif  /*_EHCA_MRMW_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c
deleted file mode 100644 (file)
index 351577a..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  PD functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-
-static struct kmem_cache *pd_cache;
-
-struct ib_pd *ehca_alloc_pd(struct ib_device *device,
-                           struct ib_ucontext *context, struct ib_udata *udata)
-{
-       struct ehca_pd *pd;
-       int i;
-
-       pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
-       if (!pd) {
-               ehca_err(device, "device=%p context=%p out of memory",
-                        device, context);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       for (i = 0; i < 2; i++) {
-               INIT_LIST_HEAD(&pd->free[i]);
-               INIT_LIST_HEAD(&pd->full[i]);
-       }
-       mutex_init(&pd->lock);
-
-       /*
-        * Kernel PD: when device = -1, 0
-        * User   PD: when context != -1
-        */
-       if (!context) {
-               /*
-                * Kernel PDs after init reuses always
-                * the one created in ehca_shca_reopen()
-                */
-               struct ehca_shca *shca = container_of(device, struct ehca_shca,
-                                                     ib_device);
-               pd->fw_pd.value = shca->pd->fw_pd.value;
-       } else
-               pd->fw_pd.value = (u64)pd;
-
-       return &pd->ib_pd;
-}
-
-int ehca_dealloc_pd(struct ib_pd *pd)
-{
-       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
-       int i, leftovers = 0;
-       struct ipz_small_queue_page *page, *tmp;
-
-       for (i = 0; i < 2; i++) {
-               list_splice(&my_pd->full[i], &my_pd->free[i]);
-               list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
-                       leftovers = 1;
-                       free_page(page->page);
-                       kmem_cache_free(small_qp_cache, page);
-               }
-       }
-
-       if (leftovers)
-               ehca_warn(pd->device,
-                         "Some small queue pages were not freed");
-
-       kmem_cache_free(pd_cache, my_pd);
-
-       return 0;
-}
-
-int ehca_init_pd_cache(void)
-{
-       pd_cache = kmem_cache_create("ehca_cache_pd",
-                                    sizeof(struct ehca_pd), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!pd_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_pd_cache(void)
-{
-       if (pd_cache)
-               kmem_cache_destroy(pd_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_qes.h b/drivers/infiniband/hw/ehca/ehca_qes.h
deleted file mode 100644 (file)
index 90c4efa..0000000
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Hardware request structures
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#ifndef _EHCA_QES_H_
-#define _EHCA_QES_H_
-
-#include "ehca_tools.h"
-
-/* virtual scatter gather entry to specify remote addresses with length */
-struct ehca_vsgentry {
-       u64 vaddr;
-       u32 lkey;
-       u32 length;
-};
-
-#define GRH_FLAG_MASK        EHCA_BMASK_IBM( 7,  7)
-#define GRH_IPVERSION_MASK   EHCA_BMASK_IBM( 0,  3)
-#define GRH_TCLASS_MASK      EHCA_BMASK_IBM( 4, 12)
-#define GRH_FLOWLABEL_MASK   EHCA_BMASK_IBM(13, 31)
-#define GRH_PAYLEN_MASK      EHCA_BMASK_IBM(32, 47)
-#define GRH_NEXTHEADER_MASK  EHCA_BMASK_IBM(48, 55)
-#define GRH_HOPLIMIT_MASK    EHCA_BMASK_IBM(56, 63)
-
-/*
- * Unreliable Datagram Address Vector Format
- * see IBTA Vol1 chapter 8.3 Global Routing Header
- */
-struct ehca_ud_av {
-       u8 sl;
-       u8 lnh;
-       u16 dlid;
-       u8 reserved1;
-       u8 reserved2;
-       u8 reserved3;
-       u8 slid_path_bits;
-       u8 reserved4;
-       u8 ipd;
-       u8 reserved5;
-       u8 pmtu;
-       u32 reserved6;
-       u64 reserved7;
-       union {
-               struct {
-                       u64 word_0; /* always set to 6  */
-                       /*should be 0x1B for IB transport */
-                       u64 word_1;
-                       u64 word_2;
-                       u64 word_3;
-                       u64 word_4;
-               } grh;
-               struct {
-                       u32 wd_0;
-                       u32 wd_1;
-                       /* DWord_1 --> SGID */
-
-                       u32 sgid_wd3;
-                       u32 sgid_wd2;
-
-                       u32 sgid_wd1;
-                       u32 sgid_wd0;
-                       /* DWord_3 --> DGID */
-
-                       u32 dgid_wd3;
-                       u32 dgid_wd2;
-
-                       u32 dgid_wd1;
-                       u32 dgid_wd0;
-               } grh_l;
-       };
-};
-
-/* maximum number of sg entries allowed in a WQE */
-#define MAX_WQE_SG_ENTRIES 252
-
-#define WQE_OPTYPE_SEND             0x80
-#define WQE_OPTYPE_RDMAREAD         0x40
-#define WQE_OPTYPE_RDMAWRITE        0x20
-#define WQE_OPTYPE_CMPSWAP          0x10
-#define WQE_OPTYPE_FETCHADD         0x08
-#define WQE_OPTYPE_BIND             0x04
-
-#define WQE_WRFLAG_REQ_SIGNAL_COM   0x80
-#define WQE_WRFLAG_FENCE            0x40
-#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
-#define WQE_WRFLAG_SOLIC_EVENT      0x10
-
-#define WQEF_CACHE_HINT             0x80
-#define WQEF_CACHE_HINT_RD_WR       0x40
-#define WQEF_TIMED_WQE              0x20
-#define WQEF_PURGE                  0x08
-#define WQEF_HIGH_NIBBLE            0xF0
-
-#define MW_BIND_ACCESSCTRL_R_WRITE   0x40
-#define MW_BIND_ACCESSCTRL_R_READ    0x20
-#define MW_BIND_ACCESSCTRL_R_ATOMIC  0x10
-
-struct ehca_wqe {
-       u64 work_request_id;
-       u8 optype;
-       u8 wr_flag;
-       u16 pkeyi;
-       u8 wqef;
-       u8 nr_of_data_seg;
-       u16 wqe_provided_slid;
-       u32 destination_qp_number;
-       u32 resync_psn_sqp;
-       u32 local_ee_context_qkey;
-       u32 immediate_data;
-       union {
-               struct {
-                       u64 remote_virtual_address;
-                       u32 rkey;
-                       u32 reserved;
-                       u64 atomic_1st_op_dma_len;
-                       u64 atomic_2nd_op;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-
-               } nud;
-               struct {
-                       u64 ehca_ud_av_ptr;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 reserved3;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-               } ud_avp;
-               struct {
-                       struct ehca_ud_av ud_av;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
-                                                    2];
-               } ud_av;
-               struct {
-                       u64 reserved0;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 reserved3;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-               } all_rcv;
-
-               struct {
-                       u64 reserved;
-                       u32 rkey;
-                       u32 old_rkey;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 virtual_address;
-                       u32 reserved3;
-                       u32 length;
-                       u32 reserved4;
-                       u16 reserved5;
-                       u8 reserved6;
-                       u8 lr_ctl;
-                       u32 lkey;
-                       u32 reserved7;
-                       u64 reserved8;
-                       u64 reserved9;
-                       u64 reserved10;
-                       u64 reserved11;
-               } bind;
-               struct {
-                       u64 reserved12;
-                       u64 reserved13;
-                       u32 size;
-                       u32 start;
-               } inline_data;
-       } u;
-
-};
-
-#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
-#define WC_IMM_DATA     EHCA_BMASK_IBM(1, 1)
-#define WC_GRH_PRESENT  EHCA_BMASK_IBM(2, 2)
-#define WC_SE_BIT       EHCA_BMASK_IBM(3, 3)
-#define WC_STATUS_ERROR_BIT 0x80000000
-#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
-#define WC_STATUS_PURGE_BIT 0x10
-#define WC_SEND_RECEIVE_BIT 0x80
-
-struct ehca_cqe {
-       u64 work_request_id;
-       u8 optype;
-       u8 w_completion_flags;
-       u16 reserved1;
-       u32 nr_bytes_transferred;
-       u32 immediate_data;
-       u32 local_qp_number;
-       u8 freed_resource_count;
-       u8 service_level;
-       u16 wqe_count;
-       u32 qp_token;
-       u32 qkey_ee_token;
-       u32 remote_qp_number;
-       u16 dlid;
-       u16 rlid;
-       u16 reserved2;
-       u16 pkey_index;
-       u32 cqe_timestamp;
-       u32 wqe_timestamp;
-       u8 wqe_timestamp_valid;
-       u8 reserved3;
-       u8 reserved4;
-       u8 cqe_flags;
-       u32 status;
-};
-
-struct ehca_eqe {
-       u64 entry;
-};
-
-struct ehca_mrte {
-       u64 starting_va;
-       u64 length; /* length of memory region in bytes*/
-       u32 pd;
-       u8 key_instance;
-       u8 pagesize;
-       u8 mr_control;
-       u8 local_remote_access_ctrl;
-       u8 reserved[0x20 - 0x18];
-       u64 at_pointer[4];
-};
-#endif /*_EHCA_QES_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
deleted file mode 100644 (file)
index 2e89356..0000000
+++ /dev/null
@@ -1,2257 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  QP functions
- *
- *  Authors: Joachim Fenkes <fenkes@de.ibm.com>
- *           Stefan Roscher <stefan.roscher@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-
-static struct kmem_cache *qp_cache;
-
-/*
- * attributes not supported by query qp
- */
-#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
-                                    IB_QP_EN_SQD_ASYNC_NOTIFY)
-
-/*
- * ehca (internal) qp state values
- */
-enum ehca_qp_state {
-       EHCA_QPS_RESET = 1,
-       EHCA_QPS_INIT = 2,
-       EHCA_QPS_RTR = 3,
-       EHCA_QPS_RTS = 5,
-       EHCA_QPS_SQD = 6,
-       EHCA_QPS_SQE = 8,
-       EHCA_QPS_ERR = 128
-};
-
-/*
- * qp state transitions as defined by IB Arch Rel 1.1 page 431
- */
-enum ib_qp_statetrans {
-       IB_QPST_ANY2RESET,
-       IB_QPST_ANY2ERR,
-       IB_QPST_RESET2INIT,
-       IB_QPST_INIT2RTR,
-       IB_QPST_INIT2INIT,
-       IB_QPST_RTR2RTS,
-       IB_QPST_RTS2SQD,
-       IB_QPST_RTS2RTS,
-       IB_QPST_SQD2RTS,
-       IB_QPST_SQE2RTS,
-       IB_QPST_SQD2SQD,
-       IB_QPST_MAX     /* nr of transitions, this must be last!!! */
-};
-
-/*
- * ib2ehca_qp_state maps IB to ehca qp_state
- * returns ehca qp state corresponding to given ib qp state
- */
-static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
-{
-       switch (ib_qp_state) {
-       case IB_QPS_RESET:
-               return EHCA_QPS_RESET;
-       case IB_QPS_INIT:
-               return EHCA_QPS_INIT;
-       case IB_QPS_RTR:
-               return EHCA_QPS_RTR;
-       case IB_QPS_RTS:
-               return EHCA_QPS_RTS;
-       case IB_QPS_SQD:
-               return EHCA_QPS_SQD;
-       case IB_QPS_SQE:
-               return EHCA_QPS_SQE;
-       case IB_QPS_ERR:
-               return EHCA_QPS_ERR;
-       default:
-               ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
-               return -EINVAL;
-       }
-}
-
-/*
- * ehca2ib_qp_state maps ehca to IB qp_state
- * returns ib qp state corresponding to given ehca qp state
- */
-static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
-                                               ehca_qp_state)
-{
-       switch (ehca_qp_state) {
-       case EHCA_QPS_RESET:
-               return IB_QPS_RESET;
-       case EHCA_QPS_INIT:
-               return IB_QPS_INIT;
-       case EHCA_QPS_RTR:
-               return IB_QPS_RTR;
-       case EHCA_QPS_RTS:
-               return IB_QPS_RTS;
-       case EHCA_QPS_SQD:
-               return IB_QPS_SQD;
-       case EHCA_QPS_SQE:
-               return IB_QPS_SQE;
-       case EHCA_QPS_ERR:
-               return IB_QPS_ERR;
-       default:
-               ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
-               return -EINVAL;
-       }
-}
-
-/*
- * ehca_qp_type used as index for req_attr and opt_attr of
- * struct ehca_modqp_statetrans
- */
-enum ehca_qp_type {
-       QPT_RC = 0,
-       QPT_UC = 1,
-       QPT_UD = 2,
-       QPT_SQP = 3,
-       QPT_MAX
-};
-
-/*
- * ib2ehcaqptype maps Ib to ehca qp_type
- * returns ehca qp type corresponding to ib qp type
- */
-static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
-{
-       switch (ibqptype) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               return QPT_SQP;
-       case IB_QPT_RC:
-               return QPT_RC;
-       case IB_QPT_UC:
-               return QPT_UC;
-       case IB_QPT_UD:
-               return QPT_UD;
-       default:
-               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
-               return -EINVAL;
-       }
-}
-
-static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
-                                                        int ib_tostate)
-{
-       int index = -EINVAL;
-       switch (ib_tostate) {
-       case IB_QPS_RESET:
-               index = IB_QPST_ANY2RESET;
-               break;
-       case IB_QPS_INIT:
-               switch (ib_fromstate) {
-               case IB_QPS_RESET:
-                       index = IB_QPST_RESET2INIT;
-                       break;
-               case IB_QPS_INIT:
-                       index = IB_QPST_INIT2INIT;
-                       break;
-               }
-               break;
-       case IB_QPS_RTR:
-               if (ib_fromstate == IB_QPS_INIT)
-                       index = IB_QPST_INIT2RTR;
-               break;
-       case IB_QPS_RTS:
-               switch (ib_fromstate) {
-               case IB_QPS_RTR:
-                       index = IB_QPST_RTR2RTS;
-                       break;
-               case IB_QPS_RTS:
-                       index = IB_QPST_RTS2RTS;
-                       break;
-               case IB_QPS_SQD:
-                       index = IB_QPST_SQD2RTS;
-                       break;
-               case IB_QPS_SQE:
-                       index = IB_QPST_SQE2RTS;
-                       break;
-               }
-               break;
-       case IB_QPS_SQD:
-               if (ib_fromstate == IB_QPS_RTS)
-                       index = IB_QPST_RTS2SQD;
-               break;
-       case IB_QPS_SQE:
-               break;
-       case IB_QPS_ERR:
-               index = IB_QPST_ANY2ERR;
-               break;
-       default:
-               break;
-       }
-       return index;
-}
-
-/*
- * ibqptype2servicetype returns hcp service type corresponding to given
- * ib qp type used by create_qp()
- */
-static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
-{
-       switch (ibqptype) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               return ST_UD;
-       case IB_QPT_RC:
-               return ST_RC;
-       case IB_QPT_UC:
-               return ST_UC;
-       case IB_QPT_UD:
-               return ST_UD;
-       case IB_QPT_RAW_IPV6:
-               return -EINVAL;
-       case IB_QPT_RAW_ETHERTYPE:
-               return -EINVAL;
-       default:
-               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
-               return -EINVAL;
-       }
-}
-
-/*
- * init userspace queue info from ipz_queue data
- */
-static inline void queue2resp(struct ipzu_queue_resp *resp,
-                             struct ipz_queue *queue)
-{
-       resp->qe_size = queue->qe_size;
-       resp->act_nr_of_sg = queue->act_nr_of_sg;
-       resp->queue_length = queue->queue_length;
-       resp->pagesize = queue->pagesize;
-       resp->toggle_state = queue->toggle_state;
-       resp->offset = queue->offset;
-}
-
-/*
- * init_qp_queue initializes/constructs r/squeue and registers queue pages.
- */
-static inline int init_qp_queue(struct ehca_shca *shca,
-                               struct ehca_pd *pd,
-                               struct ehca_qp *my_qp,
-                               struct ipz_queue *queue,
-                               int q_type,
-                               u64 expected_hret,
-                               struct ehca_alloc_queue_parms *parms,
-                               int wqe_size)
-{
-       int ret, cnt, ipz_rc, nr_q_pages;
-       void *vpage;
-       u64 rpage, h_ret;
-       struct ib_device *ib_dev = &shca->ib_device;
-       struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
-
-       if (!parms->queue_size)
-               return 0;
-
-       if (parms->is_small) {
-               nr_q_pages = 1;
-               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
-                                       128 << parms->page_size,
-                                       wqe_size, parms->act_nr_sges, 1);
-       } else {
-               nr_q_pages = parms->queue_size;
-               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
-                                       EHCA_PAGESIZE, wqe_size,
-                                       parms->act_nr_sges, 0);
-       }
-
-       if (!ipz_rc) {
-               ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
-                        ipz_rc);
-               return -EBUSY;
-       }
-
-       /* register queue pages */
-       for (cnt = 0; cnt < nr_q_pages; cnt++) {
-               vpage = ipz_qpageit_get_inc(queue);
-               if (!vpage) {
-                       ehca_err(ib_dev, "ipz_qpageit_get_inc() "
-                                "failed p_vpage= %p", vpage);
-                       ret = -EINVAL;
-                       goto init_qp_queue1;
-               }
-               rpage = __pa(vpage);
-
-               h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
-                                                my_qp->ipz_qp_handle,
-                                                NULL, 0, q_type,
-                                                rpage, parms->is_small ? 0 : 1,
-                                                my_qp->galpas.kernel);
-               if (cnt == (nr_q_pages - 1)) {  /* last page! */
-                       if (h_ret != expected_hret) {
-                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
-                                        "h_ret=%lli", h_ret);
-                               ret = ehca2ib_return_code(h_ret);
-                               goto init_qp_queue1;
-                       }
-                       vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
-                       if (vpage) {
-                               ehca_err(ib_dev, "ipz_qpageit_get_inc() "
-                                        "should not succeed vpage=%p", vpage);
-                               ret = -EINVAL;
-                               goto init_qp_queue1;
-                       }
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED) {
-                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
-                                        "h_ret=%lli", h_ret);
-                               ret = ehca2ib_return_code(h_ret);
-                               goto init_qp_queue1;
-                       }
-               }
-       }
-
-       ipz_qeit_reset(queue);
-
-       return 0;
-
-init_qp_queue1:
-       ipz_queue_dtor(pd, queue);
-       return ret;
-}
-
-static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
-{
-       if (is_llqp)
-               return 128 << act_nr_sge;
-       else
-               return offsetof(struct ehca_wqe,
-                               u.nud.sg_list[act_nr_sge]);
-}
-
-static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
-                                      int req_nr_sge, int is_llqp)
-{
-       u32 wqe_size, q_size;
-       int act_nr_sge = req_nr_sge;
-
-       if (!is_llqp)
-               /* round up #SGEs so WQE size is a power of 2 */
-               for (act_nr_sge = 4; act_nr_sge <= 252;
-                    act_nr_sge = 4 + 2 * act_nr_sge)
-                       if (act_nr_sge >= req_nr_sge)
-                               break;
-
-       wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
-       q_size = wqe_size * (queue->max_wr + 1);
-
-       if (q_size <= 512)
-               queue->page_size = 2;
-       else if (q_size <= 1024)
-               queue->page_size = 3;
-       else
-               queue->page_size = 0;
-
-       queue->is_small = (queue->page_size != 0);
-}
-
-/* needs to be called with cq->spinlock held */
-void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
-{
-       struct list_head *list, *node;
-
-       /* TODO: support low latency QPs */
-       if (qp->ext_type == EQPT_LLQP)
-               return;
-
-       if (on_sq) {
-               list = &qp->send_cq->sqp_err_list;
-               node = &qp->sq_err_node;
-       } else {
-               list = &qp->recv_cq->rqp_err_list;
-               node = &qp->rq_err_node;
-       }
-
-       if (list_empty(node))
-               list_add_tail(node, list);
-
-       return;
-}
-
-static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-
-       if (!list_empty(node))
-               list_del_init(node);
-
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-}
-
-static void reset_queue_map(struct ehca_queue_map *qmap)
-{
-       int i;
-
-       qmap->tail = qmap->entries - 1;
-       qmap->left_to_poll = 0;
-       qmap->next_wqe_idx = 0;
-       for (i = 0; i < qmap->entries; i++) {
-               qmap->map[i].reported = 1;
-               qmap->map[i].cqe_req = 0;
-       }
-}
-
-/*
- * Create an ib_qp struct that is either a QP or an SRQ, depending on
- * the value of the is_srq parameter. If init_attr and srq_init_attr share
- * fields, the field out of init_attr is used.
- */
-static struct ehca_qp *internal_create_qp(
-       struct ib_pd *pd,
-       struct ib_qp_init_attr *init_attr,
-       struct ib_srq_init_attr *srq_init_attr,
-       struct ib_udata *udata, int is_srq)
-{
-       struct ehca_qp *my_qp, *my_srq = NULL;
-       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-       struct ib_ucontext *context = NULL;
-       u64 h_ret;
-       int is_llqp = 0, has_srq = 0, is_user = 0;
-       int qp_type, max_send_sge, max_recv_sge, ret;
-
-       /* h_call's out parameters */
-       struct ehca_alloc_qp_parms parms;
-       u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
-       unsigned long flags;
-
-       if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
-               ehca_err(pd->device, "Unable to create QP, max number of %i "
-                        "QPs reached.", shca->max_num_qps);
-               ehca_err(pd->device, "To increase the maximum number of QPs "
-                        "use the number_of_qps module parameter.\n");
-               return ERR_PTR(-ENOSPC);
-       }
-
-       if (init_attr->create_flags) {
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       memset(&parms, 0, sizeof(parms));
-       qp_type = init_attr->qp_type;
-
-       if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
-               init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
-               ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
-                        init_attr->sq_sig_type);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       /* save LLQP info */
-       if (qp_type & 0x80) {
-               is_llqp = 1;
-               parms.ext_type = EQPT_LLQP;
-               parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
-       }
-       qp_type &= 0x1F;
-       init_attr->qp_type &= 0x1F;
-
-       /* handle SRQ base QPs */
-       if (init_attr->srq) {
-               my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
-
-               if (qp_type == IB_QPT_UC) {
-                       ehca_err(pd->device, "UC with SRQ not supported");
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-
-               has_srq = 1;
-               parms.ext_type = EQPT_SRQBASE;
-               parms.srq_qpn = my_srq->real_qp_num;
-       }
-
-       if (is_llqp && has_srq) {
-               ehca_err(pd->device, "LLQPs can't have an SRQ");
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       /* handle SRQs */
-       if (is_srq) {
-               parms.ext_type = EQPT_SRQ;
-               parms.srq_limit = srq_init_attr->attr.srq_limit;
-               if (init_attr->cap.max_recv_sge > 3) {
-                       ehca_err(pd->device, "no more than three SGEs "
-                                "supported for SRQ  pd=%p  max_sge=%x",
-                                pd, init_attr->cap.max_recv_sge);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       }
-
-       /* check QP type */
-       if (qp_type != IB_QPT_UD &&
-           qp_type != IB_QPT_UC &&
-           qp_type != IB_QPT_RC &&
-           qp_type != IB_QPT_SMI &&
-           qp_type != IB_QPT_GSI) {
-               ehca_err(pd->device, "wrong QP Type=%x", qp_type);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       if (is_llqp) {
-               switch (qp_type) {
-               case IB_QPT_RC:
-                       if ((init_attr->cap.max_send_wr > 255) ||
-                           (init_attr->cap.max_recv_wr > 255)) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of max_sq_wr=%x "
-                                        "or max_rq_wr=%x for RC LLQP",
-                                        init_attr->cap.max_send_wr,
-                                        init_attr->cap.max_recv_wr);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       }
-                       break;
-               case IB_QPT_UD:
-                       if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
-                               ehca_err(pd->device, "UD LLQP not supported "
-                                        "by this adapter");
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-ENOSYS);
-                       }
-                       if (!(init_attr->cap.max_send_sge <= 5
-                           && init_attr->cap.max_send_sge >= 1
-                           && init_attr->cap.max_recv_sge <= 5
-                           && init_attr->cap.max_recv_sge >= 1)) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of max_send_sge=%x "
-                                        "or max_recv_sge=%x for UD LLQP",
-                                        init_attr->cap.max_send_sge,
-                                        init_attr->cap.max_recv_sge);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       } else if (init_attr->cap.max_send_wr > 255) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of "
-                                        "max_send_wr=%x for UD QP_TYPE=%x",
-                                        init_attr->cap.max_send_wr, qp_type);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       }
-                       break;
-               default:
-                       ehca_err(pd->device, "unsupported LL QP Type=%x",
-                                qp_type);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       } else {
-               int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
-                              || qp_type == IB_QPT_GSI) ? 250 : 252;
-
-               if (init_attr->cap.max_send_sge > max_sge
-                   || init_attr->cap.max_recv_sge > max_sge) {
-                       ehca_err(pd->device, "Invalid number of SGEs requested "
-                                "send_sge=%x recv_sge=%x max_sge=%x",
-                                init_attr->cap.max_send_sge,
-                                init_attr->cap.max_recv_sge, max_sge);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       }
-
-       my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
-       if (!my_qp) {
-               ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       if (pd->uobject && udata) {
-               is_user = 1;
-               context = pd->uobject->context;
-       }
-
-       atomic_set(&my_qp->nr_events, 0);
-       init_waitqueue_head(&my_qp->wait_completion);
-       spin_lock_init(&my_qp->spinlock_s);
-       spin_lock_init(&my_qp->spinlock_r);
-       my_qp->qp_type = qp_type;
-       my_qp->ext_type = parms.ext_type;
-       my_qp->state = IB_QPS_RESET;
-
-       if (init_attr->recv_cq)
-               my_qp->recv_cq =
-                       container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
-       if (init_attr->send_cq)
-               my_qp->send_cq =
-                       container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
-
-       idr_preload(GFP_KERNEL);
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-
-       ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
-       if (ret >= 0)
-               my_qp->token = ret;
-
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-       idr_preload_end();
-       if (ret < 0) {
-               if (ret == -ENOSPC) {
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Invalid number of qp");
-               } else {
-                       ret = -ENOMEM;
-                       ehca_err(pd->device, "Can't allocate new idr entry.");
-               }
-               goto create_qp_exit0;
-       }
-
-       if (has_srq)
-               parms.srq_token = my_qp->token;
-
-       parms.servicetype = ibqptype2servicetype(qp_type);
-       if (parms.servicetype < 0) {
-               ret = -EINVAL;
-               ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
-               goto create_qp_exit1;
-       }
-
-       /* Always signal by WQE so we can hide circ. WQEs */
-       parms.sigtype = HCALL_SIGT_BY_WQE;
-
-       /* UD_AV CIRCUMVENTION */
-       max_send_sge = init_attr->cap.max_send_sge;
-       max_recv_sge = init_attr->cap.max_recv_sge;
-       if (parms.servicetype == ST_UD && !is_llqp) {
-               max_send_sge += 2;
-               max_recv_sge += 2;
-       }
-
-       parms.token = my_qp->token;
-       parms.eq_handle = shca->eq.ipz_eq_handle;
-       parms.pd = my_pd->fw_pd;
-       if (my_qp->send_cq)
-               parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
-       if (my_qp->recv_cq)
-               parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
-
-       parms.squeue.max_wr = init_attr->cap.max_send_wr;
-       parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
-       parms.squeue.max_sge = max_send_sge;
-       parms.rqueue.max_sge = max_recv_sge;
-
-       /* RC QPs need one more SWQE for unsolicited ack circumvention */
-       if (qp_type == IB_QPT_RC)
-               parms.squeue.max_wr++;
-
-       if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
-               if (HAS_SQ(my_qp))
-                       ehca_determine_small_queue(
-                               &parms.squeue, max_send_sge, is_llqp);
-               if (HAS_RQ(my_qp))
-                       ehca_determine_small_queue(
-                               &parms.rqueue, max_recv_sge, is_llqp);
-               parms.qp_storage =
-                       (parms.squeue.is_small || parms.rqueue.is_small);
-       }
-
-       h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
-                        h_ret);
-               ret = ehca2ib_return_code(h_ret);
-               goto create_qp_exit1;
-       }
-
-       ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
-       my_qp->ipz_qp_handle = parms.qp_handle;
-       my_qp->galpas = parms.galpas;
-
-       swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
-       rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
-
-       switch (qp_type) {
-       case IB_QPT_RC:
-               if (is_llqp) {
-                       parms.squeue.act_nr_sges = 1;
-                       parms.rqueue.act_nr_sges = 1;
-               }
-               /* hide the extra WQE */
-               parms.squeue.act_nr_wqes--;
-               break;
-       case IB_QPT_UD:
-       case IB_QPT_GSI:
-       case IB_QPT_SMI:
-               /* UD circumvention */
-               if (is_llqp) {
-                       parms.squeue.act_nr_sges = 1;
-                       parms.rqueue.act_nr_sges = 1;
-               } else {
-                       parms.squeue.act_nr_sges -= 2;
-                       parms.rqueue.act_nr_sges -= 2;
-               }
-
-               if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
-                       parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
-                       parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
-                       parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
-                       parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
-                       ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
-               }
-
-               break;
-
-       default:
-               break;
-       }
-
-       /* initialize r/squeue and register queue pages */
-       if (HAS_SQ(my_qp)) {
-               ret = init_qp_queue(
-                       shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
-                       HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
-                       &parms.squeue, swqe_size);
-               if (ret) {
-                       ehca_err(pd->device, "Couldn't initialize squeue "
-                                "and pages ret=%i", ret);
-                       goto create_qp_exit2;
-               }
-
-               if (!is_user) {
-                       my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
-                               my_qp->ipz_squeue.qe_size;
-                       my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
-                                                   sizeof(struct ehca_qmap_entry));
-                       if (!my_qp->sq_map.map) {
-                               ehca_err(pd->device, "Couldn't allocate squeue "
-                                        "map ret=%i", ret);
-                               goto create_qp_exit3;
-                       }
-                       INIT_LIST_HEAD(&my_qp->sq_err_node);
-                       /* to avoid the generation of bogus flush CQEs */
-                       reset_queue_map(&my_qp->sq_map);
-               }
-       }
-
-       if (HAS_RQ(my_qp)) {
-               ret = init_qp_queue(
-                       shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
-                       H_SUCCESS, &parms.rqueue, rwqe_size);
-               if (ret) {
-                       ehca_err(pd->device, "Couldn't initialize rqueue "
-                                "and pages ret=%i", ret);
-                       goto create_qp_exit4;
-               }
-               if (!is_user) {
-                       my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
-                               my_qp->ipz_rqueue.qe_size;
-                       my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
-                                                   sizeof(struct ehca_qmap_entry));
-                       if (!my_qp->rq_map.map) {
-                               ehca_err(pd->device, "Couldn't allocate squeue "
-                                        "map ret=%i", ret);
-                               goto create_qp_exit5;
-                       }
-                       INIT_LIST_HEAD(&my_qp->rq_err_node);
-                       /* to avoid the generation of bogus flush CQEs */
-                       reset_queue_map(&my_qp->rq_map);
-               }
-       } else if (init_attr->srq && !is_user) {
-               /* this is a base QP, use the queue map of the SRQ */
-               my_qp->rq_map = my_srq->rq_map;
-               INIT_LIST_HEAD(&my_qp->rq_err_node);
-
-               my_qp->ipz_rqueue = my_srq->ipz_rqueue;
-       }
-
-       if (is_srq) {
-               my_qp->ib_srq.pd = &my_pd->ib_pd;
-               my_qp->ib_srq.device = my_pd->ib_pd.device;
-
-               my_qp->ib_srq.srq_context = init_attr->qp_context;
-               my_qp->ib_srq.event_handler = init_attr->event_handler;
-       } else {
-               my_qp->ib_qp.qp_num = ib_qp_num;
-               my_qp->ib_qp.pd = &my_pd->ib_pd;
-               my_qp->ib_qp.device = my_pd->ib_pd.device;
-
-               my_qp->ib_qp.recv_cq = init_attr->recv_cq;
-               my_qp->ib_qp.send_cq = init_attr->send_cq;
-
-               my_qp->ib_qp.qp_type = qp_type;
-               my_qp->ib_qp.srq = init_attr->srq;
-
-               my_qp->ib_qp.qp_context = init_attr->qp_context;
-               my_qp->ib_qp.event_handler = init_attr->event_handler;
-       }
-
-       init_attr->cap.max_inline_data = 0; /* not supported yet */
-       init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
-       init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
-       init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
-       init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
-       my_qp->init_attr = *init_attr;
-
-       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
-               shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
-                       &my_qp->ib_qp;
-               if (ehca_nr_ports < 0) {
-                       /* alloc array to cache subsequent modify qp parms
-                        * for autodetect mode
-                        */
-                       my_qp->mod_qp_parm =
-                               kzalloc(EHCA_MOD_QP_PARM_MAX *
-                                       sizeof(*my_qp->mod_qp_parm),
-                                       GFP_KERNEL);
-                       if (!my_qp->mod_qp_parm) {
-                               ehca_err(pd->device,
-                                        "Could not alloc mod_qp_parm");
-                               goto create_qp_exit5;
-                       }
-               }
-       }
-
-       /* NOTE: define_apq0() not supported yet */
-       if (qp_type == IB_QPT_GSI) {
-               h_ret = ehca_define_sqp(shca, my_qp, init_attr);
-               if (h_ret != H_SUCCESS) {
-                       kfree(my_qp->mod_qp_parm);
-                       my_qp->mod_qp_parm = NULL;
-                       /* the QP pointer is no longer valid */
-                       shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
-                               NULL;
-                       ret = ehca2ib_return_code(h_ret);
-                       goto create_qp_exit6;
-               }
-       }
-
-       if (my_qp->send_cq) {
-               ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
-               if (ret) {
-                       ehca_err(pd->device,
-                                "Couldn't assign qp to send_cq ret=%i", ret);
-                       goto create_qp_exit7;
-               }
-       }
-
-       /* copy queues, galpa data to user space */
-       if (context && udata) {
-               struct ehca_create_qp_resp resp;
-               memset(&resp, 0, sizeof(resp));
-
-               resp.qp_num = my_qp->real_qp_num;
-               resp.token = my_qp->token;
-               resp.qp_type = my_qp->qp_type;
-               resp.ext_type = my_qp->ext_type;
-               resp.qkey = my_qp->qkey;
-               resp.real_qp_num = my_qp->real_qp_num;
-
-               if (HAS_SQ(my_qp))
-                       queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
-               if (HAS_RQ(my_qp))
-                       queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
-               resp.fw_handle_ofs = (u32)
-                       (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
-
-               if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
-                       ehca_err(pd->device, "Copy to udata failed");
-                       ret = -EINVAL;
-                       goto create_qp_exit8;
-               }
-       }
-
-       return my_qp;
-
-create_qp_exit8:
-       ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
-
-create_qp_exit7:
-       kfree(my_qp->mod_qp_parm);
-
-create_qp_exit6:
-       if (HAS_RQ(my_qp) && !is_user)
-               vfree(my_qp->rq_map.map);
-
-create_qp_exit5:
-       if (HAS_RQ(my_qp))
-               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
-
-create_qp_exit4:
-       if (HAS_SQ(my_qp) && !is_user)
-               vfree(my_qp->sq_map.map);
-
-create_qp_exit3:
-       if (HAS_SQ(my_qp))
-               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
-
-create_qp_exit2:
-       hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
-
-create_qp_exit1:
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-       idr_remove(&ehca_qp_idr, my_qp->token);
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-
-create_qp_exit0:
-       kmem_cache_free(qp_cache, my_qp);
-       atomic_dec(&shca->num_qps);
-       return ERR_PTR(ret);
-}
-
-struct ib_qp *ehca_create_qp(struct ib_pd *pd,
-                            struct ib_qp_init_attr *qp_init_attr,
-                            struct ib_udata *udata)
-{
-       struct ehca_qp *ret;
-
-       ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
-       return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
-}
-
-static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
-                              struct ib_uobject *uobject);
-
-struct ib_srq *ehca_create_srq(struct ib_pd *pd,
-                              struct ib_srq_init_attr *srq_init_attr,
-                              struct ib_udata *udata)
-{
-       struct ib_qp_init_attr qp_init_attr;
-       struct ehca_qp *my_qp;
-       struct ib_srq *ret;
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-       struct hcp_modify_qp_control_block *mqpcb;
-       u64 hret, update_mask;
-
-       if (srq_init_attr->srq_type != IB_SRQT_BASIC)
-               return ERR_PTR(-ENOSYS);
-
-       /* For common attributes, internal_create_qp() takes its info
-        * out of qp_init_attr, so copy all common attrs there.
-        */
-       memset(&qp_init_attr, 0, sizeof(qp_init_attr));
-       qp_init_attr.event_handler = srq_init_attr->event_handler;
-       qp_init_attr.qp_context = srq_init_attr->srq_context;
-       qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
-       qp_init_attr.qp_type = IB_QPT_RC;
-       qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
-       qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
-
-       my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
-       if (IS_ERR(my_qp))
-               return (struct ib_srq *)my_qp;
-
-       /* copy back return values */
-       srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
-       srq_init_attr->attr.max_sge = 3;
-
-       /* drive SRQ into RTR state */
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!mqpcb) {
-               ehca_err(pd->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
-               ret = ERR_PTR(-ENOMEM);
-               goto create_srq1;
-       }
-
-       mqpcb->qp_state = EHCA_QPS_INIT;
-       mqpcb->prim_phys_port = 1;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not modify SRQ to INIT "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       mqpcb->qp_enable = 1;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not enable SRQ "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       mqpcb->qp_state  = EHCA_QPS_RTR;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not modify SRQ to RTR "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return &my_qp->ib_srq;
-
-create_srq2:
-       ret = ERR_PTR(ehca2ib_return_code(hret));
-       ehca_free_fw_ctrlblock(mqpcb);
-
-create_srq1:
-       internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
-
-       return ret;
-}
-
-/*
- * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
- * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
- * returns total number of bad wqes in bad_wqe_cnt
- */
-static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
-                          int *bad_wqe_cnt)
-{
-       u64 h_ret;
-       struct ipz_queue *squeue;
-       void *bad_send_wqe_p, *bad_send_wqe_v;
-       u64 q_ofs;
-       struct ehca_wqe *wqe;
-       int qp_num = my_qp->ib_qp.qp_num;
-
-       /* get send wqe pointer */
-       h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
-                                          my_qp->ipz_qp_handle, &my_qp->pf,
-                                          &bad_send_wqe_p, NULL, 2);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
-                        " ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, qp_num, h_ret);
-               return ehca2ib_return_code(h_ret);
-       }
-       bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
-       ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
-                qp_num, bad_send_wqe_p);
-       /* convert wqe pointer to vadr */
-       bad_send_wqe_v = __va((u64)bad_send_wqe_p);
-       if (ehca_debug_level >= 2)
-               ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
-       squeue = &my_qp->ipz_squeue;
-       if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
-               ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
-                        " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
-               return -EFAULT;
-       }
-
-       /* loop sets wqe's purge bit */
-       wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
-       *bad_wqe_cnt = 0;
-       while (wqe->optype != 0xff && wqe->wqef != 0xff) {
-               if (ehca_debug_level >= 2)
-                       ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
-               wqe->nr_of_data_seg = 0; /* suppress data access */
-               wqe->wqef = WQEF_PURGE; /* WQE to be purged */
-               q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
-               wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
-               *bad_wqe_cnt = (*bad_wqe_cnt)+1;
-       }
-       /*
-        * bad wqe will be reprocessed and ignored when pol_cq() is called,
-        *  i.e. nr of wqes with flush error status is one less
-        */
-       ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
-                qp_num, (*bad_wqe_cnt)-1);
-       wqe->wqef = 0;
-
-       return 0;
-}
-
-static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
-                         struct ehca_queue_map *qmap)
-{
-       void *wqe_v;
-       u64 q_ofs;
-       u32 wqe_idx;
-       unsigned int tail_idx;
-
-       /* convert real to abs address */
-       wqe_p = wqe_p & (~(1UL << 63));
-
-       wqe_v = __va(wqe_p);
-
-       if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
-               ehca_gen_err("Invalid offset for calculating left cqes "
-                               "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
-               return -EFAULT;
-       }
-
-       tail_idx = next_index(qmap->tail, qmap->entries);
-       wqe_idx = q_ofs / ipz_queue->qe_size;
-
-       /* check all processed wqes, whether a cqe is requested or not */
-       while (tail_idx != wqe_idx) {
-               if (qmap->map[tail_idx].cqe_req)
-                       qmap->left_to_poll++;
-               tail_idx = next_index(tail_idx, qmap->entries);
-       }
-       /* save index in queue, where we have to start flushing */
-       qmap->next_wqe_idx = wqe_idx;
-       return 0;
-}
-
-static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
-{
-       u64 h_ret;
-       void *send_wqe_p, *recv_wqe_p;
-       int ret;
-       unsigned long flags;
-       int qp_num = my_qp->ib_qp.qp_num;
-
-       /* this hcall is not supported on base QPs */
-       if (my_qp->ext_type != EQPT_SRQBASE) {
-               /* get send and receive wqe pointer */
-               h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle, &my_qp->pf,
-                               &send_wqe_p, &recv_wqe_p, 4);
-               if (h_ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device, "disable_and_get_wqe() "
-                                "failed ehca_qp=%p qp_num=%x h_ret=%lli",
-                                my_qp, qp_num, h_ret);
-                       return ehca2ib_return_code(h_ret);
-               }
-
-               /*
-                * acquire lock to ensure that nobody is polling the cq which
-                * could mean that the qmap->tail pointer is in an
-                * inconsistent state.
-                */
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
-                               &my_qp->sq_map);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-               if (ret)
-                       return ret;
-
-
-               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-               ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
-                               &my_qp->rq_map);
-               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
-               if (ret)
-                       return ret;
-       } else {
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               my_qp->sq_map.left_to_poll = 0;
-               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
-                                                       my_qp->sq_map.entries);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-
-               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-               my_qp->rq_map.left_to_poll = 0;
-               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
-                                                       my_qp->rq_map.entries);
-               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
-       }
-
-       /* this assures flush cqes being generated only for pending wqes */
-       if ((my_qp->sq_map.left_to_poll == 0) &&
-                               (my_qp->rq_map.left_to_poll == 0)) {
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               ehca_add_to_err_list(my_qp, 1);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-
-               if (HAS_RQ(my_qp)) {
-                       spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-                       ehca_add_to_err_list(my_qp, 0);
-                       spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
-                                       flags);
-               }
-       }
-
-       return 0;
-}
-
-/*
- * internal_modify_qp with circumvention to handle aqp0 properly
- * smi_reset2init indicates if this is an internal reset-to-init-call for
- * smi. This flag must always be zero if called from ehca_modify_qp()!
- * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
- */
-static int internal_modify_qp(struct ib_qp *ibqp,
-                             struct ib_qp_attr *attr,
-                             int attr_mask, int smi_reset2init)
-{
-       enum ib_qp_state qp_cur_state, qp_new_state;
-       int cnt, qp_attr_idx, ret = 0;
-       enum ib_qp_statetrans statetrans;
-       struct hcp_modify_qp_control_block *mqpcb;
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca =
-               container_of(ibqp->pd->device, struct ehca_shca, ib_device);
-       u64 update_mask;
-       u64 h_ret;
-       int bad_wqe_cnt = 0;
-       int is_user = 0;
-       int squeue_locked = 0;
-       unsigned long flags = 0;
-
-       /* do query_qp to obtain current attr values */
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!mqpcb) {
-               ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               mqpcb, my_qp->galpas.kernel);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(ibqp->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, ibqp->qp_num, h_ret);
-               ret = ehca2ib_return_code(h_ret);
-               goto modify_qp_exit1;
-       }
-       if (ibqp->uobject)
-               is_user = 1;
-
-       qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
-
-       if (qp_cur_state == -EINVAL) {  /* invalid qp state */
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        mqpcb->qp_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-       /*
-        * circumvention to set aqp0 initial state to init
-        * as expected by IB spec
-        */
-       if (smi_reset2init == 0 &&
-           ibqp->qp_type == IB_QPT_SMI &&
-           qp_cur_state == IB_QPS_RESET &&
-           (attr_mask & IB_QP_STATE) &&
-           attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
-               struct ib_qp_attr smiqp_attr = {
-                       .qp_state = IB_QPS_INIT,
-                       .port_num = my_qp->init_attr.port_num,
-                       .pkey_index = 0,
-                       .qkey = 0
-               };
-               int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
-                       IB_QP_PKEY_INDEX | IB_QP_QKEY;
-               int smirc = internal_modify_qp(
-                       ibqp, &smiqp_attr, smiqp_attr_mask, 1);
-               if (smirc) {
-                       ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
-                                "ehca_modify_qp() rc=%i", smirc);
-                       ret = H_PARAMETER;
-                       goto modify_qp_exit1;
-               }
-               qp_cur_state = IB_QPS_INIT;
-               ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
-       }
-       /* is transmitted current state  equal to "real" current state */
-       if ((attr_mask & IB_QP_CUR_STATE) &&
-           qp_cur_state != attr->cur_qp_state) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device,
-                        "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
-                        " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
-                        attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
-                "new qp_state=%x attribute_mask=%x",
-                my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
-
-       qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
-       if (!smi_reset2init &&
-           !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
-                               attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device,
-                        "Invalid qp transition new_state=%x cur_state=%x "
-                        "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
-                        qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
-               goto modify_qp_exit1;
-       }
-
-       mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
-       if (mqpcb->qp_state)
-               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       else {
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "Invalid new qp state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        qp_new_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       /* retrieve state transition struct to get req and opt attrs */
-       statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
-       if (statetrans < 0) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
-                        "new_qp_state=%x State_xsition=%x ehca_qp=%p "
-                        "qp_num=%x", qp_cur_state, qp_new_state,
-                        statetrans, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
-
-       if (qp_attr_idx < 0) {
-               ret = qp_attr_idx;
-               ehca_err(ibqp->device,
-                        "Invalid QP type=%x ehca_qp=%p qp_num=%x",
-                        ibqp->qp_type, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       ehca_dbg(ibqp->device,
-                "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
-                my_qp, ibqp->qp_num, statetrans);
-
-       /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
-        * in non-LL UD QPs.
-        */
-       if ((my_qp->qp_type == IB_QPT_UD) &&
-           (my_qp->ext_type != EQPT_LLQP) &&
-           (statetrans == IB_QPST_INIT2RTR) &&
-           (shca->hw_level >= 0x22)) {
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
-               mqpcb->send_grh_flag = 1;
-       }
-
-       /* sqe -> rts: set purge bit of bad wqe before actual trans */
-       if ((my_qp->qp_type == IB_QPT_UD ||
-            my_qp->qp_type == IB_QPT_GSI ||
-            my_qp->qp_type == IB_QPT_SMI) &&
-           statetrans == IB_QPST_SQE2RTS) {
-               /* mark next free wqe if kernel */
-               if (!ibqp->uobject) {
-                       struct ehca_wqe *wqe;
-                       /* lock send queue */
-                       spin_lock_irqsave(&my_qp->spinlock_s, flags);
-                       squeue_locked = 1;
-                       /* mark next free wqe */
-                       wqe = (struct ehca_wqe *)
-                               ipz_qeit_get(&my_qp->ipz_squeue);
-                       wqe->optype = wqe->wqef = 0xff;
-                       ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
-                                ibqp->qp_num, wqe);
-               }
-               ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
-               if (ret) {
-                       ehca_err(ibqp->device, "prepare_sqe_rts() failed "
-                                "ehca_qp=%p qp_num=%x ret=%i",
-                                my_qp, ibqp->qp_num, ret);
-                       goto modify_qp_exit2;
-               }
-       }
-
-       /*
-        * enable RDMA_Atomic_Control if reset->init und reliable con
-        * this is necessary since gen2 does not provide that flag,
-        * but pHyp requires it
-        */
-       if (statetrans == IB_QPST_RESET2INIT &&
-           (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
-               mqpcb->rdma_atomic_ctrl = 3;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
-       }
-       /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
-       if (statetrans == IB_QPST_INIT2RTR &&
-           (ibqp->qp_type == IB_QPT_UC) &&
-           !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
-               mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
-       }
-
-       if (attr_mask & IB_QP_PKEY_INDEX) {
-               if (attr->pkey_index >= 16) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid pkey_index=%x. "
-                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
-                                attr->pkey_index, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->prim_p_key_idx = attr->pkey_index;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
-       }
-       if (attr_mask & IB_QP_PORT) {
-               struct ehca_sport *sport;
-               struct ehca_qp *aqp1;
-               if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid port=%x. "
-                                "ehca_qp=%p qp_num=%x num_ports=%x",
-                                attr->port_num, my_qp, ibqp->qp_num,
-                                shca->num_ports);
-                       goto modify_qp_exit2;
-               }
-               sport = &shca->sport[attr->port_num - 1];
-               if (!sport->ibqp_sqp[IB_QPT_GSI]) {
-                       /* should not occur */
-                       ret = -EFAULT;
-                       ehca_err(ibqp->device, "AQP1 was not created for "
-                                "port=%x", attr->port_num);
-                       goto modify_qp_exit2;
-               }
-               aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
-                                   struct ehca_qp, ib_qp);
-               if (ibqp->qp_type != IB_QPT_GSI &&
-                   ibqp->qp_type != IB_QPT_SMI &&
-                   aqp1->mod_qp_parm) {
-                       /*
-                        * firmware will reject this modify_qp() because
-                        * port is not activated/initialized fully
-                        */
-                       ret = -EFAULT;
-                       ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
-                                 "either port is being activated (try again) "
-                                 "or cabling issue", attr->port_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->prim_phys_port = attr->port_num;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
-       }
-       if (attr_mask & IB_QP_QKEY) {
-               mqpcb->qkey = attr->qkey;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
-       }
-       if (attr_mask & IB_QP_AV) {
-               mqpcb->dlid = attr->ah_attr.dlid;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
-               mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
-               mqpcb->service_level = attr->ah_attr.sl;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
-
-               if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
-                                 attr->ah_attr.static_rate,
-                                 &mqpcb->max_static_rate)) {
-                       ret = -EINVAL;
-                       goto modify_qp_exit2;
-               }
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
-
-               /*
-                * Always supply the GRH flag, even if it's zero, to give the
-                * hypervisor a clear "yes" or "no" instead of a "perhaps"
-                */
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
-
-               /*
-                * only if GRH is TRUE we might consider SOURCE_GID_IDX
-                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
-                */
-               if (attr->ah_attr.ah_flags == IB_AH_GRH) {
-                       mqpcb->send_grh_flag = 1;
-
-                       mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
-
-                       for (cnt = 0; cnt < 16; cnt++)
-                               mqpcb->dest_gid.byte[cnt] =
-                                       attr->ah_attr.grh.dgid.raw[cnt];
-
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
-                       mqpcb->flow_label = attr->ah_attr.grh.flow_label;
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
-                       mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
-                       mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
-               }
-       }
-
-       if (attr_mask & IB_QP_PATH_MTU) {
-               /* store ld(MTU) */
-               my_qp->mtu_shift = attr->path_mtu + 7;
-               mqpcb->path_mtu = attr->path_mtu;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
-       }
-       if (attr_mask & IB_QP_TIMEOUT) {
-               mqpcb->timeout = attr->timeout;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
-       }
-       if (attr_mask & IB_QP_RETRY_CNT) {
-               mqpcb->retry_count = attr->retry_cnt;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
-       }
-       if (attr_mask & IB_QP_RNR_RETRY) {
-               mqpcb->rnr_retry_count = attr->rnr_retry;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
-       }
-       if (attr_mask & IB_QP_RQ_PSN) {
-               mqpcb->receive_psn = attr->rq_psn;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
-       }
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
-               mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
-                       attr->max_dest_rd_atomic : 2;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
-       }
-       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
-               mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
-                       attr->max_rd_atomic : 2;
-               update_mask |=
-                       EHCA_BMASK_SET
-                       (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
-       }
-       if (attr_mask & IB_QP_ALT_PATH) {
-               if (attr->alt_port_num < 1
-                   || attr->alt_port_num > shca->num_ports) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid alt_port=%x. "
-                                "ehca_qp=%p qp_num=%x num_ports=%x",
-                                attr->alt_port_num, my_qp, ibqp->qp_num,
-                                shca->num_ports);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->alt_phys_port = attr->alt_port_num;
-
-               if (attr->alt_pkey_index >= 16) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
-                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
-                                attr->pkey_index, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->alt_p_key_idx = attr->alt_pkey_index;
-
-               mqpcb->timeout_al = attr->alt_timeout;
-               mqpcb->dlid_al = attr->alt_ah_attr.dlid;
-               mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
-               mqpcb->service_level_al = attr->alt_ah_attr.sl;
-
-               if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
-                                 attr->alt_ah_attr.static_rate,
-                                 &mqpcb->max_static_rate_al)) {
-                       ret = -EINVAL;
-                       goto modify_qp_exit2;
-               }
-
-               /* OpenIB doesn't support alternate retry counts - copy them */
-               mqpcb->retry_count_al = mqpcb->retry_count;
-               mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
-
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
-
-               /*
-                * Always supply the GRH flag, even if it's zero, to give the
-                * hypervisor a clear "yes" or "no" instead of a "perhaps"
-                */
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
-
-               /*
-                * only if GRH is TRUE we might consider SOURCE_GID_IDX
-                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
-                */
-               if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
-                       mqpcb->send_grh_flag_al = 1;
-
-                       for (cnt = 0; cnt < 16; cnt++)
-                               mqpcb->dest_gid_al.byte[cnt] =
-                                       attr->alt_ah_attr.grh.dgid.raw[cnt];
-                       mqpcb->source_gid_idx_al =
-                               attr->alt_ah_attr.grh.sgid_index;
-                       mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
-                       mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
-                       mqpcb->traffic_class_al =
-                               attr->alt_ah_attr.grh.traffic_class;
-
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
-                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
-               }
-       }
-
-       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
-               mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
-       }
-
-       if (attr_mask & IB_QP_SQ_PSN) {
-               mqpcb->send_psn = attr->sq_psn;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
-       }
-
-       if (attr_mask & IB_QP_DEST_QPN) {
-               mqpcb->dest_qp_nr = attr->dest_qp_num;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
-       }
-
-       if (attr_mask & IB_QP_PATH_MIG_STATE) {
-               if (attr->path_mig_state != IB_MIG_REARM
-                   && attr->path_mig_state != IB_MIG_MIGRATED) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid mig_state=%x",
-                                attr->path_mig_state);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->path_migration_state = attr->path_mig_state + 1;
-               if (attr->path_mig_state == IB_MIG_REARM)
-                       my_qp->mig_armed = 1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
-       }
-
-       if (attr_mask & IB_QP_CAP) {
-               mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
-               mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
-               /* no support for max_send/recv_sge yet */
-       }
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
-
-       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                                my_qp->ipz_qp_handle,
-                                &my_qp->pf,
-                                update_mask,
-                                mqpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
-               goto modify_qp_exit2;
-       }
-
-       if ((my_qp->qp_type == IB_QPT_UD ||
-            my_qp->qp_type == IB_QPT_GSI ||
-            my_qp->qp_type == IB_QPT_SMI) &&
-           statetrans == IB_QPST_SQE2RTS) {
-               /* doorbell to reprocessing wqes */
-               iosync(); /* serialize GAL register access */
-               hipz_update_sqa(my_qp, bad_wqe_cnt-1);
-               ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
-       }
-
-       if (statetrans == IB_QPST_RESET2INIT ||
-           statetrans == IB_QPST_INIT2INIT) {
-               mqpcb->qp_enable = 1;
-               mqpcb->qp_state = EHCA_QPS_INIT;
-               update_mask = 0;
-               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
-
-               h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                                        my_qp->ipz_qp_handle,
-                                        &my_qp->pf,
-                                        update_mask,
-                                        mqpcb,
-                                        my_qp->galpas.kernel);
-
-               if (h_ret != H_SUCCESS) {
-                       ret = ehca2ib_return_code(h_ret);
-                       ehca_err(ibqp->device, "ENABLE in context of "
-                                "RESET_2_INIT failed! Maybe you didn't get "
-                                "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
-                                h_ret, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-       }
-       if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
-           && !is_user) {
-               ret = check_for_left_cqes(my_qp, shca);
-               if (ret)
-                       goto modify_qp_exit2;
-       }
-
-       if (statetrans == IB_QPST_ANY2RESET) {
-               ipz_qeit_reset(&my_qp->ipz_rqueue);
-               ipz_qeit_reset(&my_qp->ipz_squeue);
-
-               if (qp_cur_state == IB_QPS_ERR && !is_user) {
-                       del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
-
-                       if (HAS_RQ(my_qp))
-                               del_from_err_list(my_qp->recv_cq,
-                                                 &my_qp->rq_err_node);
-               }
-               if (!is_user)
-                       reset_queue_map(&my_qp->sq_map);
-
-               if (HAS_RQ(my_qp) && !is_user)
-                       reset_queue_map(&my_qp->rq_map);
-       }
-
-       if (attr_mask & IB_QP_QKEY)
-               my_qp->qkey = attr->qkey;
-
-modify_qp_exit2:
-       if (squeue_locked) { /* this means: sqe -> rts */
-               spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
-               my_qp->sqerr_purgeflag = 1;
-       }
-
-modify_qp_exit1:
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return ret;
-}
-
-int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-                  struct ib_udata *udata)
-{
-       int ret = 0;
-
-       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
-                                             ib_device);
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-
-       /* The if-block below caches qp_attr to be modified for GSI and SMI
-        * qps during the initialization by ib_mad. When the respective port
-        * is activated, ie we got an event PORT_ACTIVE, we'll replay the
-        * cached modify calls sequence, see ehca_recover_sqs() below.
-        * Why that is required:
-        * 1) If one port is connected, older code requires that port one
-        *    to be connected and module option nr_ports=1 to be given by
-        *    user, which is very inconvenient for end user.
-        * 2) Firmware accepts modify_qp() only if respective port has become
-        *    active. Older code had a wait loop of 30sec create_qp()/
-        *    define_aqp1(), which is not appropriate in practice. This
-        *    code now removes that wait loop, see define_aqp1(), and always
-        *    reports all ports to ib_mad resp. users. Only activated ports
-        *    will then usable for the users.
-        */
-       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
-               int port = my_qp->init_attr.port_num;
-               struct ehca_sport *sport = &shca->sport[port - 1];
-               unsigned long flags;
-               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-               /* cache qp_attr only during init */
-               if (my_qp->mod_qp_parm) {
-                       struct ehca_mod_qp_parm *p;
-                       if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
-                               ehca_err(&shca->ib_device,
-                                        "mod_qp_parm overflow state=%x port=%x"
-                                        " type=%x", attr->qp_state,
-                                        my_qp->init_attr.port_num,
-                                        ibqp->qp_type);
-                               spin_unlock_irqrestore(&sport->mod_sqp_lock,
-                                                      flags);
-                               return -EINVAL;
-                       }
-                       p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
-                       p->mask = attr_mask;
-                       p->attr = *attr;
-                       my_qp->mod_qp_parm_idx++;
-                       ehca_dbg(&shca->ib_device,
-                                "Saved qp_attr for state=%x port=%x type=%x",
-                                attr->qp_state, my_qp->init_attr.port_num,
-                                ibqp->qp_type);
-                       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-                       goto out;
-               }
-               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-       }
-
-       ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
-
-out:
-       if ((ret == 0) && (attr_mask & IB_QP_STATE))
-               my_qp->state = attr->qp_state;
-
-       return ret;
-}
-
-void ehca_recover_sqp(struct ib_qp *sqp)
-{
-       struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
-       int port = my_sqp->init_attr.port_num;
-       struct ib_qp_attr attr;
-       struct ehca_mod_qp_parm *qp_parm;
-       int i, qp_parm_idx, ret;
-       unsigned long flags, wr_cnt;
-
-       if (!my_sqp->mod_qp_parm)
-               return;
-       ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
-
-       qp_parm = my_sqp->mod_qp_parm;
-       qp_parm_idx = my_sqp->mod_qp_parm_idx;
-       for (i = 0; i < qp_parm_idx; i++) {
-               attr = qp_parm[i].attr;
-               ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
-               if (ret) {
-                       ehca_err(sqp->device, "Could not modify SQP port=%x "
-                                "qp_num=%x ret=%x", port, sqp->qp_num, ret);
-                       goto free_qp_parm;
-               }
-               ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
-                        port, sqp->qp_num, attr.qp_state);
-       }
-
-       /* re-trigger posted recv wrs */
-       wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
-               my_sqp->ipz_rqueue.qe_size;
-       if (wr_cnt) {
-               spin_lock_irqsave(&my_sqp->spinlock_r, flags);
-               hipz_update_rqa(my_sqp, wr_cnt);
-               spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
-               ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
-                        port, sqp->qp_num, wr_cnt);
-       }
-
-free_qp_parm:
-       kfree(qp_parm);
-       /* this prevents subsequent calls to modify_qp() to cache qp_attr */
-       my_sqp->mod_qp_parm = NULL;
-}
-
-int ehca_query_qp(struct ib_qp *qp,
-                 struct ib_qp_attr *qp_attr,
-                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       struct hcp_modify_qp_control_block *qpcb;
-       int cnt, ret = 0;
-       u64 h_ret;
-
-       if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
-               ehca_err(qp->device, "Invalid attribute mask "
-                        "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
-                        my_qp, qp->qp_num, qp_attr_mask);
-               return -EINVAL;
-       }
-
-       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!qpcb) {
-               ehca_err(qp->device, "Out of memory for qpcb "
-                        "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(adapter_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               qpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(qp->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, qp->qp_num, h_ret);
-               goto query_qp_exit1;
-       }
-
-       qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
-       qp_attr->qp_state = qp_attr->cur_qp_state;
-
-       if (qp_attr->cur_qp_state == -EINVAL) {
-               ret = -EINVAL;
-               ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        qpcb->qp_state, my_qp, qp->qp_num);
-               goto query_qp_exit1;
-       }
-
-       if (qp_attr->qp_state == IB_QPS_SQD)
-               qp_attr->sq_draining = 1;
-
-       qp_attr->qkey = qpcb->qkey;
-       qp_attr->path_mtu = qpcb->path_mtu;
-       qp_attr->path_mig_state = qpcb->path_migration_state - 1;
-       qp_attr->rq_psn = qpcb->receive_psn;
-       qp_attr->sq_psn = qpcb->send_psn;
-       qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
-       qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
-       qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
-       /* UD_AV CIRCUMVENTION */
-       if (my_qp->qp_type == IB_QPT_UD) {
-               qp_attr->cap.max_send_sge =
-                       qpcb->actual_nr_sges_in_sq_wqe - 2;
-               qp_attr->cap.max_recv_sge =
-                       qpcb->actual_nr_sges_in_rq_wqe - 2;
-       } else {
-               qp_attr->cap.max_send_sge =
-                       qpcb->actual_nr_sges_in_sq_wqe;
-               qp_attr->cap.max_recv_sge =
-                       qpcb->actual_nr_sges_in_rq_wqe;
-       }
-
-       qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
-       qp_attr->dest_qp_num = qpcb->dest_qp_nr;
-
-       qp_attr->pkey_index = qpcb->prim_p_key_idx;
-       qp_attr->port_num = qpcb->prim_phys_port;
-       qp_attr->timeout = qpcb->timeout;
-       qp_attr->retry_cnt = qpcb->retry_count;
-       qp_attr->rnr_retry = qpcb->rnr_retry_count;
-
-       qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
-       qp_attr->alt_port_num = qpcb->alt_phys_port;
-       qp_attr->alt_timeout = qpcb->timeout_al;
-
-       qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
-       qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
-
-       /* primary av */
-       qp_attr->ah_attr.sl = qpcb->service_level;
-
-       if (qpcb->send_grh_flag) {
-               qp_attr->ah_attr.ah_flags = IB_AH_GRH;
-       }
-
-       qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
-       qp_attr->ah_attr.dlid = qpcb->dlid;
-       qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
-       qp_attr->ah_attr.port_num = qp_attr->port_num;
-
-       /* primary GRH */
-       qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
-       qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
-       qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
-       qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
-
-       for (cnt = 0; cnt < 16; cnt++)
-               qp_attr->ah_attr.grh.dgid.raw[cnt] =
-                       qpcb->dest_gid.byte[cnt];
-
-       /* alternate AV */
-       qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
-       if (qpcb->send_grh_flag_al) {
-               qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
-       }
-
-       qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
-       qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
-       qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
-
-       /* alternate GRH */
-       qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
-       qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
-       qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
-       qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
-
-       for (cnt = 0; cnt < 16; cnt++)
-               qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
-                       qpcb->dest_gid_al.byte[cnt];
-
-       /* return init attributes given in ehca_create_qp */
-       if (qp_init_attr)
-               *qp_init_attr = my_qp->init_attr;
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
-
-query_qp_exit1:
-       ehca_free_fw_ctrlblock(qpcb);
-
-       return ret;
-}
-
-int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
-{
-       struct ehca_qp *my_qp =
-               container_of(ibsrq, struct ehca_qp, ib_srq);
-       struct ehca_shca *shca =
-               container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
-       struct hcp_modify_qp_control_block *mqpcb;
-       u64 update_mask;
-       u64 h_ret;
-       int ret = 0;
-
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!mqpcb) {
-               ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
-               return -ENOMEM;
-       }
-
-       update_mask = 0;
-       if (attr_mask & IB_SRQ_LIMIT) {
-               attr_mask &= ~IB_SRQ_LIMIT;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
-               mqpcb->curr_srq_limit = attr->srq_limit;
-               mqpcb->qp_aff_asyn_ev_log_reg =
-                       EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
-       }
-
-       /* by now, all bits in attr_mask should have been cleared */
-       if (attr_mask) {
-               ehca_err(ibsrq->device, "invalid attribute mask bits set  "
-                        "attr_mask=%x", attr_mask);
-               ret = -EINVAL;
-               goto modify_srq_exit0;
-       }
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
-
-       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
-                                NULL, update_mask, mqpcb,
-                                my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x",
-                        h_ret, my_qp, my_qp->real_qp_num);
-       }
-
-modify_srq_exit0:
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return ret;
-}
-
-int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
-{
-       struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
-       struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       struct hcp_modify_qp_control_block *qpcb;
-       int ret = 0;
-       u64 h_ret;
-
-       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!qpcb) {
-               ehca_err(srq->device, "Out of memory for qpcb "
-                        "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
-                               NULL, qpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(srq->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, h_ret);
-               goto query_srq_exit1;
-       }
-
-       srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
-       srq_attr->max_sge = 3;
-       srq_attr->srq_limit = qpcb->curr_srq_limit;
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
-
-query_srq_exit1:
-       ehca_free_fw_ctrlblock(qpcb);
-
-       return ret;
-}
-
-static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
-                              struct ib_uobject *uobject)
-{
-       struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
-       struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
-                                            ib_pd);
-       struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
-       u32 qp_num = my_qp->real_qp_num;
-       int ret;
-       u64 h_ret;
-       u8 port_num;
-       int is_user = 0;
-       enum ib_qp_type qp_type;
-       unsigned long flags;
-
-       if (uobject) {
-               is_user = 1;
-               if (my_qp->mm_count_galpa ||
-                   my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
-                       ehca_err(dev, "Resources still referenced in "
-                                "user space qp_num=%x", qp_num);
-                       return -EINVAL;
-               }
-       }
-
-       if (my_qp->send_cq) {
-               ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
-               if (ret) {
-                       ehca_err(dev, "Couldn't unassign qp from "
-                                "send_cq ret=%i qp_num=%x cq_num=%x", ret,
-                                qp_num, my_qp->send_cq->cq_number);
-                       return ret;
-               }
-       }
-
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-       idr_remove(&ehca_qp_idr, my_qp->token);
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-
-       /*
-        * SRQs will never get into an error list and do not have a recv_cq,
-        * so we need to skip them here.
-        */
-       if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
-               del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
-
-       if (HAS_SQ(my_qp) && !is_user)
-               del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
-
-       /* now wait until all pending events have completed */
-       wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
-
-       h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
-               return ehca2ib_return_code(h_ret);
-       }
-
-       port_num = my_qp->init_attr.port_num;
-       qp_type  = my_qp->init_attr.qp_type;
-
-       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
-               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-               kfree(my_qp->mod_qp_parm);
-               my_qp->mod_qp_parm = NULL;
-               shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
-               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-       }
-
-       /* no support for IB_QPT_SMI yet */
-       if (qp_type == IB_QPT_GSI) {
-               struct ib_event event;
-               ehca_info(dev, "device %s: port %x is inactive.",
-                               shca->ib_device.name, port_num);
-               event.device = &shca->ib_device;
-               event.event = IB_EVENT_PORT_ERR;
-               event.element.port_num = port_num;
-               shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
-               ib_dispatch_event(&event);
-       }
-
-       if (HAS_RQ(my_qp)) {
-               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
-               if (!is_user)
-                       vfree(my_qp->rq_map.map);
-       }
-       if (HAS_SQ(my_qp)) {
-               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
-               if (!is_user)
-                       vfree(my_qp->sq_map.map);
-       }
-       kmem_cache_free(qp_cache, my_qp);
-       atomic_dec(&shca->num_qps);
-       return 0;
-}
-
-int ehca_destroy_qp(struct ib_qp *qp)
-{
-       return internal_destroy_qp(qp->device,
-                                  container_of(qp, struct ehca_qp, ib_qp),
-                                  qp->uobject);
-}
-
-int ehca_destroy_srq(struct ib_srq *srq)
-{
-       return internal_destroy_qp(srq->device,
-                                  container_of(srq, struct ehca_qp, ib_srq),
-                                  srq->uobject);
-}
-
-int ehca_init_qp_cache(void)
-{
-       qp_cache = kmem_cache_create("ehca_cache_qp",
-                                    sizeof(struct ehca_qp), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!qp_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_qp_cache(void)
-{
-       if (qp_cache)
-               kmem_cache_destroy(qp_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
deleted file mode 100644 (file)
index 47f9498..0000000
+++ /dev/null
@@ -1,953 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  post_send/recv, poll_cq, req_notify
- *
- *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-
-/* in RC traffic, insert an empty RDMA READ every this many packets */
-#define ACK_CIRC_THRESHOLD 2000000
-
-static u64 replace_wr_id(u64 wr_id, u16 idx)
-{
-       u64 ret;
-
-       ret = wr_id & ~QMAP_IDX_MASK;
-       ret |= idx & QMAP_IDX_MASK;
-
-       return ret;
-}
-
-static u16 get_app_wr_id(u64 wr_id)
-{
-       return wr_id & QMAP_IDX_MASK;
-}
-
-static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
-                                 struct ehca_wqe *wqe_p,
-                                 struct ib_recv_wr *recv_wr,
-                                 u32 rq_map_idx)
-{
-       u8 cnt_ds;
-       if (unlikely((recv_wr->num_sge < 0) ||
-                    (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
-               ehca_gen_err("Invalid number of WQE SGE. "
-                        "num_sqe=%x max_nr_of_sg=%x",
-                        recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
-               return -EINVAL; /* invalid SG list length */
-       }
-
-       /* clear wqe header until sglist */
-       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
-
-       wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
-       wqe_p->nr_of_data_seg = recv_wr->num_sge;
-
-       for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
-               wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
-                       recv_wr->sg_list[cnt_ds].addr;
-               wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
-                       recv_wr->sg_list[cnt_ds].lkey;
-               wqe_p->u.all_rcv.sg_list[cnt_ds].length =
-                       recv_wr->sg_list[cnt_ds].length;
-       }
-
-       if (ehca_debug_level >= 3) {
-               ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
-                            ipz_rqueue);
-               ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
-       }
-
-       return 0;
-}
-
-#if defined(DEBUG_GSI_SEND_WR)
-
-/* need ib_mad struct */
-#include <rdma/ib_mad.h>
-
-static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
-{
-       int idx;
-       int j;
-       while (send_wr) {
-               struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
-               struct ib_sge *sge = send_wr->sg_list;
-               ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
-                            "send_flags=%x opcode=%x", idx, send_wr->wr_id,
-                            send_wr->num_sge, send_wr->send_flags,
-                            send_wr->opcode);
-               if (mad_hdr) {
-                       ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
-                                    "mgmt_class=%x class_version=%x method=%x "
-                                    "status=%x class_specific=%x tid=%lx "
-                                    "attr_id=%x resv=%x attr_mod=%x",
-                                    idx, mad_hdr->base_version,
-                                    mad_hdr->mgmt_class,
-                                    mad_hdr->class_version, mad_hdr->method,
-                                    mad_hdr->status, mad_hdr->class_specific,
-                                    mad_hdr->tid, mad_hdr->attr_id,
-                                    mad_hdr->resv,
-                                    mad_hdr->attr_mod);
-               }
-               for (j = 0; j < send_wr->num_sge; j++) {
-                       u8 *data = __va(sge->addr);
-                       ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
-                                    "lkey=%x",
-                                    idx, j, data, sge->length, sge->lkey);
-                       /* assume length is n*16 */
-                       ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
-                                idx, j);
-                       sge++;
-               } /* eof for j */
-               idx++;
-               send_wr = send_wr->next;
-       } /* eof while send_wr */
-}
-
-#endif /* DEBUG_GSI_SEND_WR */
-
-static inline int ehca_write_swqe(struct ehca_qp *qp,
-                                 struct ehca_wqe *wqe_p,
-                                 const struct ib_send_wr *send_wr,
-                                 u32 sq_map_idx,
-                                 int hidden)
-{
-       u32 idx;
-       u64 dma_length;
-       struct ehca_av *my_av;
-       u32 remote_qkey = send_wr->wr.ud.remote_qkey;
-       struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
-
-       if (unlikely((send_wr->num_sge < 0) ||
-                    (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
-               ehca_gen_err("Invalid number of WQE SGE. "
-                        "num_sqe=%x max_nr_of_sg=%x",
-                        send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
-               return -EINVAL; /* invalid SG list length */
-       }
-
-       /* clear wqe header until sglist */
-       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
-
-       wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
-
-       qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
-       qmap_entry->reported = 0;
-       qmap_entry->cqe_req = 0;
-
-       switch (send_wr->opcode) {
-       case IB_WR_SEND:
-       case IB_WR_SEND_WITH_IMM:
-               wqe_p->optype = WQE_OPTYPE_SEND;
-               break;
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
-               break;
-       case IB_WR_RDMA_READ:
-               wqe_p->optype = WQE_OPTYPE_RDMAREAD;
-               break;
-       default:
-               ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
-               return -EINVAL; /* invalid opcode */
-       }
-
-       wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
-
-       wqe_p->wr_flag = 0;
-
-       if ((send_wr->send_flags & IB_SEND_SIGNALED ||
-           qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
-           && !hidden) {
-               wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
-               qmap_entry->cqe_req = 1;
-       }
-
-       if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
-           send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
-               /* this might not work as long as HW does not support it */
-               wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
-               wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
-       }
-
-       wqe_p->nr_of_data_seg = send_wr->num_sge;
-
-       switch (qp->qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               /* no break is intential here */
-       case IB_QPT_UD:
-               /* IB 1.2 spec C10-15 compliance */
-               if (send_wr->wr.ud.remote_qkey & 0x80000000)
-                       remote_qkey = qp->qkey;
-
-               wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
-               wqe_p->local_ee_context_qkey = remote_qkey;
-               if (unlikely(!send_wr->wr.ud.ah)) {
-                       ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
-                       return -EINVAL;
-               }
-               if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
-                       ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
-                       return -EINVAL;
-               }
-               my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
-               wqe_p->u.ud_av.ud_av = my_av->av;
-
-               /*
-                * omitted check of IB_SEND_INLINE
-                * since HW does not support it
-                */
-               for (idx = 0; idx < send_wr->num_sge; idx++) {
-                       wqe_p->u.ud_av.sg_list[idx].vaddr =
-                               send_wr->sg_list[idx].addr;
-                       wqe_p->u.ud_av.sg_list[idx].lkey =
-                               send_wr->sg_list[idx].lkey;
-                       wqe_p->u.ud_av.sg_list[idx].length =
-                               send_wr->sg_list[idx].length;
-               } /* eof for idx */
-               if (qp->qp_type == IB_QPT_SMI ||
-                   qp->qp_type == IB_QPT_GSI)
-                       wqe_p->u.ud_av.ud_av.pmtu = 1;
-               if (qp->qp_type == IB_QPT_GSI) {
-                       wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
-#ifdef DEBUG_GSI_SEND_WR
-                       trace_send_wr_ud(send_wr);
-#endif /* DEBUG_GSI_SEND_WR */
-               }
-               break;
-
-       case IB_QPT_UC:
-               if (send_wr->send_flags & IB_SEND_FENCE)
-                       wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
-               /* no break is intentional here */
-       case IB_QPT_RC:
-               /* TODO: atomic not implemented */
-               wqe_p->u.nud.remote_virtual_address =
-                       send_wr->wr.rdma.remote_addr;
-               wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;
-
-               /*
-                * omitted checking of IB_SEND_INLINE
-                * since HW does not support it
-                */
-               dma_length = 0;
-               for (idx = 0; idx < send_wr->num_sge; idx++) {
-                       wqe_p->u.nud.sg_list[idx].vaddr =
-                               send_wr->sg_list[idx].addr;
-                       wqe_p->u.nud.sg_list[idx].lkey =
-                               send_wr->sg_list[idx].lkey;
-                       wqe_p->u.nud.sg_list[idx].length =
-                               send_wr->sg_list[idx].length;
-                       dma_length += send_wr->sg_list[idx].length;
-               } /* eof idx */
-               wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
-
-               /* unsolicited ack circumvention */
-               if (send_wr->opcode == IB_WR_RDMA_READ) {
-                       /* on RDMA read, switch on and reset counters */
-                       qp->message_count = qp->packet_count = 0;
-                       qp->unsol_ack_circ = 1;
-               } else
-                       /* else estimate #packets */
-                       qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
-
-               break;
-
-       default:
-               ehca_gen_err("Invalid qptype=%x", qp->qp_type);
-               return -EINVAL;
-       }
-
-       if (ehca_debug_level >= 3) {
-               ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
-               ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
-       }
-       return 0;
-}
-
-/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
-static inline void map_ib_wc_status(u32 cqe_status,
-                                   enum ib_wc_status *wc_status)
-{
-       if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
-               switch (cqe_status & 0x3F) {
-               case 0x01:
-               case 0x21:
-                       *wc_status = IB_WC_LOC_LEN_ERR;
-                       break;
-               case 0x02:
-               case 0x22:
-                       *wc_status = IB_WC_LOC_QP_OP_ERR;
-                       break;
-               case 0x03:
-               case 0x23:
-                       *wc_status = IB_WC_LOC_EEC_OP_ERR;
-                       break;
-               case 0x04:
-               case 0x24:
-                       *wc_status = IB_WC_LOC_PROT_ERR;
-                       break;
-               case 0x05:
-               case 0x25:
-                       *wc_status = IB_WC_WR_FLUSH_ERR;
-                       break;
-               case 0x06:
-                       *wc_status = IB_WC_MW_BIND_ERR;
-                       break;
-               case 0x07: /* remote error - look into bits 20:24 */
-                       switch ((cqe_status
-                                & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
-                       case 0x0:
-                               /*
-                                * PSN Sequence Error!
-                                * couldn't find a matching status!
-                                */
-                               *wc_status = IB_WC_GENERAL_ERR;
-                               break;
-                       case 0x1:
-                               *wc_status = IB_WC_REM_INV_REQ_ERR;
-                               break;
-                       case 0x2:
-                               *wc_status = IB_WC_REM_ACCESS_ERR;
-                               break;
-                       case 0x3:
-                               *wc_status = IB_WC_REM_OP_ERR;
-                               break;
-                       case 0x4:
-                               *wc_status = IB_WC_REM_INV_RD_REQ_ERR;
-                               break;
-                       }
-                       break;
-               case 0x08:
-                       *wc_status = IB_WC_RETRY_EXC_ERR;
-                       break;
-               case 0x09:
-                       *wc_status = IB_WC_RNR_RETRY_EXC_ERR;
-                       break;
-               case 0x0A:
-               case 0x2D:
-                       *wc_status = IB_WC_REM_ABORT_ERR;
-                       break;
-               case 0x0B:
-               case 0x2E:
-                       *wc_status = IB_WC_INV_EECN_ERR;
-                       break;
-               case 0x0C:
-               case 0x2F:
-                       *wc_status = IB_WC_INV_EEC_STATE_ERR;
-                       break;
-               case 0x0D:
-                       *wc_status = IB_WC_BAD_RESP_ERR;
-                       break;
-               case 0x10:
-                       /* WQE purged */
-                       *wc_status = IB_WC_WR_FLUSH_ERR;
-                       break;
-               default:
-                       *wc_status = IB_WC_FATAL_ERR;
-
-               }
-       } else
-               *wc_status = IB_WC_SUCCESS;
-}
-
-static inline int post_one_send(struct ehca_qp *my_qp,
-                        struct ib_send_wr *cur_send_wr,
-                        int hidden)
-{
-       struct ehca_wqe *wqe_p;
-       int ret;
-       u32 sq_map_idx;
-       u64 start_offset = my_qp->ipz_squeue.current_q_offset;
-
-       /* get pointer next to free WQE */
-       wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
-       if (unlikely(!wqe_p)) {
-               /* too many posted work requests: queue overflow */
-               ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
-                        "qp_num=%x", my_qp->ib_qp.qp_num);
-               return -ENOMEM;
-       }
-
-       /*
-        * Get the index of the WQE in the send queue. The same index is used
-        * for writing into the sq_map.
-        */
-       sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
-
-       /* write a SEND WQE into the QUEUE */
-       ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
-       /*
-        * if something failed,
-        * reset the free entry pointer to the start value
-        */
-       if (unlikely(ret)) {
-               my_qp->ipz_squeue.current_q_offset = start_offset;
-               ehca_err(my_qp->ib_qp.device, "Could not write WQE "
-                        "qp_num=%x", my_qp->ib_qp.qp_num);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-int ehca_post_send(struct ib_qp *qp,
-                  struct ib_send_wr *send_wr,
-                  struct ib_send_wr **bad_send_wr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-       int wqe_cnt = 0;
-       int ret = 0;
-       unsigned long flags;
-
-       /* Reject WR if QP is in RESET, INIT or RTR state */
-       if (unlikely(my_qp->state < IB_QPS_RTS)) {
-               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
-                        my_qp->state, qp->qp_num);
-               ret = -EINVAL;
-               goto out;
-       }
-
-       /* LOCK the QUEUE */
-       spin_lock_irqsave(&my_qp->spinlock_s, flags);
-
-       /* Send an empty extra RDMA read if:
-        *  1) there has been an RDMA read on this connection before
-        *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
-        *  3) we can be sure that any previous extra RDMA read has been
-        *     processed so we don't overflow the SQ
-        */
-       if (unlikely(my_qp->unsol_ack_circ &&
-                    my_qp->packet_count > ACK_CIRC_THRESHOLD &&
-                    my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
-               /* insert an empty RDMA READ to fix up the remote QP state */
-               struct ib_send_wr circ_wr;
-               memset(&circ_wr, 0, sizeof(circ_wr));
-               circ_wr.opcode = IB_WR_RDMA_READ;
-               post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
-               wqe_cnt++;
-               ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
-               my_qp->message_count = my_qp->packet_count = 0;
-       }
-
-       /* loop processes list of send reqs */
-       while (send_wr) {
-               ret = post_one_send(my_qp, send_wr, 0);
-               if (unlikely(ret)) {
-                       goto post_send_exit0;
-               }
-               wqe_cnt++;
-               send_wr = send_wr->next;
-       }
-
-post_send_exit0:
-       iosync(); /* serialize GAL register access */
-       hipz_update_sqa(my_qp, wqe_cnt);
-       if (unlikely(ret || ehca_debug_level >= 2))
-               ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
-                        my_qp, qp->qp_num, wqe_cnt, ret);
-       my_qp->message_count += wqe_cnt;
-       spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
-
-out:
-       if (ret)
-               *bad_send_wr = send_wr;
-       return ret;
-}
-
-static int internal_post_recv(struct ehca_qp *my_qp,
-                             struct ib_device *dev,
-                             struct ib_recv_wr *recv_wr,
-                             struct ib_recv_wr **bad_recv_wr)
-{
-       struct ehca_wqe *wqe_p;
-       int wqe_cnt = 0;
-       int ret = 0;
-       u32 rq_map_idx;
-       unsigned long flags;
-       struct ehca_qmap_entry *qmap_entry;
-
-       if (unlikely(!HAS_RQ(my_qp))) {
-               ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
-                        my_qp, my_qp->real_qp_num, my_qp->ext_type);
-               ret = -ENODEV;
-               goto out;
-       }
-
-       /* LOCK the QUEUE */
-       spin_lock_irqsave(&my_qp->spinlock_r, flags);
-
-       /* loop processes list of recv reqs */
-       while (recv_wr) {
-               u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
-               /* get pointer next to free WQE */
-               wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
-               if (unlikely(!wqe_p)) {
-                       /* too many posted work requests: queue overflow */
-                       ret = -ENOMEM;
-                       ehca_err(dev, "Too many posted WQEs "
-                               "qp_num=%x", my_qp->real_qp_num);
-                       goto post_recv_exit0;
-               }
-               /*
-                * Get the index of the WQE in the recv queue. The same index
-                * is used for writing into the rq_map.
-                */
-               rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
-
-               /* write a RECV WQE into the QUEUE */
-               ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
-                               rq_map_idx);
-               /*
-                * if something failed,
-                * reset the free entry pointer to the start value
-                */
-               if (unlikely(ret)) {
-                       my_qp->ipz_rqueue.current_q_offset = start_offset;
-                       ret = -EINVAL;
-                       ehca_err(dev, "Could not write WQE "
-                               "qp_num=%x", my_qp->real_qp_num);
-                       goto post_recv_exit0;
-               }
-
-               qmap_entry = &my_qp->rq_map.map[rq_map_idx];
-               qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
-               qmap_entry->reported = 0;
-               qmap_entry->cqe_req = 1;
-
-               wqe_cnt++;
-               recv_wr = recv_wr->next;
-       } /* eof for recv_wr */
-
-post_recv_exit0:
-       iosync(); /* serialize GAL register access */
-       hipz_update_rqa(my_qp, wqe_cnt);
-       if (unlikely(ret || ehca_debug_level >= 2))
-           ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
-                    my_qp, my_qp->real_qp_num, wqe_cnt, ret);
-       spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
-
-out:
-       if (ret)
-               *bad_recv_wr = recv_wr;
-
-       return ret;
-}
-
-int ehca_post_recv(struct ib_qp *qp,
-                  struct ib_recv_wr *recv_wr,
-                  struct ib_recv_wr **bad_recv_wr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-
-       /* Reject WR if QP is in RESET state */
-       if (unlikely(my_qp->state == IB_QPS_RESET)) {
-               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
-                        my_qp->state, qp->qp_num);
-               *bad_recv_wr = recv_wr;
-               return -EINVAL;
-       }
-
-       return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
-}
-
-int ehca_post_srq_recv(struct ib_srq *srq,
-                      struct ib_recv_wr *recv_wr,
-                      struct ib_recv_wr **bad_recv_wr)
-{
-       return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
-                                 srq->device, recv_wr, bad_recv_wr);
-}
-
-/*
- * ib_wc_opcode table converts ehca wc opcode to ib
- * Since we use zero to indicate invalid opcode, the actual ib opcode must
- * be decremented!!!
- */
-static const u8 ib_wc_opcode[255] = {
-       [0x01] = IB_WC_RECV+1,
-       [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
-       [0x04] = IB_WC_BIND_MW+1,
-       [0x08] = IB_WC_FETCH_ADD+1,
-       [0x10] = IB_WC_COMP_SWAP+1,
-       [0x20] = IB_WC_RDMA_WRITE+1,
-       [0x40] = IB_WC_RDMA_READ+1,
-       [0x80] = IB_WC_SEND+1
-};
-
-/* internal function to poll one entry of cq */
-static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
-{
-       int ret = 0, qmap_tail_idx;
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       struct ehca_cqe *cqe;
-       struct ehca_qp *my_qp;
-       struct ehca_qmap_entry *qmap_entry;
-       struct ehca_queue_map *qmap;
-       int cqe_count = 0, is_error;
-
-repoll:
-       cqe = (struct ehca_cqe *)
-               ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
-       if (!cqe) {
-               ret = -EAGAIN;
-               if (ehca_debug_level >= 3)
-                       ehca_dbg(cq->device, "Completion queue is empty  "
-                                "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
-               goto poll_cq_one_exit0;
-       }
-
-       /* prevents loads being reordered across this point */
-       rmb();
-
-       cqe_count++;
-       if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
-               struct ehca_qp *qp;
-               int purgeflag;
-               unsigned long flags;
-
-               qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
-               if (!qp) {
-                       ehca_err(cq->device, "cq_num=%x qp_num=%x "
-                                "could not find qp -> ignore cqe",
-                                my_cq->cq_number, cqe->local_qp_number);
-                       ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
-                                my_cq->cq_number, cqe->local_qp_number);
-                       /* ignore this purged cqe */
-                       goto repoll;
-               }
-               spin_lock_irqsave(&qp->spinlock_s, flags);
-               purgeflag = qp->sqerr_purgeflag;
-               spin_unlock_irqrestore(&qp->spinlock_s, flags);
-
-               if (purgeflag) {
-                       ehca_dbg(cq->device,
-                                "Got CQE with purged bit qp_num=%x src_qp=%x",
-                                cqe->local_qp_number, cqe->remote_qp_number);
-                       if (ehca_debug_level >= 2)
-                               ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
-                                        cqe->local_qp_number,
-                                        cqe->remote_qp_number);
-                       /*
-                        * ignore this to avoid double cqes of bad wqe
-                        * that caused sqe and turn off purge flag
-                        */
-                       qp->sqerr_purgeflag = 0;
-                       goto repoll;
-               }
-       }
-
-       is_error = cqe->status & WC_STATUS_ERROR_BIT;
-
-       /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
-       if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
-               ehca_dbg(cq->device,
-                        "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
-                        is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
-               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
-                        my_cq, my_cq->cq_number);
-               ehca_dbg(cq->device,
-                        "ehca_cq=%p cq_num=%x -------------------------",
-                        my_cq, my_cq->cq_number);
-       }
-
-       read_lock(&ehca_qp_idr_lock);
-       my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
-       read_unlock(&ehca_qp_idr_lock);
-       if (!my_qp)
-               goto repoll;
-       wc->qp = &my_qp->ib_qp;
-
-       qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
-       if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
-               /* We got a send completion. */
-               qmap = &my_qp->sq_map;
-       else
-               /* We got a receive completion. */
-               qmap = &my_qp->rq_map;
-
-       /* advance the tail pointer */
-       qmap->tail = qmap_tail_idx;
-
-       if (is_error) {
-               /*
-                * set left_to_poll to 0 because in error state, we will not
-                * get any additional CQEs
-                */
-               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
-                                                       my_qp->sq_map.entries);
-               my_qp->sq_map.left_to_poll = 0;
-               ehca_add_to_err_list(my_qp, 1);
-
-               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
-                                                       my_qp->rq_map.entries);
-               my_qp->rq_map.left_to_poll = 0;
-               if (HAS_RQ(my_qp))
-                       ehca_add_to_err_list(my_qp, 0);
-       }
-
-       qmap_entry = &qmap->map[qmap_tail_idx];
-       if (qmap_entry->reported) {
-               ehca_warn(cq->device, "Double cqe on qp_num=%#x",
-                               my_qp->real_qp_num);
-               /* found a double cqe, discard it and read next one */
-               goto repoll;
-       }
-
-       wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
-       qmap_entry->reported = 1;
-
-       /* if left_to_poll is decremented to 0, add the QP to the error list */
-       if (qmap->left_to_poll > 0) {
-               qmap->left_to_poll--;
-               if ((my_qp->sq_map.left_to_poll == 0) &&
-                               (my_qp->rq_map.left_to_poll == 0)) {
-                       ehca_add_to_err_list(my_qp, 1);
-                       if (HAS_RQ(my_qp))
-                               ehca_add_to_err_list(my_qp, 0);
-               }
-       }
-
-       /* eval ib_wc_opcode */
-       wc->opcode = ib_wc_opcode[cqe->optype]-1;
-       if (unlikely(wc->opcode == -1)) {
-               ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
-                        "ehca_cq=%p cq_num=%x",
-                        cqe->optype, cqe->status, my_cq, my_cq->cq_number);
-               /* dump cqe for other infos */
-               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
-                        my_cq, my_cq->cq_number);
-               /* update also queue adder to throw away this entry!!! */
-               goto repoll;
-       }
-
-       /* eval ib_wc_status */
-       if (unlikely(is_error)) {
-               /* complete with errors */
-               map_ib_wc_status(cqe->status, &wc->status);
-               wc->vendor_err = wc->status;
-       } else
-               wc->status = IB_WC_SUCCESS;
-
-       wc->byte_len = cqe->nr_bytes_transferred;
-       wc->pkey_index = cqe->pkey_index;
-       wc->slid = cqe->rlid;
-       wc->dlid_path_bits = cqe->dlid;
-       wc->src_qp = cqe->remote_qp_number;
-       /*
-        * HW has "Immed data present" and "GRH present" in bits 6 and 5.
-        * SW defines those in bits 1 and 0, so we can just shift and mask.
-        */
-       wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
-       wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
-       wc->sl = cqe->service_level;
-
-poll_cq_one_exit0:
-       if (cqe_count > 0)
-               hipz_update_feca(my_cq, cqe_count);
-
-       return ret;
-}
-
-static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
-                              struct ib_wc *wc, int num_entries,
-                              struct ipz_queue *ipz_queue, int on_sq)
-{
-       int nr = 0;
-       struct ehca_wqe *wqe;
-       u64 offset;
-       struct ehca_queue_map *qmap;
-       struct ehca_qmap_entry *qmap_entry;
-
-       if (on_sq)
-               qmap = &my_qp->sq_map;
-       else
-               qmap = &my_qp->rq_map;
-
-       qmap_entry = &qmap->map[qmap->next_wqe_idx];
-
-       while ((nr < num_entries) && (qmap_entry->reported == 0)) {
-               /* generate flush CQE */
-
-               memset(wc, 0, sizeof(*wc));
-
-               offset = qmap->next_wqe_idx * ipz_queue->qe_size;
-               wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
-               if (!wqe) {
-                       ehca_err(cq->device, "Invalid wqe offset=%#llx on "
-                                "qp_num=%#x", offset, my_qp->real_qp_num);
-                       return nr;
-               }
-
-               wc->wr_id = replace_wr_id(wqe->work_request_id,
-                                         qmap_entry->app_wr_id);
-
-               if (on_sq) {
-                       switch (wqe->optype) {
-                       case WQE_OPTYPE_SEND:
-                               wc->opcode = IB_WC_SEND;
-                               break;
-                       case WQE_OPTYPE_RDMAWRITE:
-                               wc->opcode = IB_WC_RDMA_WRITE;
-                               break;
-                       case WQE_OPTYPE_RDMAREAD:
-                               wc->opcode = IB_WC_RDMA_READ;
-                               break;
-                       default:
-                               ehca_err(cq->device, "Invalid optype=%x",
-                                               wqe->optype);
-                               return nr;
-                       }
-               } else
-                       wc->opcode = IB_WC_RECV;
-
-               if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
-                       wc->ex.imm_data = wqe->immediate_data;
-                       wc->wc_flags |= IB_WC_WITH_IMM;
-               }
-
-               wc->status = IB_WC_WR_FLUSH_ERR;
-
-               wc->qp = &my_qp->ib_qp;
-
-               /* mark as reported and advance next_wqe pointer */
-               qmap_entry->reported = 1;
-               qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
-                                               qmap->entries);
-               qmap_entry = &qmap->map[qmap->next_wqe_idx];
-
-               wc++; nr++;
-       }
-
-       return nr;
-
-}
-
-int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
-{
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int nr;
-       struct ehca_qp *err_qp;
-       struct ib_wc *current_wc = wc;
-       int ret = 0;
-       unsigned long flags;
-       int entries_left = num_entries;
-
-       if (num_entries < 1) {
-               ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
-                        "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
-               ret = -EINVAL;
-               goto poll_cq_exit0;
-       }
-
-       spin_lock_irqsave(&my_cq->spinlock, flags);
-
-       /* generate flush cqes for send queues */
-       list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
-               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
-                               &err_qp->ipz_squeue, 1);
-               entries_left -= nr;
-               current_wc += nr;
-
-               if (entries_left == 0)
-                       break;
-       }
-
-       /* generate flush cqes for receive queues */
-       list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
-               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
-                               &err_qp->ipz_rqueue, 0);
-               entries_left -= nr;
-               current_wc += nr;
-
-               if (entries_left == 0)
-                       break;
-       }
-
-       for (nr = 0; nr < entries_left; nr++) {
-               ret = ehca_poll_cq_one(cq, current_wc);
-               if (ret)
-                       break;
-               current_wc++;
-       } /* eof for nr */
-       entries_left -= nr;
-
-       spin_unlock_irqrestore(&my_cq->spinlock, flags);
-       if (ret == -EAGAIN  || !ret)
-               ret = num_entries - entries_left;
-
-poll_cq_exit0:
-       return ret;
-}
-
-int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
-{
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int ret = 0;
-
-       switch (notify_flags & IB_CQ_SOLICITED_MASK) {
-       case IB_CQ_SOLICITED:
-               hipz_set_cqx_n0(my_cq, 1);
-               break;
-       case IB_CQ_NEXT_COMP:
-               hipz_set_cqx_n1(my_cq, 1);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
-               unsigned long spl_flags;
-               spin_lock_irqsave(&my_cq->spinlock, spl_flags);
-               ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
-               spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
-       }
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
deleted file mode 100644 (file)
index 376b031..0000000
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  SQP functions
- *
- *  Authors: Khadija Souissi <souissi@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <rdma/ib_mad.h>
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-#define IB_MAD_STATUS_REDIRECT         cpu_to_be16(0x0002)
-#define IB_MAD_STATUS_UNSUP_VERSION    cpu_to_be16(0x0004)
-#define IB_MAD_STATUS_UNSUP_METHOD     cpu_to_be16(0x0008)
-
-#define IB_PMA_CLASS_PORT_INFO         cpu_to_be16(0x0001)
-
-/**
- * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
- * pair is created successfully, the corresponding port gets active.
- *
- * Define Special Queue pair 0 (SMI QP) is still not supported.
- *
- * @qp_init_attr: Queue pair init attributes with port and queue pair type
- */
-
-u64 ehca_define_sqp(struct ehca_shca *shca,
-                   struct ehca_qp *ehca_qp,
-                   struct ib_qp_init_attr *qp_init_attr)
-{
-       u32 pma_qp_nr, bma_qp_nr;
-       u64 ret;
-       u8 port = qp_init_attr->port_num;
-       int counter;
-
-       shca->sport[port - 1].port_state = IB_PORT_DOWN;
-
-       switch (qp_init_attr->qp_type) {
-       case IB_QPT_SMI:
-               /* function not supported yet */
-               break;
-       case IB_QPT_GSI:
-               ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
-                                        ehca_qp->ipz_qp_handle,
-                                        ehca_qp->galpas.kernel,
-                                        (u32) qp_init_attr->port_num,
-                                        &pma_qp_nr, &bma_qp_nr);
-
-               if (ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device,
-                                "Can't define AQP1 for port %x. h_ret=%lli",
-                                port, ret);
-                       return ret;
-               }
-               shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
-               ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
-                        port, pma_qp_nr);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "invalid qp_type=%x",
-                        qp_init_attr->qp_type);
-               return H_PARAMETER;
-       }
-
-       if (ehca_nr_ports < 0) /* autodetect mode */
-               return H_SUCCESS;
-
-       for (counter = 0;
-            shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
-                    counter < ehca_port_act_time;
-            counter++) {
-               ehca_dbg(&shca->ib_device, "... wait until port %x is active",
-                        port);
-               msleep_interruptible(1000);
-       }
-
-       if (counter == ehca_port_act_time) {
-               ehca_err(&shca->ib_device, "Port %x is not active.", port);
-               return H_HARDWARE;
-       }
-
-       return H_SUCCESS;
-}
-
-struct ib_perf {
-       struct ib_mad_hdr mad_hdr;
-       u8 reserved[40];
-       u8 data[192];
-} __attribute__ ((packed));
-
-/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
-struct tcslfl {
-       u32 tc:8;
-       u32 sl:4;
-       u32 fl:20;
-} __attribute__ ((packed));
-
-/* IP Version/TC/FL packed into 32 bits, as in GRH */
-struct vertcfl {
-       u32 ver:4;
-       u32 tc:8;
-       u32 fl:20;
-} __attribute__ ((packed));
-
-static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
-                            const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                            const struct ib_mad *in_mad, struct ib_mad *out_mad)
-{
-       const struct ib_perf *in_perf = (const struct ib_perf *)in_mad;
-       struct ib_perf *out_perf = (struct ib_perf *)out_mad;
-       struct ib_class_port_info *poi =
-               (struct ib_class_port_info *)out_perf->data;
-       struct tcslfl *tcslfl =
-               (struct tcslfl *)&poi->redirect_tcslfl;
-       struct ehca_shca *shca =
-               container_of(ibdev, struct ehca_shca, ib_device);
-       struct ehca_sport *sport = &shca->sport[port_num - 1];
-
-       ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
-
-       *out_mad = *in_mad;
-
-       if (in_perf->mad_hdr.class_version != 1) {
-               ehca_warn(ibdev, "Unsupported class_version=%x",
-                         in_perf->mad_hdr.class_version);
-               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
-               goto perf_reply;
-       }
-
-       switch (in_perf->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-       case IB_MGMT_METHOD_SET:
-               /* set class port info for redirection */
-               out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
-               out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
-               memset(poi, 0, sizeof(*poi));
-               poi->base_version = 1;
-               poi->class_version = 1;
-               poi->resp_time_value = 18;
-
-               /* copy local routing information from WC where applicable */
-               tcslfl->sl         = in_wc->sl;
-               poi->redirect_lid  =
-                       sport->saved_attr.lid | in_wc->dlid_path_bits;
-               poi->redirect_qp   = sport->pma_qp_nr;
-               poi->redirect_qkey = IB_QP1_QKEY;
-
-               ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
-                               &poi->redirect_pkey);
-
-               /* if request was globally routed, copy route info */
-               if (in_grh) {
-                       const struct vertcfl *vertcfl =
-                               (const struct vertcfl *)&in_grh->version_tclass_flow;
-                       memcpy(poi->redirect_gid, in_grh->dgid.raw,
-                              sizeof(poi->redirect_gid));
-                       tcslfl->tc        = vertcfl->tc;
-                       tcslfl->fl        = vertcfl->fl;
-               } else
-                       /* else only fill in default GID */
-                       ehca_query_gid(ibdev, port_num, 0,
-                                      (union ib_gid *)&poi->redirect_gid);
-
-               ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
-                        sport->saved_attr.lid, sport->pma_qp_nr);
-               break;
-
-       case IB_MGMT_METHOD_GET_RESP:
-               return IB_MAD_RESULT_FAILURE;
-
-       default:
-               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
-               break;
-       }
-
-perf_reply:
-       out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
-
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in, size_t in_mad_size,
-                    struct ib_mad_hdr *out, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index)
-{
-       int ret;
-       const struct ib_mad *in_mad = (const struct ib_mad *)in;
-       struct ib_mad *out_mad = (struct ib_mad *)out;
-
-       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-                        *out_mad_size != sizeof(*out_mad)))
-               return IB_MAD_RESULT_FAILURE;
-
-       if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
-               return IB_MAD_RESULT_FAILURE;
-
-       /* accept only pma request */
-       if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
-               return IB_MAD_RESULT_SUCCESS;
-
-       ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
-       ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
-                               in_mad, out_mad);
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h
deleted file mode 100644 (file)
index d280b12..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  auxiliary functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Khadija Souissi <souissik@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#ifndef EHCA_TOOLS_H
-#define EHCA_TOOLS_H
-
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-#include <linux/idr.h>
-#include <linux/kthread.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/vmalloc.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/device.h>
-
-#include <linux/atomic.h>
-#include <asm/ibmebus.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/hvcall.h>
-
-extern int ehca_debug_level;
-
-#define ehca_dbg(ib_dev, format, arg...) \
-       do { \
-               if (unlikely(ehca_debug_level)) \
-                       dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
-                                  "PU%04x EHCA_DBG:%s " format "\n", \
-                                  raw_smp_processor_id(), __func__, \
-                                  ## arg); \
-       } while (0)
-
-#define ehca_info(ib_dev, format, arg...) \
-       dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
-                raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_warn(ib_dev, format, arg...) \
-       dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
-                raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_err(ib_dev, format, arg...) \
-       dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
-               raw_smp_processor_id(), __func__, ## arg)
-
-/* use this one only if no ib_dev available */
-#define ehca_gen_dbg(format, arg...) \
-       do { \
-               if (unlikely(ehca_debug_level)) \
-                       printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
-                              raw_smp_processor_id(), __func__, ## arg); \
-       } while (0)
-
-#define ehca_gen_warn(format, arg...) \
-       printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
-              raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_gen_err(format, arg...) \
-       printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
-              raw_smp_processor_id(), __func__, ## arg)
-
-/**
- * ehca_dmp - printk a memory block, whose length is n*8 bytes.
- * Each line has the following layout:
- * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
- */
-#define ehca_dmp(adr, len, format, args...) \
-       do { \
-               unsigned int x; \
-               unsigned int l = (unsigned int)(len); \
-               unsigned char *deb = (unsigned char *)(adr); \
-               for (x = 0; x < l; x += 16) { \
-                       printk(KERN_INFO "EHCA_DMP:%s " format \
-                              " adr=%p ofs=%04x %016llx %016llx\n", \
-                              __func__, ##args, deb, x, \
-                              *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
-                       deb += 16; \
-               } \
-       } while (0)
-
-/* define a bitmask, little endian version */
-#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
-
-/* define a bitmask, the ibm way... */
-#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
-
-/* internal function, don't use */
-#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
-
-/* internal function, don't use */
-#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
-
-/**
- * EHCA_BMASK_SET - return value shifted and masked by mask
- * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
- * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
- * in variable
- */
-#define EHCA_BMASK_SET(mask, value) \
-       ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
-
-/**
- * EHCA_BMASK_GET - extract a parameter from value by mask
- */
-#define EHCA_BMASK_GET(mask, value) \
-       (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
-
-/* Converts ehca to ib return code */
-int ehca2ib_return_code(u64 ehca_rc);
-
-#endif /* EHCA_TOOLS_H */
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
deleted file mode 100644 (file)
index 1a1d5d9..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  userspace support verbs
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_classes.h"
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-
-struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
-                                       struct ib_udata *udata)
-{
-       struct ehca_ucontext *my_context;
-
-       my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
-       if (!my_context) {
-               ehca_err(device, "Out of memory device=%p", device);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       return &my_context->ib_ucontext;
-}
-
-int ehca_dealloc_ucontext(struct ib_ucontext *context)
-{
-       kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
-       return 0;
-}
-
-static void ehca_mm_open(struct vm_area_struct *vma)
-{
-       u32 *count = (u32 *)vma->vm_private_data;
-       if (!count) {
-               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-               return;
-       }
-       (*count)++;
-       if (!(*count))
-               ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
-                    vma->vm_start, vma->vm_end, *count);
-}
-
-static void ehca_mm_close(struct vm_area_struct *vma)
-{
-       u32 *count = (u32 *)vma->vm_private_data;
-       if (!count) {
-               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-               return;
-       }
-       (*count)--;
-       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
-                    vma->vm_start, vma->vm_end, *count);
-}
-
-static const struct vm_operations_struct vm_ops = {
-       .open = ehca_mm_open,
-       .close = ehca_mm_close,
-};
-
-static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
-                       u32 *mm_count)
-{
-       int ret;
-       u64 vsize, physical;
-
-       vsize = vma->vm_end - vma->vm_start;
-       if (vsize < EHCA_PAGESIZE) {
-               ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
-               return -EINVAL;
-       }
-
-       physical = galpas->user.fw_handle;
-       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-       ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
-       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
-       ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
-                          vma->vm_page_prot);
-       if (unlikely(ret)) {
-               ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
-               return -ENOMEM;
-       }
-
-       vma->vm_private_data = mm_count;
-       (*mm_count)++;
-       vma->vm_ops = &vm_ops;
-
-       return 0;
-}
-
-static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
-                          u32 *mm_count)
-{
-       int ret;
-       u64 start, ofs;
-       struct page *page;
-
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-       start = vma->vm_start;
-       for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
-               u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
-               page = virt_to_page(virt_addr);
-               ret = vm_insert_page(vma, start, page);
-               if (unlikely(ret)) {
-                       ehca_gen_err("vm_insert_page() failed rc=%i", ret);
-                       return ret;
-               }
-               start += PAGE_SIZE;
-       }
-       vma->vm_private_data = mm_count;
-       (*mm_count)++;
-       vma->vm_ops = &vm_ops;
-
-       return 0;
-}
-
-static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
-                       u32 rsrc_type)
-{
-       int ret;
-
-       switch (rsrc_type) {
-       case 0: /* galpa fw handle */
-               ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
-               ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_fw() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       case 1: /* cq queue_addr */
-               ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
-               ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_queue() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
-                        rsrc_type, cq->cq_number);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
-                       u32 rsrc_type)
-{
-       int ret;
-
-       switch (rsrc_type) {
-       case 0: /* galpa fw handle */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
-               ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "remap_pfn_range() failed ret=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return -ENOMEM;
-               }
-               break;
-
-       case 1: /* qp rqueue_addr */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
-               ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
-                                     &qp->mm_count_rqueue);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       case 2: /* qp squeue_addr */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
-               ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
-                                     &qp->mm_count_squeue);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
-                        rsrc_type, qp->ib_qp.qp_num);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       u64 fileoffset = vma->vm_pgoff;
-       u32 idr_handle = fileoffset & 0x1FFFFFF;
-       u32 q_type = (fileoffset >> 27) & 0x1;    /* CQ, QP,...        */
-       u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
-       u32 ret;
-       struct ehca_cq *cq;
-       struct ehca_qp *qp;
-       struct ib_uobject *uobject;
-
-       switch (q_type) {
-       case  0: /* CQ */
-               read_lock(&ehca_cq_idr_lock);
-               cq = idr_find(&ehca_cq_idr, idr_handle);
-               read_unlock(&ehca_cq_idr_lock);
-
-               /* make sure this mmap really belongs to the authorized user */
-               if (!cq)
-                       return -EINVAL;
-
-               if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
-                       return -EINVAL;
-
-               ret = ehca_mmap_cq(vma, cq, rsrc_type);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_cq() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       case 1: /* QP */
-               read_lock(&ehca_qp_idr_lock);
-               qp = idr_find(&ehca_qp_idr, idr_handle);
-               read_unlock(&ehca_qp_idr_lock);
-
-               /* make sure this mmap really belongs to the authorized user */
-               if (!qp)
-                       return -EINVAL;
-
-               uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
-               if (!uobject || uobject->context != context)
-                       return -EINVAL;
-
-               ret = ehca_mmap_qp(vma, qp, rsrc_type);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_qp() failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_gen_err("bad queue type %x", q_type);
-               return -EINVAL;
-       }
-
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
deleted file mode 100644 (file)
index 89517ff..0000000
+++ /dev/null
@@ -1,949 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware Infiniband Interface code for POWER
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <asm/hvcall.h>
-#include "ehca_tools.h"
-#include "hcp_if.h"
-#include "hcp_phyp.h"
-#include "hipz_fns.h"
-#include "ipz_pt_fn.h"
-
-#define H_ALL_RES_QP_ENHANCED_OPS       EHCA_BMASK_IBM(9, 11)
-#define H_ALL_RES_QP_PTE_PIN            EHCA_BMASK_IBM(12, 12)
-#define H_ALL_RES_QP_SERVICE_TYPE       EHCA_BMASK_IBM(13, 15)
-#define H_ALL_RES_QP_STORAGE            EHCA_BMASK_IBM(16, 17)
-#define H_ALL_RES_QP_LL_RQ_CQE_POSTING  EHCA_BMASK_IBM(18, 18)
-#define H_ALL_RES_QP_LL_SQ_CQE_POSTING  EHCA_BMASK_IBM(19, 21)
-#define H_ALL_RES_QP_SIGNALING_TYPE     EHCA_BMASK_IBM(22, 23)
-#define H_ALL_RES_QP_UD_AV_LKEY_CTRL    EHCA_BMASK_IBM(31, 31)
-#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
-#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
-#define H_ALL_RES_QP_RESOURCE_TYPE      EHCA_BMASK_IBM(56, 63)
-
-#define H_ALL_RES_QP_MAX_OUTST_SEND_WR  EHCA_BMASK_IBM(0, 15)
-#define H_ALL_RES_QP_MAX_OUTST_RECV_WR  EHCA_BMASK_IBM(16, 31)
-#define H_ALL_RES_QP_MAX_SEND_SGE       EHCA_BMASK_IBM(32, 39)
-#define H_ALL_RES_QP_MAX_RECV_SGE       EHCA_BMASK_IBM(40, 47)
-
-#define H_ALL_RES_QP_UD_AV_LKEY         EHCA_BMASK_IBM(32, 63)
-#define H_ALL_RES_QP_SRQ_QP_TOKEN       EHCA_BMASK_IBM(0, 31)
-#define H_ALL_RES_QP_SRQ_QP_HANDLE      EHCA_BMASK_IBM(0, 64)
-#define H_ALL_RES_QP_SRQ_LIMIT          EHCA_BMASK_IBM(48, 63)
-#define H_ALL_RES_QP_SRQ_QPN            EHCA_BMASK_IBM(40, 63)
-
-#define H_ALL_RES_QP_ACT_OUTST_SEND_WR  EHCA_BMASK_IBM(16, 31)
-#define H_ALL_RES_QP_ACT_OUTST_RECV_WR  EHCA_BMASK_IBM(48, 63)
-#define H_ALL_RES_QP_ACT_SEND_SGE       EHCA_BMASK_IBM(8, 15)
-#define H_ALL_RES_QP_ACT_RECV_SGE       EHCA_BMASK_IBM(24, 31)
-
-#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(0, 31)
-#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(32, 63)
-
-#define H_MP_INIT_TYPE                  EHCA_BMASK_IBM(44, 47)
-#define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
-#define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
-
-#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
-#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
-#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
-
-static DEFINE_SPINLOCK(hcall_lock);
-
-static long ehca_plpar_hcall_norets(unsigned long opcode,
-                                   unsigned long arg1,
-                                   unsigned long arg2,
-                                   unsigned long arg3,
-                                   unsigned long arg4,
-                                   unsigned long arg5,
-                                   unsigned long arg6,
-                                   unsigned long arg7)
-{
-       long ret;
-       int i, sleep_msecs;
-       unsigned long flags = 0;
-
-       if (unlikely(ehca_debug_level >= 2))
-               ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
-                            opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
-
-       for (i = 0; i < 5; i++) {
-               /* serialize hCalls to work around firmware issue */
-               if (ehca_lock_hcalls)
-                       spin_lock_irqsave(&hcall_lock, flags);
-
-               ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
-                                        arg5, arg6, arg7);
-
-               if (ehca_lock_hcalls)
-                       spin_unlock_irqrestore(&hcall_lock, flags);
-
-               if (H_IS_LONG_BUSY(ret)) {
-                       sleep_msecs = get_longbusy_msecs(ret);
-                       msleep_interruptible(sleep_msecs);
-                       continue;
-               }
-
-               if (ret < H_SUCCESS)
-                       ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
-                                    opcode, ret, arg1, arg2, arg3,
-                                    arg4, arg5, arg6, arg7);
-               else
-                       if (unlikely(ehca_debug_level >= 2))
-                               ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
-
-               return ret;
-       }
-
-       return H_BUSY;
-}
-
-static long ehca_plpar_hcall9(unsigned long opcode,
-                             unsigned long *outs, /* array of 9 outputs */
-                             unsigned long arg1,
-                             unsigned long arg2,
-                             unsigned long arg3,
-                             unsigned long arg4,
-                             unsigned long arg5,
-                             unsigned long arg6,
-                             unsigned long arg7,
-                             unsigned long arg8,
-                             unsigned long arg9)
-{
-       long ret;
-       int i, sleep_msecs;
-       unsigned long flags = 0;
-
-       if (unlikely(ehca_debug_level >= 2))
-               ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
-                            arg1, arg2, arg3, arg4, arg5,
-                            arg6, arg7, arg8, arg9);
-
-       for (i = 0; i < 5; i++) {
-               /* serialize hCalls to work around firmware issue */
-               if (ehca_lock_hcalls)
-                       spin_lock_irqsave(&hcall_lock, flags);
-
-               ret = plpar_hcall9(opcode, outs,
-                                  arg1, arg2, arg3, arg4, arg5,
-                                  arg6, arg7, arg8, arg9);
-
-               if (ehca_lock_hcalls)
-                       spin_unlock_irqrestore(&hcall_lock, flags);
-
-               if (H_IS_LONG_BUSY(ret)) {
-                       sleep_msecs = get_longbusy_msecs(ret);
-                       msleep_interruptible(sleep_msecs);
-                       continue;
-               }
-
-               if (ret < H_SUCCESS) {
-                       ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
-                                    opcode, arg1, arg2, arg3, arg4, arg5,
-                                    arg6, arg7, arg8, arg9);
-                       ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
-                                    ret, outs[0], outs[1], outs[2], outs[3],
-                                    outs[4], outs[5], outs[6], outs[7],
-                                    outs[8]);
-               } else if (unlikely(ehca_debug_level >= 2))
-                       ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
-                                    ret, outs[0], outs[1], outs[2], outs[3],
-                                    outs[4], outs[5], outs[6], outs[7],
-                                    outs[8]);
-               return ret;
-       }
-
-       return H_BUSY;
-}
-
-u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u32 neq_control,
-                            const u32 number_of_entries,
-                            struct ipz_eq_handle *eq_handle,
-                            u32 *act_nr_of_entries,
-                            u32 *act_pages,
-                            u32 *eq_ist)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-       u64 allocate_controls;
-
-       /* resource type */
-       allocate_controls = 3ULL;
-
-       /* ISN is associated */
-       if (neq_control != 1)
-               allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
-       else /* notification event queue */
-               allocate_controls = (1ULL << 63) | allocate_controls;
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,  /* r4 */
-                               allocate_controls,      /* r5 */
-                               number_of_entries,      /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       eq_handle->handle = outs[0];
-       *act_nr_of_entries = (u32)outs[3];
-       *act_pages = (u32)outs[4];
-       *eq_ist = (u32)outs[5];
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resource - ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
-                      struct ipz_eq_handle eq_handle,
-                      const u64 event_mask)
-{
-       return ehca_plpar_hcall_norets(H_RESET_EVENTS,
-                                      adapter_handle.handle, /* r4 */
-                                      eq_handle.handle,      /* r5 */
-                                      event_mask,            /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_cq *cq,
-                            struct ehca_alloc_cq_parms *param)
-{
-       int rc;
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,   /* r4  */
-                               2,                       /* r5  */
-                               param->eq_handle.handle, /* r6  */
-                               cq->token,               /* r7  */
-                               param->nr_cqe,           /* r8  */
-                               0, 0, 0, 0);
-       cq->ipz_cq_handle.handle = outs[0];
-       param->act_nr_of_entries = (u32)outs[3];
-       param->act_pages = (u32)outs[4];
-
-       if (ret == H_SUCCESS) {
-               rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
-               if (rc) {
-                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
-                                    rc, outs[5]);
-
-                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                               adapter_handle.handle,     /* r4 */
-                                               cq->ipz_cq_handle.handle,  /* r5 */
-                                               0, 0, 0, 0, 0);
-                       ret = H_NO_MEM;
-               }
-       }
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_alloc_qp_parms *parms, int is_user)
-{
-       int rc;
-       u64 ret;
-       u64 allocate_controls, max_r10_reg, r11, r12;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       allocate_controls =
-               EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
-                                parms->squeue.page_size)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
-                                parms->rqueue.page_size)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
-                                !!(parms->ll_comp_flags & LLQP_RECV_COMP))
-               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
-                                !!(parms->ll_comp_flags & LLQP_SEND_COMP))
-               | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
-                                parms->ud_av_l_key_ctl)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
-
-       max_r10_reg =
-               EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
-                              parms->squeue.max_wr + 1)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
-                                parms->rqueue.max_wr + 1)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
-                                parms->squeue.max_sge)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
-                                parms->rqueue.max_sge);
-
-       r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
-
-       if (parms->ext_type == EQPT_SRQ)
-               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
-       else
-               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,             /* r4  */
-                               allocate_controls,                 /* r5  */
-                               parms->send_cq_handle.handle,
-                               parms->recv_cq_handle.handle,
-                               parms->eq_handle.handle,
-                               ((u64)parms->token << 32) | parms->pd.value,
-                               max_r10_reg, r11, r12);
-
-       parms->qp_handle.handle = outs[0];
-       parms->real_qp_num = (u32)outs[1];
-       parms->squeue.act_nr_wqes =
-               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
-       parms->rqueue.act_nr_wqes =
-               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
-       parms->squeue.act_nr_sges =
-               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
-       parms->rqueue.act_nr_sges =
-               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
-       parms->squeue.queue_size =
-               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
-       parms->rqueue.queue_size =
-               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
-
-       if (ret == H_SUCCESS) {
-               rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
-               if (rc) {
-                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
-                                    rc, outs[6]);
-
-                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                               adapter_handle.handle,     /* r4 */
-                                               parms->qp_handle.handle,  /* r5 */
-                                               0, 0, 0, 0, 0);
-                       ret = H_NO_MEM;
-               }
-       }
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
-                     const u8 port_id,
-                     struct hipz_query_port *query_port_response_block)
-{
-       u64 ret;
-       u64 r_cb = __pa(query_port_response_block);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("response block not page aligned");
-               return H_PARAMETER;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
-                                     adapter_handle.handle, /* r4 */
-                                     port_id,               /* r5 */
-                                     r_cb,                  /* r6 */
-                                     0, 0, 0, 0);
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(query_port_response_block, 64, "response_block");
-
-       return ret;
-}
-
-u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
-                      const u8 port_id, const u32 port_cap,
-                      const u8 init_type, const int modify_mask)
-{
-       u64 port_attributes = port_cap;
-
-       if (modify_mask & IB_PORT_SHUTDOWN)
-               port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
-       if (modify_mask & IB_PORT_INIT_TYPE)
-               port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
-       if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
-               port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
-
-       return ehca_plpar_hcall_norets(H_MODIFY_PORT,
-                                      adapter_handle.handle, /* r4 */
-                                      port_id,               /* r5 */
-                                      port_attributes,       /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
-                    struct hipz_query_hca *query_hca_rblock)
-{
-       u64 r_cb = __pa(query_hca_rblock);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("response_block=%p not page aligned",
-                            query_hca_rblock);
-               return H_PARAMETER;
-       }
-
-       return ehca_plpar_hcall_norets(H_QUERY_HCA,
-                                      adapter_handle.handle, /* r4 */
-                                      r_cb,                  /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
-                         const u8 pagesize,
-                         const u8 queue_type,
-                         const u64 resource_handle,
-                         const u64 logical_address_of_page,
-                         u64 count)
-{
-       return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
-                                      adapter_handle.handle,      /* r4  */
-                                      (u64)queue_type | ((u64)pagesize) << 8,
-                                      /* r5  */
-                                      resource_handle,            /* r6  */
-                                      logical_address_of_page,    /* r7  */
-                                      count,                      /* r8  */
-                                      0, 0);
-}
-
-u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_eq_handle eq_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count)
-{
-       if (count != 1) {
-               ehca_gen_err("Ppage counter=%llx", count);
-               return H_PARAMETER;
-       }
-       return hipz_h_register_rpage(adapter_handle,
-                                    pagesize,
-                                    queue_type,
-                                    eq_handle.handle,
-                                    logical_address_of_page, count);
-}
-
-u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
-                          u32 ist)
-{
-       u64 ret;
-       ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
-                                     adapter_handle.handle, /* r4 */
-                                     ist,                   /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret != H_SUCCESS && ret != H_BUSY)
-               ehca_gen_err("Could not query interrupt state.");
-
-       return ret;
-}
-
-u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_cq_handle cq_handle,
-                            struct ehca_pfcq *pfcq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa gal)
-{
-       if (count != 1) {
-               ehca_gen_err("Page counter=%llx", count);
-               return H_PARAMETER;
-       }
-
-       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
-                                    cq_handle.handle, logical_address_of_page,
-                                    count);
-}
-
-u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_qp_handle qp_handle,
-                            struct ehca_pfqp *pfqp,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa galpa)
-{
-       if (count > 1) {
-               ehca_gen_err("Page counter=%llx", count);
-               return H_PARAMETER;
-       }
-
-       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
-                                    qp_handle.handle, logical_address_of_page,
-                                    count);
-}
-
-u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
-                              const struct ipz_qp_handle qp_handle,
-                              struct ehca_pfqp *pfqp,
-                              void **log_addr_next_sq_wqe2processed,
-                              void **log_addr_next_rq_wqe2processed,
-                              int dis_and_get_function_code)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
-                               adapter_handle.handle,     /* r4 */
-                               dis_and_get_function_code, /* r5 */
-                               qp_handle.handle,          /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       if (log_addr_next_sq_wqe2processed)
-               *log_addr_next_sq_wqe2processed = (void *)outs[0];
-       if (log_addr_next_rq_wqe2processed)
-               *log_addr_next_rq_wqe2processed = (void *)outs[1];
-
-       return ret;
-}
-
-u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
-                    const struct ipz_qp_handle qp_handle,
-                    struct ehca_pfqp *pfqp,
-                    const u64 update_mask,
-                    struct hcp_modify_qp_control_block *mqpcb,
-                    struct h_galpa gal)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-       ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
-                               adapter_handle.handle, /* r4 */
-                               qp_handle.handle,      /* r5 */
-                               update_mask,           /* r6 */
-                               __pa(mqpcb),           /* r7 */
-                               0, 0, 0, 0, 0);
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Insufficient resources ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
-                   const struct ipz_qp_handle qp_handle,
-                   struct ehca_pfqp *pfqp,
-                   struct hcp_modify_qp_control_block *qqpcb,
-                   struct h_galpa gal)
-{
-       return ehca_plpar_hcall_norets(H_QUERY_QP,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      __pa(qqpcb),           /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_qp *qp)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = hcp_galpas_dtor(&qp->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct qp->galpas");
-               return H_RESOURCE;
-       }
-       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
-                               adapter_handle.handle,     /* r4 */
-                               /* function code */
-                               1,                         /* r5 */
-                               qp->ipz_qp_handle.handle,  /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       if (ret == H_HARDWARE)
-               ehca_gen_err("HCA not operational. ret=%lli", ret);
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     qp->ipz_qp_handle.handle,  /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("Resource still in use. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port)
-{
-       return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      port,                  /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port, u32 * pma_qp_nr,
-                      u32 * bma_qp_nr)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
-                               adapter_handle.handle, /* r4 */
-                               qp_handle.handle,      /* r5 */
-                               port,                  /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       *pma_qp_nr = (u32)outs[0];
-       *bma_qp_nr = (u32)outs[1];
-
-       if (ret == H_ALIAS_EXIST)
-               ehca_gen_err("AQP1 already exists. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id)
-{
-       u64 ret;
-
-       ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
-                                     adapter_handle.handle,  /* r4 */
-                                     qp_handle.handle,       /* r5 */
-                                     mcg_dlid,               /* r6 */
-                                     interface_id,           /* r7 */
-                                     subnet_prefix,          /* r8 */
-                                     0, 0);
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id)
-{
-       return ehca_plpar_hcall_norets(H_DETACH_MCQP,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      mcg_dlid,              /* r6 */
-                                      interface_id,          /* r7 */
-                                      subnet_prefix,         /* r8 */
-                                      0, 0);
-}
-
-u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_cq *cq,
-                     u8 force_flag)
-{
-       u64 ret;
-
-       ret = hcp_galpas_dtor(&cq->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct cp->galpas");
-               return H_RESOURCE;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     cq->ipz_cq_handle.handle,  /* r5 */
-                                     force_flag != 0 ? 1L : 0L, /* r6 */
-                                     0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_eq *eq)
-{
-       u64 ret;
-
-       ret = hcp_galpas_dtor(&eq->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct eq->galpas");
-               return H_RESOURCE;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     eq->ipz_eq_handle.handle,  /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("Resource in use. ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u64 vaddr,
-                            const u64 length,
-                            const u32 access_ctrl,
-                            const struct ipz_pd pd,
-                            struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,            /* r4 */
-                               5,                                /* r5 */
-                               vaddr,                            /* r6 */
-                               length,                           /* r7 */
-                               (((u64)access_ctrl) << 32ULL),    /* r8 */
-                               pd.value,                         /* r9 */
-                               0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count)
-{
-       u64 ret;
-
-       if (unlikely(ehca_debug_level >= 3)) {
-               if (count > 1) {
-                       u64 *kpage;
-                       int i;
-                       kpage = __va(logical_address_of_page);
-                       for (i = 0; i < count; i++)
-                               ehca_gen_dbg("kpage[%d]=%p",
-                                            i, (void *)kpage[i]);
-               } else
-                       ehca_gen_dbg("kpage=%p",
-                                    (void *)logical_address_of_page);
-       }
-
-       if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
-               ehca_gen_err("logical_address_of_page not on a 4k boundary "
-                            "adapter_handle=%llx mr=%p mr_handle=%llx "
-                            "pagesize=%x queue_type=%x "
-                            "logical_address_of_page=%llx count=%llx",
-                            adapter_handle.handle, mr,
-                            mr->ipz_mr_handle.handle, pagesize, queue_type,
-                            logical_address_of_page, count);
-               ret = H_PARAMETER;
-       } else
-               ret = hipz_h_register_rpage(adapter_handle, pagesize,
-                                           queue_type,
-                                           mr->ipz_mr_handle.handle,
-                                           logical_address_of_page, count);
-       return ret;
-}
-
-u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mr *mr,
-                   struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
-                               adapter_handle.handle,     /* r4 */
-                               mr->ipz_mr_handle.handle,  /* r5 */
-                               0, 0, 0, 0, 0, 0, 0);
-       outparms->len = outs[0];
-       outparms->vaddr = outs[1];
-       outparms->acl  = outs[4] >> 32;
-       outparms->lkey = (u32)(outs[5] >> 32);
-       outparms->rkey = (u32)(outs[5] & (0xffffffff));
-
-       return ret;
-}
-
-u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mr *mr)
-{
-       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                      adapter_handle.handle,    /* r4 */
-                                      mr->ipz_mr_handle.handle, /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
-                         const struct ehca_mr *mr,
-                         const u64 vaddr_in,
-                         const u64 length,
-                         const u32 access_ctrl,
-                         const struct ipz_pd pd,
-                         const u64 mr_addr_cb,
-                         struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
-                               adapter_handle.handle,    /* r4 */
-                               mr->ipz_mr_handle.handle, /* r5 */
-                               vaddr_in,                 /* r6 */
-                               length,                   /* r7 */
-                               /* r8 */
-                               ((((u64)access_ctrl) << 32ULL) | pd.value),
-                               mr_addr_cb,               /* r9 */
-                               0, 0, 0);
-       outparms->vaddr = outs[1];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
-                       const struct ehca_mr *mr,
-                       const struct ehca_mr *orig_mr,
-                       const u64 vaddr_in,
-                       const u32 access_ctrl,
-                       const struct ipz_pd pd,
-                       struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
-                               adapter_handle.handle,            /* r4 */
-                               orig_mr->ipz_mr_handle.handle,    /* r5 */
-                               vaddr_in,                         /* r6 */
-                               (((u64)access_ctrl) << 32ULL),    /* r7 */
-                               pd.value,                         /* r8 */
-                               0, 0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mw *mw,
-                            const struct ipz_pd pd,
-                            struct ehca_mw_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,      /* r4 */
-                               6,                          /* r5 */
-                               pd.value,                   /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mw *mw,
-                   struct ehca_mw_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
-                               adapter_handle.handle,    /* r4 */
-                               mw->ipz_mw_handle.handle, /* r5 */
-                               0, 0, 0, 0, 0, 0, 0);
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mw *mw)
-{
-       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                      adapter_handle.handle,    /* r4 */
-                                      mw->ipz_mw_handle.handle, /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
-                     const u64 ressource_handle,
-                     void *rblock,
-                     unsigned long *byte_count)
-{
-       u64 r_cb = __pa(rblock);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("rblock not page aligned.");
-               return H_PARAMETER;
-       }
-
-       return ehca_plpar_hcall_norets(H_ERROR_DATA,
-                                      adapter_handle.handle,
-                                      ressource_handle,
-                                      r_cb,
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_eoi(int irq)
-{
-       unsigned long xirr;
-
-       iosync();
-       xirr = (0xffULL << 24) | irq;
-
-       return plpar_hcall_norets(H_EOI, xirr);
-}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
deleted file mode 100644 (file)
index a46e514..0000000
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware Infiniband Interface code for POWER
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HCP_IF_H__
-#define __HCP_IF_H__
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "hipz_hw.h"
-
-/*
- * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
- * resources, create the empty EQPT (ring).
- */
-u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u32 neq_control,
-                            const u32 number_of_entries,
-                            struct ipz_eq_handle *eq_handle,
-                            u32 * act_nr_of_entries,
-                            u32 * act_pages,
-                            u32 * eq_ist);
-
-u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
-                      struct ipz_eq_handle eq_handle,
-                      const u64 event_mask);
-/*
- * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
- * resources, create the empty CQPT (ring).
- */
-u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_cq *cq,
-                            struct ehca_alloc_cq_parms *param);
-
-
-/*
- * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
- * initialize resources, create empty QPPTs (2 rings).
- */
-u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_alloc_qp_parms *parms, int is_user);
-
-u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
-                     const u8 port_id,
-                     struct hipz_query_port *query_port_response_block);
-
-u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
-                      const u8 port_id, const u32 port_cap,
-                      const u8 init_type, const int modify_mask);
-
-u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
-                    struct hipz_query_hca *query_hca_rblock);
-
-/*
- * hipz_h_register_rpage internal function in hcp_if.h for all
- * hcp_H_REGISTER_RPAGE calls.
- */
-u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
-                         const u8 pagesize,
-                         const u8 queue_type,
-                         const u64 resource_handle,
-                         const u64 logical_address_of_page,
-                         u64 count);
-
-u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_eq_handle eq_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count);
-
-u64 hipz_h_query_int_state(const struct ipz_adapter_handle
-                          hcp_adapter_handle,
-                          u32 ist);
-
-u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_cq_handle cq_handle,
-                            struct ehca_pfcq *pfcq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa gal);
-
-u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_qp_handle qp_handle,
-                            struct ehca_pfqp *pfqp,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa galpa);
-
-u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
-                              const struct ipz_qp_handle qp_handle,
-                              struct ehca_pfqp *pfqp,
-                              void **log_addr_next_sq_wqe_tb_processed,
-                              void **log_addr_next_rq_wqe_tb_processed,
-                              int dis_and_get_function_code);
-enum hcall_sigt {
-       HCALL_SIGT_NO_CQE = 0,
-       HCALL_SIGT_BY_WQE = 1,
-       HCALL_SIGT_EVERY = 2
-};
-
-u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
-                    const struct ipz_qp_handle qp_handle,
-                    struct ehca_pfqp *pfqp,
-                    const u64 update_mask,
-                    struct hcp_modify_qp_control_block *mqpcb,
-                    struct h_galpa gal);
-
-u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
-                   const struct ipz_qp_handle qp_handle,
-                   struct ehca_pfqp *pfqp,
-                   struct hcp_modify_qp_control_block *qqpcb,
-                   struct h_galpa gal);
-
-u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_qp *qp);
-
-u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port);
-
-u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port, u32 * pma_qp_nr,
-                      u32 * bma_qp_nr);
-
-u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id);
-
-u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id);
-
-u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_cq *cq,
-                     u8 force_flag);
-
-u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_eq *eq);
-
-/*
- * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
- * resources.
- */
-u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u64 vaddr,
-                            const u64 length,
-                            const u32 access_ctrl,
-                            const struct ipz_pd pd,
-                            struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
-u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count);
-
-/* hipz_h_query_mr queries MR in HW and FW */
-u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mr *mr,
-                   struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_free_resource_mr frees MR resources in HW and FW */
-u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mr *mr);
-
-/* hipz_h_reregister_pmr reregisters MR in HW and FW */
-u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
-                         const struct ehca_mr *mr,
-                         const u64 vaddr_in,
-                         const u64 length,
-                         const u32 access_ctrl,
-                         const struct ipz_pd pd,
-                         const u64 mr_addr_cb,
-                         struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_register_smr register shared MR in HW and FW */
-u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
-                       const struct ehca_mr *mr,
-                       const struct ehca_mr *orig_mr,
-                       const u64 vaddr_in,
-                       const u32 access_ctrl,
-                       const struct ipz_pd pd,
-                       struct ehca_mr_hipzout_parms *outparms);
-
-/*
- * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
- * resources.
- */
-u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mw *mw,
-                            const struct ipz_pd pd,
-                            struct ehca_mw_hipzout_parms *outparms);
-
-/* hipz_h_query_mw queries MW in HW and FW */
-u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mw *mw,
-                   struct ehca_mw_hipzout_parms *outparms);
-
-/* hipz_h_free_resource_mw frees MW resources in HW and FW */
-u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mw *mw);
-
-u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
-                     const u64 ressource_handle,
-                     void *rblock,
-                     unsigned long *byte_count);
-u64 hipz_h_eoi(int irq);
-
-#endif /* __HCP_IF_H__ */
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c
deleted file mode 100644 (file)
index 077376f..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *   load store abstraction for ehca register access with tracing
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "ehca_classes.h"
-#include "hipz_hw.h"
-
-u64 hcall_map_page(u64 physaddr)
-{
-       return (u64)ioremap(physaddr, EHCA_PAGESIZE);
-}
-
-int hcall_unmap_page(u64 mapaddr)
-{
-       iounmap((volatile void __iomem *) mapaddr);
-       return 0;
-}
-
-int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
-                   u64 paddr_kernel, u64 paddr_user)
-{
-       if (!is_user) {
-               galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
-               if (!galpas->kernel.fw_handle)
-                       return -ENOMEM;
-       } else
-               galpas->kernel.fw_handle = 0;
-
-       galpas->user.fw_handle = paddr_user;
-
-       return 0;
-}
-
-int hcp_galpas_dtor(struct h_galpas *galpas)
-{
-       if (galpas->kernel.fw_handle) {
-               int ret = hcall_unmap_page(galpas->kernel.fw_handle);
-               if (ret)
-                       return ret;
-       }
-
-       galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
-
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h
deleted file mode 100644 (file)
index d1b0299..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware calls
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HCP_PHYP_H__
-#define __HCP_PHYP_H__
-
-
-/*
- * eHCA page (mapped into memory)
- * resource to access eHCA register pages in CPU address space
-*/
-struct h_galpa {
-       u64 fw_handle;
-       /* for pSeries this is a 64bit memory address where
-          I/O memory is mapped into CPU address space (kv) */
-};
-
-/*
- * resource to access eHCA address space registers, all types
- */
-struct h_galpas {
-       u32 pid;                /*PID of userspace galpa checking */
-       struct h_galpa user;    /* user space accessible resource,
-                                  set to 0 if unused */
-       struct h_galpa kernel;  /* kernel space accessible resource,
-                                  set to 0 if unused */
-};
-
-static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
-{
-       u64 addr = galpa.fw_handle + offset;
-       return *(volatile u64 __force *)addr;
-}
-
-static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
-{
-       u64 addr = galpa.fw_handle + offset;
-       *(volatile u64 __force *)addr = value;
-}
-
-int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
-                   u64 paddr_kernel, u64 paddr_user);
-
-int hcp_galpas_dtor(struct h_galpas *galpas);
-
-u64 hcall_map_page(u64 physaddr);
-
-int hcall_unmap_page(u64 mapaddr);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns.h b/drivers/infiniband/hw/ehca/hipz_fns.h
deleted file mode 100644 (file)
index 9dac93d..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HW abstraction register functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_FNS_H__
-#define __HIPZ_FNS_H__
-
-#include "ehca_classes.h"
-#include "hipz_hw.h"
-
-#include "hipz_fns_core.h"
-
-#define hipz_galpa_store_eq(gal, offset, value) \
-       hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_eq(gal, offset) \
-       hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
-
-#define hipz_galpa_store_qped(gal, offset, value) \
-       hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_qped(gal, offset) \
-       hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
-
-#define hipz_galpa_store_mrmw(gal, offset, value) \
-       hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_mrmw(gal, offset) \
-       hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns_core.h b/drivers/infiniband/hw/ehca/hipz_fns_core.h
deleted file mode 100644 (file)
index 868735f..0000000
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HW abstraction register functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_FNS_CORE_H__
-#define __HIPZ_FNS_CORE_H__
-
-#include "hcp_phyp.h"
-#include "hipz_hw.h"
-
-#define hipz_galpa_store_cq(gal, offset, value) \
-       hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_cq(gal, offset) \
-       hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
-
-#define hipz_galpa_store_qp(gal, offset, value) \
-       hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
-#define hipz_galpa_load_qp(gal, offset) \
-       hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
-
-static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
-{
-       /*  ringing doorbell :-) */
-       hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
-                           EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
-}
-
-static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
-{
-       /*  ringing doorbell :-) */
-       hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
-                           EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
-}
-
-static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
-{
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
-                           EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
-}
-
-static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
-{
-       u64 cqx_n0_reg;
-
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
-                           EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
-                                          value));
-       cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
-}
-
-static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
-{
-       u64 cqx_n1_reg;
-
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
-                           EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
-       cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
-}
-
-#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/infiniband/hw/ehca/hipz_hw.h b/drivers/infiniband/hw/ehca/hipz_hw.h
deleted file mode 100644 (file)
index bf996c7..0000000
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  eHCA register definitions
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_HW_H__
-#define __HIPZ_HW_H__
-
-#include "ehca_tools.h"
-
-#define EHCA_MAX_MTU 4
-
-/* QP Table Entry Memory Map */
-struct hipz_qptemm {
-       u64 qpx_hcr;
-       u64 qpx_c;
-       u64 qpx_herr;
-       u64 qpx_aer;
-/* 0x20*/
-       u64 qpx_sqa;
-       u64 qpx_sqc;
-       u64 qpx_rqa;
-       u64 qpx_rqc;
-/* 0x40*/
-       u64 qpx_st;
-       u64 qpx_pmstate;
-       u64 qpx_pmfa;
-       u64 qpx_pkey;
-/* 0x60*/
-       u64 qpx_pkeya;
-       u64 qpx_pkeyb;
-       u64 qpx_pkeyc;
-       u64 qpx_pkeyd;
-/* 0x80*/
-       u64 qpx_qkey;
-       u64 qpx_dqp;
-       u64 qpx_dlidp;
-       u64 qpx_portp;
-/* 0xa0*/
-       u64 qpx_slidp;
-       u64 qpx_slidpp;
-       u64 qpx_dlida;
-       u64 qpx_porta;
-/* 0xc0*/
-       u64 qpx_slida;
-       u64 qpx_slidpa;
-       u64 qpx_slvl;
-       u64 qpx_ipd;
-/* 0xe0*/
-       u64 qpx_mtu;
-       u64 qpx_lato;
-       u64 qpx_rlimit;
-       u64 qpx_rnrlimit;
-/* 0x100*/
-       u64 qpx_t;
-       u64 qpx_sqhp;
-       u64 qpx_sqptp;
-       u64 qpx_nspsn;
-/* 0x120*/
-       u64 qpx_nspsnhwm;
-       u64 reserved1;
-       u64 qpx_sdsi;
-       u64 qpx_sdsbc;
-/* 0x140*/
-       u64 qpx_sqwsize;
-       u64 qpx_sqwts;
-       u64 qpx_lsn;
-       u64 qpx_nssn;
-/* 0x160 */
-       u64 qpx_mor;
-       u64 qpx_cor;
-       u64 qpx_sqsize;
-       u64 qpx_erc;
-/* 0x180*/
-       u64 qpx_rnrrc;
-       u64 qpx_ernrwt;
-       u64 qpx_rnrresp;
-       u64 qpx_lmsna;
-/* 0x1a0 */
-       u64 qpx_sqhpc;
-       u64 qpx_sqcptp;
-       u64 qpx_sigt;
-       u64 qpx_wqecnt;
-/* 0x1c0*/
-       u64 qpx_rqhp;
-       u64 qpx_rqptp;
-       u64 qpx_rqsize;
-       u64 qpx_nrr;
-/* 0x1e0*/
-       u64 qpx_rdmac;
-       u64 qpx_nrpsn;
-       u64 qpx_lapsn;
-       u64 qpx_lcr;
-/* 0x200*/
-       u64 qpx_rwc;
-       u64 qpx_rwva;
-       u64 qpx_rdsi;
-       u64 qpx_rdsbc;
-/* 0x220*/
-       u64 qpx_rqwsize;
-       u64 qpx_crmsn;
-       u64 qpx_rdd;
-       u64 qpx_larpsn;
-/* 0x240*/
-       u64 qpx_pd;
-       u64 qpx_scqn;
-       u64 qpx_rcqn;
-       u64 qpx_aeqn;
-/* 0x260*/
-       u64 qpx_aaelog;
-       u64 qpx_ram;
-       u64 qpx_rdmaqe0;
-       u64 qpx_rdmaqe1;
-/* 0x280*/
-       u64 qpx_rdmaqe2;
-       u64 qpx_rdmaqe3;
-       u64 qpx_nrpsnhwm;
-/* 0x298*/
-       u64 reserved[(0x400 - 0x298) / 8];
-/* 0x400 extended data */
-       u64 reserved_ext[(0x500 - 0x400) / 8];
-/* 0x500 */
-       u64 reserved2[(0x1000 - 0x500) / 8];
-/* 0x1000      */
-};
-
-#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
-#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
-#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
-
-#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
-
-/* MRMWPT Entry Memory Map */
-struct hipz_mrmwmm {
-       /* 0x00 */
-       u64 mrx_hcr;
-
-       u64 mrx_c;
-       u64 mrx_herr;
-       u64 mrx_aer;
-       /* 0x20 */
-       u64 mrx_pp;
-       u64 reserved1;
-       u64 reserved2;
-       u64 reserved3;
-       /* 0x40 */
-       u64 reserved4[(0x200 - 0x40) / 8];
-       /* 0x200 */
-       u64 mrx_ctl[64];
-
-};
-
-#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
-
-struct hipz_qpedmm {
-       /* 0x00 */
-       u64 reserved0[(0x400) / 8];
-       /* 0x400 */
-       u64 qpedx_phh;
-       u64 qpedx_ppsgp;
-       /* 0x410 */
-       u64 qpedx_ppsgu;
-       u64 qpedx_ppdgp;
-       /* 0x420 */
-       u64 qpedx_ppdgu;
-       u64 qpedx_aph;
-       /* 0x430 */
-       u64 qpedx_apsgp;
-       u64 qpedx_apsgu;
-       /* 0x440 */
-       u64 qpedx_apdgp;
-       u64 qpedx_apdgu;
-       /* 0x450 */
-       u64 qpedx_apav;
-       u64 qpedx_apsav;
-       /* 0x460  */
-       u64 qpedx_hcr;
-       u64 reserved1[4];
-       /* 0x488 */
-       u64 qpedx_rrl0;
-       /* 0x490 */
-       u64 qpedx_rrrkey0;
-       u64 qpedx_rrva0;
-       /* 0x4a0 */
-       u64 reserved2;
-       u64 qpedx_rrl1;
-       /* 0x4b0 */
-       u64 qpedx_rrrkey1;
-       u64 qpedx_rrva1;
-       /* 0x4c0 */
-       u64 reserved3;
-       u64 qpedx_rrl2;
-       /* 0x4d0 */
-       u64 qpedx_rrrkey2;
-       u64 qpedx_rrva2;
-       /* 0x4e0 */
-       u64 reserved4;
-       u64 qpedx_rrl3;
-       /* 0x4f0 */
-       u64 qpedx_rrrkey3;
-       u64 qpedx_rrva3;
-};
-
-#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
-
-/* CQ Table Entry Memory Map */
-struct hipz_cqtemm {
-       u64 cqx_hcr;
-       u64 cqx_c;
-       u64 cqx_herr;
-       u64 cqx_aer;
-/* 0x20  */
-       u64 cqx_ptp;
-       u64 cqx_tp;
-       u64 cqx_fec;
-       u64 cqx_feca;
-/* 0x40  */
-       u64 cqx_ep;
-       u64 cqx_eq;
-/* 0x50  */
-       u64 reserved1;
-       u64 cqx_n0;
-/* 0x60  */
-       u64 cqx_n1;
-       u64 reserved2[(0x1000 - 0x60) / 8];
-/* 0x1000 */
-};
-
-#define CQX_FEC_CQE_CNT           EHCA_BMASK_IBM(32, 63)
-#define CQX_FECADDER              EHCA_BMASK_IBM(32, 63)
-#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
-#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
-
-#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
-
-/* EQ Table Entry Memory Map */
-struct hipz_eqtemm {
-       u64 eqx_hcr;
-       u64 eqx_c;
-
-       u64 eqx_herr;
-       u64 eqx_aer;
-/* 0x20 */
-       u64 eqx_ptp;
-       u64 eqx_tp;
-       u64 eqx_ssba;
-       u64 eqx_psba;
-
-/* 0x40 */
-       u64 eqx_cec;
-       u64 eqx_meql;
-       u64 eqx_xisbi;
-       u64 eqx_xisc;
-/* 0x60 */
-       u64 eqx_it;
-
-};
-
-#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
-
-/* access control defines for MR/MW */
-#define HIPZ_ACCESSCTRL_L_WRITE  0x00800000
-#define HIPZ_ACCESSCTRL_R_WRITE  0x00400000
-#define HIPZ_ACCESSCTRL_R_READ   0x00200000
-#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
-#define HIPZ_ACCESSCTRL_MW_BIND  0x00080000
-
-/* query hca response block */
-struct hipz_query_hca {
-       u32 cur_reliable_dg;
-       u32 cur_qp;
-       u32 cur_cq;
-       u32 cur_eq;
-       u32 cur_mr;
-       u32 cur_mw;
-       u32 cur_ee_context;
-       u32 cur_mcast_grp;
-       u32 cur_qp_attached_mcast_grp;
-       u32 reserved1;
-       u32 cur_ipv6_qp;
-       u32 cur_eth_qp;
-       u32 cur_hp_mr;
-       u32 reserved2[3];
-       u32 max_rd_domain;
-       u32 max_qp;
-       u32 max_cq;
-       u32 max_eq;
-       u32 max_mr;
-       u32 max_hp_mr;
-       u32 max_mw;
-       u32 max_mrwpte;
-       u32 max_special_mrwpte;
-       u32 max_rd_ee_context;
-       u32 max_mcast_grp;
-       u32 max_total_mcast_qp_attach;
-       u32 max_mcast_qp_attach;
-       u32 max_raw_ipv6_qp;
-       u32 max_raw_ethy_qp;
-       u32 internal_clock_frequency;
-       u32 max_pd;
-       u32 max_ah;
-       u32 max_cqe;
-       u32 max_wqes_wq;
-       u32 max_partitions;
-       u32 max_rr_ee_context;
-       u32 max_rr_qp;
-       u32 max_rr_hca;
-       u32 max_act_wqs_ee_context;
-       u32 max_act_wqs_qp;
-       u32 max_sge;
-       u32 max_sge_rd;
-       u32 memory_page_size_supported;
-       u64 max_mr_size;
-       u32 local_ca_ack_delay;
-       u32 num_ports;
-       u32 vendor_id;
-       u32 vendor_part_id;
-       u32 hw_ver;
-       u64 node_guid;
-       u64 hca_cap_indicators;
-       u32 data_counter_register_size;
-       u32 max_shared_rq;
-       u32 max_isns_eq;
-       u32 max_neq;
-} __attribute__ ((packed));
-
-#define HCA_CAP_AH_PORT_NR_CHECK      EHCA_BMASK_IBM( 0,  0)
-#define HCA_CAP_ATOMIC                EHCA_BMASK_IBM( 1,  1)
-#define HCA_CAP_AUTO_PATH_MIG         EHCA_BMASK_IBM( 2,  2)
-#define HCA_CAP_BAD_P_KEY_CTR         EHCA_BMASK_IBM( 3,  3)
-#define HCA_CAP_SQD_RTS_PORT_CHANGE   EHCA_BMASK_IBM( 4,  4)
-#define HCA_CAP_CUR_QP_STATE_MOD      EHCA_BMASK_IBM( 5,  5)
-#define HCA_CAP_INIT_TYPE             EHCA_BMASK_IBM( 6,  6)
-#define HCA_CAP_PORT_ACTIVE_EVENT     EHCA_BMASK_IBM( 7,  7)
-#define HCA_CAP_Q_KEY_VIOL_CTR        EHCA_BMASK_IBM( 8,  8)
-#define HCA_CAP_WQE_RESIZE            EHCA_BMASK_IBM( 9,  9)
-#define HCA_CAP_RAW_PACKET_MCAST      EHCA_BMASK_IBM(10, 10)
-#define HCA_CAP_SHUTDOWN_PORT         EHCA_BMASK_IBM(11, 11)
-#define HCA_CAP_RC_LL_QP              EHCA_BMASK_IBM(12, 12)
-#define HCA_CAP_SRQ                   EHCA_BMASK_IBM(13, 13)
-#define HCA_CAP_UD_LL_QP              EHCA_BMASK_IBM(16, 16)
-#define HCA_CAP_RESIZE_MR             EHCA_BMASK_IBM(17, 17)
-#define HCA_CAP_MINI_QP               EHCA_BMASK_IBM(18, 18)
-#define HCA_CAP_H_ALLOC_RES_SYNC      EHCA_BMASK_IBM(19, 19)
-
-/* query port response block */
-struct hipz_query_port {
-       u32 state;
-       u32 bad_pkey_cntr;
-       u32 lmc;
-       u32 lid;
-       u32 subnet_timeout;
-       u32 qkey_viol_cntr;
-       u32 sm_sl;
-       u32 sm_lid;
-       u32 capability_mask;
-       u32 init_type_reply;
-       u32 pkey_tbl_len;
-       u32 gid_tbl_len;
-       u64 gid_prefix;
-       u32 port_nr;
-       u16 pkey_entries[16];
-       u8  reserved1[32];
-       u32 trent_size;
-       u32 trbuf_size;
-       u64 max_msg_sz;
-       u32 max_mtu;
-       u32 vl_cap;
-       u32 phys_pstate;
-       u32 phys_state;
-       u32 phys_speed;
-       u32 phys_width;
-       u8  reserved2[1884];
-       u64 guid_entries[255];
-} __attribute__ ((packed));
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
deleted file mode 100644 (file)
index 7ffc748..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  internal queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ipz_pt_fn.h"
-#include "ehca_classes.h"
-
-#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
-
-struct kmem_cache *small_qp_cache;
-
-void *ipz_qpageit_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       queue->current_q_offset += queue->pagesize;
-       if (queue->current_q_offset > queue->queue_length) {
-               queue->current_q_offset -= queue->pagesize;
-               ret = NULL;
-       }
-       if (((u64)ret) % queue->pagesize) {
-               ehca_gen_err("ERROR!! not at PAGE-Boundary");
-               return NULL;
-       }
-       return ret;
-}
-
-void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u64 last_entry_in_q = queue->queue_length - queue->qe_size;
-
-       queue->current_q_offset += queue->qe_size;
-       if (queue->current_q_offset > last_entry_in_q) {
-               queue->current_q_offset = 0;
-               queue->toggle_state = (~queue->toggle_state) & 1;
-       }
-
-       return ret;
-}
-
-int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
-{
-       int i;
-       for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
-               u64 page = __pa(queue->queue_pages[i]);
-               if (addr >= page && addr < page + queue->pagesize) {
-                       *q_offset = addr - page + i * queue->pagesize;
-                       return 0;
-               }
-       }
-       return -EINVAL;
-}
-
-#if PAGE_SHIFT < EHCA_PAGESHIFT
-#error Kernel pages must be at least as large than eHCA pages (4K) !
-#endif
-
-/*
- * allocate pages for queue:
- * outer loop allocates whole kernel pages (page aligned) and
- * inner loop divides a kernel page into smaller hca queue pages
- */
-static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
-{
-       int k, f = 0;
-       u8 *kpage;
-
-       while (f < nr_of_pages) {
-               kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
-               if (!kpage)
-                       goto out;
-
-               for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
-                       queue->queue_pages[f] = (struct ipz_page *)kpage;
-                       kpage += EHCA_PAGESIZE;
-                       f++;
-               }
-       }
-       return 1;
-
-out:
-       for (f = 0; f < nr_of_pages && queue->queue_pages[f];
-            f += PAGES_PER_KPAGE)
-               free_page((unsigned long)(queue->queue_pages)[f]);
-       return 0;
-}
-
-static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
-{
-       int order = ilog2(queue->pagesize) - 9;
-       struct ipz_small_queue_page *page;
-       unsigned long bit;
-
-       mutex_lock(&pd->lock);
-
-       if (!list_empty(&pd->free[order]))
-               page = list_entry(pd->free[order].next,
-                                 struct ipz_small_queue_page, list);
-       else {
-               page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
-               if (!page)
-                       goto out;
-
-               page->page = get_zeroed_page(GFP_KERNEL);
-               if (!page->page) {
-                       kmem_cache_free(small_qp_cache, page);
-                       goto out;
-               }
-
-               list_add(&page->list, &pd->free[order]);
-       }
-
-       bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
-       __set_bit(bit, page->bitmap);
-       page->fill++;
-
-       if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
-               list_move(&page->list, &pd->full[order]);
-
-       mutex_unlock(&pd->lock);
-
-       queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
-       queue->small_page = page;
-       queue->offset = bit << (order + 9);
-       return 1;
-
-out:
-       ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
-       mutex_unlock(&pd->lock);
-       return 0;
-}
-
-static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
-{
-       int order = ilog2(queue->pagesize) - 9;
-       struct ipz_small_queue_page *page = queue->small_page;
-       unsigned long bit;
-       int free_page = 0;
-
-       bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
-               >> (order + 9);
-
-       mutex_lock(&pd->lock);
-
-       __clear_bit(bit, page->bitmap);
-       page->fill--;
-
-       if (page->fill == 0) {
-               list_del(&page->list);
-               free_page = 1;
-       }
-
-       if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
-               /* the page was full until we freed the chunk */
-               list_move_tail(&page->list, &pd->free[order]);
-
-       mutex_unlock(&pd->lock);
-
-       if (free_page) {
-               free_page(page->page);
-               kmem_cache_free(small_qp_cache, page);
-       }
-}
-
-int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
-                  const u32 nr_of_pages, const u32 pagesize,
-                  const u32 qe_size, const u32 nr_of_sg,
-                  int is_small)
-{
-       if (pagesize > PAGE_SIZE) {
-               ehca_gen_err("FATAL ERROR: pagesize=%x "
-                            "is greater than kernel page size", pagesize);
-               return 0;
-       }
-
-       /* init queue fields */
-       queue->queue_length = nr_of_pages * pagesize;
-       queue->pagesize = pagesize;
-       queue->qe_size = qe_size;
-       queue->act_nr_of_sg = nr_of_sg;
-       queue->current_q_offset = 0;
-       queue->toggle_state = 1;
-       queue->small_page = NULL;
-
-       /* allocate queue page pointers */
-       queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
-                                    GFP_KERNEL | __GFP_NOWARN);
-       if (!queue->queue_pages) {
-               queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
-               if (!queue->queue_pages) {
-                       ehca_gen_err("Couldn't allocate queue page list");
-                       return 0;
-               }
-       }
-
-       /* allocate actual queue pages */
-       if (is_small) {
-               if (!alloc_small_queue_page(queue, pd))
-                       goto ipz_queue_ctor_exit0;
-       } else
-               if (!alloc_queue_pages(queue, nr_of_pages))
-                       goto ipz_queue_ctor_exit0;
-
-       return 1;
-
-ipz_queue_ctor_exit0:
-       ehca_gen_err("Couldn't alloc pages queue=%p "
-                "nr_of_pages=%x",  queue, nr_of_pages);
-       kvfree(queue->queue_pages);
-
-       return 0;
-}
-
-int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
-{
-       int i, nr_pages;
-
-       if (!queue || !queue->queue_pages) {
-               ehca_gen_dbg("queue or queue_pages is NULL");
-               return 0;
-       }
-
-       if (queue->small_page)
-               free_small_queue_page(queue, pd);
-       else {
-               nr_pages = queue->queue_length / queue->pagesize;
-               for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
-                       free_page((unsigned long)queue->queue_pages[i]);
-       }
-
-       kvfree(queue->queue_pages);
-
-       return 1;
-}
-
-int ehca_init_small_qp_cache(void)
-{
-       small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
-                                          sizeof(struct ipz_small_queue_page),
-                                          0, SLAB_HWCACHE_ALIGN, NULL);
-       if (!small_qp_cache)
-               return -ENOMEM;
-
-       return 0;
-}
-
-void ehca_cleanup_small_qp_cache(void)
-{
-       kmem_cache_destroy(small_qp_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.h b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
deleted file mode 100644 (file)
index a801274..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  internal queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __IPZ_PT_FN_H__
-#define __IPZ_PT_FN_H__
-
-#define EHCA_PAGESHIFT   12
-#define EHCA_PAGESIZE   4096UL
-#define EHCA_PAGEMASK   (~(EHCA_PAGESIZE-1))
-#define EHCA_PT_ENTRIES 512UL
-
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-
-struct ehca_pd;
-struct ipz_small_queue_page;
-
-extern struct kmem_cache *small_qp_cache;
-
-/* struct generic ehca page */
-struct ipz_page {
-       u8 entries[EHCA_PAGESIZE];
-};
-
-#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
-
-struct ipz_small_queue_page {
-       unsigned long page;
-       unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
-       int fill;
-       void *mapped_addr;
-       u32 mmap_count;
-       struct list_head list;
-};
-
-/* struct generic queue in linux kernel virtual memory (kv) */
-struct ipz_queue {
-       u64 current_q_offset;   /* current queue entry */
-
-       struct ipz_page **queue_pages;  /* array of pages belonging to queue */
-       u32 qe_size;            /* queue entry size */
-       u32 act_nr_of_sg;
-       u32 queue_length;       /* queue length allocated in bytes */
-       u32 pagesize;
-       u32 toggle_state;       /* toggle flag - per page */
-       u32 offset; /* save offset within page for small_qp */
-       struct ipz_small_queue_page *small_page;
-};
-
-/*
- * return current Queue Entry for a certain q_offset
- * returns address (kv) of Queue Entry
- */
-static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
-{
-       struct ipz_page *current_page;
-       if (q_offset >= queue->queue_length)
-               return NULL;
-       current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
-       return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
-}
-
-/*
- * return current Queue Entry
- * returns address (kv) of Queue Entry
- */
-static inline void *ipz_qeit_get(struct ipz_queue *queue)
-{
-       return ipz_qeit_calc(queue, queue->current_q_offset);
-}
-
-/*
- * return current Queue Page , increment Queue Page iterator from
- * page to page in struct ipz_queue, last increment will return 0! and
- * NOT wrap
- * returns address (kv) of Queue Page
- * warning don't use in parallel with ipz_QE_get_inc()
- */
-void *ipz_qpageit_get_inc(struct ipz_queue *queue);
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * warning don't use in parallel with ipz_qpageit_get_inc()
- */
-static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       queue->current_q_offset += queue->qe_size;
-       if (queue->current_q_offset >= queue->queue_length) {
-               queue->current_q_offset = 0;
-               /* toggle the valid flag */
-               queue->toggle_state = (~queue->toggle_state) & 1;
-       }
-
-       return ret;
-}
-
-/*
- * return a bool indicating whether current Queue Entry is valid
- */
-static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
-{
-       struct ehca_cqe *cqe = ipz_qeit_get(queue);
-       return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
-}
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * returns 0 and does not increment, if wrong valid state
- * warning don't use in parallel with ipz_qpageit_get_inc()
- */
-static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
-{
-       return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
-}
-
-/*
- * returns and resets Queue Entry iterator
- * returns address (kv) of first Queue Entry
- */
-static inline void *ipz_qeit_reset(struct ipz_queue *queue)
-{
-       queue->current_q_offset = 0;
-       return ipz_qeit_get(queue);
-}
-
-/*
- * return the q_offset corresponding to an absolute address
- */
-int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
-
-/*
- * return the next queue offset. don't modify the queue.
- */
-static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
-{
-       offset += queue->qe_size;
-       if (offset >= queue->queue_length) offset = 0;
-       return offset;
-}
-
-/* struct generic page table */
-struct ipz_pt {
-       u64 entries[EHCA_PT_ENTRIES];
-};
-
-/* struct page table for a queue, only to be used in pf */
-struct ipz_qpt {
-       /* queue page tables (kv), use u64 because we know the element length */
-       u64 *qpts;
-       u32 n_qpts;
-       u32 n_ptes;       /*  number of page table entries */
-       u64 *current_pte_addr;
-};
-
-/*
- * constructor for a ipz_queue_t, placement new for ipz_queue_t,
- * new for all dependent datastructors
- * all QP Tables are the same
- * flow:
- *    allocate+pin queue
- * see ipz_qpt_ctor()
- * returns true if ok, false if out of memory
- */
-int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
-                  const u32 nr_of_pages, const u32 pagesize,
-                  const u32 qe_size, const u32 nr_of_sg,
-                  int is_small);
-
-/*
- * destructor for a ipz_queue_t
- *  -# free queue
- *  see ipz_queue_ctor()
- *  returns true if ok, false if queue was NULL-ptr of free failed
- */
-int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
-
-/*
- * constructor for a ipz_qpt_t,
- * placement new for struct ipz_queue, new for all dependent datastructors
- * all QP Tables are the same,
- * flow:
- * -# allocate+pin queue
- * -# initialise ptcb
- * -# allocate+pin PTs
- * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
- * -# the ring must have room for exactly nr_of_PTEs
- * see ipz_qpt_ctor()
- */
-void ipz_qpt_ctor(struct ipz_qpt *qpt,
-                 const u32 nr_of_qes,
-                 const u32 pagesize,
-                 const u32 qe_size,
-                 const u8 lowbyte, const u8 toggle,
-                 u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * warning don't use in parallel with ipz_qpageit_get_inc()
- * warning unpredictable results may occur if steps>act_nr_of_queue_entries
- * fix EQ page problems
- */
-void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
-
-/*
- * return current Event Queue Entry, increment Queue Entry iterator
- * by one step in struct ipz_queue if valid, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * returns 0 and does not increment, if wrong valid state
- * warning don't use in parallel with ipz_queue_QPageit_get_inc()
- * warning unpredictable results may occur if steps>act_nr_of_queue_entries
- */
-static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u32 qe = *(u8 *)ret;
-       if ((qe >> 7) != (queue->toggle_state & 1))
-               return NULL;
-       ipz_qeit_eq_get_inc(queue); /* this is a good one */
-       return ret;
-}
-
-static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u32 qe = *(u8 *)ret;
-       if ((qe >> 7) != (queue->toggle_state & 1))
-               return NULL;
-       return ret;
-}
-
-/* returns address (GX) of first queue entry */
-static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
-{
-       return be64_to_cpu(qpt->qpts[0]);
-}
-
-/* returns address (kv) of first page of queue page table */
-static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
-{
-       return qpt->qpts;
-}
-
-#endif                         /* __IPZ_PT_FN_H__ */
index 7258818..e449e39 100644 (file)
@@ -908,7 +908,7 @@ static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return 0;
 }
 
-static struct vm_operations_struct qib_file_vm_ops = {
+static const struct vm_operations_struct qib_file_vm_ops = {
        .fault = qib_file_vma_fault,
 };
 
index 146cf29..34927b7 100644 (file)
@@ -75,7 +75,7 @@ static void qib_vma_close(struct vm_area_struct *vma)
        kref_put(&ip->ref, qib_release_mmap_info);
 }
 
-static struct vm_operations_struct qib_vm_ops = {
+static const struct vm_operations_struct qib_vm_ops = {
        .open =     qib_vma_open,
        .close =    qib_vma_close,
 };
index dc439a4..403bd29 100644 (file)
@@ -3095,7 +3095,7 @@ out:
 
 static int
 isert_setup_np(struct iscsi_np *np,
-              struct __kernel_sockaddr_storage *ksockaddr)
+              struct sockaddr_storage *ksockaddr)
 {
        struct isert_np *isert_np;
        struct rdma_cm_id *isert_lid;
@@ -3117,7 +3117,7 @@ isert_setup_np(struct iscsi_np *np,
         * in iscsi_target_configfs.c code..
         */
        memcpy(&np->np_sockaddr, ksockaddr,
-              sizeof(struct __kernel_sockaddr_storage));
+              sizeof(struct sockaddr_storage));
 
        isert_lid = isert_setup_id(isert_np);
        if (IS_ERR(isert_lid)) {
@@ -3199,32 +3199,11 @@ isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
 {
        struct rdma_cm_id *cm_id = isert_conn->cm_id;
        struct rdma_route *cm_route = &cm_id->route;
-       struct sockaddr_in *sock_in;
-       struct sockaddr_in6 *sock_in6;
 
        conn->login_family = np->np_sockaddr.ss_family;
 
-       if (np->np_sockaddr.ss_family == AF_INET6) {
-               sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr;
-               snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c",
-                        &sock_in6->sin6_addr.in6_u);
-               conn->login_port = ntohs(sock_in6->sin6_port);
-
-               sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr;
-               snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c",
-                        &sock_in6->sin6_addr.in6_u);
-               conn->local_port = ntohs(sock_in6->sin6_port);
-       } else {
-               sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr;
-               sprintf(conn->login_ip, "%pI4",
-                       &sock_in->sin_addr.s_addr);
-               conn->login_port = ntohs(sock_in->sin_port);
-
-               sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr;
-               sprintf(conn->local_ip, "%pI4",
-                       &sock_in->sin_addr.s_addr);
-               conn->local_port = ntohs(sock_in->sin_port);
-       }
+       conn->login_sockaddr = cm_route->addr.dst_addr;
+       conn->local_sockaddr = cm_route->addr.src_addr;
 }
 
 static int
index 9d35499..08d4964 100644 (file)
@@ -290,19 +290,14 @@ static int evdev_flush(struct file *file, fl_owner_t id)
 {
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
-       int retval;
 
-       retval = mutex_lock_interruptible(&evdev->mutex);
-       if (retval)
-               return retval;
+       mutex_lock(&evdev->mutex);
 
-       if (!evdev->exist || client->revoked)
-               retval = -ENODEV;
-       else
-               retval = input_flush_device(&evdev->handle, file);
+       if (evdev->exist && !client->revoked)
+               input_flush_device(&evdev->handle, file);
 
        mutex_unlock(&evdev->mutex);
-       return retval;
+       return 0;
 }
 
 static void evdev_free(struct device *dev)
index d2ea863..2165f3d 100644 (file)
@@ -5,8 +5,6 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
- *
- * <<Power management needs to be implemented>>.
  */
 
 #include <linux/clk.h>
index 1f7e15c..4f5ef5b 100644 (file)
@@ -118,6 +118,7 @@ static const struct of_device_id ab8500_ponkey_match[] = {
        { .compatible = "stericsson,ab8500-ponkey", },
        {}
 };
+MODULE_DEVICE_TABLE(of, ab8500_ponkey_match);
 #endif
 
 static struct platform_driver ab8500_ponkey_driver = {
index e82edf8..f2261ab 100644 (file)
@@ -173,6 +173,7 @@ static const struct of_device_id pwm_beeper_match[] = {
        { .compatible = "pwm-beeper", },
        { },
 };
+MODULE_DEVICE_TABLE(of, pwm_beeper_match);
 #endif
 
 static struct platform_driver pwm_beeper_driver = {
index 6bf3f10..a804705 100644 (file)
@@ -249,6 +249,7 @@ static const struct of_device_id regulator_haptic_dt_match[] = {
        { .compatible = "regulator-haptic" },
        { /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, regulator_haptic_dt_match);
 
 static struct platform_driver regulator_haptic_driver = {
        .probe          = regulator_haptic_probe,
index 54116e5..6f997aa 100644 (file)
@@ -253,6 +253,7 @@ static const struct of_device_id bbc_beep_match[] = {
        },
        {},
 };
+MODULE_DEVICE_TABLE(of, bbc_beep_match);
 
 static struct platform_driver bbc_beep_driver = {
        .driver = {
@@ -332,6 +333,7 @@ static const struct of_device_id grover_beep_match[] = {
        },
        {},
 };
+MODULE_DEVICE_TABLE(of, grover_beep_match);
 
 static struct platform_driver grover_beep_driver = {
        .driver = {
index 95599e4..23d0549 100644 (file)
@@ -232,7 +232,7 @@ static int xenkbd_connect_backend(struct xenbus_device *dev,
        struct xenbus_transaction xbt;
 
        ret = gnttab_grant_foreign_access(dev->otherend_id,
-                                         virt_to_mfn(info->page), 0);
+                                         virt_to_gfn(info->page), 0);
        if (ret < 0)
                return ret;
        info->gref = ret;
@@ -255,7 +255,7 @@ static int xenkbd_connect_backend(struct xenbus_device *dev,
                goto error_irqh;
        }
        ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
-                           virt_to_mfn(info->page));
+                           virt_to_gfn(info->page));
        if (ret)
                goto error_xenbus;
        ret = xenbus_printf(xbt, dev->nodename, "page-gref", "%u", info->gref);
index e2b7420..fa94530 100644 (file)
@@ -1170,6 +1170,7 @@ static const struct acpi_device_id elan_acpi_id[] = {
        { "ELAN0000", 0 },
        { "ELAN0100", 0 },
        { "ELAN0600", 0 },
+       { "ELAN1000", 0 },
        { }
 };
 MODULE_DEVICE_TABLE(acpi, elan_acpi_id);
index c9c98f0..db91de5 100644 (file)
@@ -877,7 +877,7 @@ static int __init i8042_check_aux(void)
 static int i8042_controller_check(void)
 {
        if (i8042_flush()) {
-               pr_err("No controller found\n");
+               pr_info("No controller found\n");
                return -ENODEV;
        }
 
index 059edeb..600dcce 100644 (file)
@@ -479,6 +479,18 @@ config TOUCHSCREEN_MTOUCH
          To compile this driver as a module, choose M here: the
          module will be called mtouch.
 
+config TOUCHSCREEN_IMX6UL_TSC
+       tristate "Freescale i.MX6UL touchscreen controller"
+       depends on (OF && GPIOLIB) || COMPILE_TEST
+       help
+         Say Y here if you have a Freescale i.MX6UL, and want to
+         use the internal touchscreen controller.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called imx6ul_tsc.
+
 config TOUCHSCREEN_INEXIO
        tristate "iNexio serial touchscreens"
        select SERIO
@@ -1040,4 +1052,16 @@ config TOUCHSCREEN_ZFORCE
          To compile this driver as a module, choose M here: the
          module will be called zforce_ts.
 
+config TOUCHSCREEN_COLIBRI_VF50
+       tristate "Toradex Colibri on board touchscreen driver"
+       depends on GPIOLIB && IIO && VF610_ADC
+       help
+         Say Y here if you have a Colibri VF50 and plan to use
+         the on-board provided 4-wire touchscreen driver.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called colibri_vf50_ts.
+
 endif
index c85aae2..1b79cc0 100644 (file)
@@ -38,6 +38,7 @@ obj-$(CONFIG_TOUCHSCREEN_EGALAX)      += egalax_ts.o
 obj-$(CONFIG_TOUCHSCREEN_FUJITSU)      += fujitsu_ts.o
 obj-$(CONFIG_TOUCHSCREEN_GOODIX)       += goodix.o
 obj-$(CONFIG_TOUCHSCREEN_ILI210X)      += ili210x.o
+obj-$(CONFIG_TOUCHSCREEN_IMX6UL_TSC)   += imx6ul_tsc.o
 obj-$(CONFIG_TOUCHSCREEN_INEXIO)       += inexio.o
 obj-$(CONFIG_TOUCHSCREEN_INTEL_MID)    += intel-mid-touch.o
 obj-$(CONFIG_TOUCHSCREEN_IPROC)                += bcm_iproc_tsc.o
@@ -85,3 +86,4 @@ obj-$(CONFIG_TOUCHSCREEN_W90X900)     += w90p910_ts.o
 obj-$(CONFIG_TOUCHSCREEN_SX8654)       += sx8654.o
 obj-$(CONFIG_TOUCHSCREEN_TPS6507X)     += tps6507x-ts.o
 obj-$(CONFIG_TOUCHSCREEN_ZFORCE)       += zforce_ts.o
+obj-$(CONFIG_TOUCHSCREEN_COLIBRI_VF50) += colibri-vf50-ts.o
diff --git a/drivers/input/touchscreen/colibri-vf50-ts.c b/drivers/input/touchscreen/colibri-vf50-ts.c
new file mode 100644 (file)
index 0000000..5d4903a
--- /dev/null
@@ -0,0 +1,386 @@
+/*
+ * Toradex Colibri VF50 Touchscreen driver
+ *
+ * Copyright 2015 Toradex AG
+ *
+ * Originally authored by Stefan Agner for 3.0 kernel
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/iio/consumer.h>
+#include <linux/iio/types.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define DRIVER_NAME                    "colibri-vf50-ts"
+#define DRV_VERSION                    "1.0"
+
+#define VF_ADC_MAX                     ((1 << 12) - 1)
+
+#define COLI_TOUCH_MIN_DELAY_US                1000
+#define COLI_TOUCH_MAX_DELAY_US                2000
+#define COLI_PULLUP_MIN_DELAY_US       10000
+#define COLI_PULLUP_MAX_DELAY_US       11000
+#define COLI_TOUCH_NO_OF_AVGS          5
+#define COLI_TOUCH_REQ_ADC_CHAN                4
+
+struct vf50_touch_device {
+       struct platform_device *pdev;
+       struct input_dev *ts_input;
+       struct iio_channel *channels;
+       struct gpio_desc *gpio_xp;
+       struct gpio_desc *gpio_xm;
+       struct gpio_desc *gpio_yp;
+       struct gpio_desc *gpio_ym;
+       int pen_irq;
+       int min_pressure;
+       bool stop_touchscreen;
+};
+
+/*
+ * Enables given plates and measures touch parameters using ADC
+ */
+static int adc_ts_measure(struct iio_channel *channel,
+                         struct gpio_desc *plate_p, struct gpio_desc *plate_m)
+{
+       int i, value = 0, val = 0;
+       int error;
+
+       gpiod_set_value(plate_p, 1);
+       gpiod_set_value(plate_m, 1);
+
+       usleep_range(COLI_TOUCH_MIN_DELAY_US, COLI_TOUCH_MAX_DELAY_US);
+
+       for (i = 0; i < COLI_TOUCH_NO_OF_AVGS; i++) {
+               error = iio_read_channel_raw(channel, &val);
+               if (error < 0) {
+                       value = error;
+                       goto error_iio_read;
+               }
+
+               value += val;
+       }
+
+       value /= COLI_TOUCH_NO_OF_AVGS;
+
+error_iio_read:
+       gpiod_set_value(plate_p, 0);
+       gpiod_set_value(plate_m, 0);
+
+       return value;
+}
+
+/*
+ * Enable touch detection using falling edge detection on XM
+ */
+static void vf50_ts_enable_touch_detection(struct vf50_touch_device *vf50_ts)
+{
+       /* Enable plate YM (needs to be strong GND, high active) */
+       gpiod_set_value(vf50_ts->gpio_ym, 1);
+
+       /*
+        * Let the platform mux to idle state in order to enable
+        * Pull-Up on GPIO
+        */
+       pinctrl_pm_select_idle_state(&vf50_ts->pdev->dev);
+
+       /* Wait for the pull-up to be stable on high */
+       usleep_range(COLI_PULLUP_MIN_DELAY_US, COLI_PULLUP_MAX_DELAY_US);
+}
+
+/*
+ * ADC touch screen sampling bottom half irq handler
+ */
+static irqreturn_t vf50_ts_irq_bh(int irq, void *private)
+{
+       struct vf50_touch_device *vf50_ts = private;
+       struct device *dev = &vf50_ts->pdev->dev;
+       int val_x, val_y, val_z1, val_z2, val_p = 0;
+       bool discard_val_on_start = true;
+
+       /* Disable the touch detection plates */
+       gpiod_set_value(vf50_ts->gpio_ym, 0);
+
+       /* Let the platform mux to default state in order to mux as ADC */
+       pinctrl_pm_select_default_state(dev);
+
+       while (!vf50_ts->stop_touchscreen) {
+               /* X-Direction */
+               val_x = adc_ts_measure(&vf50_ts->channels[0],
+                               vf50_ts->gpio_xp, vf50_ts->gpio_xm);
+               if (val_x < 0)
+                       break;
+
+               /* Y-Direction */
+               val_y = adc_ts_measure(&vf50_ts->channels[1],
+                               vf50_ts->gpio_yp, vf50_ts->gpio_ym);
+               if (val_y < 0)
+                       break;
+
+               /*
+                * Touch pressure
+                * Measure on XP/YM
+                */
+               val_z1 = adc_ts_measure(&vf50_ts->channels[2],
+                               vf50_ts->gpio_yp, vf50_ts->gpio_xm);
+               if (val_z1 < 0)
+                       break;
+               val_z2 = adc_ts_measure(&vf50_ts->channels[3],
+                               vf50_ts->gpio_yp, vf50_ts->gpio_xm);
+               if (val_z2 < 0)
+                       break;
+
+               /* Validate signal (avoid calculation using noise) */
+               if (val_z1 > 64 && val_x > 64) {
+                       /*
+                        * Calculate resistance between the plates
+                        * lower resistance means higher pressure
+                        */
+                       int r_x = (1000 * val_x) / VF_ADC_MAX;
+
+                       val_p = (r_x * val_z2) / val_z1 - r_x;
+
+               } else {
+                       val_p = 2000;
+               }
+
+               val_p = 2000 - val_p;
+               dev_dbg(dev,
+                       "Measured values: x: %d, y: %d, z1: %d, z2: %d, p: %d\n",
+                       val_x, val_y, val_z1, val_z2, val_p);
+
+               /*
+                * If touch pressure is too low, stop measuring and reenable
+                * touch detection
+                */
+               if (val_p < vf50_ts->min_pressure || val_p > 2000)
+                       break;
+
+               /*
+                * The pressure may not be enough for the first x and the
+                * second y measurement, but, the pressure is ok when the
+                * driver is doing the third and fourth measurement. To
+                * take care of this, we drop the first measurement always.
+                */
+               if (discard_val_on_start) {
+                       discard_val_on_start = false;
+               } else {
+                       /*
+                        * Report touch position and sleep for
+                        * the next measurement.
+                        */
+                       input_report_abs(vf50_ts->ts_input,
+                                       ABS_X, VF_ADC_MAX - val_x);
+                       input_report_abs(vf50_ts->ts_input,
+                                       ABS_Y, VF_ADC_MAX - val_y);
+                       input_report_abs(vf50_ts->ts_input,
+                                       ABS_PRESSURE, val_p);
+                       input_report_key(vf50_ts->ts_input, BTN_TOUCH, 1);
+                       input_sync(vf50_ts->ts_input);
+               }
+
+               usleep_range(COLI_PULLUP_MIN_DELAY_US,
+                            COLI_PULLUP_MAX_DELAY_US);
+       }
+
+       /* Report no more touch, re-enable touch detection */
+       input_report_abs(vf50_ts->ts_input, ABS_PRESSURE, 0);
+       input_report_key(vf50_ts->ts_input, BTN_TOUCH, 0);
+       input_sync(vf50_ts->ts_input);
+
+       vf50_ts_enable_touch_detection(vf50_ts);
+
+       return IRQ_HANDLED;
+}
+
+static int vf50_ts_open(struct input_dev *dev_input)
+{
+       struct vf50_touch_device *touchdev = input_get_drvdata(dev_input);
+       struct device *dev = &touchdev->pdev->dev;
+
+       dev_dbg(dev, "Input device %s opened, starting touch detection\n",
+               dev_input->name);
+
+       touchdev->stop_touchscreen = false;
+
+       /* Mux detection before request IRQ, wait for pull-up to settle */
+       vf50_ts_enable_touch_detection(touchdev);
+
+       return 0;
+}
+
+static void vf50_ts_close(struct input_dev *dev_input)
+{
+       struct vf50_touch_device *touchdev = input_get_drvdata(dev_input);
+       struct device *dev = &touchdev->pdev->dev;
+
+       touchdev->stop_touchscreen = true;
+
+       /* Make sure IRQ is not running past close */
+       mb();
+       synchronize_irq(touchdev->pen_irq);
+
+       gpiod_set_value(touchdev->gpio_ym, 0);
+       pinctrl_pm_select_default_state(dev);
+
+       dev_dbg(dev, "Input device %s closed, disable touch detection\n",
+               dev_input->name);
+}
+
+static int vf50_ts_get_gpiod(struct device *dev, struct gpio_desc **gpio_d,
+                            const char *con_id, enum gpiod_flags flags)
+{
+       int error;
+
+       *gpio_d = devm_gpiod_get(dev, con_id, flags);
+       if (IS_ERR(*gpio_d)) {
+               error = PTR_ERR(*gpio_d);
+               dev_err(dev, "Could not get gpio_%s %d\n", con_id, error);
+               return error;
+       }
+
+       return 0;
+}
+
+static void vf50_ts_channel_release(void *data)
+{
+       struct iio_channel *channels = data;
+
+       iio_channel_release_all(channels);
+}
+
+static int vf50_ts_probe(struct platform_device *pdev)
+{
+       struct input_dev *input;
+       struct iio_channel *channels;
+       struct device *dev = &pdev->dev;
+       struct vf50_touch_device *touchdev;
+       int num_adc_channels;
+       int error;
+
+       channels = iio_channel_get_all(dev);
+       if (IS_ERR(channels))
+               return PTR_ERR(channels);
+
+       error = devm_add_action(dev, vf50_ts_channel_release, channels);
+       if (error) {
+               iio_channel_release_all(channels);
+               dev_err(dev, "Failed to register iio channel release action");
+               return error;
+       }
+
+       num_adc_channels = 0;
+       while (channels[num_adc_channels].indio_dev)
+               num_adc_channels++;
+
+       if (num_adc_channels != COLI_TOUCH_REQ_ADC_CHAN) {
+               dev_err(dev, "Inadequate ADC channels specified\n");
+               return -EINVAL;
+       }
+
+       touchdev = devm_kzalloc(dev, sizeof(*touchdev), GFP_KERNEL);
+       if (!touchdev)
+               return -ENOMEM;
+
+       touchdev->pdev = pdev;
+       touchdev->channels = channels;
+
+       error = of_property_read_u32(dev->of_node, "vf50-ts-min-pressure",
+                                &touchdev->min_pressure);
+       if (error)
+               return error;
+
+       input = devm_input_allocate_device(dev);
+       if (!input) {
+               dev_err(dev, "Failed to allocate TS input device\n");
+               return -ENOMEM;
+       }
+
+       platform_set_drvdata(pdev, touchdev);
+
+       input->name = DRIVER_NAME;
+       input->id.bustype = BUS_HOST;
+       input->dev.parent = dev;
+       input->open = vf50_ts_open;
+       input->close = vf50_ts_close;
+
+       input_set_capability(input, EV_KEY, BTN_TOUCH);
+       input_set_abs_params(input, ABS_X, 0, VF_ADC_MAX, 0, 0);
+       input_set_abs_params(input, ABS_Y, 0, VF_ADC_MAX, 0, 0);
+       input_set_abs_params(input, ABS_PRESSURE, 0, VF_ADC_MAX, 0, 0);
+
+       touchdev->ts_input = input;
+       input_set_drvdata(input, touchdev);
+
+       error = input_register_device(input);
+       if (error) {
+               dev_err(dev, "Failed to register input device\n");
+               return error;
+       }
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_xp, "xp", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_xm,
+                               "xm", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_yp, "yp", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_ym, "ym", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       touchdev->pen_irq = platform_get_irq(pdev, 0);
+       if (touchdev->pen_irq < 0)
+               return touchdev->pen_irq;
+
+       error = devm_request_threaded_irq(dev, touchdev->pen_irq,
+                                         NULL, vf50_ts_irq_bh, IRQF_ONESHOT,
+                                         "vf50 touch", touchdev);
+       if (error) {
+               dev_err(dev, "Failed to request IRQ %d: %d\n",
+                       touchdev->pen_irq, error);
+               return error;
+       }
+
+       return 0;
+}
+
+static const struct of_device_id vf50_touch_of_match[] = {
+       { .compatible = "toradex,vf50-touchscreen", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, vf50_touch_of_match);
+
+static struct platform_driver vf50_touch_driver = {
+       .driver = {
+               .name = "toradex,vf50_touchctrl",
+               .of_match_table = vf50_touch_of_match,
+       },
+       .probe = vf50_ts_probe,
+};
+module_platform_driver(vf50_touch_driver);
+
+MODULE_AUTHOR("Sanchayan Maity");
+MODULE_DESCRIPTION("Colibri VF50 Touchscreen driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
index 9a323dd..a9f95c7 100644 (file)
@@ -86,4 +86,3 @@ module_i2c_driver(cyttsp4_i2c_driver);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Cypress TrueTouch(R) Standard Product (TTSP) I2C driver");
 MODULE_AUTHOR("Cypress");
-MODULE_ALIAS("i2c:cyttsp4");
index 519e2de..eee51b3 100644 (file)
@@ -86,4 +86,3 @@ module_i2c_driver(cyttsp_i2c_driver);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Cypress TrueTouch(R) Standard Product (TTSP) I2C driver");
 MODULE_AUTHOR("Cypress");
-MODULE_ALIAS("i2c:cyttsp");
index ddac134..17cc20e 100644 (file)
 #define ELAN_FW_PAGESIZE       132
 
 /* calibration timeout definition */
-#define ELAN_CALI_TIMEOUT_MSEC 10000
+#define ELAN_CALI_TIMEOUT_MSEC 12000
 
 #define ELAN_POWERON_DELAY_USEC        500
 #define ELAN_RESET_DELAY_MSEC  20
diff --git a/drivers/input/touchscreen/imx6ul_tsc.c b/drivers/input/touchscreen/imx6ul_tsc.c
new file mode 100644 (file)
index 0000000..ff0b758
--- /dev/null
@@ -0,0 +1,523 @@
+/*
+ * Freescale i.MX6UL touchscreen controller driver
+ *
+ * Copyright (C) 2015 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/gpio/consumer.h>
+#include <linux/input.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/of.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+
+/* ADC configuration registers field define */
+#define ADC_AIEN               (0x1 << 7)
+#define ADC_CONV_DISABLE       0x1F
+#define ADC_CAL                        (0x1 << 7)
+#define ADC_CALF               0x2
+#define ADC_12BIT_MODE         (0x2 << 2)
+#define ADC_IPG_CLK            0x00
+#define ADC_CLK_DIV_8          (0x03 << 5)
+#define ADC_SHORT_SAMPLE_MODE  (0x0 << 4)
+#define ADC_HARDWARE_TRIGGER   (0x1 << 13)
+#define SELECT_CHANNEL_4       0x04
+#define SELECT_CHANNEL_1       0x01
+#define DISABLE_CONVERSION_INT (0x0 << 7)
+
+/* ADC registers */
+#define REG_ADC_HC0            0x00
+#define REG_ADC_HC1            0x04
+#define REG_ADC_HC2            0x08
+#define REG_ADC_HC3            0x0C
+#define REG_ADC_HC4            0x10
+#define REG_ADC_HS             0x14
+#define REG_ADC_R0             0x18
+#define REG_ADC_CFG            0x2C
+#define REG_ADC_GC             0x30
+#define REG_ADC_GS             0x34
+
+#define ADC_TIMEOUT            msecs_to_jiffies(100)
+
+/* TSC registers */
+#define REG_TSC_BASIC_SETING   0x00
+#define REG_TSC_PRE_CHARGE_TIME        0x10
+#define REG_TSC_FLOW_CONTROL   0x20
+#define REG_TSC_MEASURE_VALUE  0x30
+#define REG_TSC_INT_EN         0x40
+#define REG_TSC_INT_SIG_EN     0x50
+#define REG_TSC_INT_STATUS     0x60
+#define REG_TSC_DEBUG_MODE     0x70
+#define REG_TSC_DEBUG_MODE2    0x80
+
+/* TSC configuration registers field define */
+#define DETECT_4_WIRE_MODE     (0x0 << 4)
+#define AUTO_MEASURE           0x1
+#define MEASURE_SIGNAL         0x1
+#define DETECT_SIGNAL          (0x1 << 4)
+#define VALID_SIGNAL           (0x1 << 8)
+#define MEASURE_INT_EN         0x1
+#define MEASURE_SIG_EN         0x1
+#define VALID_SIG_EN           (0x1 << 8)
+#define DE_GLITCH_2            (0x2 << 29)
+#define START_SENSE            (0x1 << 12)
+#define TSC_DISABLE            (0x1 << 16)
+#define DETECT_MODE            0x2
+
+struct imx6ul_tsc {
+       struct device *dev;
+       struct input_dev *input;
+       void __iomem *tsc_regs;
+       void __iomem *adc_regs;
+       struct clk *tsc_clk;
+       struct clk *adc_clk;
+       struct gpio_desc *xnur_gpio;
+
+       int measure_delay_time;
+       int pre_charge_time;
+
+       struct completion completion;
+};
+
+/*
+ * TSC module need ADC to get the measure value. So
+ * before config TSC, we should initialize ADC module.
+ */
+static void imx6ul_adc_init(struct imx6ul_tsc *tsc)
+{
+       int adc_hc = 0;
+       int adc_gc;
+       int adc_gs;
+       int adc_cfg;
+       int timeout;
+
+       reinit_completion(&tsc->completion);
+
+       adc_cfg = readl(tsc->adc_regs + REG_ADC_CFG);
+       adc_cfg |= ADC_12BIT_MODE | ADC_IPG_CLK;
+       adc_cfg |= ADC_CLK_DIV_8 | ADC_SHORT_SAMPLE_MODE;
+       adc_cfg &= ~ADC_HARDWARE_TRIGGER;
+       writel(adc_cfg, tsc->adc_regs + REG_ADC_CFG);
+
+       /* enable calibration interrupt */
+       adc_hc |= ADC_AIEN;
+       adc_hc |= ADC_CONV_DISABLE;
+       writel(adc_hc, tsc->adc_regs + REG_ADC_HC0);
+
+       /* start ADC calibration */
+       adc_gc = readl(tsc->adc_regs + REG_ADC_GC);
+       adc_gc |= ADC_CAL;
+       writel(adc_gc, tsc->adc_regs + REG_ADC_GC);
+
+       timeout = wait_for_completion_timeout
+                       (&tsc->completion, ADC_TIMEOUT);
+       if (timeout == 0)
+               dev_err(tsc->dev, "Timeout for adc calibration\n");
+
+       adc_gs = readl(tsc->adc_regs + REG_ADC_GS);
+       if (adc_gs & ADC_CALF)
+               dev_err(tsc->dev, "ADC calibration failed\n");
+
+       /* TSC need the ADC work in hardware trigger */
+       adc_cfg = readl(tsc->adc_regs + REG_ADC_CFG);
+       adc_cfg |= ADC_HARDWARE_TRIGGER;
+       writel(adc_cfg, tsc->adc_regs + REG_ADC_CFG);
+}
+
+/*
+ * This is a TSC workaround. Currently TSC misconnect two
+ * ADC channels, this function remap channel configure for
+ * hardware trigger.
+ */
+static void imx6ul_tsc_channel_config(struct imx6ul_tsc *tsc)
+{
+       int adc_hc0, adc_hc1, adc_hc2, adc_hc3, adc_hc4;
+
+       adc_hc0 = DISABLE_CONVERSION_INT;
+       writel(adc_hc0, tsc->adc_regs + REG_ADC_HC0);
+
+       adc_hc1 = DISABLE_CONVERSION_INT | SELECT_CHANNEL_4;
+       writel(adc_hc1, tsc->adc_regs + REG_ADC_HC1);
+
+       adc_hc2 = DISABLE_CONVERSION_INT;
+       writel(adc_hc2, tsc->adc_regs + REG_ADC_HC2);
+
+       adc_hc3 = DISABLE_CONVERSION_INT | SELECT_CHANNEL_1;
+       writel(adc_hc3, tsc->adc_regs + REG_ADC_HC3);
+
+       adc_hc4 = DISABLE_CONVERSION_INT;
+       writel(adc_hc4, tsc->adc_regs + REG_ADC_HC4);
+}
+
+/*
+ * TSC setting, confige the pre-charge time and measure delay time.
+ * different touch screen may need different pre-charge time and
+ * measure delay time.
+ */
+static void imx6ul_tsc_set(struct imx6ul_tsc *tsc)
+{
+       int basic_setting = 0;
+       int start;
+
+       basic_setting |= tsc->measure_delay_time << 8;
+       basic_setting |= DETECT_4_WIRE_MODE | AUTO_MEASURE;
+       writel(basic_setting, tsc->tsc_regs + REG_TSC_BASIC_SETING);
+
+       writel(DE_GLITCH_2, tsc->tsc_regs + REG_TSC_DEBUG_MODE2);
+
+       writel(tsc->pre_charge_time, tsc->tsc_regs + REG_TSC_PRE_CHARGE_TIME);
+       writel(MEASURE_INT_EN, tsc->tsc_regs + REG_TSC_INT_EN);
+       writel(MEASURE_SIG_EN | VALID_SIG_EN,
+               tsc->tsc_regs + REG_TSC_INT_SIG_EN);
+
+       /* start sense detection */
+       start = readl(tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+       start |= START_SENSE;
+       start &= ~TSC_DISABLE;
+       writel(start, tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+}
+
+static void imx6ul_tsc_init(struct imx6ul_tsc *tsc)
+{
+       imx6ul_adc_init(tsc);
+       imx6ul_tsc_channel_config(tsc);
+       imx6ul_tsc_set(tsc);
+}
+
+static void imx6ul_tsc_disable(struct imx6ul_tsc *tsc)
+{
+       int tsc_flow;
+       int adc_cfg;
+
+       /* TSC controller enters to idle status */
+       tsc_flow = readl(tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+       tsc_flow |= TSC_DISABLE;
+       writel(tsc_flow, tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+
+       /* ADC controller enters to stop mode */
+       adc_cfg = readl(tsc->adc_regs + REG_ADC_HC0);
+       adc_cfg |= ADC_CONV_DISABLE;
+       writel(adc_cfg, tsc->adc_regs + REG_ADC_HC0);
+}
+
+/* Delay some time (max 2ms), wait the pre-charge done. */
+static bool tsc_wait_detect_mode(struct imx6ul_tsc *tsc)
+{
+       unsigned long timeout = jiffies + msecs_to_jiffies(2);
+       int state_machine;
+       int debug_mode2;
+
+       do {
+               if (time_after(jiffies, timeout))
+                       return false;
+
+               usleep_range(200, 400);
+               debug_mode2 = readl(tsc->tsc_regs + REG_TSC_DEBUG_MODE2);
+               state_machine = (debug_mode2 >> 20) & 0x7;
+       } while (state_machine != DETECT_MODE);
+
+       usleep_range(200, 400);
+       return true;
+}
+
+static irqreturn_t tsc_irq_fn(int irq, void *dev_id)
+{
+       struct imx6ul_tsc *tsc = dev_id;
+       int status;
+       int value;
+       int x, y;
+       int start;
+
+       status = readl(tsc->tsc_regs + REG_TSC_INT_STATUS);
+
+       /* write 1 to clear the bit measure-signal */
+       writel(MEASURE_SIGNAL | DETECT_SIGNAL,
+               tsc->tsc_regs + REG_TSC_INT_STATUS);
+
+       /* It's a HW self-clean bit. Set this bit and start sense detection */
+       start = readl(tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+       start |= START_SENSE;
+       writel(start, tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+
+       if (status & MEASURE_SIGNAL) {
+               value = readl(tsc->tsc_regs + REG_TSC_MEASURE_VALUE);
+               x = (value >> 16) & 0x0fff;
+               y = value & 0x0fff;
+
+               /*
+                * In detect mode, we can get the xnur gpio value,
+                * otherwise assume contact is stiull active.
+                */
+               if (!tsc_wait_detect_mode(tsc) ||
+                   gpiod_get_value_cansleep(tsc->xnur_gpio)) {
+                       input_report_key(tsc->input, BTN_TOUCH, 1);
+                       input_report_abs(tsc->input, ABS_X, x);
+                       input_report_abs(tsc->input, ABS_Y, y);
+               } else {
+                       input_report_key(tsc->input, BTN_TOUCH, 0);
+               }
+
+               input_sync(tsc->input);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t adc_irq_fn(int irq, void *dev_id)
+{
+       struct imx6ul_tsc *tsc = dev_id;
+       int coco;
+       int value;
+
+       coco = readl(tsc->adc_regs + REG_ADC_HS);
+       if (coco & 0x01) {
+               value = readl(tsc->adc_regs + REG_ADC_R0);
+               complete(&tsc->completion);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int imx6ul_tsc_open(struct input_dev *input_dev)
+{
+       struct imx6ul_tsc *tsc = input_get_drvdata(input_dev);
+       int err;
+
+       err = clk_prepare_enable(tsc->adc_clk);
+       if (err) {
+               dev_err(tsc->dev,
+                       "Could not prepare or enable the adc clock: %d\n",
+                       err);
+               return err;
+       }
+
+       err = clk_prepare_enable(tsc->tsc_clk);
+       if (err) {
+               dev_err(tsc->dev,
+                       "Could not prepare or enable the tsc clock: %d\n",
+                       err);
+               clk_disable_unprepare(tsc->adc_clk);
+               return err;
+       }
+
+       imx6ul_tsc_init(tsc);
+
+       return 0;
+}
+
+static void imx6ul_tsc_close(struct input_dev *input_dev)
+{
+       struct imx6ul_tsc *tsc = input_get_drvdata(input_dev);
+
+       imx6ul_tsc_disable(tsc);
+
+       clk_disable_unprepare(tsc->tsc_clk);
+       clk_disable_unprepare(tsc->adc_clk);
+}
+
+static int imx6ul_tsc_probe(struct platform_device *pdev)
+{
+       struct device_node *np = pdev->dev.of_node;
+       struct imx6ul_tsc *tsc;
+       struct input_dev *input_dev;
+       struct resource *tsc_mem;
+       struct resource *adc_mem;
+       int err;
+       int tsc_irq;
+       int adc_irq;
+
+       tsc = devm_kzalloc(&pdev->dev, sizeof(struct imx6ul_tsc), GFP_KERNEL);
+       if (!tsc)
+               return -ENOMEM;
+
+       input_dev = devm_input_allocate_device(&pdev->dev);
+       if (!input_dev)
+               return -ENOMEM;
+
+       input_dev->name = "iMX6UL TouchScreen Controller";
+       input_dev->id.bustype = BUS_HOST;
+
+       input_dev->open = imx6ul_tsc_open;
+       input_dev->close = imx6ul_tsc_close;
+
+       input_set_capability(input_dev, EV_KEY, BTN_TOUCH);
+       input_set_abs_params(input_dev, ABS_X, 0, 0xFFF, 0, 0);
+       input_set_abs_params(input_dev, ABS_Y, 0, 0xFFF, 0, 0);
+
+       input_set_drvdata(input_dev, tsc);
+
+       tsc->dev = &pdev->dev;
+       tsc->input = input_dev;
+       init_completion(&tsc->completion);
+
+       tsc->xnur_gpio = devm_gpiod_get(&pdev->dev, "xnur", GPIOD_IN);
+       if (IS_ERR(tsc->xnur_gpio)) {
+               err = PTR_ERR(tsc->xnur_gpio);
+               dev_err(&pdev->dev,
+                       "failed to request GPIO tsc_X- (xnur): %d\n", err);
+               return err;
+       }
+
+       tsc_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       tsc->tsc_regs = devm_ioremap_resource(&pdev->dev, tsc_mem);
+       if (IS_ERR(tsc->tsc_regs)) {
+               err = PTR_ERR(tsc->tsc_regs);
+               dev_err(&pdev->dev, "failed to remap tsc memory: %d\n", err);
+               return err;
+       }
+
+       adc_mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+       tsc->adc_regs = devm_ioremap_resource(&pdev->dev, adc_mem);
+       if (IS_ERR(tsc->adc_regs)) {
+               err = PTR_ERR(tsc->adc_regs);
+               dev_err(&pdev->dev, "failed to remap adc memory: %d\n", err);
+               return err;
+       }
+
+       tsc->tsc_clk = devm_clk_get(&pdev->dev, "tsc");
+       if (IS_ERR(tsc->tsc_clk)) {
+               err = PTR_ERR(tsc->tsc_clk);
+               dev_err(&pdev->dev, "failed getting tsc clock: %d\n", err);
+               return err;
+       }
+
+       tsc->adc_clk = devm_clk_get(&pdev->dev, "adc");
+       if (IS_ERR(tsc->adc_clk)) {
+               err = PTR_ERR(tsc->adc_clk);
+               dev_err(&pdev->dev, "failed getting adc clock: %d\n", err);
+               return err;
+       }
+
+       tsc_irq = platform_get_irq(pdev, 0);
+       if (tsc_irq < 0) {
+               dev_err(&pdev->dev, "no tsc irq resource?\n");
+               return tsc_irq;
+       }
+
+       adc_irq = platform_get_irq(pdev, 1);
+       if (adc_irq <= 0) {
+               dev_err(&pdev->dev, "no adc irq resource?\n");
+               return adc_irq;
+       }
+
+       err = devm_request_threaded_irq(tsc->dev, tsc_irq,
+                                       NULL, tsc_irq_fn, IRQF_ONESHOT,
+                                       dev_name(&pdev->dev), tsc);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "failed requesting tsc irq %d: %d\n",
+                       tsc_irq, err);
+               return err;
+       }
+
+       err = devm_request_irq(tsc->dev, adc_irq, adc_irq_fn, 0,
+                               dev_name(&pdev->dev), tsc);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "failed requesting adc irq %d: %d\n",
+                       adc_irq, err);
+               return err;
+       }
+
+       err = of_property_read_u32(np, "measure-delay-time",
+                                  &tsc->measure_delay_time);
+       if (err)
+               tsc->measure_delay_time = 0xffff;
+
+       err = of_property_read_u32(np, "pre-charge-time",
+                                  &tsc->pre_charge_time);
+       if (err)
+               tsc->pre_charge_time = 0xfff;
+
+       err = input_register_device(tsc->input);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "failed to register input device: %d\n", err);
+               return err;
+       }
+
+       platform_set_drvdata(pdev, tsc);
+       return 0;
+}
+
+static int __maybe_unused imx6ul_tsc_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct imx6ul_tsc *tsc = platform_get_drvdata(pdev);
+       struct input_dev *input_dev = tsc->input;
+
+       mutex_lock(&input_dev->mutex);
+
+       if (input_dev->users) {
+               imx6ul_tsc_disable(tsc);
+
+               clk_disable_unprepare(tsc->tsc_clk);
+               clk_disable_unprepare(tsc->adc_clk);
+       }
+
+       mutex_unlock(&input_dev->mutex);
+
+       return 0;
+}
+
+static int __maybe_unused imx6ul_tsc_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct imx6ul_tsc *tsc = platform_get_drvdata(pdev);
+       struct input_dev *input_dev = tsc->input;
+       int retval = 0;
+
+       mutex_lock(&input_dev->mutex);
+
+       if (input_dev->users) {
+               retval = clk_prepare_enable(tsc->adc_clk);
+               if (retval)
+                       goto out;
+
+               retval = clk_prepare_enable(tsc->tsc_clk);
+               if (retval) {
+                       clk_disable_unprepare(tsc->adc_clk);
+                       goto out;
+               }
+
+               imx6ul_tsc_init(tsc);
+       }
+
+out:
+       mutex_unlock(&input_dev->mutex);
+       return retval;
+}
+
+static SIMPLE_DEV_PM_OPS(imx6ul_tsc_pm_ops,
+                        imx6ul_tsc_suspend, imx6ul_tsc_resume);
+
+static const struct of_device_id imx6ul_tsc_match[] = {
+       { .compatible = "fsl,imx6ul-tsc", },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, imx6ul_tsc_match);
+
+static struct platform_driver imx6ul_tsc_driver = {
+       .driver         = {
+               .name   = "imx6ul-tsc",
+               .of_match_table = imx6ul_tsc_match,
+               .pm     = &imx6ul_tsc_pm_ops,
+       },
+       .probe          = imx6ul_tsc_probe,
+};
+module_platform_driver(imx6ul_tsc_driver);
+
+MODULE_AUTHOR("Haibo Chen <haibo.chen@freescale.com>");
+MODULE_DESCRIPTION("Freescale i.MX6UL Touchscreen controller driver");
+MODULE_LICENSE("GPL v2");
index c011699..4857943 100644 (file)
@@ -191,7 +191,7 @@ static void sun4i_ts_close(struct input_dev *dev)
        writel(TEMP_IRQ_EN(1), ts->base + TP_INT_FIFOC);
 }
 
-static int sun4i_get_temp(const struct sun4i_ts_data *ts, long *temp)
+static int sun4i_get_temp(const struct sun4i_ts_data *ts, int *temp)
 {
        /* No temp_data until the first irq */
        if (ts->temp_data == -1)
@@ -202,7 +202,7 @@ static int sun4i_get_temp(const struct sun4i_ts_data *ts, long *temp)
        return 0;
 }
 
-static int sun4i_get_tz_temp(void *data, long *temp)
+static int sun4i_get_tz_temp(void *data, int *temp)
 {
        return sun4i_get_temp(data, temp);
 }
@@ -215,14 +215,14 @@ static ssize_t show_temp(struct device *dev, struct device_attribute *devattr,
                         char *buf)
 {
        struct sun4i_ts_data *ts = dev_get_drvdata(dev);
-       long temp;
+       int temp;
        int error;
 
        error = sun4i_get_temp(ts, &temp);
        if (error)
                return error;
 
-       return sprintf(buf, "%ld\n", temp);
+       return sprintf(buf, "%d\n", temp);
 }
 
 static ssize_t show_temp_label(struct device *dev,
index 0717aa9..9bc20e2 100644 (file)
@@ -135,8 +135,9 @@ __dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
 static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
                             struct seq_file *s)
 {
-       return seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
+       seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
                          (cr->cam & MMU_CAM_P) ? 1 : 0);
+       return 0;
 }
 
 static size_t omap_dump_tlb_entries(struct omap_iommu *obj, struct seq_file *s)
index d5415ee..3e01e6f 100644 (file)
@@ -393,7 +393,7 @@ config DM_MULTIPATH
        # of SCSI_DH if the latter isn't defined but if
        # it is, DM_MULTIPATH must depend on it.  We get a build
        # error if SCSI_DH=m and DM_MULTIPATH=y
-       depends on SCSI_DH || !SCSI_DH
+       depends on !SCSI_DH || SCSI
        ---help---
          Allow volume managers to support multipath hardware.
 
index eff7bdd..5a67671 100644 (file)
@@ -159,12 +159,9 @@ static struct priority_group *alloc_priority_group(void)
 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 {
        struct pgpath *pgpath, *tmp;
-       struct multipath *m = ti->private;
 
        list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
                list_del(&pgpath->list);
-               if (m->hw_handler_name)
-                       scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
                dm_put_device(ti, pgpath->path.dev);
                free_pgpath(pgpath);
        }
@@ -580,6 +577,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
                q = bdev_get_queue(p->path.dev->bdev);
 
        if (m->retain_attached_hw_handler) {
+retain:
                attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
                if (attached_handler_name) {
                        /*
@@ -599,20 +597,14 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
        }
 
        if (m->hw_handler_name) {
-               /*
-                * Increments scsi_dh reference, even when using an
-                * already-attached handler.
-                */
                r = scsi_dh_attach(q, m->hw_handler_name);
                if (r == -EBUSY) {
-                       /*
-                        * Already attached to different hw_handler:
-                        * try to reattach with correct one.
-                        */
-                       scsi_dh_detach(q);
-                       r = scsi_dh_attach(q, m->hw_handler_name);
-               }
+                       char b[BDEVNAME_SIZE];
 
+                       printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
+                               bdevname(p->path.dev->bdev, b));
+                       goto retain;
+               }
                if (r < 0) {
                        ti->error = "error attaching hardware handler";
                        dm_put_device(ti, p->path.dev);
@@ -624,7 +616,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
                        if (r < 0) {
                                ti->error = "unable to set hardware "
                                                        "handler parameters";
-                               scsi_dh_detach(q);
                                dm_put_device(ti, p->path.dev);
                                goto bad;
                        }
@@ -734,12 +725,6 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
                return 0;
 
        m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
-       if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
-                                    "scsi_dh_%s", m->hw_handler_name)) {
-               ti->error = "unknown hardware handler type";
-               ret = -EINVAL;
-               goto fail;
-       }
 
        if (hw_argc > 1) {
                char *p;
index dc2aaab..217d613 100644 (file)
@@ -10,6 +10,7 @@ config VIDEO_OMAP2_VOUT
        select OMAP2_DSS if HAS_IOMEM && ARCH_OMAP2PLUS
        select OMAP2_VRFB if ARCH_OMAP2 || ARCH_OMAP3
        select VIDEO_OMAP2_VOUT_VRFB if VIDEO_OMAP2_VOUT && OMAP2_VRFB
+       select FRAME_VECTOR
        default n
        ---help---
          V4L2 Display driver support for OMAP2/3 based boards.
index f09c5f1..70c28d1 100644 (file)
@@ -195,46 +195,34 @@ static int omap_vout_try_format(struct v4l2_pix_format *pix)
 }
 
 /*
- * omap_vout_uservirt_to_phys: This inline function is used to convert user
- * space virtual address to physical address.
+ * omap_vout_get_userptr: Convert user space virtual address to physical
+ * address.
  */
-static unsigned long omap_vout_uservirt_to_phys(unsigned long virtp)
+static int omap_vout_get_userptr(struct videobuf_buffer *vb, u32 virtp,
+                                u32 *physp)
 {
-       unsigned long physp = 0;
-       struct vm_area_struct *vma;
-       struct mm_struct *mm = current->mm;
+       struct frame_vector *vec;
+       int ret;
 
        /* For kernel direct-mapped memory, take the easy way */
-       if (virtp >= PAGE_OFFSET)
-               return virt_to_phys((void *) virtp);
-
-       down_read(&current->mm->mmap_sem);
-       vma = find_vma(mm, virtp);
-       if (vma && (vma->vm_flags & VM_IO) && vma->vm_pgoff) {
-               /* this will catch, kernel-allocated, mmaped-to-usermode
-                  addresses */
-               physp = (vma->vm_pgoff << PAGE_SHIFT) + (virtp - vma->vm_start);
-               up_read(&current->mm->mmap_sem);
-       } else {
-               /* otherwise, use get_user_pages() for general userland pages */
-               int res, nr_pages = 1;
-               struct page *pages;
+       if (virtp >= PAGE_OFFSET) {
+               *physp = virt_to_phys((void *)virtp);
+               return 0;
+       }
 
-               res = get_user_pages(current, current->mm, virtp, nr_pages, 1,
-                               0, &pages, NULL);
-               up_read(&current->mm->mmap_sem);
+       vec = frame_vector_create(1);
+       if (!vec)
+               return -ENOMEM;
 
-               if (res == nr_pages) {
-                       physp =  __pa(page_address(&pages[0]) +
-                                       (virtp & ~PAGE_MASK));
-               } else {
-                       printk(KERN_WARNING VOUT_NAME
-                                       "get_user_pages failed\n");
-                       return 0;
-               }
+       ret = get_vaddr_frames(virtp, 1, true, false, vec);
+       if (ret != 1) {
+               frame_vector_destroy(vec);
+               return -EINVAL;
        }
+       *physp = __pfn_to_phys(frame_vector_pfns(vec)[0]);
+       vb->priv = vec;
 
-       return physp;
+       return 0;
 }
 
 /*
@@ -784,11 +772,15 @@ static int omap_vout_buffer_prepare(struct videobuf_queue *q,
         * address of the buffer
         */
        if (V4L2_MEMORY_USERPTR == vb->memory) {
+               int ret;
+
                if (0 == vb->baddr)
                        return -EINVAL;
                /* Physical address */
-               vout->queued_buf_addr[vb->i] = (u8 *)
-                       omap_vout_uservirt_to_phys(vb->baddr);
+               ret = omap_vout_get_userptr(vb, vb->baddr,
+                               (u32 *)&vout->queued_buf_addr[vb->i]);
+               if (ret < 0)
+                       return ret;
        } else {
                unsigned long addr, dma_addr;
                unsigned long size;
@@ -834,12 +826,13 @@ static void omap_vout_buffer_queue(struct videobuf_queue *q,
 static void omap_vout_buffer_release(struct videobuf_queue *q,
                            struct videobuf_buffer *vb)
 {
-       struct omap_vout_device *vout = q->priv_data;
-
        vb->state = VIDEOBUF_NEEDS_INIT;
+       if (vb->memory == V4L2_MEMORY_USERPTR && vb->priv) {
+               struct frame_vector *vec = vb->priv;
 
-       if (V4L2_MEMORY_MMAP != vout->memory)
-               return;
+               put_vaddr_frames(vec);
+               frame_vector_destroy(vec);
+       }
 }
 
 /*
@@ -872,7 +865,7 @@ static void omap_vout_vm_close(struct vm_area_struct *vma)
        vout->mmap_count--;
 }
 
-static struct vm_operations_struct omap_vout_vm_ops = {
+static const struct vm_operations_struct omap_vout_vm_ops = {
        .open   = omap_vout_vm_open,
        .close  = omap_vout_vm_close,
 };
index b4b0229..82876a6 100644 (file)
@@ -84,6 +84,7 @@ config VIDEOBUF2_CORE
 
 config VIDEOBUF2_MEMOPS
        tristate
+       select FRAME_VECTOR
 
 config VIDEOBUF2_DMA_CONTIG
        tristate
index f1022d8..4f59b7e 100644 (file)
@@ -1691,9 +1691,7 @@ static int __buf_prepare(struct vb2_buffer *vb, const struct v4l2_buffer *b)
                ret = __qbuf_mmap(vb, b);
                break;
        case V4L2_MEMORY_USERPTR:
-               down_read(&current->mm->mmap_sem);
                ret = __qbuf_userptr(vb, b);
-               up_read(&current->mm->mmap_sem);
                break;
        case V4L2_MEMORY_DMABUF:
                ret = __qbuf_dmabuf(vb, b);
index 94c1e64..2397ceb 100644 (file)
@@ -32,15 +32,13 @@ struct vb2_dc_buf {
        dma_addr_t                      dma_addr;
        enum dma_data_direction         dma_dir;
        struct sg_table                 *dma_sgt;
+       struct frame_vector             *vec;
 
        /* MMAP related */
        struct vb2_vmarea_handler       handler;
        atomic_t                        refcount;
        struct sg_table                 *sgt_base;
 
-       /* USERPTR related */
-       struct vm_area_struct           *vma;
-
        /* DMABUF related */
        struct dma_buf_attachment       *db_attach;
 };
@@ -49,24 +47,6 @@ struct vb2_dc_buf {
 /*        scatterlist table functions        */
 /*********************************************/
 
-
-static void vb2_dc_sgt_foreach_page(struct sg_table *sgt,
-       void (*cb)(struct page *pg))
-{
-       struct scatterlist *s;
-       unsigned int i;
-
-       for_each_sg(sgt->sgl, s, sgt->orig_nents, i) {
-               struct page *page = sg_page(s);
-               unsigned int n_pages = PAGE_ALIGN(s->offset + s->length)
-                       >> PAGE_SHIFT;
-               unsigned int j;
-
-               for (j = 0; j < n_pages; ++j, ++page)
-                       cb(page);
-       }
-}
-
 static unsigned long vb2_dc_get_contiguous_size(struct sg_table *sgt)
 {
        struct scatterlist *s;
@@ -429,92 +409,12 @@ static struct dma_buf *vb2_dc_get_dmabuf(void *buf_priv, unsigned long flags)
 /*       callbacks for USERPTR buffers       */
 /*********************************************/
 
-static inline int vma_is_io(struct vm_area_struct *vma)
-{
-       return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
-}
-
-static int vb2_dc_get_user_pfn(unsigned long start, int n_pages,
-       struct vm_area_struct *vma, unsigned long *res)
-{
-       unsigned long pfn, start_pfn, prev_pfn;
-       unsigned int i;
-       int ret;
-
-       if (!vma_is_io(vma))
-               return -EFAULT;
-
-       ret = follow_pfn(vma, start, &pfn);
-       if (ret)
-               return ret;
-
-       start_pfn = pfn;
-       start += PAGE_SIZE;
-
-       for (i = 1; i < n_pages; ++i, start += PAGE_SIZE) {
-               prev_pfn = pfn;
-               ret = follow_pfn(vma, start, &pfn);
-
-               if (ret) {
-                       pr_err("no page for address %lu\n", start);
-                       return ret;
-               }
-               if (pfn != prev_pfn + 1)
-                       return -EINVAL;
-       }
-
-       *res = start_pfn;
-       return 0;
-}
-
-static int vb2_dc_get_user_pages(unsigned long start, struct page **pages,
-       int n_pages, struct vm_area_struct *vma,
-       enum dma_data_direction dma_dir)
-{
-       if (vma_is_io(vma)) {
-               unsigned int i;
-
-               for (i = 0; i < n_pages; ++i, start += PAGE_SIZE) {
-                       unsigned long pfn;
-                       int ret = follow_pfn(vma, start, &pfn);
-
-                       if (!pfn_valid(pfn))
-                               return -EINVAL;
-
-                       if (ret) {
-                               pr_err("no page for address %lu\n", start);
-                               return ret;
-                       }
-                       pages[i] = pfn_to_page(pfn);
-               }
-       } else {
-               int n;
-
-               n = get_user_pages(current, current->mm, start & PAGE_MASK,
-                       n_pages, dma_dir == DMA_FROM_DEVICE, 1, pages, NULL);
-               /* negative error means that no page was pinned */
-               n = max(n, 0);
-               if (n != n_pages) {
-                       pr_err("got only %d of %d user pages\n", n, n_pages);
-                       while (n)
-                               put_page(pages[--n]);
-                       return -EFAULT;
-               }
-       }
-
-       return 0;
-}
-
-static void vb2_dc_put_dirty_page(struct page *page)
-{
-       set_page_dirty_lock(page);
-       put_page(page);
-}
-
 static void vb2_dc_put_userptr(void *buf_priv)
 {
        struct vb2_dc_buf *buf = buf_priv;
        struct sg_table *sgt = buf->dma_sgt;
+       int i;
+       struct page **pages;
 
        if (sgt) {
                DEFINE_DMA_ATTRS(attrs);
@@ -526,13 +426,15 @@ static void vb2_dc_put_userptr(void *buf_priv)
                 */
                dma_unmap_sg_attrs(buf->dev, sgt->sgl, sgt->orig_nents,
                                   buf->dma_dir, &attrs);
-               if (!vma_is_io(buf->vma))
-                       vb2_dc_sgt_foreach_page(sgt, vb2_dc_put_dirty_page);
-
+               pages = frame_vector_pages(buf->vec);
+               /* sgt should exist only if vector contains pages... */
+               BUG_ON(IS_ERR(pages));
+               for (i = 0; i < frame_vector_count(buf->vec); i++)
+                       set_page_dirty_lock(pages[i]);
                sg_free_table(sgt);
                kfree(sgt);
        }
-       vb2_put_vma(buf->vma);
+       vb2_destroy_framevec(buf->vec);
        kfree(buf);
 }
 
@@ -572,13 +474,10 @@ static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
 {
        struct vb2_dc_conf *conf = alloc_ctx;
        struct vb2_dc_buf *buf;
-       unsigned long start;
-       unsigned long end;
+       struct frame_vector *vec;
        unsigned long offset;
-       struct page **pages;
-       int n_pages;
+       int n_pages, i;
        int ret = 0;
-       struct vm_area_struct *vma;
        struct sg_table *sgt;
        unsigned long contig_size;
        unsigned long dma_align = dma_get_cache_alignment();
@@ -604,72 +503,43 @@ static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
        buf->dev = conf->dev;
        buf->dma_dir = dma_dir;
 
-       start = vaddr & PAGE_MASK;
        offset = vaddr & ~PAGE_MASK;
-       end = PAGE_ALIGN(vaddr + size);
-       n_pages = (end - start) >> PAGE_SHIFT;
-
-       pages = kmalloc(n_pages * sizeof(pages[0]), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               pr_err("failed to allocate pages table\n");
+       vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
+       if (IS_ERR(vec)) {
+               ret = PTR_ERR(vec);
                goto fail_buf;
        }
+       buf->vec = vec;
+       n_pages = frame_vector_count(vec);
+       ret = frame_vector_to_pages(vec);
+       if (ret < 0) {
+               unsigned long *nums = frame_vector_pfns(vec);
 
-       /* current->mm->mmap_sem is taken by videobuf2 core */
-       vma = find_vma(current->mm, vaddr);
-       if (!vma) {
-               pr_err("no vma for address %lu\n", vaddr);
-               ret = -EFAULT;
-               goto fail_pages;
-       }
-
-       if (vma->vm_end < vaddr + size) {
-               pr_err("vma at %lu is too small for %lu bytes\n", vaddr, size);
-               ret = -EFAULT;
-               goto fail_pages;
-       }
-
-       buf->vma = vb2_get_vma(vma);
-       if (!buf->vma) {
-               pr_err("failed to copy vma\n");
-               ret = -ENOMEM;
-               goto fail_pages;
-       }
-
-       /* extract page list from userspace mapping */
-       ret = vb2_dc_get_user_pages(start, pages, n_pages, vma, dma_dir);
-       if (ret) {
-               unsigned long pfn;
-               if (vb2_dc_get_user_pfn(start, n_pages, vma, &pfn) == 0) {
-                       buf->dma_addr = vb2_dc_pfn_to_dma(buf->dev, pfn);
-                       buf->size = size;
-                       kfree(pages);
-                       return buf;
-               }
-
-               pr_err("failed to get user pages\n");
-               goto fail_vma;
+               /*
+                * Failed to convert to pages... Check the memory is physically
+                * contiguous and use direct mapping
+                */
+               for (i = 1; i < n_pages; i++)
+                       if (nums[i-1] + 1 != nums[i])
+                               goto fail_pfnvec;
+               buf->dma_addr = vb2_dc_pfn_to_dma(buf->dev, nums[0]);
+               goto out;
        }
 
        sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
        if (!sgt) {
                pr_err("failed to allocate sg table\n");
                ret = -ENOMEM;
-               goto fail_get_user_pages;
+               goto fail_pfnvec;
        }
 
-       ret = sg_alloc_table_from_pages(sgt, pages, n_pages,
+       ret = sg_alloc_table_from_pages(sgt, frame_vector_pages(vec), n_pages,
                offset, size, GFP_KERNEL);
        if (ret) {
                pr_err("failed to initialize sg table\n");
                goto fail_sgt;
        }
 
-       /* pages are no longer needed */
-       kfree(pages);
-       pages = NULL;
-
        /*
         * No need to sync to the device, this will happen later when the
         * prepare() memop is called.
@@ -691,8 +561,9 @@ static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
        }
 
        buf->dma_addr = sg_dma_address(sgt->sgl);
-       buf->size = size;
        buf->dma_sgt = sgt;
+out:
+       buf->size = size;
 
        return buf;
 
@@ -701,23 +572,13 @@ fail_map_sg:
                           buf->dma_dir, &attrs);
 
 fail_sgt_init:
-       if (!vma_is_io(buf->vma))
-               vb2_dc_sgt_foreach_page(sgt, put_page);
        sg_free_table(sgt);
 
 fail_sgt:
        kfree(sgt);
 
-fail_get_user_pages:
-       if (pages && !vma_is_io(buf->vma))
-               while (n_pages)
-                       put_page(pages[--n_pages]);
-
-fail_vma:
-       vb2_put_vma(buf->vma);
-
-fail_pages:
-       kfree(pages); /* kfree is NULL-proof */
+fail_pfnvec:
+       vb2_destroy_framevec(vec);
 
 fail_buf:
        kfree(buf);
index 7289b81..be7bd65 100644 (file)
@@ -38,6 +38,7 @@ struct vb2_dma_sg_buf {
        struct device                   *dev;
        void                            *vaddr;
        struct page                     **pages;
+       struct frame_vector             *vec;
        int                             offset;
        enum dma_data_direction         dma_dir;
        struct sg_table                 sg_table;
@@ -51,7 +52,6 @@ struct vb2_dma_sg_buf {
        unsigned int                    num_pages;
        atomic_t                        refcount;
        struct vb2_vmarea_handler       handler;
-       struct vm_area_struct           *vma;
 
        struct dma_buf_attachment       *db_attach;
 };
@@ -225,25 +225,17 @@ static void vb2_dma_sg_finish(void *buf_priv)
        dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
 }
 
-static inline int vma_is_io(struct vm_area_struct *vma)
-{
-       return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
-}
-
 static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr,
                                    unsigned long size,
                                    enum dma_data_direction dma_dir)
 {
        struct vb2_dma_sg_conf *conf = alloc_ctx;
        struct vb2_dma_sg_buf *buf;
-       unsigned long first, last;
-       int num_pages_from_user;
-       struct vm_area_struct *vma;
        struct sg_table *sgt;
        DEFINE_DMA_ATTRS(attrs);
+       struct frame_vector *vec;
 
        dma_set_attr(DMA_ATTR_SKIP_CPU_SYNC, &attrs);
-
        buf = kzalloc(sizeof *buf, GFP_KERNEL);
        if (!buf)
                return NULL;
@@ -254,61 +246,19 @@ static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr,
        buf->offset = vaddr & ~PAGE_MASK;
        buf->size = size;
        buf->dma_sgt = &buf->sg_table;
+       vec = vb2_create_framevec(vaddr, size, buf->dma_dir == DMA_FROM_DEVICE);
+       if (IS_ERR(vec))
+               goto userptr_fail_pfnvec;
+       buf->vec = vec;
 
-       first = (vaddr           & PAGE_MASK) >> PAGE_SHIFT;
-       last  = ((vaddr + size - 1) & PAGE_MASK) >> PAGE_SHIFT;
-       buf->num_pages = last - first + 1;
-
-       buf->pages = kzalloc(buf->num_pages * sizeof(struct page *),
-                            GFP_KERNEL);
-       if (!buf->pages)
-               goto userptr_fail_alloc_pages;
-
-       vma = find_vma(current->mm, vaddr);
-       if (!vma) {
-               dprintk(1, "no vma for address %lu\n", vaddr);
-               goto userptr_fail_find_vma;
-       }
-
-       if (vma->vm_end < vaddr + size) {
-               dprintk(1, "vma at %lu is too small for %lu bytes\n",
-                       vaddr, size);
-               goto userptr_fail_find_vma;
-       }
-
-       buf->vma = vb2_get_vma(vma);
-       if (!buf->vma) {
-               dprintk(1, "failed to copy vma\n");
-               goto userptr_fail_find_vma;
-       }
-
-       if (vma_is_io(buf->vma)) {
-               for (num_pages_from_user = 0;
-                    num_pages_from_user < buf->num_pages;
-                    ++num_pages_from_user, vaddr += PAGE_SIZE) {
-                       unsigned long pfn;
-
-                       if (follow_pfn(vma, vaddr, &pfn)) {
-                               dprintk(1, "no page for address %lu\n", vaddr);
-                               break;
-                       }
-                       buf->pages[num_pages_from_user] = pfn_to_page(pfn);
-               }
-       } else
-               num_pages_from_user = get_user_pages(current, current->mm,
-                                            vaddr & PAGE_MASK,
-                                            buf->num_pages,
-                                            buf->dma_dir == DMA_FROM_DEVICE,
-                                            1, /* force */
-                                            buf->pages,
-                                            NULL);
-
-       if (num_pages_from_user != buf->num_pages)
-               goto userptr_fail_get_user_pages;
+       buf->pages = frame_vector_pages(vec);
+       if (IS_ERR(buf->pages))
+               goto userptr_fail_sgtable;
+       buf->num_pages = frame_vector_count(vec);
 
        if (sg_alloc_table_from_pages(buf->dma_sgt, buf->pages,
                        buf->num_pages, buf->offset, size, 0))
-               goto userptr_fail_alloc_table_from_pages;
+               goto userptr_fail_sgtable;
 
        sgt = &buf->sg_table;
        /*
@@ -324,17 +274,9 @@ static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr,
 
 userptr_fail_map:
        sg_free_table(&buf->sg_table);
-userptr_fail_alloc_table_from_pages:
-userptr_fail_get_user_pages:
-       dprintk(1, "get_user_pages requested/got: %d/%d]\n",
-               buf->num_pages, num_pages_from_user);
-       if (!vma_is_io(buf->vma))
-               while (--num_pages_from_user >= 0)
-                       put_page(buf->pages[num_pages_from_user]);
-       vb2_put_vma(buf->vma);
-userptr_fail_find_vma:
-       kfree(buf->pages);
-userptr_fail_alloc_pages:
+userptr_fail_sgtable:
+       vb2_destroy_framevec(vec);
+userptr_fail_pfnvec:
        kfree(buf);
        return NULL;
 }
@@ -362,11 +304,8 @@ static void vb2_dma_sg_put_userptr(void *buf_priv)
        while (--i >= 0) {
                if (buf->dma_dir == DMA_FROM_DEVICE)
                        set_page_dirty_lock(buf->pages[i]);
-               if (!vma_is_io(buf->vma))
-                       put_page(buf->pages[i]);
        }
-       kfree(buf->pages);
-       vb2_put_vma(buf->vma);
+       vb2_destroy_framevec(buf->vec);
        kfree(buf);
 }
 
index 0d49b79..48c6a49 100644 (file)
 #include <media/videobuf2-memops.h>
 
 /**
- * vb2_get_vma() - acquire and lock the virtual memory area
- * @vma:       given virtual memory area
+ * vb2_create_framevec() - map virtual addresses to pfns
+ * @start:     Virtual user address where we start mapping
+ * @length:    Length of a range to map
+ * @write:     Should we map for writing into the area
  *
- * This function attempts to acquire an area mapped in the userspace for
- * the duration of a hardware operation. The area is "locked" by performing
- * the same set of operation that are done when process calls fork() and
- * memory areas are duplicated.
- *
- * Returns a copy of a virtual memory region on success or NULL.
- */
-struct vm_area_struct *vb2_get_vma(struct vm_area_struct *vma)
-{
-       struct vm_area_struct *vma_copy;
-
-       vma_copy = kmalloc(sizeof(*vma_copy), GFP_KERNEL);
-       if (vma_copy == NULL)
-               return NULL;
-
-       if (vma->vm_ops && vma->vm_ops->open)
-               vma->vm_ops->open(vma);
-
-       if (vma->vm_file)
-               get_file(vma->vm_file);
-
-       memcpy(vma_copy, vma, sizeof(*vma));
-
-       vma_copy->vm_mm = NULL;
-       vma_copy->vm_next = NULL;
-       vma_copy->vm_prev = NULL;
-
-       return vma_copy;
-}
-EXPORT_SYMBOL_GPL(vb2_get_vma);
-
-/**
- * vb2_put_userptr() - release a userspace virtual memory area
- * @vma:       virtual memory region associated with the area to be released
- *
- * This function releases the previously acquired memory area after a hardware
- * operation.
+ * This function allocates and fills in a vector with pfns corresponding to
+ * virtual address range passed in arguments. If pfns have corresponding pages,
+ * page references are also grabbed to pin pages in memory. The function
+ * returns pointer to the vector on success and error pointer in case of
+ * failure. Returned vector needs to be freed via vb2_destroy_pfnvec().
  */
-void vb2_put_vma(struct vm_area_struct *vma)
+struct frame_vector *vb2_create_framevec(unsigned long start,
+                                        unsigned long length,
+                                        bool write)
 {
-       if (!vma)
-               return;
-
-       if (vma->vm_ops && vma->vm_ops->close)
-               vma->vm_ops->close(vma);
-
-       if (vma->vm_file)
-               fput(vma->vm_file);
-
-       kfree(vma);
+       int ret;
+       unsigned long first, last;
+       unsigned long nr;
+       struct frame_vector *vec;
+
+       first = start >> PAGE_SHIFT;
+       last = (start + length - 1) >> PAGE_SHIFT;
+       nr = last - first + 1;
+       vec = frame_vector_create(nr);
+       if (!vec)
+               return ERR_PTR(-ENOMEM);
+       ret = get_vaddr_frames(start, nr, write, 1, vec);
+       if (ret < 0)
+               goto out_destroy;
+       /* We accept only complete set of PFNs */
+       if (ret != nr) {
+               ret = -EFAULT;
+               goto out_release;
+       }
+       return vec;
+out_release:
+       put_vaddr_frames(vec);
+out_destroy:
+       frame_vector_destroy(vec);
+       return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(vb2_put_vma);
+EXPORT_SYMBOL(vb2_create_framevec);
 
 /**
- * vb2_get_contig_userptr() - lock physically contiguous userspace mapped memory
- * @vaddr:     starting virtual address of the area to be verified
- * @size:      size of the area
- * @res_paddr: will return physical address for the given vaddr
- * @res_vma:   will return locked copy of struct vm_area for the given area
- *
- * This function will go through memory area of size @size mapped at @vaddr and
- * verify that the underlying physical pages are contiguous. If they are
- * contiguous the virtual memory area is locked and a @res_vma is filled with
- * the copy and @res_pa set to the physical address of the buffer.
+ * vb2_destroy_framevec() - release vector of mapped pfns
+ * @vec:       vector of pfns / pages to release
  *
- * Returns 0 on success.
+ * This releases references to all pages in the vector @vec (if corresponding
+ * pfns are backed by pages) and frees the passed vector.
  */
-int vb2_get_contig_userptr(unsigned long vaddr, unsigned long size,
-                          struct vm_area_struct **res_vma, dma_addr_t *res_pa)
+void vb2_destroy_framevec(struct frame_vector *vec)
 {
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long offset, start, end;
-       unsigned long this_pfn, prev_pfn;
-       dma_addr_t pa = 0;
-
-       start = vaddr;
-       offset = start & ~PAGE_MASK;
-       end = start + size;
-
-       vma = find_vma(mm, start);
-
-       if (vma == NULL || vma->vm_end < end)
-               return -EFAULT;
-
-       for (prev_pfn = 0; start < end; start += PAGE_SIZE) {
-               int ret = follow_pfn(vma, start, &this_pfn);
-               if (ret)
-                       return ret;
-
-               if (prev_pfn == 0)
-                       pa = this_pfn << PAGE_SHIFT;
-               else if (this_pfn != prev_pfn + 1)
-                       return -EFAULT;
-
-               prev_pfn = this_pfn;
-       }
-
-       /*
-        * Memory is contiguous, lock vma and return to the caller
-        */
-       *res_vma = vb2_get_vma(vma);
-       if (*res_vma == NULL)
-               return -ENOMEM;
-
-       *res_pa = pa + offset;
-       return 0;
+       put_vaddr_frames(vec);
+       frame_vector_destroy(vec);
 }
-EXPORT_SYMBOL_GPL(vb2_get_contig_userptr);
+EXPORT_SYMBOL(vb2_destroy_framevec);
 
 /**
  * vb2_common_vm_open() - increase refcount of the vma
index 2fe4c27..ecb8f0c 100644 (file)
 
 struct vb2_vmalloc_buf {
        void                            *vaddr;
-       struct page                     **pages;
-       struct vm_area_struct           *vma;
+       struct frame_vector             *vec;
        enum dma_data_direction         dma_dir;
        unsigned long                   size;
-       unsigned int                    n_pages;
        atomic_t                        refcount;
        struct vb2_vmarea_handler       handler;
        struct dma_buf                  *dbuf;
@@ -76,10 +74,8 @@ static void *vb2_vmalloc_get_userptr(void *alloc_ctx, unsigned long vaddr,
                                     enum dma_data_direction dma_dir)
 {
        struct vb2_vmalloc_buf *buf;
-       unsigned long first, last;
-       int n_pages, offset;
-       struct vm_area_struct *vma;
-       dma_addr_t physp;
+       struct frame_vector *vec;
+       int n_pages, offset, i;
 
        buf = kzalloc(sizeof(*buf), GFP_KERNEL);
        if (!buf)
@@ -88,51 +84,36 @@ static void *vb2_vmalloc_get_userptr(void *alloc_ctx, unsigned long vaddr,
        buf->dma_dir = dma_dir;
        offset = vaddr & ~PAGE_MASK;
        buf->size = size;
-
-
-       vma = find_vma(current->mm, vaddr);
-       if (vma && (vma->vm_flags & VM_PFNMAP) && (vma->vm_pgoff)) {
-               if (vb2_get_contig_userptr(vaddr, size, &vma, &physp))
-                       goto fail_pages_array_alloc;
-               buf->vma = vma;
-               buf->vaddr = (__force void *)ioremap_nocache(physp, size);
-               if (!buf->vaddr)
-                       goto fail_pages_array_alloc;
+       vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
+       if (IS_ERR(vec))
+               goto fail_pfnvec_create;
+       buf->vec = vec;
+       n_pages = frame_vector_count(vec);
+       if (frame_vector_to_pages(vec) < 0) {
+               unsigned long *nums = frame_vector_pfns(vec);
+
+               /*
+                * We cannot get page pointers for these pfns. Check memory is
+                * physically contiguous and use direct mapping.
+                */
+               for (i = 1; i < n_pages; i++)
+                       if (nums[i-1] + 1 != nums[i])
+                               goto fail_map;
+               buf->vaddr = (__force void *)
+                               ioremap_nocache(nums[0] << PAGE_SHIFT, size);
        } else {
-               first = vaddr >> PAGE_SHIFT;
-               last  = (vaddr + size - 1) >> PAGE_SHIFT;
-               buf->n_pages = last - first + 1;
-               buf->pages = kzalloc(buf->n_pages * sizeof(struct page *),
-                                    GFP_KERNEL);
-               if (!buf->pages)
-                       goto fail_pages_array_alloc;
-
-               /* current->mm->mmap_sem is taken by videobuf2 core */
-               n_pages = get_user_pages(current, current->mm,
-                                        vaddr & PAGE_MASK, buf->n_pages,
-                                        dma_dir == DMA_FROM_DEVICE,
-                                        1, /* force */
-                                        buf->pages, NULL);
-               if (n_pages != buf->n_pages)
-                       goto fail_get_user_pages;
-
-               buf->vaddr = vm_map_ram(buf->pages, buf->n_pages, -1,
+               buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1,
                                        PAGE_KERNEL);
-               if (!buf->vaddr)
-                       goto fail_get_user_pages;
        }
 
+       if (!buf->vaddr)
+               goto fail_map;
        buf->vaddr += offset;
        return buf;
 
-fail_get_user_pages:
-       pr_debug("get_user_pages requested/got: %d/%d]\n", n_pages,
-                buf->n_pages);
-       while (--n_pages >= 0)
-               put_page(buf->pages[n_pages]);
-       kfree(buf->pages);
-
-fail_pages_array_alloc:
+fail_map:
+       vb2_destroy_framevec(vec);
+fail_pfnvec_create:
        kfree(buf);
 
        return NULL;
@@ -143,20 +124,21 @@ static void vb2_vmalloc_put_userptr(void *buf_priv)
        struct vb2_vmalloc_buf *buf = buf_priv;
        unsigned long vaddr = (unsigned long)buf->vaddr & PAGE_MASK;
        unsigned int i;
+       struct page **pages;
+       unsigned int n_pages;
 
-       if (buf->pages) {
+       if (!buf->vec->is_pfns) {
+               n_pages = frame_vector_count(buf->vec);
+               pages = frame_vector_pages(buf->vec);
                if (vaddr)
-                       vm_unmap_ram((void *)vaddr, buf->n_pages);
-               for (i = 0; i < buf->n_pages; ++i) {
-                       if (buf->dma_dir == DMA_FROM_DEVICE)
-                               set_page_dirty_lock(buf->pages[i]);
-                       put_page(buf->pages[i]);
-               }
-               kfree(buf->pages);
+                       vm_unmap_ram((void *)vaddr, n_pages);
+               if (buf->dma_dir == DMA_FROM_DEVICE)
+                       for (i = 0; i < n_pages; i++)
+                               set_page_dirty_lock(pages[i]);
        } else {
-               vb2_put_vma(buf->vma);
                iounmap((__force void __iomem *)buf->vaddr);
        }
+       vb2_destroy_framevec(buf->vec);
        kfree(buf);
 }
 
index c49d244..70e62d6 100644 (file)
@@ -418,7 +418,7 @@ static void genwqe_vma_close(struct vm_area_struct *vma)
        kfree(dma_map);
 }
 
-static struct vm_operations_struct genwqe_vma_ops = {
+static const struct vm_operations_struct genwqe_vma_ops = {
        .open   = genwqe_vma_open,
        .close  = genwqe_vma_close,
 };
index 2bc0f50..b346638 100644 (file)
@@ -364,6 +364,7 @@ int mei_watchdog_register(struct mei_device *dev)
 
        int ret;
 
+       amt_wd_dev.parent = dev->dev;
        /* unlock to perserve correct locking order */
        mutex_unlock(&dev->device_lock);
        ret = watchdog_register_device(&amt_wd_dev);
index 289e204..9d56515 100644 (file)
@@ -418,7 +418,7 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch  *ds, int port)
        core_writel(priv, port, CORE_FAST_AGE_PORT);
 
        reg = core_readl(priv, CORE_FAST_AGE_CTRL);
-       reg |= EN_AGE_PORT | FAST_AGE_STR_DONE;
+       reg |= EN_AGE_PORT | EN_AGE_DYNAMIC | FAST_AGE_STR_DONE;
        core_writel(priv, reg, CORE_FAST_AGE_CTRL);
 
        do {
@@ -432,6 +432,8 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch  *ds, int port)
        if (!timeout)
                return -ETIMEDOUT;
 
+       core_writel(priv, 0, CORE_FAST_AGE_CTRL);
+
        return 0;
 }
 
@@ -507,7 +509,7 @@ static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
        u32 reg;
 
        reg = core_readl(priv, CORE_G_PCTL_PORT(port));
-       cur_hw_state = reg >> G_MISTP_STATE_SHIFT;
+       cur_hw_state = reg & (G_MISTP_STATE_MASK << G_MISTP_STATE_SHIFT);
 
        switch (state) {
        case BR_STATE_DISABLED:
@@ -531,10 +533,12 @@ static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
        }
 
        /* Fast-age ARL entries if we are moving a port from Learning or
-        * Forwarding state to Disabled, Blocking or Listening state
+        * Forwarding (cur_hw_state) state to Disabled, Blocking or Listening
+        * state (hw_state)
         */
        if (cur_hw_state != hw_state) {
-               if (cur_hw_state & 4 && !(hw_state & 4)) {
+               if (cur_hw_state >= G_MISTP_LEARN_STATE &&
+                   hw_state <= G_MISTP_LISTEN_STATE) {
                        ret = bcm_sf2_sw_fast_age_port(ds, port);
                        if (ret) {
                                pr_err("%s: fast-ageing failed\n", __func__);
index 22e2ebf..789d7b7 100644 (file)
@@ -112,8 +112,8 @@ static inline u64 name##_readq(struct bcm_sf2_priv *priv, u32 off)  \
        spin_unlock(&priv->indir_lock);                                 \
        return (u64)indir << 32 | dir;                                  \
 }                                                                      \
-static inline void name##_writeq(struct bcm_sf2_priv *priv, u32 off,   \
-                                                       u64 val)        \
+static inline void name##_writeq(struct bcm_sf2_priv *priv, u64 val,   \
+                                                       u32 off)        \
 {                                                                      \
        spin_lock(&priv->indir_lock);                                   \
        reg_writel(priv, upper_32_bits(val), REG_DIR_DATA_WRITE);       \
index d54b740..c2daaf0 100644 (file)
@@ -117,6 +117,11 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
        .port_join_bridge       = mv88e6xxx_join_bridge,
        .port_leave_bridge      = mv88e6xxx_leave_bridge,
        .port_stp_update        = mv88e6xxx_port_stp_update,
+       .port_pvid_get          = mv88e6xxx_port_pvid_get,
+       .port_pvid_set          = mv88e6xxx_port_pvid_set,
+       .port_vlan_add          = mv88e6xxx_port_vlan_add,
+       .port_vlan_del          = mv88e6xxx_port_vlan_del,
+       .vlan_getnext           = mv88e6xxx_vlan_getnext,
        .port_fdb_add           = mv88e6xxx_port_fdb_add,
        .port_fdb_del           = mv88e6xxx_port_fdb_del,
        .port_fdb_getnext       = mv88e6xxx_port_fdb_getnext,
index da48e66..fe64482 100644 (file)
@@ -511,8 +511,7 @@ static int tse_poll(struct napi_struct *napi, int budget)
 
        if (rxcomplete < budget) {
 
-               napi_gro_flush(napi, false);
-               __napi_complete(napi);
+               napi_complete(napi);
 
                netdev_dbg(priv->dev,
                           "NAPI Complete, did %d packets with budget %d\n",
@@ -1518,6 +1517,7 @@ static int altera_tse_probe(struct platform_device *pdev)
        spin_lock_init(&priv->tx_lock);
        spin_lock_init(&priv->rxdma_irq_lock);
 
+       netif_carrier_off(ndev);
        ret = register_netdev(ndev);
        if (ret) {
                dev_err(&pdev->dev, "failed to register TSE net device\n");
index 0660dee..f683d97 100644 (file)
@@ -818,10 +818,9 @@ static int setup_glist(struct lio *lio)
        INIT_LIST_HEAD(&lio->glist);
 
        for (i = 0; i < lio->tx_qsize; i++) {
-               g = kmalloc(sizeof(*g), GFP_KERNEL);
+               g = kzalloc(sizeof(*g), GFP_KERNEL);
                if (!g)
                        break;
-               memset(g, 0, sizeof(struct octnic_gather));
 
                g->sg_size =
                        ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);
index eb22d58..f5dcde2 100644 (file)
@@ -4568,28 +4568,23 @@ static void free_some_resources(struct adapter *adapter)
 
 static int get_chip_type(struct pci_dev *pdev, u32 pl_rev)
 {
-       int ver, chip;
        u16 device_id;
 
        /* Retrieve adapter's device ID */
        pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id);
-       ver = device_id >> 12;
-       switch (ver) {
+
+       switch (device_id >> 12) {
        case CHELSIO_T4:
-               chip |= CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
-               break;
+               return CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
        case CHELSIO_T5:
-               chip |= CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev);
-               break;
+               return CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev);
        case CHELSIO_T6:
-               chip |= CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev);
-               break;
+               return CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev);
        default:
                dev_err(&pdev->dev, "Device %d is not supported\n",
                        device_id);
-               return -EINVAL;
        }
-       return chip;
+       return -EINVAL;
 }
 
 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -4724,8 +4719,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
                        err = -ENOMEM;
                        goto out_free_adapter;
                }
-               t4_write_reg(adapter, SGE_STAT_CFG_A,
-                            STATSOURCE_T5_V(7) | STATMODE_V(0));
        }
 
        setup_memwin(adapter);
@@ -4737,6 +4730,11 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (err)
                goto out_unmap_bar;
 
+       /* configure SGE_STAT_CFG_A to read WC stats */
+       if (!is_t4(adapter->params.chip))
+               t4_write_reg(adapter, SGE_STAT_CFG_A,
+                            STATSOURCE_T5_V(7) | STATMODE_V(0));
+
        for_each_port(adapter, i) {
                struct net_device *netdev;
 
index 78f446c..9162746 100644 (file)
@@ -807,7 +807,7 @@ static inline unsigned int calc_tx_flits(const struct sk_buff *skb)
         * message or, if we're doing a Large Send Offload, an LSO CPL message
         * with an embedded TX Packet Write CPL message.
         */
-       flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4;
+       flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
        if (skb_shinfo(skb)->gso_size)
                flits += (sizeof(struct fw_eth_tx_pkt_wr) +
                          sizeof(struct cpl_tx_pkt_lso_core) +
index ab46746..a32de30 100644 (file)
@@ -762,8 +762,6 @@ enum fw_ldst_func_mod_index {
 
 struct fw_ldst_cmd {
        __be32 op_to_addrspace;
-#define FW_LDST_CMD_ADDRSPACE_S                0
-#define FW_LDST_CMD_ADDRSPACE_V(x)     ((x) << FW_LDST_CMD_ADDRSPACE_S)
        __be32 cycles_to_len16;
        union fw_ldst {
                struct fw_ldst_addrval {
@@ -788,6 +786,13 @@ struct fw_ldst_cmd {
                        __be16 vctl;
                        __be16 rval;
                } mdio;
+               struct fw_ldst_cim_rq {
+                       u8 req_first64[8];
+                       u8 req_second64[8];
+                       u8 resp_first64[8];
+                       u8 resp_second64[8];
+                       __be32 r3[2];
+               } cim_rq;
                union fw_ldst_mps {
                        struct fw_ldst_mps_rplc {
                                __be16 fid_idx;
@@ -828,9 +833,33 @@ struct fw_ldst_cmd {
                        __be16 nset_pkd;
                        __be32 data[12];
                } pcie;
+               struct fw_ldst_i2c_deprecated {
+                       u8 pid_pkd;
+                       u8 base;
+                       u8 boffset;
+                       u8 data;
+                       __be32 r9;
+               } i2c_deprecated;
+               struct fw_ldst_i2c {
+                       u8 pid;
+                       u8 did;
+                       u8 boffset;
+                       u8 blen;
+                       __be32 r9;
+                       __u8   data[48];
+               } i2c;
+               struct fw_ldst_le {
+                       __be32 index;
+                       __be32 r9;
+                       u8 val[33];
+                       u8 r11[7];
+               } le;
        } u;
 };
 
+#define FW_LDST_CMD_ADDRSPACE_S                0
+#define FW_LDST_CMD_ADDRSPACE_V(x)     ((x) << FW_LDST_CMD_ADDRSPACE_S)
+
 #define FW_LDST_CMD_MSG_S       31
 #define FW_LDST_CMD_MSG_V(x)   ((x) << FW_LDST_CMD_MSG_S)
 
index 92bafa7..c4b262c 100644 (file)
@@ -36,8 +36,8 @@
 #define __T4FW_VERSION_H__
 
 #define T4FW_VERSION_MAJOR 0x01
-#define T4FW_VERSION_MINOR 0x0D
-#define T4FW_VERSION_MICRO 0x20
+#define T4FW_VERSION_MINOR 0x0E
+#define T4FW_VERSION_MICRO 0x04
 #define T4FW_VERSION_BUILD 0x00
 
 #define T4FW_MIN_VERSION_MAJOR 0x01
@@ -45,8 +45,8 @@
 #define T4FW_MIN_VERSION_MICRO 0x00
 
 #define T5FW_VERSION_MAJOR 0x01
-#define T5FW_VERSION_MINOR 0x0D
-#define T5FW_VERSION_MICRO 0x20
+#define T5FW_VERSION_MINOR 0x0E
+#define T5FW_VERSION_MICRO 0x04
 #define T5FW_VERSION_BUILD 0x00
 
 #define T5FW_MIN_VERSION_MAJOR 0x00
@@ -54,8 +54,8 @@
 #define T5FW_MIN_VERSION_MICRO 0x00
 
 #define T6FW_VERSION_MAJOR 0x01
-#define T6FW_VERSION_MINOR 0x0D
-#define T6FW_VERSION_MICRO 0x2D
+#define T6FW_VERSION_MINOR 0x0E
+#define T6FW_VERSION_MICRO 0x04
 #define T6FW_VERSION_BUILD 0x00
 
 #define T6FW_MIN_VERSION_MAJOR 0x00
index c0a7813..cf94b72 100644 (file)
@@ -1226,7 +1226,7 @@ static irqreturn_t dm9000_interrupt(int irq, void *dev_id)
        if (int_status & ISR_PRS)
                dm9000_rx(dev);
 
-       /* Trnasmit Interrupt check */
+       /* Transmit Interrupt check */
        if (int_status & ISR_PTS)
                dm9000_tx_done(dev, db);
 
index 3be1fbd..eb32391 100644 (file)
@@ -1968,7 +1968,7 @@ static int __be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value)
                        memcpy(req->mcast_mac[i++].byte, ha->addr, ETH_ALEN);
        }
 
-       status = be_mcc_notify(adapter);
+       status = be_mcc_notify_wait(adapter);
 err:
        spin_unlock_bh(&adapter->mcc_lock);
        return status;
index 442410c..a2c96fd 100644 (file)
@@ -1132,10 +1132,6 @@ static int ethoc_probe(struct platform_device *pdev)
                memcpy(netdev->dev_addr, pdata->hwaddr, IFHWADDRLEN);
                priv->phy_id = pdata->phy_id;
        } else {
-               priv->phy_id = -1;
-
-#ifdef CONFIG_OF
-               {
                const uint8_t *mac;
 
                mac = of_get_property(pdev->dev.of_node,
@@ -1143,8 +1139,7 @@ static int ethoc_probe(struct platform_device *pdev)
                                      NULL);
                if (mac)
                        memcpy(netdev->dev_addr, mac, IFHWADDRLEN);
-               }
-#endif
+               priv->phy_id = -1;
        }
 
        /* Check that the given MAC address is valid. If it isn't, read the
index 91925e3..dd4ca39 100644 (file)
@@ -1816,11 +1816,13 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
        struct fec_enet_private *fep = bus->priv;
        struct device *dev = &fep->pdev->dev;
        unsigned long time_left;
-       int ret = 0;
+       int ret;
 
        ret = pm_runtime_get_sync(dev);
        if (ret < 0)
                return ret;
+       else
+               ret = 0;
 
        fep->mii_timeout = 0;
        reinit_completion(&fep->mdio_done);
@@ -3029,6 +3031,14 @@ fec_set_mac_address(struct net_device *ndev, void *p)
                memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len);
        }
 
+       /* Add netif status check here to avoid system hang in below case:
+        * ifconfig ethx down; ifconfig ethx hw ether xx:xx:xx:xx:xx:xx;
+        * After ethx down, fec all clocks are gated off and then register
+        * access causes system hang.
+        */
+       if (!netif_running(ndev))
+               return 0;
+
        writel(ndev->dev_addr[3] | (ndev->dev_addr[2] << 8) |
                (ndev->dev_addr[1] << 16) | (ndev->dev_addr[0] << 24),
                fep->hwp + FEC_ADDR_LOW);
index 6e9a792..060dd39 100644 (file)
@@ -583,7 +583,7 @@ jme_setup_tx_resources(struct jme_adapter *jme)
        atomic_set(&txring->next_to_clean, 0);
        atomic_set(&txring->nr_free, jme->tx_ring_size);
 
-       txring->bufinf          = kmalloc(sizeof(struct jme_buffer_info) *
+       txring->bufinf          = kzalloc(sizeof(struct jme_buffer_info) *
                                        jme->tx_ring_size, GFP_ATOMIC);
        if (unlikely(!(txring->bufinf)))
                goto err_free_txring;
@@ -592,8 +592,6 @@ jme_setup_tx_resources(struct jme_adapter *jme)
         * Initialize Transmit Descriptors
         */
        memset(txring->alloc, 0, TX_RING_ALLOC_SIZE(jme->tx_ring_size));
-       memset(txring->bufinf, 0,
-               sizeof(struct jme_buffer_info) * jme->tx_ring_size);
 
        return 0;
 
@@ -845,7 +843,7 @@ jme_setup_rx_resources(struct jme_adapter *jme)
        rxring->next_to_use     = 0;
        atomic_set(&rxring->next_to_clean, 0);
 
-       rxring->bufinf          = kmalloc(sizeof(struct jme_buffer_info) *
+       rxring->bufinf          = kzalloc(sizeof(struct jme_buffer_info) *
                                        jme->rx_ring_size, GFP_ATOMIC);
        if (unlikely(!(rxring->bufinf)))
                goto err_free_rxring;
@@ -853,8 +851,6 @@ jme_setup_rx_resources(struct jme_adapter *jme)
        /*
         * Initiallize Receive Descriptors
         */
-       memset(rxring->bufinf, 0,
-               sizeof(struct jme_buffer_info) * jme->rx_ring_size);
        for (i = 0 ; i < jme->rx_ring_size ; ++i) {
                if (unlikely(jme_make_new_rx_buf(jme, i))) {
                        jme_free_rx_resources(jme);
index d52639b..960169e 100644 (file)
@@ -1859,14 +1859,11 @@ oom:
                return;
        }
 
-       mc_spec = kmalloc(0x200, GFP_ATOMIC);
+       mc_spec = kzalloc(0x200, GFP_ATOMIC);
        if (mc_spec == NULL)
                goto oom;
        mc_other = mc_spec + (0x100 >> 2);
 
-       memset(mc_spec, 0, 0x100);
-       memset(mc_other, 0, 0x100);
-
        netdev_for_each_mc_addr(ha, dev) {
                u8 *a = ha->addr;
                u32 *table;
index 5ab3adf..9f0bdd9 100644 (file)
@@ -918,8 +918,6 @@ int qlcnic_83xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx,
                                mbx->req.arg = NULL;
                                return -ENOMEM;
                        }
-                       memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num);
-                       memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num);
                        temp = adapter->ahw->fw_hal_version << 29;
                        mbx->req.arg[0] = (type | (mbx->req.num << 16) | temp);
                        mbx->cmd_op = type;
index 6e6f18f..a5f422f 100644 (file)
@@ -73,8 +73,6 @@ int qlcnic_82xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx,
                                mbx->req.arg = NULL;
                                return -ENOMEM;
                        }
-                       memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num);
-                       memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num);
                        mbx->req.arg[0] = type;
                        break;
                }
index 546cd5f..7327b72 100644 (file)
@@ -729,8 +729,6 @@ static int qlcnic_sriov_alloc_bc_mbx_args(struct qlcnic_cmd_args *mbx, u32 type)
                                mbx->req.arg = NULL;
                                return -ENOMEM;
                        }
-                       memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num);
-                       memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num);
                        mbx->req.arg[0] = (type | (mbx->req.num << 16) |
                                           (3 << 29));
                        mbx->rsp.arg[0] = (type & 0xffff) | mbx->rsp.num << 16;
index 24dcbe6..2b32e0c 100644 (file)
@@ -833,7 +833,8 @@ struct rtl8169_private {
        unsigned features;
 
        struct mii_if_info mii;
-       struct rtl8169_counters counters;
+       dma_addr_t counters_phys_addr;
+       struct rtl8169_counters *counters;
        struct rtl8169_tc_offsets tc_offset;
        u32 saved_wolopts;
        u32 opts1_mask;
@@ -2190,53 +2191,37 @@ static int rtl8169_get_sset_count(struct net_device *dev, int sset)
        }
 }
 
-static struct rtl8169_counters *rtl8169_map_counters(struct net_device *dev,
-                                                    dma_addr_t *paddr,
-                                                    u32 counter_cmd)
+DECLARE_RTL_COND(rtl_counters_cond)
 {
-       struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
-       struct device *d = &tp->pci_dev->dev;
-       struct rtl8169_counters *counters;
-       u32 cmd;
 
-       counters = dma_alloc_coherent(d, sizeof(*counters), paddr, GFP_KERNEL);
-       if (counters) {
-               RTL_W32(CounterAddrHigh, (u64)*paddr >> 32);
-               cmd = (u64)*paddr & DMA_BIT_MASK(32);
-               RTL_W32(CounterAddrLow, cmd);
-               RTL_W32(CounterAddrLow, cmd | counter_cmd);
-       }
-       return counters;
+       return RTL_R32(CounterAddrLow) & (CounterReset | CounterDump);
 }
 
-static void rtl8169_unmap_counters (struct net_device *dev,
-                                   dma_addr_t paddr,
-                                   struct rtl8169_counters *counters)
+static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
-       struct device *d = &tp->pci_dev->dev;
+       dma_addr_t paddr = tp->counters_phys_addr;
+       u32 cmd;
+       bool ret;
 
-       RTL_W32(CounterAddrLow, 0);
-       RTL_W32(CounterAddrHigh, 0);
+       RTL_W32(CounterAddrHigh, (u64)paddr >> 32);
+       cmd = (u64)paddr & DMA_BIT_MASK(32);
+       RTL_W32(CounterAddrLow, cmd);
+       RTL_W32(CounterAddrLow, cmd | counter_cmd);
 
-       dma_free_coherent(d, sizeof(*counters), counters, paddr);
-}
+       ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000);
 
-DECLARE_RTL_COND(rtl_reset_counters_cond)
-{
-       void __iomem *ioaddr = tp->mmio_addr;
+       RTL_W32(CounterAddrLow, 0);
+       RTL_W32(CounterAddrHigh, 0);
 
-       return RTL_R32(CounterAddrLow) & CounterReset;
+       return ret;
 }
 
 static bool rtl8169_reset_counters(struct net_device *dev)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
-       struct rtl8169_counters *counters;
-       dma_addr_t paddr;
-       bool ret = true;
 
        /*
         * Versions prior to RTL_GIGA_MAC_VER_19 don't support resetting the
@@ -2245,32 +2230,13 @@ static bool rtl8169_reset_counters(struct net_device *dev)
        if (tp->mac_version < RTL_GIGA_MAC_VER_19)
                return true;
 
-       counters = rtl8169_map_counters(dev, &paddr, CounterReset);
-       if (!counters)
-               return false;
-
-       if (!rtl_udelay_loop_wait_low(tp, &rtl_reset_counters_cond, 10, 1000))
-               ret = false;
-
-       rtl8169_unmap_counters(dev, paddr, counters);
-
-       return ret;
-}
-
-DECLARE_RTL_COND(rtl_counters_cond)
-{
-       void __iomem *ioaddr = tp->mmio_addr;
-
-       return RTL_R32(CounterAddrLow) & CounterDump;
+       return rtl8169_do_counters(dev, CounterReset);
 }
 
 static bool rtl8169_update_counters(struct net_device *dev)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
-       struct rtl8169_counters *counters;
-       dma_addr_t paddr;
-       bool ret = true;
 
        /*
         * Some chips are unable to dump tally counters when the receiver
@@ -2279,23 +2245,13 @@ static bool rtl8169_update_counters(struct net_device *dev)
        if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
                return true;
 
-       counters = rtl8169_map_counters(dev, &paddr, CounterDump);
-       if (!counters)
-               return false;
-
-       if (rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000))
-               memcpy(&tp->counters, counters, sizeof(*counters));
-       else
-               ret = false;
-
-       rtl8169_unmap_counters(dev, paddr, counters);
-
-       return ret;
+       return rtl8169_do_counters(dev, CounterDump);
 }
 
 static bool rtl8169_init_counter_offsets(struct net_device *dev)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
+       struct rtl8169_counters *counters = tp->counters;
        bool ret = false;
 
        /*
@@ -2323,9 +2279,9 @@ static bool rtl8169_init_counter_offsets(struct net_device *dev)
        if (rtl8169_update_counters(dev))
                ret = true;
 
-       tp->tc_offset.tx_errors = tp->counters.tx_errors;
-       tp->tc_offset.tx_multi_collision = tp->counters.tx_multi_collision;
-       tp->tc_offset.tx_aborted = tp->counters.tx_aborted;
+       tp->tc_offset.tx_errors = counters->tx_errors;
+       tp->tc_offset.tx_multi_collision = counters->tx_multi_collision;
+       tp->tc_offset.tx_aborted = counters->tx_aborted;
        tp->tc_offset.inited = true;
 
        return ret;
@@ -2335,24 +2291,25 @@ static void rtl8169_get_ethtool_stats(struct net_device *dev,
                                      struct ethtool_stats *stats, u64 *data)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
+       struct rtl8169_counters *counters = tp->counters;
 
        ASSERT_RTNL();
 
        rtl8169_update_counters(dev);
 
-       data[0] = le64_to_cpu(tp->counters.tx_packets);
-       data[1] = le64_to_cpu(tp->counters.rx_packets);
-       data[2] = le64_to_cpu(tp->counters.tx_errors);
-       data[3] = le32_to_cpu(tp->counters.rx_errors);
-       data[4] = le16_to_cpu(tp->counters.rx_missed);
-       data[5] = le16_to_cpu(tp->counters.align_errors);
-       data[6] = le32_to_cpu(tp->counters.tx_one_collision);
-       data[7] = le32_to_cpu(tp->counters.tx_multi_collision);
-       data[8] = le64_to_cpu(tp->counters.rx_unicast);
-       data[9] = le64_to_cpu(tp->counters.rx_broadcast);
-       data[10] = le32_to_cpu(tp->counters.rx_multicast);
-       data[11] = le16_to_cpu(tp->counters.tx_aborted);
-       data[12] = le16_to_cpu(tp->counters.tx_underun);
+       data[0] = le64_to_cpu(counters->tx_packets);
+       data[1] = le64_to_cpu(counters->rx_packets);
+       data[2] = le64_to_cpu(counters->tx_errors);
+       data[3] = le32_to_cpu(counters->rx_errors);
+       data[4] = le16_to_cpu(counters->rx_missed);
+       data[5] = le16_to_cpu(counters->align_errors);
+       data[6] = le32_to_cpu(counters->tx_one_collision);
+       data[7] = le32_to_cpu(counters->tx_multi_collision);
+       data[8] = le64_to_cpu(counters->rx_unicast);
+       data[9] = le64_to_cpu(counters->rx_broadcast);
+       data[10] = le32_to_cpu(counters->rx_multicast);
+       data[11] = le16_to_cpu(counters->tx_aborted);
+       data[12] = le16_to_cpu(counters->tx_underun);
 }
 
 static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data)
@@ -7780,6 +7737,7 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
+       struct rtl8169_counters *counters = tp->counters;
        unsigned int start;
 
        if (netif_running(dev))
@@ -7816,11 +7774,11 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
         * Subtract values fetched during initalization.
         * See rtl8169_init_counter_offsets for a description why we do that.
         */
-       stats->tx_errors = le64_to_cpu(tp->counters.tx_errors) -
+       stats->tx_errors = le64_to_cpu(counters->tx_errors) -
                le64_to_cpu(tp->tc_offset.tx_errors);
-       stats->collisions = le32_to_cpu(tp->counters.tx_multi_collision) -
+       stats->collisions = le32_to_cpu(counters->tx_multi_collision) -
                le32_to_cpu(tp->tc_offset.tx_multi_collision);
-       stats->tx_aborted_errors = le16_to_cpu(tp->counters.tx_aborted) -
+       stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted) -
                le16_to_cpu(tp->tc_offset.tx_aborted);
 
        return stats;
@@ -8022,6 +7980,9 @@ static void rtl_remove_one(struct pci_dev *pdev)
 
        unregister_netdev(dev);
 
+       dma_free_coherent(&tp->pci_dev->dev, sizeof(*tp->counters),
+                         tp->counters, tp->counters_phys_addr);
+
        rtl_release_firmware(tp);
 
        if (pci_dev_run_wake(pdev))
@@ -8447,9 +8408,16 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        tp->rtl_fw = RTL_FIRMWARE_UNKNOWN;
 
+       tp->counters = dma_alloc_coherent (&pdev->dev, sizeof(*tp->counters),
+                                          &tp->counters_phys_addr, GFP_KERNEL);
+       if (!tp->counters) {
+               rc = -ENOMEM;
+               goto err_out_msi_4;
+       }
+
        rc = register_netdev(dev);
        if (rc < 0)
-               goto err_out_msi_4;
+               goto err_out_cnt_5;
 
        pci_set_drvdata(pdev, dev);
 
@@ -8483,6 +8451,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 out:
        return rc;
 
+err_out_cnt_5:
+       dma_free_coherent(&pdev->dev, sizeof(*tp->counters), tp->counters,
+                         tp->counters_phys_addr);
 err_out_msi_4:
        netif_napi_del(&tp->napi);
        rtl_disable_msi(pdev, tp);
index 864b476..925f2f8 100644 (file)
@@ -837,8 +837,11 @@ static int stmmac_init_phy(struct net_device *dev)
                                     interface);
        }
 
-       if (IS_ERR(phydev)) {
+       if (IS_ERR_OR_NULL(phydev)) {
                pr_err("%s: Could not attach to PHY\n", dev->name);
+               if (!phydev)
+                       return -ENODEV;
+
                return PTR_ERR(phydev);
        }
 
index a8f3151..8276ee5 100644 (file)
@@ -20,7 +20,7 @@ config SYNOPSYS_DWC_ETH_QOS
        select PHYLIB
        select CRC32
        select MII
-       depends on OF
+       depends on OF && HAS_DMA
        ---help---
          This driver supports the DWC Ethernet QoS from Synopsys
 
index d8757bf..a9acf71 100644 (file)
@@ -61,11 +61,21 @@ MODULE_VERSION(NTB_NETDEV_VER);
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Intel Corporation");
 
+/* Time in usecs for tx resource reaper */
+static unsigned int tx_time = 1;
+
+/* Number of descriptors to free before resuming tx */
+static unsigned int tx_start = 10;
+
+/* Number of descriptors still available before stop upper layer tx */
+static unsigned int tx_stop = 5;
+
 struct ntb_netdev {
        struct list_head list;
        struct pci_dev *pdev;
        struct net_device *ndev;
        struct ntb_transport_qp *qp;
+       struct timer_list tx_timer;
 };
 
 #define        NTB_TX_TIMEOUT_MS       1000
@@ -136,11 +146,42 @@ enqueue_again:
        }
 }
 
+static int __ntb_netdev_maybe_stop_tx(struct net_device *netdev,
+                                     struct ntb_transport_qp *qp, int size)
+{
+       struct ntb_netdev *dev = netdev_priv(netdev);
+
+       netif_stop_queue(netdev);
+       /* Make sure to see the latest value of ntb_transport_tx_free_entry()
+        * since the queue was last started.
+        */
+       smp_mb();
+
+       if (likely(ntb_transport_tx_free_entry(qp) < size)) {
+               mod_timer(&dev->tx_timer, jiffies + usecs_to_jiffies(tx_time));
+               return -EBUSY;
+       }
+
+       netif_start_queue(netdev);
+       return 0;
+}
+
+static int ntb_netdev_maybe_stop_tx(struct net_device *ndev,
+                                   struct ntb_transport_qp *qp, int size)
+{
+       if (netif_queue_stopped(ndev) ||
+           (ntb_transport_tx_free_entry(qp) >= size))
+               return 0;
+
+       return __ntb_netdev_maybe_stop_tx(ndev, qp, size);
+}
+
 static void ntb_netdev_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
                                  void *data, int len)
 {
        struct net_device *ndev = qp_data;
        struct sk_buff *skb;
+       struct ntb_netdev *dev = netdev_priv(ndev);
 
        skb = data;
        if (!skb || !ndev)
@@ -155,6 +196,15 @@ static void ntb_netdev_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
        }
 
        dev_kfree_skb(skb);
+
+       if (ntb_transport_tx_free_entry(dev->qp) >= tx_start) {
+               /* Make sure anybody stopping the queue after this sees the new
+                * value of ntb_transport_tx_free_entry()
+                */
+               smp_mb();
+               if (netif_queue_stopped(ndev))
+                       netif_wake_queue(ndev);
+       }
 }
 
 static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
@@ -163,10 +213,15 @@ static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
        struct ntb_netdev *dev = netdev_priv(ndev);
        int rc;
 
+       ntb_netdev_maybe_stop_tx(ndev, dev->qp, tx_stop);
+
        rc = ntb_transport_tx_enqueue(dev->qp, skb, skb->data, skb->len);
        if (rc)
                goto err;
 
+       /* check for next submit */
+       ntb_netdev_maybe_stop_tx(ndev, dev->qp, tx_stop);
+
        return NETDEV_TX_OK;
 
 err:
@@ -175,6 +230,23 @@ err:
        return NETDEV_TX_BUSY;
 }
 
+static void ntb_netdev_tx_timer(unsigned long data)
+{
+       struct net_device *ndev = (struct net_device *)data;
+       struct ntb_netdev *dev = netdev_priv(ndev);
+
+       if (ntb_transport_tx_free_entry(dev->qp) < tx_stop) {
+               mod_timer(&dev->tx_timer, jiffies + msecs_to_jiffies(tx_time));
+       } else {
+               /* Make sure anybody stopping the queue after this sees the new
+                * value of ntb_transport_tx_free_entry()
+                */
+               smp_mb();
+               if (netif_queue_stopped(ndev))
+                       netif_wake_queue(ndev);
+       }
+}
+
 static int ntb_netdev_open(struct net_device *ndev)
 {
        struct ntb_netdev *dev = netdev_priv(ndev);
@@ -197,8 +269,11 @@ static int ntb_netdev_open(struct net_device *ndev)
                }
        }
 
+       setup_timer(&dev->tx_timer, ntb_netdev_tx_timer, (unsigned long)ndev);
+
        netif_carrier_off(ndev);
        ntb_transport_link_up(dev->qp);
+       netif_start_queue(ndev);
 
        return 0;
 
@@ -219,6 +294,8 @@ static int ntb_netdev_close(struct net_device *ndev)
        while ((skb = ntb_transport_rx_remove(dev->qp, &len)))
                dev_kfree_skb(skb);
 
+       del_timer_sync(&dev->tx_timer);
+
        return 0;
 }
 
index c07030d..c5ad98a 100644 (file)
@@ -127,6 +127,11 @@ config DP83867_PHY
        ---help---
          Currently supports the DP83867 PHY.
 
+config MICROCHIP_PHY
+       tristate "Drivers for Microchip PHYs"
+       help
+         Supports the LAN88XX PHYs.
+
 config FIXED_PHY
        tristate "Driver for MDIO Bus/PHY emulation with fixed speed/link PHYs"
        depends on PHYLIB
index 9bb1033..87f079c 100644 (file)
@@ -37,3 +37,4 @@ obj-$(CONFIG_MDIO_BUS_MUX_MMIOREG) += mdio-mux-mmioreg.o
 obj-$(CONFIG_MDIO_SUN4I)       += mdio-sun4i.o
 obj-$(CONFIG_MDIO_MOXART)      += mdio-moxart.o
 obj-$(CONFIG_MDIO_BCM_UNIMAC)  += mdio-bcm-unimac.o
+obj-$(CONFIG_MICROCHIP_PHY)    += microchip.o
index 12c7eb2..fb1299c 100644 (file)
@@ -325,7 +325,7 @@ struct phy_device *fixed_phy_register(unsigned int irq,
        phy_addr = phy_fixed_addr++;
        spin_unlock(&phy_fixed_addr_lock);
 
-       ret = fixed_phy_add(PHY_POLL, phy_addr, status, link_gpio);
+       ret = fixed_phy_add(irq, phy_addr, status, link_gpio);
        if (ret < 0)
                return ERR_PTR(ret);
 
diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c
new file mode 100644 (file)
index 0000000..c0a20eb
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2015 Microchip Technology
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mii.h>
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include <linux/microchipphy.h>
+
+#define DRIVER_AUTHOR  "WOOJUNG HUH <woojung.huh@microchip.com>"
+#define DRIVER_DESC    "Microchip LAN88XX PHY driver"
+
+struct lan88xx_priv {
+       int     chip_id;
+       int     chip_rev;
+       __u32   wolopts;
+};
+
+static int lan88xx_phy_config_intr(struct phy_device *phydev)
+{
+       int rc;
+
+       if (phydev->interrupts == PHY_INTERRUPT_ENABLED) {
+               /* unmask all source and clear them before enable */
+               rc = phy_write(phydev, LAN88XX_INT_MASK, 0x7FFF);
+               rc = phy_read(phydev, LAN88XX_INT_STS);
+               rc = phy_write(phydev, LAN88XX_INT_MASK,
+                              LAN88XX_INT_MASK_MDINTPIN_EN_ |
+                              LAN88XX_INT_MASK_LINK_CHANGE_);
+       } else {
+               rc = phy_write(phydev, LAN88XX_INT_MASK, 0);
+       }
+
+       return rc < 0 ? rc : 0;
+}
+
+static int lan88xx_phy_ack_interrupt(struct phy_device *phydev)
+{
+       int rc = phy_read(phydev, LAN88XX_INT_STS);
+
+       return rc < 0 ? rc : 0;
+}
+
+int lan88xx_suspend(struct phy_device *phydev)
+{
+       struct lan88xx_priv *priv = phydev->priv;
+
+       /* do not power down PHY when WOL is enabled */
+       if (!priv->wolopts)
+               genphy_suspend(phydev);
+
+       return 0;
+}
+
+static int lan88xx_probe(struct phy_device *phydev)
+{
+       struct device *dev = &phydev->dev;
+       struct lan88xx_priv *priv;
+
+       priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->wolopts = 0;
+
+       /* these values can be used to identify internal PHY */
+       priv->chip_id = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_ID,
+                                             3, phydev->addr);
+       priv->chip_rev = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_REV,
+                                              3, phydev->addr);
+
+       phydev->priv = priv;
+
+       return 0;
+}
+
+static void lan88xx_remove(struct phy_device *phydev)
+{
+       struct device *dev = &phydev->dev;
+       struct lan88xx_priv *priv = phydev->priv;
+
+       if (priv)
+               devm_kfree(dev, priv);
+}
+
+static int lan88xx_set_wol(struct phy_device *phydev,
+                          struct ethtool_wolinfo *wol)
+{
+       struct lan88xx_priv *priv = phydev->priv;
+
+       priv->wolopts = wol->wolopts;
+
+       return 0;
+}
+
+static struct phy_driver microchip_phy_driver[] = {
+{
+       .phy_id         = 0x0007c130,
+       .phy_id_mask    = 0xfffffff0,
+       .name           = "Microchip LAN88xx",
+
+       .features       = (PHY_GBIT_FEATURES |
+                          SUPPORTED_Pause | SUPPORTED_Asym_Pause),
+       .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
+
+       .probe          = lan88xx_probe,
+       .remove         = lan88xx_remove,
+
+       .config_init    = genphy_config_init,
+       .config_aneg    = genphy_config_aneg,
+       .read_status    = genphy_read_status,
+
+       .ack_interrupt  = lan88xx_phy_ack_interrupt,
+       .config_intr    = lan88xx_phy_config_intr,
+
+       .suspend        = lan88xx_suspend,
+       .resume         = genphy_resume,
+       .set_wol        = lan88xx_set_wol,
+
+       .driver         = { .owner = THIS_MODULE, }
+} };
+
+module_phy_driver(microchip_phy_driver);
+
+static struct mdio_device_id __maybe_unused microchip_tbl[] = {
+       { 0x0007c130, 0xfffffff0 },
+       { }
+};
+
+MODULE_DEVICE_TABLE(mdio, microchip_tbl);
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
index 39364a4..a39518f 100644 (file)
@@ -1049,8 +1049,7 @@ static int lan78xx_link_reset(struct lan78xx_net *dev)
 {
        struct mii_if_info *mii = &dev->mii;
        struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET };
-       u16 ladv, radv;
-       int ret;
+       int ladv, radv, ret;
        u32 buf;
 
        /* clear PHY interrupt status */
@@ -1104,12 +1103,12 @@ static int lan78xx_link_reset(struct lan78xx_net *dev)
                }
 
                ladv = lan78xx_mdio_read(dev->net, mii->phy_id, MII_ADVERTISE);
-               if (unlikely(ladv < 0))
-                       return -EIO;
+               if (ladv < 0)
+                       return ladv;
 
                radv = lan78xx_mdio_read(dev->net, mii->phy_id, MII_LPA);
-               if (unlikely(radv < 0))
-                       return -EIO;
+               if (radv < 0)
+                       return radv;
 
                netif_dbg(dev, link, dev->net,
                          "speed: %u duplex: %d anadv: 0x%04x anlpa: 0x%04x",
index fe4ec32..d9427ca 100644 (file)
 #include <linux/mdio.h>
 #include <linux/usb/cdc.h>
 
-/* Version Information */
-#define DRIVER_VERSION "v1.08.1 (2015/07/28)"
+/* Information for net-next */
+#define NETNEXT_VERSION                "08"
+
+/* Information for net */
+#define NET_VERSION            "2"
+
+#define DRIVER_VERSION         "v1." NETNEXT_VERSION "." NET_VERSION
 #define DRIVER_AUTHOR "Realtek linux nic maintainers <nic_swsd@realtek.com>"
 #define DRIVER_DESC "Realtek RTL8152/RTL8153 Based USB Ethernet Adapters"
 #define MODULENAME "r8152"
 #define OCP_EEE_ABLE           0xa5c4
 #define OCP_EEE_ADV            0xa5d0
 #define OCP_EEE_LPABLE         0xa5d2
+#define OCP_PHY_STATE          0xa708          /* nway state for 8153 */
 #define OCP_ADC_CFG            0xbc06
 
 /* SRAM Register */
 /* OCP_DOWN_SPEED */
 #define EN_10M_BGOFF           0x0080
 
+/* OCP_PHY_STATE */
+#define TXDIS_STATE            0x01
+#define ABD_STATE              0x02
+
 /* OCP_ADC_CFG */
 #define CKADSEL_L              0x0100
 #define ADC_EN                 0x0080
@@ -604,6 +614,7 @@ struct r8152 {
                void (*unload)(struct r8152 *);
                int (*eee_get)(struct r8152 *, struct ethtool_eee *);
                int (*eee_set)(struct r8152 *, struct ethtool_eee *);
+               bool (*in_nway)(struct r8152 *);
        } rtl_ops;
 
        int intr_interval;
@@ -2941,6 +2952,32 @@ static void rtl8153_down(struct r8152 *tp)
        r8153_enable_aldps(tp);
 }
 
+static bool rtl8152_in_nway(struct r8152 *tp)
+{
+       u16 nway_state;
+
+       ocp_write_word(tp, MCU_TYPE_PLA, PLA_OCP_GPHY_BASE, 0x2000);
+       tp->ocp_base = 0x2000;
+       ocp_write_byte(tp, MCU_TYPE_PLA, 0xb014, 0x4c);         /* phy state */
+       nway_state = ocp_read_word(tp, MCU_TYPE_PLA, 0xb01a);
+
+       /* bit 15: TXDIS_STATE, bit 14: ABD_STATE */
+       if (nway_state & 0xc000)
+               return false;
+       else
+               return true;
+}
+
+static bool rtl8153_in_nway(struct r8152 *tp)
+{
+       u16 phy_state = ocp_reg_read(tp, OCP_PHY_STATE) & 0xff;
+
+       if (phy_state == TXDIS_STATE || phy_state == ABD_STATE)
+               return false;
+       else
+               return true;
+}
+
 static void set_carrier(struct r8152 *tp)
 {
        struct net_device *netdev = tp->netdev;
@@ -3405,6 +3442,27 @@ static int rtl8152_post_reset(struct usb_interface *intf)
        return 0;
 }
 
+static bool delay_autosuspend(struct r8152 *tp)
+{
+       bool sw_linking = !!netif_carrier_ok(tp->netdev);
+       bool hw_linking = !!(rtl8152_get_speed(tp) & LINK_STATUS);
+
+       /* This means a linking change occurs and the driver doesn't detect it,
+        * yet. If the driver has disabled tx/rx and hw is linking on, the
+        * device wouldn't wake up by receiving any packet.
+        */
+       if (work_busy(&tp->schedule.work) || sw_linking != hw_linking)
+               return true;
+
+       /* If the linking down is occurred by nway, the device may miss the
+        * linking change event. And it wouldn't wake when linking on.
+        */
+       if (!sw_linking && tp->rtl_ops.in_nway(tp))
+               return true;
+       else
+               return false;
+}
+
 static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message)
 {
        struct r8152 *tp = usb_get_intfdata(intf);
@@ -3414,7 +3472,7 @@ static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message)
        mutex_lock(&tp->control);
 
        if (PMSG_IS_AUTO(message)) {
-               if (netif_running(netdev) && work_busy(&tp->schedule.work)) {
+               if (netif_running(netdev) && delay_autosuspend(tp)) {
                        ret = -EBUSY;
                        goto out1;
                }
@@ -4044,6 +4102,7 @@ static int rtl_ops_init(struct r8152 *tp)
                ops->unload             = rtl8152_unload;
                ops->eee_get            = r8152_get_eee;
                ops->eee_set            = r8152_set_eee;
+               ops->in_nway            = rtl8152_in_nway;
                break;
 
        case RTL_VER_03:
@@ -4058,6 +4117,7 @@ static int rtl_ops_init(struct r8152 *tp)
                ops->unload             = rtl8153_unload;
                ops->eee_get            = r8153_get_eee;
                ops->eee_set            = r8153_set_eee;
+               ops->in_nway            = rtl8153_in_nway;
                break;
 
        default:
index e049857..b4cf107 100644 (file)
@@ -428,12 +428,18 @@ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb,
        old_state = entry->state;
        entry->state = state;
        __skb_unlink(skb, list);
-       spin_unlock(&list->lock);
-       spin_lock(&dev->done.lock);
+
+       /* defer_bh() is never called with list == &dev->done.
+        * spin_lock_nested() tells lockdep that it is OK to take
+        * dev->done.lock here with list->lock held.
+        */
+       spin_lock_nested(&dev->done.lock, SINGLE_DEPTH_NESTING);
+
        __skb_queue_tail(&dev->done, skb);
        if (dev->done.qlen == 1)
                tasklet_schedule(&dev->bh);
-       spin_unlock_irqrestore(&dev->done.lock, flags);
+       spin_unlock(&dev->done.lock);
+       spin_unlock_irqrestore(&list->lock, flags);
        return old_state;
 }
 
@@ -749,6 +755,20 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);
 
 /*-------------------------------------------------------------------------*/
 
+static void wait_skb_queue_empty(struct sk_buff_head *q)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->lock, flags);
+       while (!skb_queue_empty(q)) {
+               spin_unlock_irqrestore(&q->lock, flags);
+               schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               spin_lock_irqsave(&q->lock, flags);
+       }
+       spin_unlock_irqrestore(&q->lock, flags);
+}
+
 // precondition: never called in_interrupt
 static void usbnet_terminate_urbs(struct usbnet *dev)
 {
@@ -762,14 +782,11 @@ static void usbnet_terminate_urbs(struct usbnet *dev)
                unlink_urbs(dev, &dev->rxq);
 
        /* maybe wait for deletions to finish. */
-       while (!skb_queue_empty(&dev->rxq)
-               && !skb_queue_empty(&dev->txq)
-               && !skb_queue_empty(&dev->done)) {
-                       schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       netif_dbg(dev, ifdown, dev->net,
-                                 "waited for %d urb completions\n", temp);
-       }
+       wait_skb_queue_empty(&dev->rxq);
+       wait_skb_queue_empty(&dev->txq);
+       wait_skb_queue_empty(&dev->done);
+       netif_dbg(dev, ifdown, dev->net,
+                 "waited for %d urb completions\n", temp);
        set_current_state(TASK_RUNNING);
        remove_wait_queue(&dev->wait, &wait);
 }
index ce988fd..cf8b7f0 100644 (file)
@@ -1223,7 +1223,6 @@ drop:
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
        struct metadata_dst *tun_dst = NULL;
-       struct ip_tunnel_info *info;
        struct vxlan_sock *vs;
        struct vxlanhdr *vxh;
        u32 flags, vni;
@@ -1270,8 +1269,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                if (!tun_dst)
                        goto drop;
 
-               info = &tun_dst->u.tun_info;
-               md = ip_tunnel_info_opts(info);
+               md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
        } else {
                memset(md, 0, sizeof(*md));
        }
@@ -1286,7 +1284,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                md->gbp = ntohs(gbp->policy_id);
 
                if (tun_dst)
-                       info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+                       tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
 
                if (gbp->dont_learn)
                        md->gbp |= VXLAN_GBP_DONT_LEARN;
index 758c4ba..8fef8d8 100644 (file)
@@ -1358,6 +1358,8 @@ sbni_ioctl( struct net_device  *dev,  struct ifreq  *ifr,  int  cmd )
                if( !slave_dev  ||  !(slave_dev->flags & IFF_UP) ) {
                        netdev_err(dev, "trying to enslave non-active device %s\n",
                                   slave_name);
+                       if (slave_dev)
+                               dev_put(slave_dev);
                        return  -EPERM;
                }
 
index 613ca2b..d1a1e16 100644 (file)
@@ -156,6 +156,12 @@ static const struct file_operations fops_vring = {
        .llseek         = seq_lseek,
 };
 
+static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
+                           const char *prefix)
+{
+       seq_hex_dump(s, prefix, DUMP_PREFIX_NONE, 16, 1, p, len, false);
+}
+
 static void wil_print_ring(struct seq_file *s, const char *prefix,
                           void __iomem *off)
 {
@@ -212,8 +218,6 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
                                   le16_to_cpu(hdr.seq), len,
                                   le16_to_cpu(hdr.type), hdr.flags);
                        if (len <= MAX_MBOXITEM_SIZE) {
-                               int n = 0;
-                               char printbuf[16 * 3 + 2];
                                unsigned char databuf[MAX_MBOXITEM_SIZE];
                                void __iomem *src = wmi_buffer(wil, d.addr) +
                                        sizeof(struct wil6210_mbox_hdr);
@@ -223,16 +227,7 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
                                 * reading header
                                 */
                                wil_memcpy_fromio_32(databuf, src, len);
-                               while (n < len) {
-                                       int l = min(len - n, 16);
-
-                                       hex_dump_to_buffer(databuf + n, l,
-                                                          16, 1, printbuf,
-                                                          sizeof(printbuf),
-                                                          false);
-                                       seq_printf(s, "      : %s\n", printbuf);
-                                       n += l;
-                               }
+                               wil_seq_hexdump(s, databuf, len, "      : ");
                        }
                } else {
                        seq_puts(s, "\n");
@@ -867,22 +862,6 @@ static const struct file_operations fops_wmi = {
        .open  = simple_open,
 };
 
-static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
-                           const char *prefix)
-{
-       char printbuf[16 * 3 + 2];
-       int i = 0;
-
-       while (i < len) {
-               int l = min(len - i, 16);
-
-               hex_dump_to_buffer(p + i, l, 16, 1, printbuf,
-                                  sizeof(printbuf), false);
-               seq_printf(s, "%s%s\n", prefix, printbuf);
-               i += l;
-       }
-}
-
 static void wil_seq_print_skb(struct seq_file *s, struct sk_buff *skb)
 {
        int i = 0;
index 6dc76c1..a7bf747 100644 (file)
@@ -200,11 +200,6 @@ struct xenvif_queue { /* Per-queue data for xenvif */
        struct xenvif_stats stats;
 };
 
-/* Maximum number of Rx slots a to-guest packet may use, including the
- * slot needed for GSO meta-data.
- */
-#define XEN_NETBK_RX_SLOTS_MAX (MAX_SKB_FRAGS + 1)
-
 enum state_bit_shift {
        /* This bit marks that the vif is connected */
        VIF_STATUS_CONNECTED,
@@ -317,11 +312,6 @@ int xenvif_dealloc_kthread(void *data);
 
 void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
 
-/* Determine whether the needed number of slots (req) are available,
- * and set req_event if not.
- */
-bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue, int needed);
-
 void xenvif_carrier_on(struct xenvif *vif);
 
 /* Callback from stack when TX packet can be released */
index 42569b9..ec98d43 100644 (file)
@@ -149,9 +149,20 @@ static inline pending_ring_idx_t pending_index(unsigned i)
        return i & (MAX_PENDING_REQS-1);
 }
 
-bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue, int needed)
+static int xenvif_rx_ring_slots_needed(struct xenvif *vif)
+{
+       if (vif->gso_mask)
+               return DIV_ROUND_UP(vif->dev->gso_max_size, PAGE_SIZE) + 1;
+       else
+               return DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
+}
+
+static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
 {
        RING_IDX prod, cons;
+       int needed;
+
+       needed = xenvif_rx_ring_slots_needed(queue->vif);
 
        do {
                prod = queue->rx.sring->req_prod;
@@ -314,7 +325,7 @@ static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb
                } else {
                        copy_gop->source.domid = DOMID_SELF;
                        copy_gop->source.u.gmfn =
-                               virt_to_mfn(page_address(page));
+                               virt_to_gfn(page_address(page));
                }
                copy_gop->source.offset = offset;
 
@@ -513,7 +524,7 @@ static void xenvif_rx_action(struct xenvif_queue *queue)
 
        skb_queue_head_init(&rxq);
 
-       while (xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX)
+       while (xenvif_rx_ring_slots_available(queue)
               && (skb = xenvif_rx_dequeue(queue)) != NULL) {
                queue->last_rx_time = jiffies;
 
@@ -1395,7 +1406,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
                queue->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
 
                queue->tx_copy_ops[*copy_ops].dest.u.gmfn =
-                       virt_to_mfn(skb->data);
+                       virt_to_gfn(skb->data);
                queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
                queue->tx_copy_ops[*copy_ops].dest.offset =
                        offset_in_page(skb->data);
@@ -1938,8 +1949,7 @@ static bool xenvif_rx_queue_stalled(struct xenvif_queue *queue)
        prod = queue->rx.sring->req_prod;
        cons = queue->rx.req_cons;
 
-       return !queue->stalled
-               && prod - cons < XEN_NETBK_RX_SLOTS_MAX
+       return !queue->stalled && prod - cons < 1
                && time_after(jiffies,
                              queue->last_rx_time + queue->vif->stall_timeout);
 }
@@ -1951,14 +1961,13 @@ static bool xenvif_rx_queue_ready(struct xenvif_queue *queue)
        prod = queue->rx.sring->req_prod;
        cons = queue->rx.req_cons;
 
-       return queue->stalled
-               && prod - cons >= XEN_NETBK_RX_SLOTS_MAX;
+       return queue->stalled && prod - cons >= 1;
 }
 
 static bool xenvif_have_rx_work(struct xenvif_queue *queue)
 {
        return (!skb_queue_empty(&queue->rx_queue)
-               && xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX))
+               && xenvif_rx_ring_slots_available(queue))
                || (queue->vif->stall_timeout &&
                    (xenvif_rx_queue_stalled(queue)
                     || xenvif_rx_queue_ready(queue)))
@@ -2105,8 +2114,11 @@ static int __init netback_init(void)
        if (!xen_domain())
                return -ENODEV;
 
-       /* Allow as many queues as there are CPUs, by default */
-       xenvif_max_queues = num_online_cpus();
+       /* Allow as many queues as there are CPUs if user has not
+        * specified a value.
+        */
+       if (xenvif_max_queues == 0)
+               xenvif_max_queues = num_online_cpus();
 
        if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) {
                pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",
index e27e6d2..f821a97 100644 (file)
@@ -291,7 +291,7 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue)
                struct sk_buff *skb;
                unsigned short id;
                grant_ref_t ref;
-               unsigned long pfn;
+               unsigned long gfn;
                struct xen_netif_rx_request *req;
 
                skb = xennet_alloc_one_rx_buffer(queue);
@@ -307,12 +307,12 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue)
                BUG_ON((signed short)ref < 0);
                queue->grant_rx_ref[id] = ref;
 
-               pfn = page_to_pfn(skb_frag_page(&skb_shinfo(skb)->frags[0]));
+               gfn = xen_page_to_gfn(skb_frag_page(&skb_shinfo(skb)->frags[0]));
 
                req = RING_GET_REQUEST(&queue->rx, req_prod);
                gnttab_grant_foreign_access_ref(ref,
                                                queue->info->xbdev->otherend_id,
-                                               pfn_to_mfn(pfn),
+                                               gfn,
                                                0);
 
                req->id = id;
@@ -430,8 +430,10 @@ static struct xen_netif_tx_request *xennet_make_one_txreq(
        ref = gnttab_claim_grant_reference(&queue->gref_tx_head);
        BUG_ON((signed short)ref < 0);
 
-       gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id,
-                                       page_to_mfn(page), GNTMAP_readonly);
+       gnttab_grant_foreign_access_ref(ref,
+                                       queue->info->xbdev->otherend_id,
+                                       xen_page_to_gfn(page),
+                                       GNTMAP_readonly);
 
        queue->tx_skbs[id].skb = skb;
        queue->grant_tx_page[id] = page;
@@ -2132,8 +2134,11 @@ static int __init netif_init(void)
 
        pr_info("Initialising Xen virtual ethernet driver\n");
 
-       /* Allow as many queues as there are CPUs, by default */
-       xennet_max_queues = num_online_cpus();
+       /* Allow as many queues as there are CPUs if user has not
+        * specified a value.
+        */
+       if (xennet_max_queues == 0)
+               xennet_max_queues = num_online_cpus();
 
        return xenbus_register_frontend(&netfront_driver);
 }
index 87751cf..865a3e3 100644 (file)
@@ -190,14 +190,17 @@ static inline int pdev_is_xeon(struct pci_dev *pdev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
        case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
        case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
        case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                return 1;
        }
        return 0;
@@ -237,7 +240,7 @@ static inline int ndev_ignore_unsafe(struct intel_ntb_dev *ndev,
 
 static int ndev_mw_to_bar(struct intel_ntb_dev *ndev, int idx)
 {
-       if (idx < 0 || idx > ndev->mw_count)
+       if (idx < 0 || idx >= ndev->mw_count)
                return -EINVAL;
        return ndev->reg->mw_bar[idx];
 }
@@ -572,10 +575,13 @@ static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
                         "Connection Topology -\t%s\n",
                         ntb_topo_string(ndev->ntb.topo));
 
-       off += scnprintf(buf + off, buf_size - off,
-                        "B2B Offset -\t\t%#lx\n", ndev->b2b_off);
-       off += scnprintf(buf + off, buf_size - off,
-                        "B2B MW Idx -\t\t%d\n", ndev->b2b_idx);
+       if (ndev->b2b_idx != UINT_MAX) {
+               off += scnprintf(buf + off, buf_size - off,
+                                "B2B MW Idx -\t\t%u\n", ndev->b2b_idx);
+               off += scnprintf(buf + off, buf_size - off,
+                                "B2B Offset -\t\t%#lx\n", ndev->b2b_off);
+       }
+
        off += scnprintf(buf + off, buf_size - off,
                         "BAR4 Split -\t\t%s\n",
                         ndev->bar4_split ? "yes" : "no");
@@ -1484,7 +1490,7 @@ static int xeon_setup_b2b_mw(struct intel_ntb_dev *ndev,
        pdev = ndev_pdev(ndev);
        mmio = ndev->self_mmio;
 
-       if (ndev->b2b_idx >= ndev->mw_count) {
+       if (ndev->b2b_idx == UINT_MAX) {
                dev_dbg(ndev_dev(ndev), "not using b2b mw\n");
                b2b_bar = 0;
                ndev->b2b_off = 0;
@@ -1776,6 +1782,13 @@ static int xeon_init_ntb(struct intel_ntb_dev *ndev)
                        else
                                ndev->b2b_idx = b2b_mw_idx;
 
+                       if (ndev->b2b_idx >= ndev->mw_count) {
+                               dev_dbg(ndev_dev(ndev),
+                                       "b2b_mw_idx %d invalid for mw_count %u\n",
+                                       b2b_mw_idx, ndev->mw_count);
+                               return -EINVAL;
+                       }
+
                        dev_dbg(ndev_dev(ndev),
                                "setting up b2b mw idx %d means %d\n",
                                b2b_mw_idx, ndev->b2b_idx);
@@ -1843,6 +1856,9 @@ static int xeon_init_dev(struct intel_ntb_dev *ndev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                ndev->hwerr_flags |= NTB_HWERR_SDOORBELL_LOCKUP;
                break;
        }
@@ -1857,6 +1873,9 @@ static int xeon_init_dev(struct intel_ntb_dev *ndev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                ndev->hwerr_flags |= NTB_HWERR_SB01BASE_LOCKUP;
                break;
        }
@@ -1878,6 +1897,9 @@ static int xeon_init_dev(struct intel_ntb_dev *ndev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                ndev->hwerr_flags |= NTB_HWERR_B2BDOORBELL_BIT14;
                break;
        }
@@ -1996,7 +2018,7 @@ static inline void ndev_init_struct(struct intel_ntb_dev *ndev,
        ndev->ntb.ops = &intel_ntb_ops;
 
        ndev->b2b_off = 0;
-       ndev->b2b_idx = INT_MAX;
+       ndev->b2b_idx = UINT_MAX;
 
        ndev->bar4_split = 0;
 
@@ -2234,14 +2256,17 @@ static const struct pci_device_id intel_ntb_pci_tbl[] = {
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
+       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BDX)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
+       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_BDX)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
+       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_BDX)},
        {0}
 };
 MODULE_DEVICE_TABLE(pci, intel_ntb_pci_tbl);
index 7ddaf38..ea0612f 100644 (file)
@@ -67,6 +67,9 @@
 #define PCI_DEVICE_ID_INTEL_NTB_PS_HSX 0x2F0E
 #define PCI_DEVICE_ID_INTEL_NTB_SS_HSX 0x2F0F
 #define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD        0x0C4E
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_BDX        0x6F0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_BDX 0x6F0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_BDX 0x6F0F
 
 /* Intel Xeon hardware */
 
index 1c6386d..6e3ee90 100644 (file)
@@ -119,7 +119,8 @@ struct ntb_transport_qp {
        struct ntb_transport_ctx *transport;
        struct ntb_dev *ndev;
        void *cb_data;
-       struct dma_chan *dma_chan;
+       struct dma_chan *tx_dma_chan;
+       struct dma_chan *rx_dma_chan;
 
        bool client_ready;
        bool link_is_up;
@@ -297,7 +298,7 @@ static LIST_HEAD(ntb_transport_list);
 
 static int ntb_bus_init(struct ntb_transport_ctx *nt)
 {
-       list_add(&nt->entry, &ntb_transport_list);
+       list_add_tail(&nt->entry, &ntb_transport_list);
        return 0;
 }
 
@@ -452,7 +453,7 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 
        out_offset = 0;
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "NTB QP stats\n");
+                              "\nNTB QP stats:\n\n");
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "rx_bytes - \t%llu\n", qp->rx_bytes);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
@@ -470,11 +471,11 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "rx_err_ver - \t%llu\n", qp->rx_err_ver);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "rx_buff - \t%p\n", qp->rx_buff);
+                              "rx_buff - \t0x%p\n", qp->rx_buff);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "rx_index - \t%u\n", qp->rx_index);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "rx_max_entry - \t%u\n", qp->rx_max_entry);
+                              "rx_max_entry - \t%u\n\n", qp->rx_max_entry);
 
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "tx_bytes - \t%llu\n", qp->tx_bytes);
@@ -489,15 +490,32 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "tx_err_no_buf - %llu\n", qp->tx_err_no_buf);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "tx_mw - \t%p\n", qp->tx_mw);
+                              "tx_mw - \t0x%p\n", qp->tx_mw);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "tx_index - \t%u\n", qp->tx_index);
+                              "tx_index (H) - \t%u\n", qp->tx_index);
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "RRI (T) - \t%u\n",
+                              qp->remote_rx_info->entry);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "tx_max_entry - \t%u\n", qp->tx_max_entry);
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "free tx - \t%u\n",
+                              ntb_transport_tx_free_entry(qp));
 
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "\nQP Link %s\n",
+                              "\n");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "Using TX DMA - \t%s\n",
+                              qp->tx_dma_chan ? "Yes" : "No");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "Using RX DMA - \t%s\n",
+                              qp->rx_dma_chan ? "Yes" : "No");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "QP Link - \t%s\n",
                               qp->link_is_up ? "Up" : "Down");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "\n");
+
        if (out_offset > out_count)
                out_offset = out_count;
 
@@ -535,6 +553,7 @@ static struct ntb_queue_entry *ntb_list_rm(spinlock_t *lock,
        }
        entry = list_first_entry(list, struct ntb_queue_entry, entry);
        list_del(&entry->entry);
+
 out:
        spin_unlock_irqrestore(lock, flags);
 
@@ -1206,7 +1225,7 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
 {
        struct dma_async_tx_descriptor *txd;
        struct ntb_transport_qp *qp = entry->qp;
-       struct dma_chan *chan = qp->dma_chan;
+       struct dma_chan *chan = qp->rx_dma_chan;
        struct dma_device *device;
        size_t pay_off, buff_off, len;
        struct dmaengine_unmap_data *unmap;
@@ -1219,18 +1238,18 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
                goto err;
 
        if (len < copy_bytes)
-               goto err_wait;
+               goto err;
 
        device = chan->device;
        pay_off = (size_t)offset & ~PAGE_MASK;
        buff_off = (size_t)buf & ~PAGE_MASK;
 
        if (!is_dma_copy_aligned(device, pay_off, buff_off, len))
-               goto err_wait;
+               goto err;
 
        unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
        if (!unmap)
-               goto err_wait;
+               goto err;
 
        unmap->len = len;
        unmap->addr[0] = dma_map_page(device->dev, virt_to_page(offset),
@@ -1273,12 +1292,6 @@ err_set_unmap:
        dmaengine_unmap_put(unmap);
 err_get_unmap:
        dmaengine_unmap_put(unmap);
-err_wait:
-       /* If the callbacks come out of order, the writing of the index to the
-        * last completed will be out of order.  This may result in the
-        * receive stalling forever.
-        */
-       dma_sync_wait(chan, qp->last_cookie);
 err:
        ntb_memcpy_rx(entry, offset);
        qp->rx_memcpy++;
@@ -1373,8 +1386,8 @@ static void ntb_transport_rxc_db(unsigned long data)
                        break;
        }
 
-       if (i && qp->dma_chan)
-               dma_async_issue_pending(qp->dma_chan);
+       if (i && qp->rx_dma_chan)
+               dma_async_issue_pending(qp->rx_dma_chan);
 
        if (i == qp->rx_max_entry) {
                /* there is more work to do */
@@ -1441,7 +1454,7 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 {
        struct ntb_payload_header __iomem *hdr;
        struct dma_async_tx_descriptor *txd;
-       struct dma_chan *chan = qp->dma_chan;
+       struct dma_chan *chan = qp->tx_dma_chan;
        struct dma_device *device;
        size_t dest_off, buff_off;
        struct dmaengine_unmap_data *unmap;
@@ -1634,14 +1647,27 @@ ntb_transport_create_queue(void *data, struct device *client_dev,
        dma_cap_set(DMA_MEMCPY, dma_mask);
 
        if (use_dma) {
-               qp->dma_chan = dma_request_channel(dma_mask, ntb_dma_filter_fn,
-                                                  (void *)(unsigned long)node);
-               if (!qp->dma_chan)
-                       dev_info(&pdev->dev, "Unable to allocate DMA channel\n");
+               qp->tx_dma_chan =
+                       dma_request_channel(dma_mask, ntb_dma_filter_fn,
+                                           (void *)(unsigned long)node);
+               if (!qp->tx_dma_chan)
+                       dev_info(&pdev->dev, "Unable to allocate TX DMA channel\n");
+
+               qp->rx_dma_chan =
+                       dma_request_channel(dma_mask, ntb_dma_filter_fn,
+                                           (void *)(unsigned long)node);
+               if (!qp->rx_dma_chan)
+                       dev_info(&pdev->dev, "Unable to allocate RX DMA channel\n");
        } else {
-               qp->dma_chan = NULL;
+               qp->tx_dma_chan = NULL;
+               qp->rx_dma_chan = NULL;
        }
-       dev_dbg(&pdev->dev, "Using %s memcpy\n", qp->dma_chan ? "DMA" : "CPU");
+
+       dev_dbg(&pdev->dev, "Using %s memcpy for TX\n",
+               qp->tx_dma_chan ? "DMA" : "CPU");
+
+       dev_dbg(&pdev->dev, "Using %s memcpy for RX\n",
+               qp->rx_dma_chan ? "DMA" : "CPU");
 
        for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
                entry = kzalloc_node(sizeof(*entry), GFP_ATOMIC, node);
@@ -1676,8 +1702,10 @@ err2:
 err1:
        while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_free_q)))
                kfree(entry);
-       if (qp->dma_chan)
-               dma_release_channel(qp->dma_chan);
+       if (qp->tx_dma_chan)
+               dma_release_channel(qp->tx_dma_chan);
+       if (qp->rx_dma_chan)
+               dma_release_channel(qp->rx_dma_chan);
        nt->qp_bitmap_free |= qp_bit;
 err:
        return NULL;
@@ -1701,12 +1729,27 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 
        pdev = qp->ndev->pdev;
 
-       if (qp->dma_chan) {
-               struct dma_chan *chan = qp->dma_chan;
+       if (qp->tx_dma_chan) {
+               struct dma_chan *chan = qp->tx_dma_chan;
+               /* Putting the dma_chan to NULL will force any new traffic to be
+                * processed by the CPU instead of the DAM engine
+                */
+               qp->tx_dma_chan = NULL;
+
+               /* Try to be nice and wait for any queued DMA engine
+                * transactions to process before smashing it with a rock
+                */
+               dma_sync_wait(chan, qp->last_cookie);
+               dmaengine_terminate_all(chan);
+               dma_release_channel(chan);
+       }
+
+       if (qp->rx_dma_chan) {
+               struct dma_chan *chan = qp->rx_dma_chan;
                /* Putting the dma_chan to NULL will force any new traffic to be
                 * processed by the CPU instead of the DAM engine
                 */
-               qp->dma_chan = NULL;
+               qp->rx_dma_chan = NULL;
 
                /* Try to be nice and wait for any queued DMA engine
                 * transactions to process before smashing it with a rock
@@ -1843,7 +1886,7 @@ int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
        entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
        if (!entry) {
                qp->tx_err_no_buf++;
-               return -ENOMEM;
+               return -EBUSY;
        }
 
        entry->cb_data = cb;
@@ -1954,21 +1997,34 @@ EXPORT_SYMBOL_GPL(ntb_transport_qp_num);
 unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp)
 {
        unsigned int max;
+       unsigned int copy_align;
 
        if (!qp)
                return 0;
 
-       if (!qp->dma_chan)
+       if (!qp->tx_dma_chan && !qp->rx_dma_chan)
                return qp->tx_max_frame - sizeof(struct ntb_payload_header);
 
+       copy_align = max(qp->tx_dma_chan->device->copy_align,
+                        qp->rx_dma_chan->device->copy_align);
+
        /* If DMA engine usage is possible, try to find the max size for that */
        max = qp->tx_max_frame - sizeof(struct ntb_payload_header);
-       max -= max % (1 << qp->dma_chan->device->copy_align);
+       max -= max % (1 << copy_align);
 
        return max;
 }
 EXPORT_SYMBOL_GPL(ntb_transport_max_size);
 
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp)
+{
+       unsigned int head = qp->tx_index;
+       unsigned int tail = qp->remote_rx_info->entry;
+
+       return tail > head ? tail - head : qp->tx_max_entry + tail - head;
+}
+EXPORT_SYMBOL_GPL(ntb_transport_tx_free_entry);
+
 static void ntb_transport_doorbell_callback(void *data, int vector)
 {
        struct ntb_transport_ctx *nt = data;
index 02ff84f..957b421 100644 (file)
@@ -1103,16 +1103,9 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
        struct ioc *ioc = ioc_list;
 
        while (ioc != NULL) {
-               u32 *res_ptr = (u32 *)ioc->res_map;
-               int j;
-
-               for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) {
-                       if ((j & 7) == 0)
-                               seq_puts(m, "\n   ");
-                       seq_printf(m, "%08x", *res_ptr);
-                       res_ptr++;
-               }
-               seq_puts(m, "\n\n");
+               seq_hex_dump(m, "   ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+                            ioc->res_size, false);
+               seq_putc(m, '\n');
                ioc = ioc->next;
                break; /* XXX - remove me */
        }
index f1441e4..225049b 100644 (file)
@@ -1854,14 +1854,9 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
 {
        struct sba_device *sba_dev = sba_list;
        struct ioc *ioc = &sba_dev->ioc[0];     /* FIXME: Multi-IOC support! */
-       unsigned int *res_ptr = (unsigned int *)ioc->res_map;
-       int i;
 
-       for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) {
-               if ((i & 7) == 0)
-                       seq_puts(m, "\n   ");
-               seq_printf(m, " %08x", *res_ptr);
-       }
+       seq_hex_dump(m, "   ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+                    ioc->res_size, false);
        seq_putc(m, '\n');
 
        return 0;
index 52a880c..dd652f2 100644 (file)
@@ -467,7 +467,7 @@ static void pci_device_shutdown(struct device *dev)
        pci_msi_shutdown(pci_dev);
        pci_msix_shutdown(pci_dev);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        /*
         * If this is a kexec reboot, turn off Bus Master bit on the
         * device to tell it to not continue to do DMA. Don't touch
index 69723e0..9638a00 100644 (file)
@@ -349,6 +349,9 @@ static bool pinctrl_ready_for_gpio_range(unsigned gpio)
        struct pinctrl_gpio_range *range = NULL;
        struct gpio_chip *chip = gpio_to_chip(gpio);
 
+       if (WARN(!chip, "no gpio_chip for gpio%i?", gpio))
+               return false;
+
        mutex_lock(&pinctrldev_list_mutex);
 
        /* Loop over the pin controllers */
index 461fffc..11f8b83 100644 (file)
@@ -337,9 +337,9 @@ static int dc_pinctrl_probe(struct platform_device *pdev)
        pmap->dev = &pdev->dev;
 
        pmap->pctl = pinctrl_register(pctl_desc, &pdev->dev, pmap);
-       if (!pmap->pctl) {
+       if (IS_ERR(pmap->pctl)) {
                dev_err(&pdev->dev, "pinctrl driver registration failed\n");
-               return -EINVAL;
+               return PTR_ERR(pmap->pctl);
        }
 
        ret = dc_gpiochip_add(pmap, pdev->dev.of_node);
index 67e08cb..29984b3 100644 (file)
@@ -313,8 +313,7 @@ static int pinmux_func_name_to_selector(struct pinctrl_dev *pctldev,
 
        /* See if this pctldev has this function */
        while (selector < nfuncs) {
-               const char *fname = ops->get_function_name(pctldev,
-                                                          selector);
+               const char *fname = ops->get_function_name(pctldev, selector);
 
                if (!strcmp(function, fname))
                        return selector;
index c978b31..e1a3721 100644 (file)
@@ -723,9 +723,9 @@ static int pm8xxx_gpio_probe(struct platform_device *pdev)
 #endif
 
        pctrl->pctrl = pinctrl_register(&pctrl->desc, &pdev->dev, pctrl);
-       if (!pctrl->pctrl) {
+       if (IS_ERR(pctrl->pctrl)) {
                dev_err(&pdev->dev, "couldn't register pm8xxx gpio driver\n");
-               return -ENODEV;
+               return PTR_ERR(pctrl->pctrl);
        }
 
        pctrl->chip = pm8xxx_gpio_template;
index 2d1b69f..6652b8d 100644 (file)
@@ -814,9 +814,9 @@ static int pm8xxx_mpp_probe(struct platform_device *pdev)
 #endif
 
        pctrl->pctrl = pinctrl_register(&pctrl->desc, &pdev->dev, pctrl);
-       if (!pctrl->pctrl) {
+       if (IS_ERR(pctrl->pctrl)) {
                dev_err(&pdev->dev, "couldn't register pm8xxx mpp driver\n");
-               return -ENODEV;
+               return PTR_ERR(pctrl->pctrl);
        }
 
        pctrl->chip = pm8xxx_mpp_template;
index 019844d..d168b39 100644 (file)
@@ -361,7 +361,7 @@ static inline void s3c24xx_demux_eint(struct irq_desc *desc,
                                      u32 offset, u32 range)
 {
        struct s3c24xx_eint_data *data = irq_desc_get_handler_data(desc);
-       struct irq_chip *chip = irq_desc_get_irq_chip(desc);
+       struct irq_chip *chip = irq_desc_get_chip(desc);
        struct samsung_pinctrl_drv_data *d = data->drvdata;
        unsigned int pend, mask;
 
index 1ef02da..460fa67 100644 (file)
@@ -346,8 +346,7 @@ static void acerhdf_check_param(struct thermal_zone_device *thermal)
  * as late as the polling interval is since we can't do that in the respective
  * accessors of the module parameters.
  */
-static int acerhdf_get_ec_temp(struct thermal_zone_device *thermal,
-                              unsigned long *t)
+static int acerhdf_get_ec_temp(struct thermal_zone_device *thermal, int *t)
 {
        int temp, err = 0;
 
@@ -453,7 +452,7 @@ static int acerhdf_get_trip_type(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_trip_hyst(struct thermal_zone_device *thermal, int trip,
-                                unsigned long *temp)
+                                int *temp)
 {
        if (trip != 0)
                return -EINVAL;
@@ -464,7 +463,7 @@ static int acerhdf_get_trip_hyst(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_trip_temp(struct thermal_zone_device *thermal, int trip,
-                                unsigned long *temp)
+                                int *temp)
 {
        if (trip == 0)
                *temp = fanon;
@@ -477,7 +476,7 @@ static int acerhdf_get_trip_temp(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_crit_temp(struct thermal_zone_device *thermal,
-                                unsigned long *temperature)
+                                int *temperature)
 {
        *temperature = ACERHDF_TEMP_CRIT;
        return 0;
index 0944e83..9f713b8 100644 (file)
@@ -132,7 +132,7 @@ static int is_valid_adc(uint16_t adc_val, uint16_t min, uint16_t max)
  * to achieve very close approximate temp value with less than
  * 0.5C error
  */
-static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
+static int adc_to_temp(int direct, uint16_t adc_val, int *tp)
 {
        int temp;
 
@@ -174,14 +174,13 @@ static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
  *
  * Can sleep
  */
-static int mid_read_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int mid_read_temp(struct thermal_zone_device *tzd, int *temp)
 {
        struct thermal_device_info *td_info = tzd->devdata;
        uint16_t adc_val, addr;
        uint8_t data = 0;
        int ret;
-       unsigned long curr_temp;
-
+       int curr_temp;
 
        addr = td_info->chnl_addr;
 
@@ -453,7 +452,7 @@ static SIMPLE_DEV_PM_OPS(mid_thermal_pm,
  *
  * Can sleep
  */
-static int read_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int read_curr_temp(struct thermal_zone_device *tzd, int *temp)
 {
        WARN_ON(tzd == NULL);
        return mid_read_temp(tzd, temp);
index 1c202cc..907293e 100644 (file)
@@ -619,7 +619,7 @@ static int cm_get_battery_temperature(struct charger_manager *cm,
 
 #ifdef CONFIG_THERMAL
        if (cm->tzd_batt) {
-               ret = thermal_zone_get_temp(cm->tzd_batt, (unsigned long *)temp);
+               ret = thermal_zone_get_temp(cm->tzd_batt, temp);
                if (!ret)
                        /* Calibrate temperature unit */
                        *temp /= 100;
index 869284c..456987c 100644 (file)
@@ -557,7 +557,7 @@ EXPORT_SYMBOL_GPL(power_supply_unreg_notifier);
 
 #ifdef CONFIG_THERMAL
 static int power_supply_read_temp(struct thermal_zone_device *tzd,
-               unsigned long *temp)
+               int *temp)
 {
        struct power_supply *psy;
        union power_supply_propval val;
index d2d2904..9aaf646 100644 (file)
@@ -89,6 +89,7 @@ static int ath79_reset_probe(struct platform_device *pdev)
        if (IS_ERR(ath79_reset->base))
                return PTR_ERR(ath79_reset->base);
 
+       spin_lock_init(&ath79_reset->lock);
        ath79_reset->rcdev.ops = &ath79_reset_ops;
        ath79_reset->rcdev.owner = THIS_MODULE;
        ath79_reset->rcdev.of_node = pdev->dev.of_node;
index 01bf1f5..4eb4554 100644 (file)
@@ -1206,16 +1206,8 @@ static void sprinthx(unsigned char *title, struct seq_file *m,
 static void sprinthx4(unsigned char *title, struct seq_file *m,
                      unsigned int *array, unsigned int len)
 {
-       int r;
-
        seq_printf(m, "\n%s\n", title);
-       for (r = 0; r < len; r++) {
-               if ((r % 8) == 0)
-                       seq_printf(m, "    ");
-               seq_printf(m, "%08X ", array[r]);
-               if ((r % 8) == 7)
-                       seq_putc(m, '\n');
-       }
+       seq_hex_dump(m, "    ", DUMP_PREFIX_NONE, 32, 4, array, len, false);
        seq_putc(m, '\n');
 }
 
index 471d087..1a8c9b5 100644 (file)
@@ -172,6 +172,7 @@ scsi_mod-$(CONFIG_SYSCTL)   += scsi_sysctl.o
 scsi_mod-$(CONFIG_SCSI_PROC_FS)        += scsi_proc.o
 scsi_mod-y                     += scsi_trace.o scsi_logging.o
 scsi_mod-$(CONFIG_PM)          += scsi_pm.o
+scsi_mod-$(CONFIG_SCSI_DH)     += scsi_dh.o
 
 hv_storvsc-y                   := storvsc_drv.o
 
index edb43fd..c831e30 100644 (file)
@@ -983,7 +983,7 @@ static int asd_process_ctrl_a_user(struct asd_ha_struct *asd_ha,
 {
        int err, i;
        u32 offs, size;
-       struct asd_ll_el *el;
+       struct asd_ll_el *el = NULL;
        struct asd_ctrla_phy_settings *ps;
        struct asd_ctrla_phy_settings dflt_ps;
 
@@ -1004,6 +1004,7 @@ static int asd_process_ctrl_a_user(struct asd_ha_struct *asd_ha,
 
                size = sizeof(struct asd_ctrla_phy_settings);
                ps = &dflt_ps;
+               goto out_process;
        }
 
        if (size == 0)
@@ -1028,7 +1029,7 @@ static int asd_process_ctrl_a_user(struct asd_ha_struct *asd_ha,
                ASD_DPRINTK("couldn't find ctrla phy settings struct\n");
                goto out2;
        }
-
+out_process:
        err = asd_process_ctrla_phy_settings(asd_ha, ps);
        if (err) {
                ASD_DPRINTK("couldn't process ctrla phy settings\n");
index 315d6d6..98f7e8c 100644 (file)
@@ -3665,19 +3665,19 @@ bfa_cb_sfp_state_query(struct bfa_sfp_s *sfp)
                if (sfp->state_query_cbfn)
                        sfp->state_query_cbfn(sfp->state_query_cbarg,
                                        sfp->status);
-                       sfp->media = NULL;
-               }
+               sfp->media = NULL;
+       }
 
-               if (sfp->portspeed) {
-                       sfp->status = bfa_sfp_speed_valid(sfp, sfp->portspeed);
-                       if (sfp->state_query_cbfn)
-                               sfp->state_query_cbfn(sfp->state_query_cbarg,
-                                               sfp->status);
-                               sfp->portspeed = BFA_PORT_SPEED_UNKNOWN;
-               }
+       if (sfp->portspeed) {
+               sfp->status = bfa_sfp_speed_valid(sfp, sfp->portspeed);
+               if (sfp->state_query_cbfn)
+                       sfp->state_query_cbfn(sfp->state_query_cbarg,
+                                       sfp->status);
+               sfp->portspeed = BFA_PORT_SPEED_UNKNOWN;
+       }
 
-               sfp->state_query_lock = 0;
-               sfp->state_query_cbfn = NULL;
+       sfp->state_query_lock = 0;
+       sfp->state_query_cbfn = NULL;
 }
 
 /*
@@ -3878,7 +3878,7 @@ bfa_sfp_show_comp(struct bfa_sfp_s *sfp, struct bfi_mbmsg_s *msg)
                bfa_trc(sfp, sfp->data_valid);
                if (sfp->data_valid) {
                        u32     size = sizeof(struct sfp_mem_s);
-                       u8 *des = (u8 *) &(sfp->sfpmem);
+                       u8 *des = (u8 *)(sfp->sfpmem);
                        memcpy(des, sfp->dbuf_kva, size);
                }
                /*
index 69abd0a..e5647d5 100644 (file)
@@ -3,7 +3,7 @@
 #
 
 menuconfig SCSI_DH
-       tristate "SCSI Device Handlers"
+       bool "SCSI Device Handlers"
        depends on SCSI
        default n
        help
index e1d2ea0..09866c5 100644 (file)
@@ -1,7 +1,6 @@
 #
 # SCSI Device Handler
 #
-obj-$(CONFIG_SCSI_DH)          += scsi_dh.o
 obj-$(CONFIG_SCSI_DH_RDAC)     += scsi_dh_rdac.o
 obj-$(CONFIG_SCSI_DH_HP_SW)    += scsi_dh_hp_sw.o
 obj-$(CONFIG_SCSI_DH_EMC)      += scsi_dh_emc.o
diff --git a/drivers/scsi/device_handler/scsi_dh.c b/drivers/scsi/device_handler/scsi_dh.c
deleted file mode 100644 (file)
index 1efebc9..0000000
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * SCSI device handler infrastruture.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2007
- *      Authors:
- *               Chandra Seetharaman <sekharan@us.ibm.com>
- *               Mike Anderson <andmike@linux.vnet.ibm.com>
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <scsi/scsi_dh.h>
-#include "../scsi_priv.h"
-
-static DEFINE_SPINLOCK(list_lock);
-static LIST_HEAD(scsi_dh_list);
-
-static struct scsi_device_handler *get_device_handler(const char *name)
-{
-       struct scsi_device_handler *tmp, *found = NULL;
-
-       spin_lock(&list_lock);
-       list_for_each_entry(tmp, &scsi_dh_list, list) {
-               if (!strncmp(tmp->name, name, strlen(tmp->name))) {
-                       found = tmp;
-                       break;
-               }
-       }
-       spin_unlock(&list_lock);
-       return found;
-}
-
-/*
- * device_handler_match_function - Match a device handler to a device
- * @sdev - SCSI device to be tested
- *
- * Tests @sdev against the match function of all registered device_handler.
- * Returns the found device handler or NULL if not found.
- */
-static struct scsi_device_handler *
-device_handler_match_function(struct scsi_device *sdev)
-{
-       struct scsi_device_handler *tmp_dh, *found_dh = NULL;
-
-       spin_lock(&list_lock);
-       list_for_each_entry(tmp_dh, &scsi_dh_list, list) {
-               if (tmp_dh->match && tmp_dh->match(sdev)) {
-                       found_dh = tmp_dh;
-                       break;
-               }
-       }
-       spin_unlock(&list_lock);
-       return found_dh;
-}
-
-/*
- * device_handler_match - Attach a device handler to a device
- * @scsi_dh - The device handler to match against or NULL
- * @sdev - SCSI device to be tested against @scsi_dh
- *
- * Tests @sdev against the device handler @scsi_dh or against
- * all registered device_handler if @scsi_dh == NULL.
- * Returns the found device handler or NULL if not found.
- */
-static struct scsi_device_handler *
-device_handler_match(struct scsi_device_handler *scsi_dh,
-                    struct scsi_device *sdev)
-{
-       struct scsi_device_handler *found_dh;
-
-       found_dh = device_handler_match_function(sdev);
-
-       if (scsi_dh && found_dh != scsi_dh)
-               found_dh = NULL;
-
-       return found_dh;
-}
-
-/*
- * scsi_dh_handler_attach - Attach a device handler to a device
- * @sdev - SCSI device the device handler should attach to
- * @scsi_dh - The device handler to attach
- */
-static int scsi_dh_handler_attach(struct scsi_device *sdev,
-                                 struct scsi_device_handler *scsi_dh)
-{
-       struct scsi_dh_data *d;
-
-       if (sdev->scsi_dh_data) {
-               if (sdev->scsi_dh_data->scsi_dh != scsi_dh)
-                       return -EBUSY;
-
-               kref_get(&sdev->scsi_dh_data->kref);
-               return 0;
-       }
-
-       if (!try_module_get(scsi_dh->module))
-               return -EINVAL;
-
-       d = scsi_dh->attach(sdev);
-       if (IS_ERR(d)) {
-               sdev_printk(KERN_ERR, sdev, "%s: Attach failed (%ld)\n",
-                           scsi_dh->name, PTR_ERR(d));
-               module_put(scsi_dh->module);
-               return PTR_ERR(d);
-       }
-
-       d->scsi_dh = scsi_dh;
-       kref_init(&d->kref);
-       d->sdev = sdev;
-
-       spin_lock_irq(sdev->request_queue->queue_lock);
-       sdev->scsi_dh_data = d;
-       spin_unlock_irq(sdev->request_queue->queue_lock);
-       return 0;
-}
-
-static void __detach_handler (struct kref *kref)
-{
-       struct scsi_dh_data *scsi_dh_data =
-               container_of(kref, struct scsi_dh_data, kref);
-       struct scsi_device_handler *scsi_dh = scsi_dh_data->scsi_dh;
-       struct scsi_device *sdev = scsi_dh_data->sdev;
-
-       scsi_dh->detach(sdev);
-
-       spin_lock_irq(sdev->request_queue->queue_lock);
-       sdev->scsi_dh_data = NULL;
-       spin_unlock_irq(sdev->request_queue->queue_lock);
-
-       sdev_printk(KERN_NOTICE, sdev, "%s: Detached\n", scsi_dh->name);
-       module_put(scsi_dh->module);
-}
-
-/*
- * scsi_dh_handler_detach - Detach a device handler from a device
- * @sdev - SCSI device the device handler should be detached from
- * @scsi_dh - Device handler to be detached
- *
- * Detach from a device handler. If a device handler is specified,
- * only detach if the currently attached handler matches @scsi_dh.
- */
-static void scsi_dh_handler_detach(struct scsi_device *sdev,
-                                  struct scsi_device_handler *scsi_dh)
-{
-       if (!sdev->scsi_dh_data)
-               return;
-
-       if (scsi_dh && scsi_dh != sdev->scsi_dh_data->scsi_dh)
-               return;
-
-       if (!scsi_dh)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-
-       if (scsi_dh)
-               kref_put(&sdev->scsi_dh_data->kref, __detach_handler);
-}
-
-/*
- * Functions for sysfs attribute 'dh_state'
- */
-static ssize_t
-store_dh_state(struct device *dev, struct device_attribute *attr,
-              const char *buf, size_t count)
-{
-       struct scsi_device *sdev = to_scsi_device(dev);
-       struct scsi_device_handler *scsi_dh;
-       int err = -EINVAL;
-
-       if (sdev->sdev_state == SDEV_CANCEL ||
-           sdev->sdev_state == SDEV_DEL)
-               return -ENODEV;
-
-       if (!sdev->scsi_dh_data) {
-               /*
-                * Attach to a device handler
-                */
-               if (!(scsi_dh = get_device_handler(buf)))
-                       return err;
-               err = scsi_dh_handler_attach(sdev, scsi_dh);
-       } else {
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-               if (!strncmp(buf, "detach", 6)) {
-                       /*
-                        * Detach from a device handler
-                        */
-                       scsi_dh_handler_detach(sdev, scsi_dh);
-                       err = 0;
-               } else if (!strncmp(buf, "activate", 8)) {
-                       /*
-                        * Activate a device handler
-                        */
-                       if (scsi_dh->activate)
-                               err = scsi_dh->activate(sdev, NULL, NULL);
-                       else
-                               err = 0;
-               }
-       }
-
-       return err<0?err:count;
-}
-
-static ssize_t
-show_dh_state(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct scsi_device *sdev = to_scsi_device(dev);
-
-       if (!sdev->scsi_dh_data)
-               return snprintf(buf, 20, "detached\n");
-
-       return snprintf(buf, 20, "%s\n", sdev->scsi_dh_data->scsi_dh->name);
-}
-
-static struct device_attribute scsi_dh_state_attr =
-       __ATTR(dh_state, S_IRUGO | S_IWUSR, show_dh_state,
-              store_dh_state);
-
-/*
- * scsi_dh_sysfs_attr_add - Callback for scsi_init_dh
- */
-static int scsi_dh_sysfs_attr_add(struct device *dev, void *data)
-{
-       struct scsi_device *sdev;
-       int err;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       err = device_create_file(&sdev->sdev_gendev,
-                                &scsi_dh_state_attr);
-
-       return 0;
-}
-
-/*
- * scsi_dh_sysfs_attr_remove - Callback for scsi_exit_dh
- */
-static int scsi_dh_sysfs_attr_remove(struct device *dev, void *data)
-{
-       struct scsi_device *sdev;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       device_remove_file(&sdev->sdev_gendev,
-                          &scsi_dh_state_attr);
-
-       return 0;
-}
-
-/*
- * scsi_dh_notifier - notifier chain callback
- */
-static int scsi_dh_notifier(struct notifier_block *nb,
-                           unsigned long action, void *data)
-{
-       struct device *dev = data;
-       struct scsi_device *sdev;
-       int err = 0;
-       struct scsi_device_handler *devinfo = NULL;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       if (action == BUS_NOTIFY_ADD_DEVICE) {
-               err = device_create_file(dev, &scsi_dh_state_attr);
-               /* don't care about err */
-               devinfo = device_handler_match(NULL, sdev);
-               if (devinfo)
-                       err = scsi_dh_handler_attach(sdev, devinfo);
-       } else if (action == BUS_NOTIFY_DEL_DEVICE) {
-               device_remove_file(dev, &scsi_dh_state_attr);
-               scsi_dh_handler_detach(sdev, NULL);
-       }
-       return err;
-}
-
-/*
- * scsi_dh_notifier_add - Callback for scsi_register_device_handler
- */
-static int scsi_dh_notifier_add(struct device *dev, void *data)
-{
-       struct scsi_device_handler *scsi_dh = data;
-       struct scsi_device *sdev;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       if (!get_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       if (device_handler_match(scsi_dh, sdev))
-               scsi_dh_handler_attach(sdev, scsi_dh);
-
-       put_device(dev);
-
-       return 0;
-}
-
-/*
- * scsi_dh_notifier_remove - Callback for scsi_unregister_device_handler
- */
-static int scsi_dh_notifier_remove(struct device *dev, void *data)
-{
-       struct scsi_device_handler *scsi_dh = data;
-       struct scsi_device *sdev;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       if (!get_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       scsi_dh_handler_detach(sdev, scsi_dh);
-
-       put_device(dev);
-
-       return 0;
-}
-
-/*
- * scsi_register_device_handler - register a device handler personality
- *      module.
- * @scsi_dh - device handler to be registered.
- *
- * Returns 0 on success, -EBUSY if handler already registered.
- */
-int scsi_register_device_handler(struct scsi_device_handler *scsi_dh)
-{
-
-       if (get_device_handler(scsi_dh->name))
-               return -EBUSY;
-
-       if (!scsi_dh->attach || !scsi_dh->detach)
-               return -EINVAL;
-
-       spin_lock(&list_lock);
-       list_add(&scsi_dh->list, &scsi_dh_list);
-       spin_unlock(&list_lock);
-
-       bus_for_each_dev(&scsi_bus_type, NULL, scsi_dh, scsi_dh_notifier_add);
-       printk(KERN_INFO "%s: device handler registered\n", scsi_dh->name);
-
-       return SCSI_DH_OK;
-}
-EXPORT_SYMBOL_GPL(scsi_register_device_handler);
-
-/*
- * scsi_unregister_device_handler - register a device handler personality
- *      module.
- * @scsi_dh - device handler to be unregistered.
- *
- * Returns 0 on success, -ENODEV if handler not registered.
- */
-int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh)
-{
-
-       if (!get_device_handler(scsi_dh->name))
-               return -ENODEV;
-
-       bus_for_each_dev(&scsi_bus_type, NULL, scsi_dh,
-                        scsi_dh_notifier_remove);
-
-       spin_lock(&list_lock);
-       list_del(&scsi_dh->list);
-       spin_unlock(&list_lock);
-       printk(KERN_INFO "%s: device handler unregistered\n", scsi_dh->name);
-
-       return SCSI_DH_OK;
-}
-EXPORT_SYMBOL_GPL(scsi_unregister_device_handler);
-
-/*
- * scsi_dh_activate - activate the path associated with the scsi_device
- *      corresponding to the given request queue.
- *     Returns immediately without waiting for activation to be completed.
- * @q    - Request queue that is associated with the scsi_device to be
- *         activated.
- * @fn   - Function to be called upon completion of the activation.
- *         Function fn is called with data (below) and the error code.
- *         Function fn may be called from the same calling context. So,
- *         do not hold the lock in the caller which may be needed in fn.
- * @data - data passed to the function fn upon completion.
- *
- */
-int scsi_dh_activate(struct request_queue *q, activate_complete fn, void *data)
-{
-       int err = 0;
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh = NULL;
-       struct device *dev = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev) {
-               spin_unlock_irqrestore(q->queue_lock, flags);
-               err = SCSI_DH_NOSYS;
-               if (fn)
-                       fn(data, err);
-               return err;
-       }
-
-       if (sdev->scsi_dh_data)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-       dev = get_device(&sdev->sdev_gendev);
-       if (!scsi_dh || !dev ||
-           sdev->sdev_state == SDEV_CANCEL ||
-           sdev->sdev_state == SDEV_DEL)
-               err = SCSI_DH_NOSYS;
-       if (sdev->sdev_state == SDEV_OFFLINE)
-               err = SCSI_DH_DEV_OFFLINED;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (err) {
-               if (fn)
-                       fn(data, err);
-               goto out;
-       }
-
-       if (scsi_dh->activate)
-               err = scsi_dh->activate(sdev, fn, data);
-out:
-       put_device(dev);
-       return err;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_activate);
-
-/*
- * scsi_dh_set_params - set the parameters for the device as per the
- *      string specified in params.
- * @q - Request queue that is associated with the scsi_device for
- *      which the parameters to be set.
- * @params - parameters in the following format
- *      "no_of_params\0param1\0param2\0param3\0...\0"
- *      for example, string for 2 parameters with value 10 and 21
- *      is specified as "2\010\021\0".
- */
-int scsi_dh_set_params(struct request_queue *q, const char *params)
-{
-       int err = -SCSI_DH_NOSYS;
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (sdev && sdev->scsi_dh_data)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-       if (scsi_dh && scsi_dh->set_params && get_device(&sdev->sdev_gendev))
-               err = 0;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (err)
-               return err;
-       err = scsi_dh->set_params(sdev, params);
-       put_device(&sdev->sdev_gendev);
-       return err;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_set_params);
-
-/*
- * scsi_dh_handler_exist - Return TRUE(1) if a device handler exists for
- *     the given name. FALSE(0) otherwise.
- * @name - name of the device handler.
- */
-int scsi_dh_handler_exist(const char *name)
-{
-       return (get_device_handler(name) != NULL);
-}
-EXPORT_SYMBOL_GPL(scsi_dh_handler_exist);
-
-/*
- * scsi_dh_attach - Attach device handler
- * @q - Request queue that is associated with the scsi_device
- *      the handler should be attached to
- * @name - name of the handler to attach
- */
-int scsi_dh_attach(struct request_queue *q, const char *name)
-{
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh;
-       int err = 0;
-
-       scsi_dh = get_device_handler(name);
-       if (!scsi_dh)
-               return -EINVAL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev || !get_device(&sdev->sdev_gendev))
-               err = -ENODEV;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (!err) {
-               err = scsi_dh_handler_attach(sdev, scsi_dh);
-               put_device(&sdev->sdev_gendev);
-       }
-       return err;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_attach);
-
-/*
- * scsi_dh_detach - Detach device handler
- * @q - Request queue that is associated with the scsi_device
- *      the handler should be detached from
- *
- * This function will detach the device handler only
- * if the sdev is not part of the internal list, ie
- * if it has been attached manually.
- */
-void scsi_dh_detach(struct request_queue *q)
-{
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev || !get_device(&sdev->sdev_gendev))
-               sdev = NULL;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (!sdev)
-               return;
-
-       if (sdev->scsi_dh_data) {
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-               scsi_dh_handler_detach(sdev, scsi_dh);
-       }
-       put_device(&sdev->sdev_gendev);
-}
-EXPORT_SYMBOL_GPL(scsi_dh_detach);
-
-/*
- * scsi_dh_attached_handler_name - Get attached device handler's name
- * @q - Request queue that is associated with the scsi_device
- *      that may have a device handler attached
- * @gfp - the GFP mask used in the kmalloc() call when allocating memory
- *
- * Returns name of attached handler, NULL if no handler is attached.
- * Caller must take care to free the returned string.
- */
-const char *scsi_dh_attached_handler_name(struct request_queue *q, gfp_t gfp)
-{
-       unsigned long flags;
-       struct scsi_device *sdev;
-       const char *handler_name = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev || !get_device(&sdev->sdev_gendev))
-               sdev = NULL;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (!sdev)
-               return NULL;
-
-       if (sdev->scsi_dh_data)
-               handler_name = kstrdup(sdev->scsi_dh_data->scsi_dh->name, gfp);
-
-       put_device(&sdev->sdev_gendev);
-       return handler_name;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_attached_handler_name);
-
-static struct notifier_block scsi_dh_nb = {
-       .notifier_call = scsi_dh_notifier
-};
-
-static int __init scsi_dh_init(void)
-{
-       int r;
-
-       r = bus_register_notifier(&scsi_bus_type, &scsi_dh_nb);
-
-       if (!r)
-               bus_for_each_dev(&scsi_bus_type, NULL, NULL,
-                                scsi_dh_sysfs_attr_add);
-
-       return r;
-}
-
-static void __exit scsi_dh_exit(void)
-{
-       bus_for_each_dev(&scsi_bus_type, NULL, NULL,
-                        scsi_dh_sysfs_attr_remove);
-       bus_unregister_notifier(&scsi_bus_type, &scsi_dh_nb);
-}
-
-module_init(scsi_dh_init);
-module_exit(scsi_dh_exit);
-
-MODULE_DESCRIPTION("SCSI device handler");
-MODULE_AUTHOR("Chandra Seetharaman <sekharan@us.ibm.com>");
-MODULE_LICENSE("GPL");
index 854b568..cc2773b 100644 (file)
@@ -62,7 +62,6 @@
 #define ALUA_OPTIMIZE_STPG             1
 
 struct alua_dh_data {
-       struct scsi_dh_data     dh_data;
        int                     group_id;
        int                     rel_port;
        int                     tpgs;
@@ -86,11 +85,6 @@ struct alua_dh_data {
 static char print_alua_state(int);
 static int alua_check_sense(struct scsi_device *, struct scsi_sense_hdr *);
 
-static inline struct alua_dh_data *get_alua_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct alua_dh_data, dh_data);
-}
-
 static int realloc_buffer(struct alua_dh_data *h, unsigned len)
 {
        if (h->buff && h->buff != h->inq)
@@ -708,7 +702,7 @@ out:
  */
 static int alua_set_params(struct scsi_device *sdev, const char *params)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
        unsigned int optimize = 0, argc;
        const char *p = params;
        int result = SCSI_DH_OK;
@@ -746,7 +740,7 @@ MODULE_PARM_DESC(optimize_stpg, "Allow use of a non-optimized path, rather than
 static int alua_activate(struct scsi_device *sdev,
                        activate_complete fn, void *data)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
        int err = SCSI_DH_OK;
        int stpg = 0;
 
@@ -804,7 +798,7 @@ out:
  */
 static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->state == TPGS_STATE_TRANSITIONING)
@@ -819,23 +813,18 @@ static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
 
 }
 
-static bool alua_match(struct scsi_device *sdev)
-{
-       return (scsi_device_tpgs(sdev) != 0);
-}
-
 /*
  * alua_bus_attach - Attach device handler
  * @sdev: device to be attached to
  */
-static struct scsi_dh_data *alua_bus_attach(struct scsi_device *sdev)
+static int alua_bus_attach(struct scsi_device *sdev)
 {
        struct alua_dh_data *h;
        int err;
 
        h = kzalloc(sizeof(*h) , GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->tpgs = TPGS_MODE_UNINITIALIZED;
        h->state = TPGS_STATE_OPTIMIZED;
        h->group_id = -1;
@@ -848,11 +837,11 @@ static struct scsi_dh_data *alua_bus_attach(struct scsi_device *sdev)
        if (err != SCSI_DH_OK && err != SCSI_DH_DEV_OFFLINED)
                goto failed;
 
-       sdev_printk(KERN_NOTICE, sdev, "%s: Attached\n", ALUA_DH_NAME);
-       return &h->dh_data;
+       sdev->handler_data = h;
+       return 0;
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 /*
@@ -861,10 +850,11 @@ failed:
  */
 static void alua_bus_detach(struct scsi_device *sdev)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
 
        if (h->buff && h->inq != h->buff)
                kfree(h->buff);
+       sdev->handler_data = NULL;
        kfree(h);
 }
 
@@ -877,7 +867,6 @@ static struct scsi_device_handler alua_dh = {
        .check_sense = alua_check_sense,
        .activate = alua_activate,
        .set_params = alua_set_params,
-       .match = alua_match,
 };
 
 static int __init alua_init(void)
index 6ed1caa..e6fb97c 100644 (file)
@@ -72,7 +72,6 @@ static const char * lun_state[] =
 };
 
 struct clariion_dh_data {
-       struct scsi_dh_data dh_data;
        /*
         * Flags:
         *  CLARIION_SHORT_TRESPASS
@@ -114,13 +113,6 @@ struct clariion_dh_data {
        int current_sp;
 };
 
-static inline struct clariion_dh_data
-                       *get_clariion_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct clariion_dh_data,
-                       dh_data);
-}
-
 /*
  * Parse MODE_SELECT cmd reply.
  */
@@ -450,7 +442,7 @@ static int clariion_check_sense(struct scsi_device *sdev,
 
 static int clariion_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct clariion_dh_data *h = get_clariion_data(sdev);
+       struct clariion_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->lun_state != CLARIION_LUN_OWNED) {
@@ -533,7 +525,7 @@ retry:
 static int clariion_activate(struct scsi_device *sdev,
                                activate_complete fn, void *data)
 {
-       struct clariion_dh_data *csdev = get_clariion_data(sdev);
+       struct clariion_dh_data *csdev = sdev->handler_data;
        int result;
 
        result = clariion_send_inquiry(sdev, csdev);
@@ -574,7 +566,7 @@ done:
  */
 static int clariion_set_params(struct scsi_device *sdev, const char *params)
 {
-       struct clariion_dh_data *csdev = get_clariion_data(sdev);
+       struct clariion_dh_data *csdev = sdev->handler_data;
        unsigned int hr = 0, st = 0, argc;
        const char *p = params;
        int result = SCSI_DH_OK;
@@ -622,42 +614,14 @@ done:
        return result;
 }
 
-static const struct {
-       char *vendor;
-       char *model;
-} clariion_dev_list[] = {
-       {"DGC", "RAID"},
-       {"DGC", "DISK"},
-       {"DGC", "VRAID"},
-       {NULL, NULL},
-};
-
-static bool clariion_match(struct scsi_device *sdev)
-{
-       int i;
-
-       if (scsi_device_tpgs(sdev))
-               return false;
-
-       for (i = 0; clariion_dev_list[i].vendor; i++) {
-               if (!strncmp(sdev->vendor, clariion_dev_list[i].vendor,
-                       strlen(clariion_dev_list[i].vendor)) &&
-                   !strncmp(sdev->model, clariion_dev_list[i].model,
-                       strlen(clariion_dev_list[i].model))) {
-                       return true;
-               }
-       }
-       return false;
-}
-
-static struct scsi_dh_data *clariion_bus_attach(struct scsi_device *sdev)
+static int clariion_bus_attach(struct scsi_device *sdev)
 {
        struct clariion_dh_data *h;
        int err;
 
        h = kzalloc(sizeof(*h) , GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->lun_state = CLARIION_LUN_UNINITIALIZED;
        h->default_sp = CLARIION_UNBOUND_LU;
        h->current_sp = CLARIION_UNBOUND_LU;
@@ -675,18 +639,19 @@ static struct scsi_dh_data *clariion_bus_attach(struct scsi_device *sdev)
                    CLARIION_NAME, h->current_sp + 'A',
                    h->port, lun_state[h->lun_state],
                    h->default_sp + 'A');
-       return &h->dh_data;
+
+       sdev->handler_data = h;
+       return 0;
 
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 static void clariion_bus_detach(struct scsi_device *sdev)
 {
-       struct clariion_dh_data *h = get_clariion_data(sdev);
-
-       kfree(h);
+       kfree(sdev->handler_data);
+       sdev->handler_data = NULL;
 }
 
 static struct scsi_device_handler clariion_dh = {
@@ -698,7 +663,6 @@ static struct scsi_device_handler clariion_dh = {
        .activate       = clariion_activate,
        .prep_fn        = clariion_prep_fn,
        .set_params     = clariion_set_params,
-       .match          = clariion_match,
 };
 
 static int __init clariion_init(void)
index 485d995..9406d5f 100644 (file)
@@ -38,7 +38,6 @@
 #define HP_SW_PATH_PASSIVE             1
 
 struct hp_sw_dh_data {
-       struct scsi_dh_data dh_data;
        unsigned char sense[SCSI_SENSE_BUFFERSIZE];
        int path_state;
        int retries;
@@ -50,11 +49,6 @@ struct hp_sw_dh_data {
 
 static int hp_sw_start_stop(struct hp_sw_dh_data *);
 
-static inline struct hp_sw_dh_data *get_hp_sw_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct hp_sw_dh_data, dh_data);
-}
-
 /*
  * tur_done - Handle TEST UNIT READY return status
  * @sdev: sdev the command has been sent to
@@ -267,7 +261,7 @@ static int hp_sw_start_stop(struct hp_sw_dh_data *h)
 
 static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct hp_sw_dh_data *h = get_hp_sw_data(sdev);
+       struct hp_sw_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->path_state != HP_SW_PATH_ACTIVE) {
@@ -292,7 +286,7 @@ static int hp_sw_activate(struct scsi_device *sdev,
                                activate_complete fn, void *data)
 {
        int ret = SCSI_DH_OK;
-       struct hp_sw_dh_data *h = get_hp_sw_data(sdev);
+       struct hp_sw_dh_data *h = sdev->handler_data;
 
        ret = hp_sw_tur(sdev, h);
 
@@ -311,43 +305,14 @@ static int hp_sw_activate(struct scsi_device *sdev,
        return 0;
 }
 
-static const struct {
-       char *vendor;
-       char *model;
-} hp_sw_dh_data_list[] = {
-       {"COMPAQ", "MSA1000 VOLUME"},
-       {"COMPAQ", "HSV110"},
-       {"HP", "HSV100"},
-       {"DEC", "HSG80"},
-       {NULL, NULL},
-};
-
-static bool hp_sw_match(struct scsi_device *sdev)
-{
-       int i;
-
-       if (scsi_device_tpgs(sdev))
-               return false;
-
-       for (i = 0; hp_sw_dh_data_list[i].vendor; i++) {
-               if (!strncmp(sdev->vendor, hp_sw_dh_data_list[i].vendor,
-                       strlen(hp_sw_dh_data_list[i].vendor)) &&
-                   !strncmp(sdev->model, hp_sw_dh_data_list[i].model,
-                       strlen(hp_sw_dh_data_list[i].model))) {
-                       return true;
-               }
-       }
-       return false;
-}
-
-static struct scsi_dh_data *hp_sw_bus_attach(struct scsi_device *sdev)
+static int hp_sw_bus_attach(struct scsi_device *sdev)
 {
        struct hp_sw_dh_data *h;
        int ret;
 
        h = kzalloc(sizeof(*h), GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->path_state = HP_SW_PATH_UNINITIALIZED;
        h->retries = HP_SW_RETRIES;
        h->sdev = sdev;
@@ -359,17 +324,18 @@ static struct scsi_dh_data *hp_sw_bus_attach(struct scsi_device *sdev)
        sdev_printk(KERN_INFO, sdev, "%s: attached to %s path\n",
                    HP_SW_NAME, h->path_state == HP_SW_PATH_ACTIVE?
                    "active":"passive");
-       return &h->dh_data;
+
+       sdev->handler_data = h;
+       return 0;
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 static void hp_sw_bus_detach( struct scsi_device *sdev )
 {
-       struct hp_sw_dh_data *h = get_hp_sw_data(sdev);
-
-       kfree(h);
+       kfree(sdev->handler_data);
+       sdev->handler_data = NULL;
 }
 
 static struct scsi_device_handler hp_sw_dh = {
@@ -379,7 +345,6 @@ static struct scsi_device_handler hp_sw_dh = {
        .detach         = hp_sw_bus_detach,
        .activate       = hp_sw_activate,
        .prep_fn        = hp_sw_prep_fn,
-       .match          = hp_sw_match,
 };
 
 static int __init hp_sw_init(void)
index b46ace3..3613581 100644 (file)
@@ -181,7 +181,6 @@ struct c2_inquiry {
 };
 
 struct rdac_dh_data {
-       struct scsi_dh_data     dh_data;
        struct rdac_controller  *ctlr;
 #define UNINITIALIZED_LUN      (1 << 8)
        unsigned                lun;
@@ -260,11 +259,6 @@ do { \
                sdev_printk(KERN_INFO, sdev, RDAC_NAME ": " f "\n", ## arg); \
 } while (0);
 
-static inline struct rdac_dh_data *get_rdac_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct rdac_dh_data, dh_data);
-}
-
 static struct request *get_rdac_req(struct scsi_device *sdev,
                        void *buffer, unsigned buflen, int rw)
 {
@@ -544,7 +538,7 @@ static int mode_select_handle_sense(struct scsi_device *sdev,
 {
        struct scsi_sense_hdr sense_hdr;
        int err = SCSI_DH_IO, ret;
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
 
        ret = scsi_normalize_sense(sensebuf, SCSI_SENSE_BUFFERSIZE, &sense_hdr);
        if (!ret)
@@ -589,7 +583,7 @@ static void send_mode_select(struct work_struct *work)
                container_of(work, struct rdac_controller, ms_work);
        struct request *rq;
        struct scsi_device *sdev = ctlr->ms_sdev;
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
        struct request_queue *q = sdev->request_queue;
        int err, retry_cnt = RDAC_RETRY_COUNT;
        struct rdac_queue_data *tmp, *qdata;
@@ -648,7 +642,7 @@ static int queue_mode_select(struct scsi_device *sdev,
        if (!qdata)
                return SCSI_DH_RETRY;
 
-       qdata->h = get_rdac_data(sdev);
+       qdata->h = sdev->handler_data;
        qdata->callback_fn = fn;
        qdata->callback_data = data;
 
@@ -667,7 +661,7 @@ static int queue_mode_select(struct scsi_device *sdev,
 static int rdac_activate(struct scsi_device *sdev,
                        activate_complete fn, void *data)
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
        int err = SCSI_DH_OK;
        int act = 0;
 
@@ -702,7 +696,7 @@ done:
 
 static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->state != RDAC_STATE_ACTIVE) {
@@ -716,7 +710,7 @@ static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
 static int rdac_check_sense(struct scsi_device *sdev,
                                struct scsi_sense_hdr *sense_hdr)
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
 
        RDAC_LOG(RDAC_LOG_SENSE, sdev, "array %s, ctlr %d, "
                        "I/O returned with sense %02x/%02x/%02x",
@@ -778,56 +772,7 @@ static int rdac_check_sense(struct scsi_device *sdev,
        return SCSI_RETURN_NOT_HANDLED;
 }
 
-static const struct {
-       char *vendor;
-       char *model;
-} rdac_dev_list[] = {
-       {"IBM", "1722"},
-       {"IBM", "1724"},
-       {"IBM", "1726"},
-       {"IBM", "1742"},
-       {"IBM", "1745"},
-       {"IBM", "1746"},
-       {"IBM", "1813"},
-       {"IBM", "1814"},
-       {"IBM", "1815"},
-       {"IBM", "1818"},
-       {"IBM", "3526"},
-       {"SGI", "TP9"},
-       {"SGI", "IS"},
-       {"STK", "OPENstorage D280"},
-       {"STK", "FLEXLINE 380"},
-       {"SUN", "CSM"},
-       {"SUN", "LCSM100"},
-       {"SUN", "STK6580_6780"},
-       {"SUN", "SUN_6180"},
-       {"SUN", "ArrayStorage"},
-       {"DELL", "MD3"},
-       {"NETAPP", "INF-01-00"},
-       {"LSI", "INF-01-00"},
-       {"ENGENIO", "INF-01-00"},
-       {NULL, NULL},
-};
-
-static bool rdac_match(struct scsi_device *sdev)
-{
-       int i;
-
-       if (scsi_device_tpgs(sdev))
-               return false;
-
-       for (i = 0; rdac_dev_list[i].vendor; i++) {
-               if (!strncmp(sdev->vendor, rdac_dev_list[i].vendor,
-                       strlen(rdac_dev_list[i].vendor)) &&
-                   !strncmp(sdev->model, rdac_dev_list[i].model,
-                       strlen(rdac_dev_list[i].model))) {
-                       return true;
-               }
-       }
-       return false;
-}
-
-static struct scsi_dh_data *rdac_bus_attach(struct scsi_device *sdev)
+static int rdac_bus_attach(struct scsi_device *sdev)
 {
        struct rdac_dh_data *h;
        int err;
@@ -836,7 +781,7 @@ static struct scsi_dh_data *rdac_bus_attach(struct scsi_device *sdev)
 
        h = kzalloc(sizeof(*h) , GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->lun = UNINITIALIZED_LUN;
        h->state = RDAC_STATE_ACTIVE;
 
@@ -861,7 +806,8 @@ static struct scsi_dh_data *rdac_bus_attach(struct scsi_device *sdev)
                    RDAC_NAME, h->lun, mode[(int)h->mode],
                    lun_state[(int)h->lun_state]);
 
-       return &h->dh_data;
+       sdev->handler_data = h;
+       return 0;
 
 clean_ctlr:
        spin_lock(&list_lock);
@@ -870,12 +816,12 @@ clean_ctlr:
 
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 static void rdac_bus_detach( struct scsi_device *sdev )
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
 
        if (h->ctlr && h->ctlr->ms_queued)
                flush_workqueue(kmpath_rdacd);
@@ -884,6 +830,7 @@ static void rdac_bus_detach( struct scsi_device *sdev )
        if (h->ctlr)
                kref_put(&h->ctlr->kref, release_controller);
        spin_unlock(&list_lock);
+       sdev->handler_data = NULL;
        kfree(h);
 }
 
@@ -895,7 +842,6 @@ static struct scsi_device_handler rdac_dh = {
        .attach = rdac_bus_attach,
        .detach = rdac_bus_detach,
        .activate = rdac_activate,
-       .match = rdac_match,
 };
 
 static int __init rdac_init(void)
index ec193a8..d3eb80c 100644 (file)
@@ -364,7 +364,7 @@ static int fcoe_interface_setup(struct fcoe_interface *fcoe,
         * on the ethertype for the given device
         */
        fcoe->fcoe_packet_type.func = fcoe_rcv;
-       fcoe->fcoe_packet_type.type = __constant_htons(ETH_P_FCOE);
+       fcoe->fcoe_packet_type.type = htons(ETH_P_FCOE);
        fcoe->fcoe_packet_type.dev = netdev;
        dev_add_pack(&fcoe->fcoe_packet_type);
 
index 3411919..b62836d 100644 (file)
@@ -4555,7 +4555,7 @@ static ssize_t ipr_store_raw_mode(struct device *dev,
        spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags);
        res = (struct ipr_resource_entry *)sdev->hostdata;
        if (res) {
-               if (ioa_cfg->sis64 && ipr_is_af_dasd_device(res)) {
+               if (ipr_is_af_dasd_device(res)) {
                        res->raw_mode = simple_strtoul(buf, NULL, 10);
                        len = strlen(buf);
                        if (res->sdev)
@@ -6383,9 +6383,13 @@ static int ipr_queuecommand(struct Scsi_Host *shost,
            (!ipr_is_gscsi(res) || scsi_cmd->cmnd[0] == IPR_QUERY_RSRC_STATE)) {
                ioarcb->cmd_pkt.request_type = IPR_RQTYPE_IOACMD;
        }
-       if (res->raw_mode && ipr_is_af_dasd_device(res))
+       if (res->raw_mode && ipr_is_af_dasd_device(res)) {
                ioarcb->cmd_pkt.request_type = IPR_RQTYPE_PIPE;
 
+               if (scsi_cmd->underflow == 0)
+                       ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
+       }
+
        if (ioa_cfg->sis64)
                rc = ipr_build_ioadl64(ioa_cfg, ipr_cmd);
        else
index 98d9bb6..33c74d3 100644 (file)
@@ -853,12 +853,9 @@ static void iscsi_scsi_cmd_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
                                     SAM_STAT_CHECK_CONDITION;
                        scsi_build_sense_buffer(1, sc->sense_buffer,
                                                ILLEGAL_REQUEST, 0x10, ascq);
-                       sc->sense_buffer[7] = 0xc; /* Additional sense length */
-                       sc->sense_buffer[8] = 0;   /* Information desc type */
-                       sc->sense_buffer[9] = 0xa; /* Additional desc length */
-                       sc->sense_buffer[10] = 0x80; /* Validity bit */
-
-                       put_unaligned_be64(sector, &sc->sense_buffer[12]);
+                       scsi_set_sense_information(sc->sense_buffer,
+                                                  SCSI_SENSE_BUFFERSIZE,
+                                                  sector);
                        goto out;
                }
        }
index eb62772..4abb93a 100644 (file)
@@ -2284,7 +2284,7 @@ lpfc_mbx_cmpl_rdp_page_a2(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox)
                        (struct lpfc_rdp_context *)(mbox->context2);
 
        if (bf_get(lpfc_mqe_status, &mbox->u.mqe))
-               goto error;
+               goto error_mbuf_free;
 
        lpfc_sli_bemem_bcopy(mp->virt, &rdp_context->page_a2,
                                DMP_SFF_PAGE_A2_SIZE);
@@ -2299,13 +2299,14 @@ lpfc_mbx_cmpl_rdp_page_a2(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox)
        mbox->mbox_cmpl = lpfc_mbx_cmpl_rdp_link_stat;
        mbox->context2 = (struct lpfc_rdp_context *) rdp_context;
        if (lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT) == MBX_NOT_FINISHED)
-               goto error;
+               goto error_cmd_free;
 
        return;
 
-error:
+error_mbuf_free:
        lpfc_mbuf_free(phba, mp->virt, mp->phys);
        kfree(mp);
+error_cmd_free:
        lpfc_sli4_mbox_cmd_free(phba, mbox);
        rdp_context->cmpl(phba, rdp_context, FAILURE);
 }
index 6dec7cf..c167911 100644 (file)
@@ -112,9 +112,12 @@ _scsih_set_fwfault_debug(const char *val, struct kernel_param *kp)
        if (ret)
                return ret;
 
+       /* global ioc spinlock to protect controller list on list operations */
        printk(KERN_INFO "setting fwfault_debug(%d)\n", mpt2sas_fwfault_debug);
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list)
                ioc->fwfault_debug = mpt2sas_fwfault_debug;
+       spin_unlock(&gioc_lock);
        return 0;
 }
 
@@ -4437,6 +4440,8 @@ mpt2sas_base_free_resources(struct MPT2SAS_ADAPTER *ioc)
        dexitprintk(ioc, printk(MPT2SAS_INFO_FMT "%s\n", ioc->name,
            __func__));
 
+       /* synchronizing freeing resource with pci_access_mutex lock */
+       mutex_lock(&ioc->pci_access_mutex);
        if (ioc->chip_phys && ioc->chip) {
                _base_mask_interrupts(ioc);
                ioc->shost_recovery = 1;
@@ -4456,6 +4461,7 @@ mpt2sas_base_free_resources(struct MPT2SAS_ADAPTER *ioc)
                pci_disable_pcie_error_reporting(pdev);
                pci_disable_device(pdev);
        }
+       mutex_unlock(&ioc->pci_access_mutex);
        return;
 }
 
index caff8d1..97ea360 100644 (file)
  * @flags: MPT_TARGET_FLAGS_XXX flags
  * @deleted: target flaged for deletion
  * @tm_busy: target is busy with TM request.
+ * @sdev: The sas_device associated with this target
  */
 struct MPT2SAS_TARGET {
        struct scsi_target *starget;
@@ -248,6 +249,7 @@ struct MPT2SAS_TARGET {
        u32     flags;
        u8      deleted;
        u8      tm_busy;
+       struct _sas_device *sdev;
 };
 
 
@@ -376,8 +378,24 @@ struct _sas_device {
        u8      phy;
        u8      responding;
        u8      pfa_led_on;
+       struct kref refcount;
 };
 
+static inline void sas_device_get(struct _sas_device *s)
+{
+       kref_get(&s->refcount);
+}
+
+static inline void sas_device_free(struct kref *r)
+{
+       kfree(container_of(r, struct _sas_device, refcount));
+}
+
+static inline void sas_device_put(struct _sas_device *s)
+{
+       kref_put(&s->refcount, sas_device_free);
+}
+
 /**
  * struct _raid_device - raid volume link list
  * @list: sas device list
@@ -799,6 +817,12 @@ typedef void (*MPT2SAS_FLUSH_RUNNING_CMDS)(struct MPT2SAS_ADAPTER *ioc);
  * @delayed_tr_list: target reset link list
  * @delayed_tr_volume_list: volume target reset link list
  * @@temp_sensors_count: flag to carry the number of temperature sensors
+ * @pci_access_mutex: Mutex to synchronize ioctl,sysfs show path and
+ * pci resource handling. PCI resource freeing will lead to free
+ * vital hardware/memory resource, which might be in use by cli/sysfs
+ * path functions resulting in Null pointer reference followed by kernel
+ * crash. To avoid the above race condition we use mutex syncrhonization
+ * which ensures the syncrhonization between cli/sysfs_show path
  */
 struct MPT2SAS_ADAPTER {
        struct list_head list;
@@ -1015,6 +1039,7 @@ struct MPT2SAS_ADAPTER {
        u8              mfg_pg10_hide_flag;
        u8              hide_drives;
 
+       struct mutex pci_access_mutex;
 };
 
 typedef u8 (*MPT_CALLBACK)(struct MPT2SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
@@ -1023,6 +1048,17 @@ typedef u8 (*MPT_CALLBACK)(struct MPT2SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
 
 /* base shared API */
 extern struct list_head mpt2sas_ioc_list;
+/* spinlock on list operations over IOCs
+ * Case: when multiple warpdrive cards(IOCs) are in use
+ * Each IOC will added to the ioc list stucture on initialization.
+ * Watchdog threads run at regular intervals to check IOC for any
+ * fault conditions which will trigger the dead_ioc thread to
+ * deallocate pci resource, resulting deleting the IOC netry from list,
+ * this deletion need to protected by spinlock to enusre that
+ * ioc removal is syncrhonized, if not synchronized it might lead to
+ * list_del corruption as the ioc list is traversed in cli path
+ */
+extern spinlock_t gioc_lock;
 void mpt2sas_base_start_watchdog(struct MPT2SAS_ADAPTER *ioc);
 void mpt2sas_base_stop_watchdog(struct MPT2SAS_ADAPTER *ioc);
 
@@ -1095,11 +1131,12 @@ struct _sas_node *mpt2sas_scsih_expander_find_by_handle(struct MPT2SAS_ADAPTER *
     u16 handle);
 struct _sas_node *mpt2sas_scsih_expander_find_by_sas_address(struct MPT2SAS_ADAPTER
     *ioc, u64 sas_address);
-struct _sas_device *mpt2sas_scsih_sas_device_find_by_sas_address(
+struct _sas_device *mpt2sas_get_sdev_by_addr(
+    struct MPT2SAS_ADAPTER *ioc, u64 sas_address);
+struct _sas_device *__mpt2sas_get_sdev_by_addr(
     struct MPT2SAS_ADAPTER *ioc, u64 sas_address);
 
 void mpt2sas_port_enable_complete(struct MPT2SAS_ADAPTER *ioc);
-
 void mpt2sas_scsih_reset_handler(struct MPT2SAS_ADAPTER *ioc, int reset_phase);
 
 /* config shared API */
index 4e50960..3694b63 100644 (file)
@@ -427,13 +427,16 @@ static int
 _ctl_verify_adapter(int ioc_number, struct MPT2SAS_ADAPTER **iocpp)
 {
        struct MPT2SAS_ADAPTER *ioc;
-
+       /* global ioc lock to protect controller on list operations */
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list) {
                if (ioc->id != ioc_number)
                        continue;
+               spin_unlock(&gioc_lock);
                *iocpp = ioc;
                return ioc_number;
        }
+       spin_unlock(&gioc_lock);
        *iocpp = NULL;
        return -1;
 }
@@ -522,10 +525,15 @@ _ctl_poll(struct file *filep, poll_table *wait)
 
        poll_wait(filep, &ctl_poll_wait, wait);
 
+       /* global ioc lock to protect controller on list operations */
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list) {
-               if (ioc->aen_event_read_flag)
+               if (ioc->aen_event_read_flag) {
+                       spin_unlock(&gioc_lock);
                        return POLLIN | POLLRDNORM;
+               }
        }
+       spin_unlock(&gioc_lock);
        return 0;
 }
 
@@ -2168,16 +2176,23 @@ _ctl_ioctl_main(struct file *file, unsigned int cmd, void __user *arg,
 
        if (_ctl_verify_adapter(ioctl_header.ioc_number, &ioc) == -1 || !ioc)
                return -ENODEV;
+       /* pci_access_mutex lock acquired by ioctl path */
+       mutex_lock(&ioc->pci_access_mutex);
        if (ioc->shost_recovery || ioc->pci_error_recovery ||
-           ioc->is_driver_loading)
-               return -EAGAIN;
+               ioc->is_driver_loading || ioc->remove_host) {
+               ret = -EAGAIN;
+               goto out_unlock_pciaccess;
+       }
 
        state = (file->f_flags & O_NONBLOCK) ? NON_BLOCKING : BLOCKING;
        if (state == NON_BLOCKING) {
-               if (!mutex_trylock(&ioc->ctl_cmds.mutex))
-                       return -EAGAIN;
+               if (!mutex_trylock(&ioc->ctl_cmds.mutex)) {
+                       ret = -EAGAIN;
+                       goto out_unlock_pciaccess;
+               }
        } else if (mutex_lock_interruptible(&ioc->ctl_cmds.mutex)) {
-               return -ERESTARTSYS;
+               ret = -ERESTARTSYS;
+               goto out_unlock_pciaccess;
        }
 
        switch (cmd) {
@@ -2258,6 +2273,8 @@ _ctl_ioctl_main(struct file *file, unsigned int cmd, void __user *arg,
        }
 
        mutex_unlock(&ioc->ctl_cmds.mutex);
+out_unlock_pciaccess:
+       mutex_unlock(&ioc->pci_access_mutex);
        return ret;
 }
 
@@ -2711,6 +2728,12 @@ _ctl_BRM_status_show(struct device *cdev, struct device_attribute *attr,
                    "warpdrive\n", ioc->name, __func__);
                goto out;
        }
+       /* pci_access_mutex lock acquired by sysfs show path */
+       mutex_lock(&ioc->pci_access_mutex);
+       if (ioc->pci_error_recovery || ioc->remove_host) {
+               mutex_unlock(&ioc->pci_access_mutex);
+               return 0;
+       }
 
        /* allocate upto GPIOVal 36 entries */
        sz = offsetof(Mpi2IOUnitPage3_t, GPIOVal) + (sizeof(u16) * 36);
@@ -2749,6 +2772,7 @@ _ctl_BRM_status_show(struct device *cdev, struct device_attribute *attr,
 
  out:
        kfree(io_unit_pg3);
+       mutex_unlock(&ioc->pci_access_mutex);
        return rc;
 }
 static DEVICE_ATTR(BRM_status, S_IRUGO, _ctl_BRM_status_show, NULL);
index 3f26147..0ad09b2 100644 (file)
@@ -79,7 +79,8 @@ static int _scsih_scan_finished(struct Scsi_Host *shost, unsigned long time);
 
 /* global parameters */
 LIST_HEAD(mpt2sas_ioc_list);
-
+/* global ioc lock for list operations */
+DEFINE_SPINLOCK(gioc_lock);
 /* local parameters */
 static u8 scsi_io_cb_idx = -1;
 static u8 tm_cb_idx = -1;
@@ -176,9 +177,37 @@ struct fw_event_work {
        u8                      VP_ID;
        u8                      ignore;
        u16                     event;
+       struct kref             refcount;
        char                    event_data[0] __aligned(4);
 };
 
+static void fw_event_work_free(struct kref *r)
+{
+       kfree(container_of(r, struct fw_event_work, refcount));
+}
+
+static void fw_event_work_get(struct fw_event_work *fw_work)
+{
+       kref_get(&fw_work->refcount);
+}
+
+static void fw_event_work_put(struct fw_event_work *fw_work)
+{
+       kref_put(&fw_work->refcount, fw_event_work_free);
+}
+
+static struct fw_event_work *alloc_fw_event_work(int len)
+{
+       struct fw_event_work *fw_event;
+
+       fw_event = kzalloc(sizeof(*fw_event) + len, GFP_ATOMIC);
+       if (!fw_event)
+               return NULL;
+
+       kref_init(&fw_event->refcount);
+       return fw_event;
+}
+
 /* raid transport support */
 static struct raid_template *mpt2sas_raid_template;
 
@@ -293,8 +322,10 @@ _scsih_set_debug_level(const char *val, struct kernel_param *kp)
                return ret;
 
        printk(KERN_INFO "setting logging_level(0x%08x)\n", logging_level);
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list)
                ioc->logging_level = logging_level;
+       spin_unlock(&gioc_lock);
        return 0;
 }
 module_param_call(logging_level, _scsih_set_debug_level, param_get_int,
@@ -526,8 +557,61 @@ _scsih_determine_boot_device(struct MPT2SAS_ADAPTER *ioc,
        }
 }
 
+static struct _sas_device *
+__mpt2sas_get_sdev_from_target(struct MPT2SAS_ADAPTER *ioc,
+               struct MPT2SAS_TARGET *tgt_priv)
+{
+       struct _sas_device *ret;
+
+       assert_spin_locked(&ioc->sas_device_lock);
+
+       ret = tgt_priv->sdev;
+       if (ret)
+               sas_device_get(ret);
+
+       return ret;
+}
+
+static struct _sas_device *
+mpt2sas_get_sdev_from_target(struct MPT2SAS_ADAPTER *ioc,
+               struct MPT2SAS_TARGET *tgt_priv)
+{
+       struct _sas_device *ret;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       ret = __mpt2sas_get_sdev_from_target(ioc, tgt_priv);
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       return ret;
+}
+
+
+struct _sas_device *
+__mpt2sas_get_sdev_by_addr(struct MPT2SAS_ADAPTER *ioc,
+    u64 sas_address)
+{
+       struct _sas_device *sas_device;
+
+       assert_spin_locked(&ioc->sas_device_lock);
+
+       list_for_each_entry(sas_device, &ioc->sas_device_list, list)
+               if (sas_device->sas_address == sas_address)
+                       goto found_device;
+
+       list_for_each_entry(sas_device, &ioc->sas_device_init_list, list)
+               if (sas_device->sas_address == sas_address)
+                       goto found_device;
+
+       return NULL;
+
+found_device:
+       sas_device_get(sas_device);
+       return sas_device;
+}
+
 /**
- * mpt2sas_scsih_sas_device_find_by_sas_address - sas device search
+ * mpt2sas_get_sdev_by_addr - sas device search
  * @ioc: per adapter object
  * @sas_address: sas address
  * Context: Calling function should acquire ioc->sas_device_lock
@@ -536,24 +620,44 @@ _scsih_determine_boot_device(struct MPT2SAS_ADAPTER *ioc,
  * object.
  */
 struct _sas_device *
-mpt2sas_scsih_sas_device_find_by_sas_address(struct MPT2SAS_ADAPTER *ioc,
+mpt2sas_get_sdev_by_addr(struct MPT2SAS_ADAPTER *ioc,
     u64 sas_address)
 {
        struct _sas_device *sas_device;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
+                       sas_address);
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       return sas_device;
+}
+
+static struct _sas_device *
+__mpt2sas_get_sdev_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
+{
+       struct _sas_device *sas_device;
+
+       assert_spin_locked(&ioc->sas_device_lock);
 
        list_for_each_entry(sas_device, &ioc->sas_device_list, list)
-               if (sas_device->sas_address == sas_address)
-                       return sas_device;
+               if (sas_device->handle == handle)
+                       goto found_device;
 
        list_for_each_entry(sas_device, &ioc->sas_device_init_list, list)
-               if (sas_device->sas_address == sas_address)
-                       return sas_device;
+               if (sas_device->handle == handle)
+                       goto found_device;
 
        return NULL;
+
+found_device:
+       sas_device_get(sas_device);
+       return sas_device;
 }
 
 /**
- * _scsih_sas_device_find_by_handle - sas device search
+ * mpt2sas_get_sdev_by_handle - sas device search
  * @ioc: per adapter object
  * @handle: sas device handle (assigned by firmware)
  * Context: Calling function should acquire ioc->sas_device_lock
@@ -562,19 +666,16 @@ mpt2sas_scsih_sas_device_find_by_sas_address(struct MPT2SAS_ADAPTER *ioc,
  * object.
  */
 static struct _sas_device *
-_scsih_sas_device_find_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
+mpt2sas_get_sdev_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 {
        struct _sas_device *sas_device;
+       unsigned long flags;
 
-       list_for_each_entry(sas_device, &ioc->sas_device_list, list)
-               if (sas_device->handle == handle)
-                       return sas_device;
-
-       list_for_each_entry(sas_device, &ioc->sas_device_init_list, list)
-               if (sas_device->handle == handle)
-                       return sas_device;
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
-       return NULL;
+       return sas_device;
 }
 
 /**
@@ -583,7 +684,7 @@ _scsih_sas_device_find_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
  * @sas_device: the sas_device object
  * Context: This function will acquire ioc->sas_device_lock.
  *
- * Removing object and freeing associated memory from the ioc->sas_device_list.
+ * If sas_device is on the list, remove it and decrement its reference count.
  */
 static void
 _scsih_sas_device_remove(struct MPT2SAS_ADAPTER *ioc,
@@ -594,9 +695,15 @@ _scsih_sas_device_remove(struct MPT2SAS_ADAPTER *ioc,
        if (!sas_device)
                return;
 
+       /*
+        * The lock serializes access to the list, but we still need to verify
+        * that nobody removed the entry while we were waiting on the lock.
+        */
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       list_del(&sas_device->list);
-       kfree(sas_device);
+       if (!list_empty(&sas_device->list)) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 }
 
@@ -620,6 +727,7 @@ _scsih_sas_device_add(struct MPT2SAS_ADAPTER *ioc,
            sas_device->handle, (unsigned long long)sas_device->sas_address));
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device_get(sas_device);
        list_add_tail(&sas_device->list, &ioc->sas_device_list);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -659,6 +767,7 @@ _scsih_sas_device_init_add(struct MPT2SAS_ADAPTER *ioc,
            sas_device->handle, (unsigned long long)sas_device->sas_address));
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device_get(sas_device);
        list_add_tail(&sas_device->list, &ioc->sas_device_init_list);
        _scsih_determine_boot_device(ioc, sas_device, 0);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
@@ -1208,12 +1317,15 @@ _scsih_change_queue_depth(struct scsi_device *sdev, int qdepth)
                goto not_sata;
        if ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME))
                goto not_sata;
+
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-          sas_device_priv_data->sas_target->sas_address);
-       if (sas_device && sas_device->device_info &
-           MPI2_SAS_DEVICE_INFO_SATA_DEVICE)
-               max_depth = MPT2SAS_SATA_QUEUE_DEPTH;
+       sas_device = __mpt2sas_get_sdev_from_target(ioc, sas_target_priv_data);
+       if (sas_device) {
+               if (sas_device->device_info & MPI2_SAS_DEVICE_INFO_SATA_DEVICE)
+                       max_depth = MPT2SAS_SATA_QUEUE_DEPTH;
+
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
  not_sata:
@@ -1271,18 +1383,20 @@ _scsih_target_alloc(struct scsi_target *starget)
        /* sas/sata devices */
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        rphy = dev_to_rphy(starget->dev.parent);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
           rphy->identify.sas_address);
 
        if (sas_device) {
                sas_target_priv_data->handle = sas_device->handle;
                sas_target_priv_data->sas_address = sas_device->sas_address;
+               sas_target_priv_data->sdev = sas_device;
                sas_device->starget = starget;
                sas_device->id = starget->id;
                sas_device->channel = starget->channel;
                if (test_bit(sas_device->handle, ioc->pd_handles))
                        sas_target_priv_data->flags |=
                            MPT_TARGET_FLAGS_RAID_COMPONENT;
+
        }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -1324,13 +1438,21 @@ _scsih_target_destroy(struct scsi_target *starget)
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        rphy = dev_to_rphy(starget->dev.parent);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-          rphy->identify.sas_address);
+       sas_device = __mpt2sas_get_sdev_from_target(ioc, sas_target_priv_data);
        if (sas_device && (sas_device->starget == starget) &&
            (sas_device->id == starget->id) &&
            (sas_device->channel == starget->channel))
                sas_device->starget = NULL;
 
+       if (sas_device) {
+               /*
+                * Corresponding get() is in _scsih_target_alloc()
+                */
+               sas_target_priv_data->sdev = NULL;
+               sas_device_put(sas_device);
+
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
  out:
@@ -1386,7 +1508,7 @@ _scsih_slave_alloc(struct scsi_device *sdev)
 
        if (!(sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME)) {
                spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+               sas_device = __mpt2sas_get_sdev_by_addr(ioc,
                                sas_target_priv_data->sas_address);
                if (sas_device && (sas_device->starget == NULL)) {
                        sdev_printk(KERN_INFO, sdev,
@@ -1394,6 +1516,10 @@ _scsih_slave_alloc(struct scsi_device *sdev)
                             __func__, __LINE__);
                        sas_device->starget = starget;
                }
+
+               if (sas_device)
+                       sas_device_put(sas_device);
+
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
 
@@ -1428,10 +1554,13 @@ _scsih_slave_destroy(struct scsi_device *sdev)
 
        if (!(sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME)) {
                spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                  sas_target_priv_data->sas_address);
+               sas_device = __mpt2sas_get_sdev_from_target(ioc,
+                               sas_target_priv_data);
                if (sas_device && !sas_target_priv_data->num_luns)
                        sas_device->starget = NULL;
+
+               if (sas_device)
+                       sas_device_put(sas_device);
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
 
@@ -2078,7 +2207,7 @@ _scsih_slave_configure(struct scsi_device *sdev)
        }
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
           sas_device_priv_data->sas_target->sas_address);
        if (!sas_device) {
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
@@ -2112,17 +2241,18 @@ _scsih_slave_configure(struct scsi_device *sdev)
            (unsigned long long) sas_device->enclosure_logical_id,
            sas_device->slot);
 
+       sas_device_put(sas_device);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        if (!ssp_target)
                _scsih_display_sata_capabilities(ioc, handle, sdev);
 
-
        _scsih_change_queue_depth(sdev, qdepth);
 
        if (ssp_target) {
                sas_read_port_mode_page(sdev);
                _scsih_enable_tlr(ioc, sdev);
        }
+
        return 0;
 }
 
@@ -2509,8 +2639,7 @@ _scsih_tm_display_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd)
                    device_str, (unsigned long long)priv_target->sas_address);
        } else {
                spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                   priv_target->sas_address);
+               sas_device = __mpt2sas_get_sdev_from_target(ioc, priv_target);
                if (sas_device) {
                        if (priv_target->flags &
                            MPT_TARGET_FLAGS_RAID_COMPONENT) {
@@ -2529,6 +2658,8 @@ _scsih_tm_display_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd)
                            "enclosure_logical_id(0x%016llx), slot(%d)\n",
                           (unsigned long long)sas_device->enclosure_logical_id,
                            sas_device->slot);
+
+                       sas_device_put(sas_device);
                }
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
@@ -2604,12 +2735,12 @@ _scsih_dev_reset(struct scsi_cmnd *scmd)
 {
        struct MPT2SAS_ADAPTER *ioc = shost_priv(scmd->device->host);
        struct MPT2SAS_DEVICE *sas_device_priv_data;
-       struct _sas_device *sas_device;
-       unsigned long flags;
+       struct _sas_device *sas_device = NULL;
        u16     handle;
        int r;
 
        struct scsi_target *starget = scmd->device->sdev_target;
+       struct MPT2SAS_TARGET *target_priv_data = starget->hostdata;
 
        starget_printk(KERN_INFO, starget, "attempting device reset! "
            "scmd(%p)\n", scmd);
@@ -2629,12 +2760,10 @@ _scsih_dev_reset(struct scsi_cmnd *scmd)
        handle = 0;
        if (sas_device_priv_data->sas_target->flags &
            MPT_TARGET_FLAGS_RAID_COMPONENT) {
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc,
-                  sas_device_priv_data->sas_target->handle);
+               sas_device = mpt2sas_get_sdev_from_target(ioc,
+                               target_priv_data);
                if (sas_device)
                        handle = sas_device->volume_handle;
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        } else
                handle = sas_device_priv_data->sas_target->handle;
 
@@ -2651,6 +2780,10 @@ _scsih_dev_reset(struct scsi_cmnd *scmd)
  out:
        sdev_printk(KERN_INFO, scmd->device, "device reset: %s scmd(%p)\n",
            ((r == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+       if (sas_device)
+               sas_device_put(sas_device);
+
        return r;
 }
 
@@ -2665,11 +2798,11 @@ _scsih_target_reset(struct scsi_cmnd *scmd)
 {
        struct MPT2SAS_ADAPTER *ioc = shost_priv(scmd->device->host);
        struct MPT2SAS_DEVICE *sas_device_priv_data;
-       struct _sas_device *sas_device;
-       unsigned long flags;
+       struct _sas_device *sas_device = NULL;
        u16     handle;
        int r;
        struct scsi_target *starget = scmd->device->sdev_target;
+       struct MPT2SAS_TARGET *target_priv_data = starget->hostdata;
 
        starget_printk(KERN_INFO, starget, "attempting target reset! "
            "scmd(%p)\n", scmd);
@@ -2689,12 +2822,10 @@ _scsih_target_reset(struct scsi_cmnd *scmd)
        handle = 0;
        if (sas_device_priv_data->sas_target->flags &
            MPT_TARGET_FLAGS_RAID_COMPONENT) {
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc,
-                  sas_device_priv_data->sas_target->handle);
+               sas_device = mpt2sas_get_sdev_from_target(ioc,
+                               target_priv_data);
                if (sas_device)
                        handle = sas_device->volume_handle;
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        } else
                handle = sas_device_priv_data->sas_target->handle;
 
@@ -2711,6 +2842,10 @@ _scsih_target_reset(struct scsi_cmnd *scmd)
  out:
        starget_printk(KERN_INFO, starget, "target reset: %s scmd(%p)\n",
            ((r == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+       if (sas_device)
+               sas_device_put(sas_device);
+
        return r;
 }
 
@@ -2768,36 +2903,39 @@ _scsih_fw_event_add(struct MPT2SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
                return;
 
        spin_lock_irqsave(&ioc->fw_event_lock, flags);
+       fw_event_work_get(fw_event);
        list_add_tail(&fw_event->list, &ioc->fw_event_list);
        INIT_DELAYED_WORK(&fw_event->delayed_work, _firmware_event_work);
+       fw_event_work_get(fw_event);
        queue_delayed_work(ioc->firmware_event_thread,
            &fw_event->delayed_work, 0);
        spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 }
 
 /**
- * _scsih_fw_event_free - delete fw_event
+ * _scsih_fw_event_del_from_list - delete fw_event from the list
  * @ioc: per adapter object
  * @fw_event: object describing the event
  * Context: This function will acquire ioc->fw_event_lock.
  *
- * This removes firmware event object from link list, frees associated memory.
+ * If the fw_event is on the fw_event_list, remove it and do a put.
  *
  * Return nothing.
  */
 static void
-_scsih_fw_event_free(struct MPT2SAS_ADAPTER *ioc, struct fw_event_work
+_scsih_fw_event_del_from_list(struct MPT2SAS_ADAPTER *ioc, struct fw_event_work
     *fw_event)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&ioc->fw_event_lock, flags);
-       list_del(&fw_event->list);
-       kfree(fw_event);
+       if (!list_empty(&fw_event->list)) {
+               list_del_init(&fw_event->list);
+               fw_event_work_put(fw_event);
+       }
        spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 }
 
-
 /**
  * _scsih_error_recovery_delete_devices - remove devices not responding
  * @ioc: per adapter object
@@ -2812,13 +2950,14 @@ _scsih_error_recovery_delete_devices(struct MPT2SAS_ADAPTER *ioc)
        if (ioc->is_driver_loading)
                return;
 
-       fw_event = kzalloc(sizeof(struct fw_event_work), GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(0);
        if (!fw_event)
                return;
 
        fw_event->event = MPT2SAS_REMOVE_UNRESPONDING_DEVICES;
        fw_event->ioc = ioc;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
 }
 
 /**
@@ -2832,12 +2971,29 @@ mpt2sas_port_enable_complete(struct MPT2SAS_ADAPTER *ioc)
 {
        struct fw_event_work *fw_event;
 
-       fw_event = kzalloc(sizeof(struct fw_event_work), GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(0);
        if (!fw_event)
                return;
        fw_event->event = MPT2SAS_PORT_ENABLE_COMPLETE;
        fw_event->ioc = ioc;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
+}
+
+static struct fw_event_work *dequeue_next_fw_event(struct MPT2SAS_ADAPTER *ioc)
+{
+       unsigned long flags;
+       struct fw_event_work *fw_event = NULL;
+
+       spin_lock_irqsave(&ioc->fw_event_lock, flags);
+       if (!list_empty(&ioc->fw_event_list)) {
+               fw_event = list_first_entry(&ioc->fw_event_list,
+                               struct fw_event_work, list);
+               list_del_init(&fw_event->list);
+       }
+       spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
+
+       return fw_event;
 }
 
 /**
@@ -2852,17 +3008,25 @@ mpt2sas_port_enable_complete(struct MPT2SAS_ADAPTER *ioc)
 static void
 _scsih_fw_event_cleanup_queue(struct MPT2SAS_ADAPTER *ioc)
 {
-       struct fw_event_work *fw_event, *next;
+       struct fw_event_work *fw_event;
 
        if (list_empty(&ioc->fw_event_list) ||
             !ioc->firmware_event_thread || in_interrupt())
                return;
 
-       list_for_each_entry_safe(fw_event, next, &ioc->fw_event_list, list) {
-               if (cancel_delayed_work_sync(&fw_event->delayed_work)) {
-                       _scsih_fw_event_free(ioc, fw_event);
-                       continue;
-               }
+       while ((fw_event = dequeue_next_fw_event(ioc))) {
+               /*
+                * Wait on the fw_event to complete. If this returns 1, then
+                * the event was never executed, and we need a put for the
+                * reference the delayed_work had on the fw_event.
+                *
+                * If it did execute, we wait for it to finish, and the put will
+                * happen from _firmware_event_work()
+                */
+               if (cancel_delayed_work_sync(&fw_event->delayed_work))
+                       fw_event_work_put(fw_event);
+
+               fw_event_work_put(fw_event);
        }
 }
 
@@ -3002,15 +3166,15 @@ _scsih_block_io_to_children_attached_to_ex(struct MPT2SAS_ADAPTER *ioc,
 
        list_for_each_entry(mpt2sas_port,
           &sas_expander->sas_port_list, port_list) {
-               if (mpt2sas_port->remote_identify.device_type ==
-                   SAS_END_DEVICE) {
+               if (mpt2sas_port->remote_identify.device_type == SAS_END_DEVICE) {
                        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-                       sas_device =
-                           mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                          mpt2sas_port->remote_identify.sas_address);
-                       if (sas_device)
+                       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
+                                       mpt2sas_port->remote_identify.sas_address);
+                       if (sas_device) {
                                set_bit(sas_device->handle,
-                                   ioc->blocking_handles);
+                                               ioc->blocking_handles);
+                               sas_device_put(sas_device);
+                       }
                        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
                }
        }
@@ -3080,7 +3244,7 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 {
        Mpi2SCSITaskManagementRequest_t *mpi_request;
        u16 smid;
-       struct _sas_device *sas_device;
+       struct _sas_device *sas_device = NULL;
        struct MPT2SAS_TARGET *sas_target_priv_data = NULL;
        u64 sas_address = 0;
        unsigned long flags;
@@ -3110,7 +3274,7 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
                return;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (sas_device && sas_device->starget &&
             sas_device->starget->hostdata) {
                sas_target_priv_data = sas_device->starget->hostdata;
@@ -3131,14 +3295,14 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        if (!smid) {
                delayed_tr = kzalloc(sizeof(*delayed_tr), GFP_ATOMIC);
                if (!delayed_tr)
-                       return;
+                       goto out;
                INIT_LIST_HEAD(&delayed_tr->list);
                delayed_tr->handle = handle;
                list_add_tail(&delayed_tr->list, &ioc->delayed_tr_list);
                dewtprintk(ioc, printk(MPT2SAS_INFO_FMT
                    "DELAYED:tr:handle(0x%04x), (open)\n",
                    ioc->name, handle));
-               return;
+               goto out;
        }
 
        dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "tr_send:handle(0x%04x), "
@@ -3150,6 +3314,9 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        mpi_request->DevHandle = cpu_to_le16(handle);
        mpi_request->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET;
        mpt2sas_base_put_smid_hi_priority(ioc, smid);
+out:
+       if (sas_device)
+               sas_device_put(sas_device);
 }
 
 
@@ -4068,7 +4235,6 @@ _scsih_scsi_ioc_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
        char *desc_scsi_state = ioc->tmp_string;
        u32 log_info = le32_to_cpu(mpi_reply->IOCLogInfo);
        struct _sas_device *sas_device = NULL;
-       unsigned long flags;
        struct scsi_target *starget = scmd->device->sdev_target;
        struct MPT2SAS_TARGET *priv_target = starget->hostdata;
        char *device_str = NULL;
@@ -4200,9 +4366,7 @@ _scsih_scsi_ioc_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
                printk(MPT2SAS_WARN_FMT "\t%s wwid(0x%016llx)\n", ioc->name,
                    device_str, (unsigned long long)priv_target->sas_address);
        } else {
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                   priv_target->sas_address);
+               sas_device = mpt2sas_get_sdev_from_target(ioc, priv_target);
                if (sas_device) {
                        printk(MPT2SAS_WARN_FMT "\tsas_address(0x%016llx), "
                            "phy(%d)\n", ioc->name, sas_device->sas_address,
@@ -4211,8 +4375,9 @@ _scsih_scsi_ioc_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
                            "\tenclosure_logical_id(0x%016llx), slot(%d)\n",
                            ioc->name, sas_device->enclosure_logical_id,
                            sas_device->slot);
+
+                       sas_device_put(sas_device);
                }
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
 
        printk(MPT2SAS_WARN_FMT "\thandle(0x%04x), ioc_status(%s)(0x%04x), "
@@ -4259,7 +4424,7 @@ _scsih_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        Mpi2SepRequest_t mpi_request;
        struct _sas_device *sas_device;
 
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
        if (!sas_device)
                return;
 
@@ -4274,7 +4439,7 @@ _scsih_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
            &mpi_request)) != 0) {
                printk(MPT2SAS_ERR_FMT "failure at %s:%d/%s()!\n", ioc->name,
                __FILE__, __LINE__, __func__);
-               return;
+               goto out;
        }
        sas_device->pfa_led_on = 1;
 
@@ -4284,8 +4449,10 @@ _scsih_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
                 "enclosure_processor: ioc_status (0x%04x), loginfo(0x%08x)\n",
                 ioc->name, le16_to_cpu(mpi_reply.IOCStatus),
                 le32_to_cpu(mpi_reply.IOCLogInfo)));
-               return;
+               goto out;
        }
+out:
+       sas_device_put(sas_device);
 }
 
 /**
@@ -4340,13 +4507,14 @@ _scsih_send_event_to_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 {
        struct fw_event_work *fw_event;
 
-       fw_event = kzalloc(sizeof(struct fw_event_work), GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(0);
        if (!fw_event)
                return;
        fw_event->event = MPT2SAS_TURN_ON_PFA_LED;
        fw_event->device_handle = handle;
        fw_event->ioc = ioc;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
 }
 
 /**
@@ -4370,19 +4538,17 @@ _scsih_smart_predicted_fault(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 
        /* only handle non-raid devices */
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (!sas_device) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
+               goto out_unlock;
        }
        starget = sas_device->starget;
        sas_target_priv_data = starget->hostdata;
 
        if ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_RAID_COMPONENT) ||
-          ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME))) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+          ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME)))
+               goto out_unlock;
+
        starget_printk(KERN_WARNING, starget, "predicted fault\n");
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -4396,7 +4562,7 @@ _scsih_smart_predicted_fault(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        if (!event_reply) {
                printk(MPT2SAS_ERR_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
-               return;
+               goto out;
        }
 
        event_reply->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
@@ -4413,6 +4579,14 @@ _scsih_smart_predicted_fault(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        event_data->SASAddress = cpu_to_le64(sas_target_priv_data->sas_address);
        mpt2sas_ctl_add_to_event_log(ioc, event_reply);
        kfree(event_reply);
+out:
+       if (sas_device)
+               sas_device_put(sas_device);
+       return;
+
+out_unlock:
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+       goto out;
 }
 
 /**
@@ -5148,14 +5322,13 @@ _scsih_check_device(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        sas_address = le64_to_cpu(sas_device_pg0.SASAddress);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            sas_address);
 
        if (!sas_device) {
                printk(MPT2SAS_ERR_FMT "device is not present "
                    "handle(0x%04x), no sas_device!!!\n", ioc->name, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
+               goto out_unlock;
        }
 
        if (unlikely(sas_device->handle != handle)) {
@@ -5172,19 +5345,24 @@ _scsih_check_device(struct MPT2SAS_ADAPTER *ioc, u16 handle)
            MPI2_SAS_DEVICE0_FLAGS_DEVICE_PRESENT)) {
                printk(MPT2SAS_ERR_FMT "device is not present "
                    "handle(0x%04x), flags!!!\n", ioc->name, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
+               goto out_unlock;
        }
 
        /* check if there were any issues with discovery */
        if (_scsih_check_access_status(ioc, sas_address, handle,
-           sas_device_pg0.AccessStatus)) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+           sas_device_pg0.AccessStatus))
+               goto out_unlock;
+
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        _scsih_ublock_io_device(ioc, sas_address);
+       if (sas_device)
+               sas_device_put(sas_device);
+       return;
 
+out_unlock:
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+       if (sas_device)
+               sas_device_put(sas_device);
 }
 
 /**
@@ -5208,7 +5386,6 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
        u32 ioc_status;
        __le64 sas_address;
        u32 device_info;
-       unsigned long flags;
 
        if ((mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply, &sas_device_pg0,
            MPI2_SAS_DEVICE_PGAD_FORM_HANDLE, handle))) {
@@ -5250,14 +5427,13 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
                return -1;
        }
 
-
-       spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = mpt2sas_get_sdev_by_addr(ioc,
            sas_address);
-       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
-       if (sas_device)
+       if (sas_device) {
+               sas_device_put(sas_device);
                return 0;
+       }
 
        sas_device = kzalloc(sizeof(struct _sas_device),
            GFP_KERNEL);
@@ -5267,6 +5443,7 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
                return -1;
        }
 
+       kref_init(&sas_device->refcount);
        sas_device->handle = handle;
        if (_scsih_get_sas_address(ioc, le16_to_cpu
                (sas_device_pg0.ParentDevHandle),
@@ -5296,6 +5473,7 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
        else
                _scsih_sas_device_add(ioc, sas_device);
 
+       sas_device_put(sas_device);
        return 0;
 }
 
@@ -5344,7 +5522,6 @@ _scsih_remove_device(struct MPT2SAS_ADAPTER *ioc,
            "handle(0x%04x), sas_addr(0x%016llx)\n", ioc->name, __func__,
            sas_device->handle, (unsigned long long)
            sas_device->sas_address));
-       kfree(sas_device);
 }
 /**
  * _scsih_device_remove_by_handle - removing device object by handle
@@ -5363,12 +5540,17 @@ _scsih_device_remove_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
                return;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-       if (sas_device)
-               list_del(&sas_device->list);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
+       if (sas_device) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-       if (sas_device)
+
+       if (sas_device) {
                _scsih_remove_device(ioc, sas_device);
+               sas_device_put(sas_device);
+       }
 }
 
 /**
@@ -5389,13 +5571,17 @@ mpt2sas_device_remove_by_sas_address(struct MPT2SAS_ADAPTER *ioc,
                return;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-           sas_address);
-       if (sas_device)
-               list_del(&sas_device->list);
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc, sas_address);
+       if (sas_device) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-       if (sas_device)
+
+       if (sas_device) {
                _scsih_remove_device(ioc, sas_device);
+               sas_device_put(sas_device);
+       }
 }
 #ifdef CONFIG_SCSI_MPT2SAS_LOGGING
 /**
@@ -5716,26 +5902,28 @@ _scsih_sas_device_status_change_event(struct MPT2SAS_ADAPTER *ioc,
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        sas_address = le64_to_cpu(event_data->SASAddress);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            sas_address);
 
-       if (!sas_device || !sas_device->starget) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+       if (!sas_device || !sas_device->starget)
+               goto out;
 
        target_priv_data = sas_device->starget->hostdata;
-       if (!target_priv_data) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+       if (!target_priv_data)
+               goto out;
 
        if (event_data->ReasonCode ==
            MPI2_EVENT_SAS_DEV_STAT_RC_INTERNAL_DEVICE_RESET)
                target_priv_data->tm_busy = 1;
        else
                target_priv_data->tm_busy = 0;
+
+out:
+       if (sas_device)
+               sas_device_put(sas_device);
+
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
 }
 
 #ifdef CONFIG_SCSI_MPT2SAS_LOGGING
@@ -6123,7 +6311,7 @@ _scsih_sas_pd_expose(struct MPT2SAS_ADAPTER *ioc,
        u16 handle = le16_to_cpu(element->PhysDiskDevHandle);
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (sas_device) {
                sas_device->volume_handle = 0;
                sas_device->volume_wwid = 0;
@@ -6142,6 +6330,8 @@ _scsih_sas_pd_expose(struct MPT2SAS_ADAPTER *ioc,
        /* exposing raid component */
        if (starget)
                starget_for_each_device(starget, NULL, _scsih_reprobe_lun);
+
+       sas_device_put(sas_device);
 }
 
 /**
@@ -6170,7 +6360,7 @@ _scsih_sas_pd_hide(struct MPT2SAS_ADAPTER *ioc,
                    &volume_wwid);
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (sas_device) {
                set_bit(handle, ioc->pd_handles);
                if (sas_device->starget && sas_device->starget->hostdata) {
@@ -6189,6 +6379,8 @@ _scsih_sas_pd_hide(struct MPT2SAS_ADAPTER *ioc,
        /* hiding raid component */
        if (starget)
                starget_for_each_device(starget, (void *)1, _scsih_reprobe_lun);
+
+       sas_device_put(sas_device);
 }
 
 /**
@@ -6221,7 +6413,6 @@ _scsih_sas_pd_add(struct MPT2SAS_ADAPTER *ioc,
     Mpi2EventIrConfigElement_t *element)
 {
        struct _sas_device *sas_device;
-       unsigned long flags;
        u16 handle = le16_to_cpu(element->PhysDiskDevHandle);
        Mpi2ConfigReply_t mpi_reply;
        Mpi2SasDevicePage0_t sas_device_pg0;
@@ -6231,11 +6422,11 @@ _scsih_sas_pd_add(struct MPT2SAS_ADAPTER *ioc,
 
        set_bit(handle, ioc->pd_handles);
 
-       spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-       if (sas_device)
+       sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
+       if (sas_device) {
+               sas_device_put(sas_device);
                return;
+       }
 
        if ((mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply, &sas_device_pg0,
            MPI2_SAS_DEVICE_PGAD_FORM_HANDLE, handle))) {
@@ -6509,7 +6700,6 @@ _scsih_sas_ir_physical_disk_event(struct MPT2SAS_ADAPTER *ioc,
        u16 handle, parent_handle;
        u32 state;
        struct _sas_device *sas_device;
-       unsigned long flags;
        Mpi2ConfigReply_t mpi_reply;
        Mpi2SasDevicePage0_t sas_device_pg0;
        u32 ioc_status;
@@ -6542,12 +6732,11 @@ _scsih_sas_ir_physical_disk_event(struct MPT2SAS_ADAPTER *ioc,
                if (!ioc->is_warpdrive)
                        set_bit(handle, ioc->pd_handles);
 
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-
-               if (sas_device)
+               sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
+               if (sas_device) {
+                       sas_device_put(sas_device);
                        return;
+               }
 
                if ((mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply,
                    &sas_device_pg0, MPI2_SAS_DEVICE_PGAD_FORM_HANDLE,
@@ -7015,6 +7204,7 @@ _scsih_remove_unresponding_sas_devices(struct MPT2SAS_ADAPTER *ioc)
        struct _raid_device *raid_device, *raid_device_next;
        struct list_head tmp_list;
        unsigned long flags;
+       LIST_HEAD(head);
 
        printk(MPT2SAS_INFO_FMT "removing unresponding devices: start\n",
            ioc->name);
@@ -7022,14 +7212,29 @@ _scsih_remove_unresponding_sas_devices(struct MPT2SAS_ADAPTER *ioc)
        /* removing unresponding end devices */
        printk(MPT2SAS_INFO_FMT "removing unresponding devices: end-devices\n",
            ioc->name);
+
+       /*
+        * Iterate, pulling off devices marked as non-responding. We become the
+        * owner for the reference the list had on any object we prune.
+        */
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_for_each_entry_safe(sas_device, sas_device_next,
-           &ioc->sas_device_list, list) {
+                       &ioc->sas_device_list, list) {
                if (!sas_device->responding)
-                       mpt2sas_device_remove_by_sas_address(ioc,
-                               sas_device->sas_address);
+                       list_move_tail(&sas_device->list, &head);
                else
                        sas_device->responding = 0;
        }
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       /*
+        * Now, uninitialize and remove the unresponding devices we pruned.
+        */
+       list_for_each_entry_safe(sas_device, sas_device_next, &head, list) {
+               _scsih_remove_device(ioc, sas_device);
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
 
        /* removing unresponding volumes */
        if (ioc->ir_firmware) {
@@ -7179,11 +7384,11 @@ _scsih_scan_for_devices_after_reset(struct MPT2SAS_ADAPTER *ioc)
                }
                phys_disk_num = pd_pg0.PhysDiskNum;
                handle = le16_to_cpu(pd_pg0.DevHandle);
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               if (sas_device)
+               sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
+               if (sas_device) {
+                       sas_device_put(sas_device);
                        continue;
+               }
                if (mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply,
                    &sas_device_pg0, MPI2_SAS_DEVICE_PGAD_FORM_HANDLE,
                    handle) != 0)
@@ -7302,12 +7507,12 @@ _scsih_scan_for_devices_after_reset(struct MPT2SAS_ADAPTER *ioc)
                if (!(_scsih_is_end_device(
                    le32_to_cpu(sas_device_pg0.DeviceInfo))))
                        continue;
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+               sas_device = mpt2sas_get_sdev_by_addr(ioc,
                    le64_to_cpu(sas_device_pg0.SASAddress));
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               if (sas_device)
+               if (sas_device) {
+                       sas_device_put(sas_device);
                        continue;
+               }
                parent_handle = le16_to_cpu(sas_device_pg0.ParentDevHandle);
                if (!_scsih_get_sas_address(ioc, parent_handle, &sas_address)) {
                        printk(MPT2SAS_INFO_FMT "\tBEFORE adding end device: "
@@ -7410,17 +7615,27 @@ _firmware_event_work(struct work_struct *work)
            struct fw_event_work, delayed_work.work);
        struct MPT2SAS_ADAPTER *ioc = fw_event->ioc;
 
+       _scsih_fw_event_del_from_list(ioc, fw_event);
+
        /* the queue is being flushed so ignore this event */
-       if (ioc->remove_host ||
-           ioc->pci_error_recovery) {
-               _scsih_fw_event_free(ioc, fw_event);
+       if (ioc->remove_host || ioc->pci_error_recovery) {
+               fw_event_work_put(fw_event);
                return;
        }
 
        switch (fw_event->event) {
        case MPT2SAS_REMOVE_UNRESPONDING_DEVICES:
-               while (scsi_host_in_recovery(ioc->shost) || ioc->shost_recovery)
+               while (scsi_host_in_recovery(ioc->shost) ||
+                               ioc->shost_recovery) {
+                       /*
+                        * If we're unloading, bail. Otherwise, this can become
+                        * an infinite loop.
+                        */
+                       if (ioc->remove_host)
+                               goto out;
+
                        ssleep(1);
+               }
                _scsih_remove_unresponding_sas_devices(ioc);
                _scsih_scan_for_devices_after_reset(ioc);
                break;
@@ -7469,7 +7684,8 @@ _firmware_event_work(struct work_struct *work)
                _scsih_sas_ir_operation_status_event(ioc, fw_event);
                break;
        }
-       _scsih_fw_event_free(ioc, fw_event);
+out:
+       fw_event_work_put(fw_event);
 }
 
 /**
@@ -7607,7 +7823,7 @@ mpt2sas_scsih_event_callback(struct MPT2SAS_ADAPTER *ioc, u8 msix_index,
        }
 
        sz = le16_to_cpu(mpi_reply->EventDataLength) * 4;
-       fw_event = kzalloc(sizeof(*fw_event) + sz, GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(sz);
        if (!fw_event) {
                printk(MPT2SAS_ERR_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
@@ -7620,6 +7836,7 @@ mpt2sas_scsih_event_callback(struct MPT2SAS_ADAPTER *ioc, u8 msix_index,
        fw_event->VP_ID = mpi_reply->VP_ID;
        fw_event->event = event;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
        return;
 }
 
@@ -7867,7 +8084,9 @@ _scsih_remove(struct pci_dev *pdev)
        sas_remove_host(shost);
        scsi_remove_host(shost);
        mpt2sas_base_detach(ioc);
+       spin_lock(&gioc_lock);
        list_del(&ioc->list);
+       spin_unlock(&gioc_lock);
        scsi_host_put(shost);
 }
 
@@ -7966,6 +8185,48 @@ _scsih_probe_raid(struct MPT2SAS_ADAPTER *ioc)
        }
 }
 
+static struct _sas_device *get_next_sas_device(struct MPT2SAS_ADAPTER *ioc)
+{
+       struct _sas_device *sas_device = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       if (!list_empty(&ioc->sas_device_init_list)) {
+               sas_device = list_first_entry(&ioc->sas_device_init_list,
+                               struct _sas_device, list);
+               sas_device_get(sas_device);
+       }
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       return sas_device;
+}
+
+static void sas_device_make_active(struct MPT2SAS_ADAPTER *ioc,
+               struct _sas_device *sas_device)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+
+       /*
+        * Since we dropped the lock during the call to port_add(), we need to
+        * be careful here that somebody else didn't move or delete this item
+        * while we were busy with other things.
+        *
+        * If it was on the list, we need a put() for the reference the list
+        * had. Either way, we need a get() for the destination list.
+        */
+       if (!list_empty(&sas_device->list)) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
+
+       sas_device_get(sas_device);
+       list_add_tail(&sas_device->list, &ioc->sas_device_list);
+
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+}
+
 /**
  * _scsih_probe_sas - reporting sas devices to sas transport
  * @ioc: per adapter object
@@ -7975,34 +8236,30 @@ _scsih_probe_raid(struct MPT2SAS_ADAPTER *ioc)
 static void
 _scsih_probe_sas(struct MPT2SAS_ADAPTER *ioc)
 {
-       struct _sas_device *sas_device, *next;
-       unsigned long flags;
-
-       /* SAS Device List */
-       list_for_each_entry_safe(sas_device, next, &ioc->sas_device_init_list,
-           list) {
+       struct _sas_device *sas_device;
 
-               if (ioc->hide_drives)
-                       continue;
+       if (ioc->hide_drives)
+               return;
 
+       while ((sas_device = get_next_sas_device(ioc))) {
                if (!mpt2sas_transport_port_add(ioc, sas_device->handle,
-                   sas_device->sas_address_parent)) {
-                       list_del(&sas_device->list);
-                       kfree(sas_device);
+                               sas_device->sas_address_parent)) {
+                       _scsih_sas_device_remove(ioc, sas_device);
+                       sas_device_put(sas_device);
                        continue;
                } else if (!sas_device->starget) {
                        if (!ioc->is_driver_loading) {
                                mpt2sas_transport_port_remove(ioc,
-                                       sas_device->sas_address,
-                                       sas_device->sas_address_parent);
-                               list_del(&sas_device->list);
-                               kfree(sas_device);
+                                               sas_device->sas_address,
+                                               sas_device->sas_address_parent);
+                               _scsih_sas_device_remove(ioc, sas_device);
+                               sas_device_put(sas_device);
                                continue;
                        }
                }
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               list_move_tail(&sas_device->list, &ioc->sas_device_list);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+               sas_device_make_active(ioc, sas_device);
+               sas_device_put(sas_device);
        }
 }
 
@@ -8142,7 +8399,9 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        ioc = shost_priv(shost);
        memset(ioc, 0, sizeof(struct MPT2SAS_ADAPTER));
        INIT_LIST_HEAD(&ioc->list);
+       spin_lock(&gioc_lock);
        list_add_tail(&ioc->list, &mpt2sas_ioc_list);
+       spin_unlock(&gioc_lock);
        ioc->shost = shost;
        ioc->id = mpt_ids++;
        sprintf(ioc->name, "%s%d", MPT2SAS_DRIVER_NAME, ioc->id);
@@ -8167,6 +8426,8 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        ioc->schedule_dead_ioc_flush_running_cmds = &_scsih_flush_running_cmds;
        /* misc semaphores and spin locks */
        mutex_init(&ioc->reset_in_progress_mutex);
+       /* initializing pci_access_mutex lock */
+       mutex_init(&ioc->pci_access_mutex);
        spin_lock_init(&ioc->ioc_reset_in_progress_lock);
        spin_lock_init(&ioc->scsi_lookup_lock);
        spin_lock_init(&ioc->sas_device_lock);
@@ -8269,7 +8530,9 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  out_attach_fail:
        destroy_workqueue(ioc->firmware_event_thread);
  out_thread_fail:
+       spin_lock(&gioc_lock);
        list_del(&ioc->list);
+       spin_unlock(&gioc_lock);
        scsi_host_put(shost);
        return rv;
 }
index ff2500a..af86800 100644 (file)
@@ -1323,15 +1323,17 @@ _transport_get_enclosure_identifier(struct sas_rphy *rphy, u64 *identifier)
        int rc;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            rphy->identify.sas_address);
        if (sas_device) {
                *identifier = sas_device->enclosure_logical_id;
                rc = 0;
+               sas_device_put(sas_device);
        } else {
                *identifier = 0;
                rc = -ENXIO;
        }
+
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        return rc;
 }
@@ -1351,12 +1353,14 @@ _transport_get_bay_identifier(struct sas_rphy *rphy)
        int rc;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            rphy->identify.sas_address);
-       if (sas_device)
+       if (sas_device) {
                rc = sas_device->slot;
-       else
+               sas_device_put(sas_device);
+       } else {
                rc = -ENXIO;
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        return rc;
 }
index c34c115..ec27ad2 100644 (file)
@@ -8,7 +8,7 @@
  *                 scatter/gather formats.
  * Creation Date:  June 21, 2006
  *
- * mpi2.h Version:  02.00.31
+ * mpi2.h Version:  02.00.35
  *
  * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25
  *       prefix are for use only on MPI v2.5 products, and must not be used
  *                     Added MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET.
  * 04-09-13  02.00.30  Bumped MPI2_HEADER_VERSION_UNIT.
  * 04-17-13  02.00.31  Bumped MPI2_HEADER_VERSION_UNIT.
+ * 08-19-13  02.00.32  Bumped MPI2_HEADER_VERSION_UNIT.
+ * 12-05-13  02.00.33  Bumped MPI2_HEADER_VERSION_UNIT.
+ * 01-08-14  02.00.34  Bumped MPI2_HEADER_VERSION_UNIT
+ * 06-13-14  02.00.35  Bumped MPI2_HEADER_VERSION_UNIT.
  * --------------------------------------------------------------------------
  */
 
 #define MPI2_VERSION_02_05                  (0x0205)
 
 /*Unit and Dev versioning for this MPI header set */
-#define MPI2_HEADER_VERSION_UNIT            (0x1F)
+#define MPI2_HEADER_VERSION_UNIT            (0x23)
 #define MPI2_HEADER_VERSION_DEV             (0x00)
 #define MPI2_HEADER_VERSION_UNIT_MASK       (0xFF00)
 #define MPI2_HEADER_VERSION_UNIT_SHIFT      (8)
index e261a31..581fdb3 100644 (file)
@@ -6,7 +6,7 @@
  *         Title:  MPI Configuration messages and pages
  * Creation Date:  November 10, 2006
  *
- *   mpi2_cnfg.h Version:  02.00.26
+ *   mpi2_cnfg.h Version:  02.00.29
  *
  * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25
  *       prefix are for use only on MPI v2.5 products, and must not be used
  *                     match the specification.
  * 08-19-13  02.00.26  Added reserved words to MPI2_CONFIG_PAGE_IO_UNIT_7 for
  *                     future use.
+ * 12-05-13  02.00.27  Added MPI2_MANPAGE7_FLAG_BASE_ENCLOSURE_LEVEL for
+ *                    MPI2_CONFIG_PAGE_MAN_7.
+ *                    Added EnclosureLevel and ConnectorName fields to
+ *                    MPI2_CONFIG_PAGE_SAS_DEV_0.
+ *                    Added MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID for
+ *                    MPI2_CONFIG_PAGE_SAS_DEV_0.
+ *                    Added EnclosureLevel field to
+ *                    MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0.
+ *                    Added MPI2_SAS_ENCLS0_FLAGS_ENCL_LEVEL_VALID for
+ *                    MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0.
+ * 01-08-14  02.00.28  Added more defines for the BiosOptions field of
+ *                    MPI2_CONFIG_PAGE_BIOS_1.
+ * 06-13-14  02.00.29  Added SSUTimeout field to MPI2_CONFIG_PAGE_BIOS_1, and
+ *                    more defines for the BiosOptions field..
  * --------------------------------------------------------------------------
  */
 
@@ -724,6 +738,7 @@ typedef struct _MPI2_CONFIG_PAGE_MAN_7 {
 #define MPI2_MANUFACTURING7_PAGEVERSION                 (0x01)
 
 /*defines for the Flags field */
+#define MPI2_MANPAGE7_FLAG_BASE_ENCLOSURE_LEVEL         (0x00000008)
 #define MPI2_MANPAGE7_FLAG_EVENTREPLAY_SLOT_ORDER       (0x00000002)
 #define MPI2_MANPAGE7_FLAG_USE_SLOT_INFO                (0x00000001)
 
@@ -1311,7 +1326,9 @@ typedef struct _MPI2_CONFIG_PAGE_BIOS_1 {
        MPI2_CONFIG_PAGE_HEADER Header;                     /*0x00 */
        U32                     BiosOptions;                /*0x04 */
        U32                     IOCSettings;                /*0x08 */
-       U32                     Reserved1;                  /*0x0C */
+       U8                      SSUTimeout;                 /*0x0C */
+       U8                      Reserved1;                  /*0x0D */
+       U16                     Reserved2;                  /*0x0E */
        U32                     DeviceSettings;             /*0x10 */
        U16                     NumberOfDevices;            /*0x14 */
        U16                     UEFIVersion;                /*0x16 */
@@ -1323,9 +1340,24 @@ typedef struct _MPI2_CONFIG_PAGE_BIOS_1 {
        *PTR_MPI2_CONFIG_PAGE_BIOS_1,
        Mpi2BiosPage1_t, *pMpi2BiosPage1_t;
 
-#define MPI2_BIOSPAGE1_PAGEVERSION                      (0x05)
+#define MPI2_BIOSPAGE1_PAGEVERSION                      (0x07)
 
 /*values for BIOS Page 1 BiosOptions field */
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_MASK                         (0x00003800)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_PBDHL                        (0x00000000)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_ENCSLOSURE                   (0x00000800)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_LWWID                        (0x00001000)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_PSENS                        (0x00001800)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_ESPHY                        (0x00002000)
+
+#define MPI2_BIOSPAGE1_OPTIONS_X86_DISABLE_BIOS                (0x00000400)
+
+#define MPI2_BIOSPAGE1_OPTIONS_MASK_REGISTRATION_UEFI_BSD      (0x00000300)
+#define MPI2_BIOSPAGE1_OPTIONS_USE_BIT0_REGISTRATION_UEFI_BSD  (0x00000000)
+#define MPI2_BIOSPAGE1_OPTIONS_FULL_REGISTRATION_UEFI_BSD      (0x00000100)
+#define MPI2_BIOSPAGE1_OPTIONS_ADAPTER_REGISTRATION_UEFI_BSD   (0x00000200)
+#define MPI2_BIOSPAGE1_OPTIONS_DISABLE_REGISTRATION_UEFI_BSD   (0x00000300)
+
 #define MPI2_BIOSPAGE1_OPTIONS_MASK_OEM_ID                  (0x000000F0)
 #define MPI2_BIOSPAGE1_OPTIONS_LSI_OEM_ID                   (0x00000000)
 
@@ -2633,9 +2665,9 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_DEV_0 {
        U8
                ControlGroup;           /*0x2E */
        U8
-               Reserved1;              /*0x2F */
+               EnclosureLevel;         /*0x2F */
        U32
-               Reserved2;              /*0x30 */
+               ConnectorName[4];       /*0x30 */
        U32
                Reserved3;              /*0x34 */
 } MPI2_CONFIG_PAGE_SAS_DEV_0,
@@ -2643,7 +2675,7 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_DEV_0 {
        Mpi2SasDevicePage0_t,
        *pMpi2SasDevicePage0_t;
 
-#define MPI2_SASDEVICE0_PAGEVERSION         (0x08)
+#define MPI2_SASDEVICE0_PAGEVERSION         (0x09)
 
 /*values for SAS Device Page 0 AccessStatus field */
 #define MPI2_SAS_DEVICE0_ASTATUS_NO_ERRORS                  (0x00)
@@ -2683,6 +2715,7 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_DEV_0 {
 #define MPI2_SAS_DEVICE0_FLAGS_SATA_NCQ_SUPPORTED           (0x0020)
 #define MPI2_SAS_DEVICE0_FLAGS_SATA_FUA_SUPPORTED           (0x0010)
 #define MPI2_SAS_DEVICE0_FLAGS_PORT_SELECTOR_ATTACH         (0x0008)
+#define MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID             (0x0002)
 #define MPI2_SAS_DEVICE0_FLAGS_DEVICE_PRESENT               (0x0001)
 
 
@@ -3019,8 +3052,10 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0 {
                NumSlots;                   /*0x18 */
        U16
                StartSlot;                  /*0x1A */
-       U16
+       U8
                Reserved2;                  /*0x1C */
+       U8
+               EnclosureLevel;             /*0x1D */
        U16
                SEPDevHandle;               /*0x1E */
        U32
@@ -3031,9 +3066,10 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0 {
        *PTR_MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0,
        Mpi2SasEnclosurePage0_t, *pMpi2SasEnclosurePage0_t;
 
-#define MPI2_SASENCLOSURE0_PAGEVERSION      (0x03)
+#define MPI2_SASENCLOSURE0_PAGEVERSION      (0x04)
 
 /*values for SAS Enclosure Page 0 Flags field */
+#define MPI2_SAS_ENCLS0_FLAGS_ENCL_LEVEL_VALID      (0x0010)
 #define MPI2_SAS_ENCLS0_FLAGS_MNG_MASK              (0x000F)
 #define MPI2_SAS_ENCLS0_FLAGS_MNG_UNKNOWN           (0x0000)
 #define MPI2_SAS_ENCLS0_FLAGS_MNG_IOC_SES           (0x0001)
index 4908309..d7598cc 100644 (file)
@@ -6,7 +6,7 @@
  *         Title:  MPI IOC, Port, Event, FW Download, and FW Upload messages
  * Creation Date:  October 11, 2006
  *
- * mpi2_ioc.h Version:  02.00.23
+ * mpi2_ioc.h Version:  02.00.24
  *
  * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25
  *       prefix are for use only on MPI v2.5 products, and must not be used
  *                     Added MPI2_IOCFACTS_CAPABILITY_RDPQ_ARRAY_CAPABLE.
  *                     Added MPI2_FW_DOWNLOAD_ITYPE_PUBLIC_KEY.
  *                     Added Encrypted Hash Extended Image.
+ * 12-05-13  02.00.24  Added MPI25_HASH_IMAGE_TYPE_BIOS.
  * --------------------------------------------------------------------------
  */
 
@@ -1598,6 +1599,7 @@ Mpi25EncryptedHashEntry_t, *pMpi25EncryptedHashEntry_t;
 /* values for HashImageType */
 #define MPI25_HASH_IMAGE_TYPE_UNUSED           (0x00)
 #define MPI25_HASH_IMAGE_TYPE_FIRMWARE         (0x01)
+#define MPI25_HASH_IMAGE_TYPE_BIOS              (0x02)
 
 /* values for HashAlgorithm */
 #define MPI25_HASH_ALGORITHM_UNUSED            (0x00)
index 904910d..1629e5b 100644 (file)
@@ -6,7 +6,7 @@
  *         Title:  MPI diagnostic tool structures and definitions
  * Creation Date:  March 26, 2007
  *
- *   mpi2_tool.h Version:  02.00.11
+ *   mpi2_tool.h Version:  02.00.12
  *
  * Version History
  * ---------------
@@ -33,6 +33,7 @@
  * 07-26-12  02.00.10  Modified MPI2_TOOLBOX_DIAGNOSTIC_CLI_REQUEST so that
  *                     it uses MPI Chain SGE as well as MPI Simple SGE.
  * 08-19-13  02.00.11  Added MPI2_TOOLBOX_TEXT_DISPLAY_TOOL and related info.
+ * 01-08-14  02.00.12  Added MPI2_TOOLBOX_CLEAN_BIT26_PRODUCT_SPECIFIC.
  * --------------------------------------------------------------------------
  */
 
@@ -100,6 +101,7 @@ typedef struct _MPI2_TOOLBOX_CLEAN_REQUEST {
 #define MPI2_TOOLBOX_CLEAN_OTHER_PERSIST_PAGES      (0x20000000)
 #define MPI2_TOOLBOX_CLEAN_FW_CURRENT               (0x10000000)
 #define MPI2_TOOLBOX_CLEAN_FW_BACKUP                (0x08000000)
+#define MPI2_TOOLBOX_CLEAN_BIT26_PRODUCT_SPECIFIC   (0x04000000)
 #define MPI2_TOOLBOX_CLEAN_MEGARAID                 (0x02000000)
 #define MPI2_TOOLBOX_CLEAN_INITIALIZATION           (0x01000000)
 #define MPI2_TOOLBOX_CLEAN_FLASH                    (0x00000004)
index 43f87e9..d4f1dcd 100644 (file)
@@ -83,10 +83,10 @@ static int msix_disable = -1;
 module_param(msix_disable, int, 0);
 MODULE_PARM_DESC(msix_disable, " disable msix routed interrupts (default=0)");
 
-static int max_msix_vectors = 8;
+static int max_msix_vectors = -1;
 module_param(max_msix_vectors, int, 0);
 MODULE_PARM_DESC(max_msix_vectors,
-       " max msix vectors - (default=8)");
+       " max msix vectors");
 
 static int mpt3sas_fwfault_debug;
 MODULE_PARM_DESC(mpt3sas_fwfault_debug,
@@ -1009,8 +1009,30 @@ _base_interrupt(int irq, void *bus_id)
        }
 
        wmb();
-       writel(reply_q->reply_post_host_index | (msix_index <<
-           MPI2_RPHI_MSIX_INDEX_SHIFT), &ioc->chip->ReplyPostHostIndex);
+
+       /* Update Reply Post Host Index.
+        * For those HBA's which support combined reply queue feature
+        * 1. Get the correct Supplemental Reply Post Host Index Register.
+        *    i.e. (msix_index / 8)th entry from Supplemental Reply Post Host
+        *    Index Register address bank i.e replyPostRegisterIndex[],
+        * 2. Then update this register with new reply host index value
+        *    in ReplyPostIndex field and the MSIxIndex field with
+        *    msix_index value reduced to a value between 0 and 7,
+        *    using a modulo 8 operation. Since each Supplemental Reply Post
+        *    Host Index Register supports 8 MSI-X vectors.
+        *
+        * For other HBA's just update the Reply Post Host Index register with
+        * new reply host index value in ReplyPostIndex Field and msix_index
+        * value in MSIxIndex field.
+        */
+       if (ioc->msix96_vector)
+               writel(reply_q->reply_post_host_index | ((msix_index  & 7) <<
+                       MPI2_RPHI_MSIX_INDEX_SHIFT),
+                       ioc->replyPostRegisterIndex[msix_index/8]);
+       else
+               writel(reply_q->reply_post_host_index | (msix_index <<
+                       MPI2_RPHI_MSIX_INDEX_SHIFT),
+                       &ioc->chip->ReplyPostHostIndex);
        atomic_dec(&reply_q->busy);
        return IRQ_HANDLED;
 }
@@ -1338,7 +1360,7 @@ _base_build_sg_scmd_ieee(struct MPT3SAS_ADAPTER *ioc,
 
        sg_scmd = scsi_sglist(scmd);
        sges_left = scsi_dma_map(scmd);
-       if (!sges_left) {
+       if (sges_left < 0) {
                sdev_printk(KERN_ERR, scmd->device,
                        "pci_map_sg failed: request for %d bytes!\n",
                        scsi_bufflen(scmd));
@@ -1407,7 +1429,7 @@ _base_build_sg_scmd_ieee(struct MPT3SAS_ADAPTER *ioc,
  fill_in_last_segment:
 
        /* fill the last segment */
-       while (sges_left) {
+       while (sges_left > 0) {
                if (sges_left == 1)
                        _base_add_sg_single_ieee(sg_local,
                            simple_sgl_flags_last, 0, sg_dma_len(sg_scmd),
@@ -1560,8 +1582,6 @@ _base_check_enable_msix(struct MPT3SAS_ADAPTER *ioc)
 
        pci_read_config_word(ioc->pdev, base + 2, &message_control);
        ioc->msix_vector_count = (message_control & 0x3FF) + 1;
-       if (ioc->msix_vector_count > 8)
-               ioc->msix_vector_count = 8;
        dinitprintk(ioc, pr_info(MPT3SAS_FMT
                "msix is supported, vector_count(%d)\n",
                ioc->name, ioc->msix_vector_count));
@@ -1792,6 +1812,36 @@ _base_enable_msix(struct MPT3SAS_ADAPTER *ioc)
        return r;
 }
 
+/**
+ * mpt3sas_base_unmap_resources - free controller resources
+ * @ioc: per adapter object
+ */
+void
+mpt3sas_base_unmap_resources(struct MPT3SAS_ADAPTER *ioc)
+{
+       struct pci_dev *pdev = ioc->pdev;
+
+       dexitprintk(ioc, printk(MPT3SAS_FMT "%s\n",
+               ioc->name, __func__));
+
+       _base_free_irq(ioc);
+       _base_disable_msix(ioc);
+
+       if (ioc->msix96_vector)
+               kfree(ioc->replyPostRegisterIndex);
+
+       if (ioc->chip_phys) {
+               iounmap(ioc->chip);
+               ioc->chip_phys = 0;
+       }
+
+       if (pci_is_enabled(pdev)) {
+               pci_release_selected_regions(ioc->pdev, ioc->bars);
+               pci_disable_pcie_error_reporting(pdev);
+               pci_disable_device(pdev);
+       }
+}
+
 /**
  * mpt3sas_base_map_resources - map in controller resources (io/irq/memap)
  * @ioc: per adapter object
@@ -1882,6 +1932,36 @@ mpt3sas_base_map_resources(struct MPT3SAS_ADAPTER *ioc)
        if (r)
                goto out_fail;
 
+       /* Use the Combined reply queue feature only for SAS3 C0 & higher
+        * revision HBAs and also only when reply queue count is greater than 8
+        */
+       if (ioc->msix96_vector && ioc->reply_queue_count > 8) {
+               /* Determine the Supplemental Reply Post Host Index Registers
+                * Addresse. Supplemental Reply Post Host Index Registers
+                * starts at offset MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET and
+                * each register is at offset bytes of
+                * MPT3_SUP_REPLY_POST_HOST_INDEX_REG_OFFSET from previous one.
+                */
+               ioc->replyPostRegisterIndex = kcalloc(
+                    MPT3_SUP_REPLY_POST_HOST_INDEX_REG_COUNT,
+                    sizeof(resource_size_t *), GFP_KERNEL);
+               if (!ioc->replyPostRegisterIndex) {
+                       dfailprintk(ioc, printk(MPT3SAS_FMT
+                       "allocation for reply Post Register Index failed!!!\n",
+                                                                  ioc->name));
+                       r = -ENOMEM;
+                       goto out_fail;
+               }
+
+               for (i = 0; i < MPT3_SUP_REPLY_POST_HOST_INDEX_REG_COUNT; i++) {
+                       ioc->replyPostRegisterIndex[i] = (resource_size_t *)
+                            ((u8 *)&ioc->chip->Doorbell +
+                            MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET +
+                            (i * MPT3_SUP_REPLY_POST_HOST_INDEX_REG_OFFSET));
+               }
+       } else
+               ioc->msix96_vector = 0;
+
        list_for_each_entry(reply_q, &ioc->reply_queue_list, list)
                pr_info(MPT3SAS_FMT "%s: IRQ %d\n",
                    reply_q->name,  ((ioc->msix_enable) ? "PCI-MSI-X enabled" :
@@ -1897,12 +1977,7 @@ mpt3sas_base_map_resources(struct MPT3SAS_ADAPTER *ioc)
        return 0;
 
  out_fail:
-       if (ioc->chip_phys)
-               iounmap(ioc->chip);
-       ioc->chip_phys = 0;
-       pci_release_selected_regions(ioc->pdev, ioc->bars);
-       pci_disable_pcie_error_reporting(pdev);
-       pci_disable_device(pdev);
+       mpt3sas_base_unmap_resources(ioc);
        return r;
 }
 
@@ -2291,6 +2366,99 @@ _base_display_intel_branding(struct MPT3SAS_ADAPTER *ioc)
 
 
 
+/**
+ * _base_display_dell_branding - Display branding string
+ * @ioc: per adapter object
+ *
+ * Return nothing.
+ */
+static void
+_base_display_dell_branding(struct MPT3SAS_ADAPTER *ioc)
+{
+       if (ioc->pdev->subsystem_vendor != PCI_VENDOR_ID_DELL)
+               return;
+
+       switch (ioc->pdev->device) {
+       case MPI25_MFGPAGE_DEVID_SAS3008:
+               switch (ioc->pdev->subsystem_device) {
+               case MPT3SAS_DELL_12G_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_DELL_12G_HBA_BRANDING);
+                       break;
+               default:
+                       pr_info(MPT3SAS_FMT
+                          "Dell 12Gbps HBA: Subsystem ID: 0x%X\n", ioc->name,
+                          ioc->pdev->subsystem_device);
+                       break;
+               }
+               break;
+       default:
+               pr_info(MPT3SAS_FMT
+                       "Dell 12Gbps HBA: Subsystem ID: 0x%X\n", ioc->name,
+                       ioc->pdev->subsystem_device);
+               break;
+       }
+}
+
+/**
+ * _base_display_cisco_branding - Display branding string
+ * @ioc: per adapter object
+ *
+ * Return nothing.
+ */
+static void
+_base_display_cisco_branding(struct MPT3SAS_ADAPTER *ioc)
+{
+       if (ioc->pdev->subsystem_vendor != PCI_VENDOR_ID_CISCO)
+               return;
+
+       switch (ioc->pdev->device) {
+       case MPI25_MFGPAGE_DEVID_SAS3008:
+               switch (ioc->pdev->subsystem_device) {
+               case MPT3SAS_CISCO_12G_8E_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_CISCO_12G_8E_HBA_BRANDING);
+                       break;
+               case MPT3SAS_CISCO_12G_8I_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_CISCO_12G_8I_HBA_BRANDING);
+                       break;
+               case MPT3SAS_CISCO_12G_AVILA_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_CISCO_12G_AVILA_HBA_BRANDING);
+                       break;
+               default:
+                       pr_info(MPT3SAS_FMT
+                         "Cisco 12Gbps SAS HBA: Subsystem ID: 0x%X\n",
+                         ioc->name, ioc->pdev->subsystem_device);
+                       break;
+               }
+               break;
+       case MPI25_MFGPAGE_DEVID_SAS3108_1:
+               switch (ioc->pdev->subsystem_device) {
+               case MPT3SAS_CISCO_12G_AVILA_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                       MPT3SAS_CISCO_12G_AVILA_HBA_BRANDING);
+                       break;
+               case MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                       MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_BRANDING);
+                       break;
+               default:
+                       pr_info(MPT3SAS_FMT
+                        "Cisco 12Gbps SAS HBA: Subsystem ID: 0x%X\n",
+                        ioc->name, ioc->pdev->subsystem_device);
+                       break;
+               }
+               break;
+       default:
+                pr_info(MPT3SAS_FMT
+                       "Cisco 12Gbps SAS HBA: Subsystem ID: 0x%X\n",
+                       ioc->name, ioc->pdev->subsystem_device);
+               break;
+       }
+}
+
 /**
  * _base_display_ioc_capabilities - Disply IOC's capabilities.
  * @ioc: per adapter object
@@ -2321,6 +2489,8 @@ _base_display_ioc_capabilities(struct MPT3SAS_ADAPTER *ioc)
            bios_version & 0x000000FF);
 
        _base_display_intel_branding(ioc);
+       _base_display_dell_branding(ioc);
+       _base_display_cisco_branding(ioc);
 
        pr_info(MPT3SAS_FMT "Protocol=(", ioc->name);
 
@@ -3138,6 +3308,9 @@ _base_wait_on_iocstate(struct MPT3SAS_ADAPTER *ioc, u32 ioc_state, int timeout,
  *
  * Notes: MPI2_HIS_IOC2SYS_DB_STATUS - set to one when IOC writes to doorbell.
  */
+static int
+_base_diag_reset(struct MPT3SAS_ADAPTER *ioc, int sleep_flag);
+
 static int
 _base_wait_for_doorbell_int(struct MPT3SAS_ADAPTER *ioc, int timeout,
        int sleep_flag)
@@ -3680,6 +3853,64 @@ _base_get_port_facts(struct MPT3SAS_ADAPTER *ioc, int port, int sleep_flag)
        return 0;
 }
 
+/**
+ * _base_wait_for_iocstate - Wait until the card is in READY or OPERATIONAL
+ * @ioc: per adapter object
+ * @timeout:
+ * @sleep_flag: CAN_SLEEP or NO_SLEEP
+ *
+ * Returns 0 for success, non-zero for failure.
+ */
+static int
+_base_wait_for_iocstate(struct MPT3SAS_ADAPTER *ioc, int timeout,
+       int sleep_flag)
+{
+       u32 ioc_state;
+       int rc;
+
+       dinitprintk(ioc, printk(MPT3SAS_FMT "%s\n", ioc->name,
+           __func__));
+
+       if (ioc->pci_error_recovery) {
+               dfailprintk(ioc, printk(MPT3SAS_FMT
+                   "%s: host in pci error recovery\n", ioc->name, __func__));
+               return -EFAULT;
+       }
+
+       ioc_state = mpt3sas_base_get_iocstate(ioc, 0);
+       dhsprintk(ioc, printk(MPT3SAS_FMT "%s: ioc_state(0x%08x)\n",
+           ioc->name, __func__, ioc_state));
+
+       if (((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_READY) ||
+           (ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_OPERATIONAL)
+               return 0;
+
+       if (ioc_state & MPI2_DOORBELL_USED) {
+               dhsprintk(ioc, printk(MPT3SAS_FMT
+                   "unexpected doorbell active!\n", ioc->name));
+               goto issue_diag_reset;
+       }
+
+       if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
+               mpt3sas_base_fault_info(ioc, ioc_state &
+                   MPI2_DOORBELL_DATA_MASK);
+               goto issue_diag_reset;
+       }
+
+       ioc_state = _base_wait_on_iocstate(ioc, MPI2_IOC_STATE_READY,
+           timeout, sleep_flag);
+       if (ioc_state) {
+               dfailprintk(ioc, printk(MPT3SAS_FMT
+                   "%s: failed going to ready state (ioc_state=0x%x)\n",
+                   ioc->name, __func__, ioc_state));
+               return -EFAULT;
+       }
+
+ issue_diag_reset:
+       rc = _base_diag_reset(ioc, sleep_flag);
+       return rc;
+}
+
 /**
  * _base_get_ioc_facts - obtain ioc facts reply and save in ioc
  * @ioc: per adapter object
@@ -3698,6 +3929,13 @@ _base_get_ioc_facts(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
        dinitprintk(ioc, pr_info(MPT3SAS_FMT "%s\n", ioc->name,
            __func__));
 
+       r = _base_wait_for_iocstate(ioc, 10, sleep_flag);
+       if (r) {
+               dfailprintk(ioc, printk(MPT3SAS_FMT
+                   "%s: failed getting to correct state\n",
+                   ioc->name, __func__));
+               return r;
+       }
        mpi_reply_sz = sizeof(Mpi2IOCFactsReply_t);
        mpi_request_sz = sizeof(Mpi2IOCFactsRequest_t);
        memset(&mpi_request, 0, mpi_request_sz);
@@ -3783,7 +4021,7 @@ _base_send_ioc_init(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
        mpi_request.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
        mpi_request.VF_ID = 0; /* TODO */
        mpi_request.VP_ID = 0;
-       mpi_request.MsgVersion = cpu_to_le16(MPI2_VERSION);
+       mpi_request.MsgVersion = cpu_to_le16(MPI25_VERSION);
        mpi_request.HeaderVersion = cpu_to_le16(MPI2_HEADER_VERSION);
 
        if (_base_is_controller_msix_enabled(ioc))
@@ -4524,8 +4762,15 @@ _base_make_ioc_operational(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
 
        /* initialize reply post host index */
        list_for_each_entry(reply_q, &ioc->reply_queue_list, list) {
-               writel(reply_q->msix_index << MPI2_RPHI_MSIX_INDEX_SHIFT,
-                   &ioc->chip->ReplyPostHostIndex);
+               if (ioc->msix96_vector)
+                       writel((reply_q->msix_index & 7)<<
+                          MPI2_RPHI_MSIX_INDEX_SHIFT,
+                          ioc->replyPostRegisterIndex[reply_q->msix_index/8]);
+               else
+                       writel(reply_q->msix_index <<
+                               MPI2_RPHI_MSIX_INDEX_SHIFT,
+                               &ioc->chip->ReplyPostHostIndex);
+
                if (!_base_is_controller_msix_enabled(ioc))
                        goto skip_init_reply_post_host_index;
        }
@@ -4564,8 +4809,6 @@ _base_make_ioc_operational(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
 void
 mpt3sas_base_free_resources(struct MPT3SAS_ADAPTER *ioc)
 {
-       struct pci_dev *pdev = ioc->pdev;
-
        dexitprintk(ioc, pr_info(MPT3SAS_FMT "%s\n", ioc->name,
            __func__));
 
@@ -4576,18 +4819,7 @@ mpt3sas_base_free_resources(struct MPT3SAS_ADAPTER *ioc)
                ioc->shost_recovery = 0;
        }
 
-       _base_free_irq(ioc);
-       _base_disable_msix(ioc);
-
-       if (ioc->chip_phys && ioc->chip)
-               iounmap(ioc->chip);
-       ioc->chip_phys = 0;
-
-       if (pci_is_enabled(pdev)) {
-               pci_release_selected_regions(ioc->pdev, ioc->bars);
-               pci_disable_pcie_error_reporting(pdev);
-               pci_disable_device(pdev);
-       }
+       mpt3sas_base_unmap_resources(ioc);
        return;
 }
 
@@ -4602,6 +4834,7 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
 {
        int r, i;
        int cpu_id, last_cpu_id = 0;
+       u8 revision;
 
        dinitprintk(ioc, pr_info(MPT3SAS_FMT "%s\n", ioc->name,
            __func__));
@@ -4621,6 +4854,20 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
                goto out_free_resources;
        }
 
+       /* Check whether the controller revision is C0 or above.
+        * only C0 and above revision controllers support 96 MSI-X vectors.
+        */
+       revision = ioc->pdev->revision;
+
+       if ((ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3004 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3008 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_1 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_2 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_5 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_6) &&
+            (revision >= 0x02))
+               ioc->msix96_vector = 1;
+
        ioc->rdpq_array_enable_assigned = 0;
        ioc->dma_mask = 0;
        r = mpt3sas_base_map_resources(ioc);
@@ -4643,7 +4890,6 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
        ioc->build_sg_scmd = &_base_build_sg_scmd_ieee;
        ioc->build_sg = &_base_build_sg_ieee;
        ioc->build_zero_len_sge = &_base_build_zero_len_sge_ieee;
-       ioc->mpi25 = 1;
        ioc->sge_size_ieee = sizeof(Mpi2IeeeSgeSimple64_t);
 
        /*
index afa8816..f0e462b 100644 (file)
@@ -71,8 +71,8 @@
 #define MPT3SAS_DRIVER_NAME            "mpt3sas"
 #define MPT3SAS_AUTHOR "Avago Technologies <MPT-FusionLinux.pdl@avagotech.com>"
 #define MPT3SAS_DESCRIPTION    "LSI MPT Fusion SAS 3.0 Device Driver"
-#define MPT3SAS_DRIVER_VERSION         "04.100.00.00"
-#define MPT3SAS_MAJOR_VERSION          4
+#define MPT3SAS_DRIVER_VERSION         "09.100.00.00"
+#define MPT3SAS_MAJOR_VERSION          9
 #define MPT3SAS_MINOR_VERSION          100
 #define MPT3SAS_BUILD_VERSION          0
 #define MPT3SAS_RELEASE_VERSION        00
 #define MPT3SAS_INTEL_RS3FC044_SSDID   0x3523
 #define MPT3SAS_INTEL_RS3UC080_SSDID    0x3524
 
+/*
+ * Dell HBA branding
+ */
+#define MPT3SAS_DELL_12G_HBA_BRANDING       \
+       "Dell 12Gbps HBA"
+
+/*
+ * Dell HBA SSDIDs
+ */
+#define MPT3SAS_DELL_12G_HBA_SSDID     0x1F46
+
+/*
+ * Cisco HBA branding
+ */
+#define MPT3SAS_CISCO_12G_8E_HBA_BRANDING              \
+               "Cisco 9300-8E 12G SAS HBA"
+#define MPT3SAS_CISCO_12G_8I_HBA_BRANDING              \
+               "Cisco 9300-8i 12G SAS HBA"
+#define MPT3SAS_CISCO_12G_AVILA_HBA_BRANDING   \
+               "Cisco 12G Modular SAS Pass through Controller"
+#define MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_BRANDING                \
+               "UCS C3X60 12G SAS Pass through Controller"
+/*
+ * Cisco HBA SSSDIDs
+ */
+#define MPT3SAS_CISCO_12G_8E_HBA_SSDID  0x14C
+#define MPT3SAS_CISCO_12G_8I_HBA_SSDID  0x154
+#define MPT3SAS_CISCO_12G_AVILA_HBA_SSDID  0x155
+#define MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_SSDID  0x156
+
 /*
  * status bits for ioc->diag_buffer_status
  */
 #define MPT3_DIAG_BUFFER_IS_RELEASED   (0x02)
 #define MPT3_DIAG_BUFFER_IS_DIAG_RESET (0x04)
 
+/*
+ * Combined Reply Queue constants,
+ * There are twelve Supplemental Reply Post Host Index Registers
+ * and each register is at offset 0x10 bytes from the previous one.
+ */
+#define MPT3_SUP_REPLY_POST_HOST_INDEX_REG_COUNT 12
+#define MPT3_SUP_REPLY_POST_HOST_INDEX_REG_OFFSET (0x10)
 
 /* OEM Identifiers */
 #define MFG10_OEM_ID_INVALID                   (0x00000000)
 #define MFG10_GF0_SSD_DATA_SCRUB_DISABLE       (0x00000008)
 #define MFG10_GF0_SINGLE_DRIVE_R0              (0x00000010)
 
+#define VIRTUAL_IO_FAILED_RETRY                        (0x32010081)
+
 /* OEM Specific Flags will come from OEM specific header files */
 struct Mpi2ManufacturingPage10_t {
        MPI2_CONFIG_PAGE_HEADER Header;         /* 00h */
@@ -294,7 +333,8 @@ struct _internal_cmd {
  * @responding: used in _scsih_sas_device_mark_responding
  * @fast_path: fast path feature enable bit
  * @pfa_led_on: flag for PFA LED status
- *
+ * @pend_sas_rphy_add: flag to check if device is in sas_rphy_add()
+ *     addition routine.
  */
 struct _sas_device {
        struct list_head list;
@@ -315,6 +355,9 @@ struct _sas_device {
        u8      responding;
        u8      fast_path;
        u8      pfa_led_on;
+       u8      pend_sas_rphy_add;
+       u8      enclosure_level;
+       u8      connector_name[4];
 };
 
 /**
@@ -728,7 +771,8 @@ typedef void (*MPT3SAS_FLUSH_RUNNING_CMDS)(struct MPT3SAS_ADAPTER *ioc);
  *                             is assigned only ones
  * @reply_queue_count: number of reply queue's
  * @reply_queue_list: link list contaning the reply queue info
- * @reply_post_host_index: head index in the pool where FW completes IO
+ * @msix96_vector: 96 MSI-X vector support
+ * @replyPostRegisterIndex: index of next position in Reply Desc Post Queue
  * @delayed_tr_list: target reset link list
  * @delayed_tr_volume_list: volume target reset link list
  * @@temp_sensors_count: flag to carry the number of temperature sensors
@@ -814,7 +858,6 @@ struct MPT3SAS_ADAPTER {
        MPT_BUILD_SG_SCMD build_sg_scmd;
        MPT_BUILD_SG    build_sg;
        MPT_BUILD_ZERO_LEN_SGE build_zero_len_sge;
-       u8              mpi25;
        u16             sge_size_ieee;
 
        /* function ptr for MPI sg elements only */
@@ -937,6 +980,10 @@ struct MPT3SAS_ADAPTER {
        u8              reply_queue_count;
        struct list_head reply_queue_list;
 
+       u8              msix96_vector;
+       /* reply post register index */
+       resource_size_t **replyPostRegisterIndex;
+
        struct list_head delayed_tr_list;
        struct list_head delayed_tr_volume_list;
        u8              temp_sensors_count;
index 5a97e32..8ccef38 100644 (file)
@@ -585,6 +585,22 @@ _scsih_sas_device_remove(struct MPT3SAS_ADAPTER *ioc,
 
        if (!sas_device)
                return;
+       pr_info(MPT3SAS_FMT
+           "removing handle(0x%04x), sas_addr(0x%016llx)\n",
+           ioc->name, sas_device->handle,
+           (unsigned long long) sas_device->sas_address);
+
+       if (sas_device->enclosure_handle != 0)
+               pr_info(MPT3SAS_FMT
+                  "removing enclosure logical id(0x%016llx), slot(%d)\n",
+                  ioc->name, (unsigned long long)
+                  sas_device->enclosure_logical_id, sas_device->slot);
+
+       if (sas_device->connector_name[0] != '\0')
+               pr_info(MPT3SAS_FMT
+                  "removing enclosure level(0x%04x), connector name( %s)\n",
+                  ioc->name, sas_device->enclosure_level,
+                  sas_device->connector_name);
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_del(&sas_device->list);
@@ -663,6 +679,18 @@ _scsih_sas_device_add(struct MPT3SAS_ADAPTER *ioc,
                ioc->name, __func__, sas_device->handle,
                (unsigned long long)sas_device->sas_address));
 
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure logical id(0x%016llx), slot( %d)\n",
+                   ioc->name, __func__, (unsigned long long)
+                   sas_device->enclosure_logical_id, sas_device->slot));
+
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure level(0x%04x), connector name( %s)\n",
+                   ioc->name, __func__,
+                   sas_device->enclosure_level, sas_device->connector_name));
+
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_add_tail(&sas_device->list, &ioc->sas_device_list);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
@@ -704,6 +732,18 @@ _scsih_sas_device_init_add(struct MPT3SAS_ADAPTER *ioc,
                __func__, sas_device->handle,
                (unsigned long long)sas_device->sas_address));
 
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure logical id(0x%016llx), slot( %d)\n",
+                   ioc->name, __func__, (unsigned long long)
+                   sas_device->enclosure_logical_id, sas_device->slot));
+
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure level(0x%04x), connector name( %s)\n",
+                   ioc->name, __func__, sas_device->enclosure_level,
+                   sas_device->connector_name));
+
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_add_tail(&sas_device->list, &ioc->sas_device_init_list);
        _scsih_determine_boot_device(ioc, sas_device, 0);
@@ -1772,10 +1812,16 @@ _scsih_slave_configure(struct scsi_device *sdev)
            "sas_addr(0x%016llx), phy(%d), device_name(0x%016llx)\n",
            ds, handle, (unsigned long long)sas_device->sas_address,
            sas_device->phy, (unsigned long long)sas_device->device_name);
-       sdev_printk(KERN_INFO, sdev,
-               "%s: enclosure_logical_id(0x%016llx), slot(%d)\n",
-               ds, (unsigned long long)
-           sas_device->enclosure_logical_id, sas_device->slot);
+       if (sas_device->enclosure_handle != 0)
+               sdev_printk(KERN_INFO, sdev,
+                    "%s: enclosure_logical_id(0x%016llx), slot(%d)\n",
+                    ds, (unsigned long long)
+                    sas_device->enclosure_logical_id, sas_device->slot);
+       if (sas_device->connector_name[0] != '\0')
+               sdev_printk(KERN_INFO, sdev,
+                    "%s: enclosure level(0x%04x), connector name( %s)\n",
+                    ds, sas_device->enclosure_level,
+                    sas_device->connector_name);
 
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -2189,10 +2235,17 @@ _scsih_tm_display_info(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd)
                            sas_device->handle,
                            (unsigned long long)sas_device->sas_address,
                            sas_device->phy);
-                       starget_printk(KERN_INFO, starget,
-                           "enclosure_logical_id(0x%016llx), slot(%d)\n",
-                          (unsigned long long)sas_device->enclosure_logical_id,
-                           sas_device->slot);
+                       if (sas_device->enclosure_handle != 0)
+                               starget_printk(KERN_INFO, starget,
+                                "enclosure_logical_id(0x%016llx), slot(%d)\n",
+                                (unsigned long long)
+                                sas_device->enclosure_logical_id,
+                                sas_device->slot);
+                       if (sas_device->connector_name)
+                               starget_printk(KERN_INFO, starget,
+                               "enclosure level(0x%04x),connector name(%s)\n",
+                                sas_device->enclosure_level,
+                                sas_device->connector_name);
                }
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
@@ -2551,6 +2604,75 @@ _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
        }
 }
 
+/**
+ * _scsih_internal_device_block - block the sdev device
+ * @sdev: per device object
+ * @sas_device_priv_data : per device driver private data
+ *
+ * make sure device is blocked without error, if not
+ * print an error
+ */
+static void
+_scsih_internal_device_block(struct scsi_device *sdev,
+                       struct MPT3SAS_DEVICE *sas_device_priv_data)
+{
+       int r = 0;
+
+       sdev_printk(KERN_INFO, sdev, "device_block, handle(0x%04x)\n",
+           sas_device_priv_data->sas_target->handle);
+       sas_device_priv_data->block = 1;
+
+       r = scsi_internal_device_block(sdev);
+       if (r == -EINVAL)
+               sdev_printk(KERN_WARNING, sdev,
+                   "device_block failed with return(%d) for handle(0x%04x)\n",
+                   sas_device_priv_data->sas_target->handle, r);
+}
+
+/**
+ * _scsih_internal_device_unblock - unblock the sdev device
+ * @sdev: per device object
+ * @sas_device_priv_data : per device driver private data
+ * make sure device is unblocked without error, if not retry
+ * by blocking and then unblocking
+ */
+
+static void
+_scsih_internal_device_unblock(struct scsi_device *sdev,
+                       struct MPT3SAS_DEVICE *sas_device_priv_data)
+{
+       int r = 0;
+
+       sdev_printk(KERN_WARNING, sdev, "device_unblock and setting to running, "
+           "handle(0x%04x)\n", sas_device_priv_data->sas_target->handle);
+       sas_device_priv_data->block = 0;
+       r = scsi_internal_device_unblock(sdev, SDEV_RUNNING);
+       if (r == -EINVAL) {
+               /* The device has been set to SDEV_RUNNING by SD layer during
+                * device addition but the request queue is still stopped by
+                * our earlier block call. We need to perform a block again
+                * to get the device to SDEV_BLOCK and then to SDEV_RUNNING */
+
+               sdev_printk(KERN_WARNING, sdev,
+                   "device_unblock failed with return(%d) for handle(0x%04x) "
+                   "performing a block followed by an unblock\n",
+                   sas_device_priv_data->sas_target->handle, r);
+               sas_device_priv_data->block = 1;
+               r = scsi_internal_device_block(sdev);
+               if (r)
+                       sdev_printk(KERN_WARNING, sdev, "retried device_block "
+                           "failed with return(%d) for handle(0x%04x)\n",
+                           sas_device_priv_data->sas_target->handle, r);
+
+               sas_device_priv_data->block = 0;
+               r = scsi_internal_device_unblock(sdev, SDEV_RUNNING);
+               if (r)
+                       sdev_printk(KERN_WARNING, sdev, "retried device_unblock"
+                           " failed with return(%d) for handle(0x%04x)\n",
+                           sas_device_priv_data->sas_target->handle, r);
+       }
+}
+
 /**
  * _scsih_ublock_io_all_device - unblock every device
  * @ioc: per adapter object
@@ -2570,11 +2692,10 @@ _scsih_ublock_io_all_device(struct MPT3SAS_ADAPTER *ioc)
                if (!sas_device_priv_data->block)
                        continue;
 
-               sas_device_priv_data->block = 0;
                dewtprintk(ioc, sdev_printk(KERN_INFO, sdev,
                        "device_running, handle(0x%04x)\n",
                    sas_device_priv_data->sas_target->handle));
-               scsi_internal_device_unblock(sdev, SDEV_RUNNING);
+               _scsih_internal_device_unblock(sdev, sas_device_priv_data);
        }
 }
 
@@ -2599,10 +2720,9 @@ _scsih_ublock_io_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address)
                if (sas_device_priv_data->sas_target->sas_address
                    != sas_address)
                        continue;
-               if (sas_device_priv_data->block) {
-                       sas_device_priv_data->block = 0;
-                       scsi_internal_device_unblock(sdev, SDEV_RUNNING);
-               }
+               if (sas_device_priv_data->block)
+                       _scsih_internal_device_unblock(sdev,
+                               sas_device_priv_data);
        }
 }
 
@@ -2625,10 +2745,7 @@ _scsih_block_io_all_device(struct MPT3SAS_ADAPTER *ioc)
                        continue;
                if (sas_device_priv_data->block)
                        continue;
-               sas_device_priv_data->block = 1;
-               scsi_internal_device_block(sdev);
-               sdev_printk(KERN_INFO, sdev, "device_blocked, handle(0x%04x)\n",
-                   sas_device_priv_data->sas_target->handle);
+               _scsih_internal_device_block(sdev, sas_device_priv_data);
        }
 }
 
@@ -2644,6 +2761,11 @@ _scsih_block_io_device(struct MPT3SAS_ADAPTER *ioc, u16 handle)
 {
        struct MPT3SAS_DEVICE *sas_device_priv_data;
        struct scsi_device *sdev;
+       struct _sas_device *sas_device;
+
+       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       if (!sas_device)
+               return;
 
        shost_for_each_device(sdev, ioc->shost) {
                sas_device_priv_data = sdev->hostdata;
@@ -2653,10 +2775,9 @@ _scsih_block_io_device(struct MPT3SAS_ADAPTER *ioc, u16 handle)
                        continue;
                if (sas_device_priv_data->block)
                        continue;
-               sas_device_priv_data->block = 1;
-               scsi_internal_device_block(sdev);
-               sdev_printk(KERN_INFO, sdev,
-                       "device_blocked, handle(0x%04x)\n", handle);
+               if (sas_device->pend_sas_rphy_add)
+                       continue;
+               _scsih_internal_device_block(sdev, sas_device_priv_data);
        }
 }
 
@@ -2806,6 +2927,18 @@ _scsih_tm_tr_send(struct MPT3SAS_ADAPTER *ioc, u16 handle)
                        "setting delete flag: handle(0x%04x), sas_addr(0x%016llx)\n",
                        ioc->name, handle,
                    (unsigned long long)sas_address));
+               if (sas_device->enclosure_handle != 0)
+                       dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                        "setting delete flag:enclosure logical id(0x%016llx),"
+                        " slot(%d)\n", ioc->name, (unsigned long long)
+                         sas_device->enclosure_logical_id,
+                         sas_device->slot));
+               if (sas_device->connector_name)
+                       dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                        "setting delete flag: enclosure level(0x%04x),"
+                        " connector name( %s)\n", ioc->name,
+                         sas_device->enclosure_level,
+                         sas_device->connector_name));
                _scsih_ublock_io_device(ioc, sas_address);
                sas_target_priv_data->handle = MPT3SAS_INVALID_DEVICE_HANDLE;
        }
@@ -3821,10 +3954,19 @@ _scsih_scsi_ioc_info(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
                                "\tsas_address(0x%016llx), phy(%d)\n",
                                ioc->name, (unsigned long long)
                            sas_device->sas_address, sas_device->phy);
-                       pr_warn(MPT3SAS_FMT
-                           "\tenclosure_logical_id(0x%016llx), slot(%d)\n",
-                           ioc->name, (unsigned long long)
-                           sas_device->enclosure_logical_id, sas_device->slot);
+                       if (sas_device->enclosure_handle != 0)
+                               pr_warn(MPT3SAS_FMT
+                                 "\tenclosure_logical_id(0x%016llx),"
+                                 "slot(%d)\n", ioc->name,
+                                 (unsigned long long)
+                                 sas_device->enclosure_logical_id,
+                                 sas_device->slot);
+                       if (sas_device->connector_name[0])
+                               pr_warn(MPT3SAS_FMT
+                                 "\tenclosure level(0x%04x),"
+                                 " connector name( %s)\n", ioc->name,
+                                 sas_device->enclosure_level,
+                                 sas_device->connector_name);
                }
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
@@ -3999,7 +4141,16 @@ _scsih_smart_predicted_fault(struct MPT3SAS_ADAPTER *ioc, u16 handle)
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
                return;
        }
-       starget_printk(KERN_WARNING, starget, "predicted fault\n");
+       if (sas_device->enclosure_handle != 0)
+               starget_printk(KERN_INFO, starget, "predicted fault, "
+                       "enclosure logical id(0x%016llx), slot(%d)\n",
+                       (unsigned long long)sas_device->enclosure_logical_id,
+                       sas_device->slot);
+       if (sas_device->connector_name[0] != '\0')
+               starget_printk(KERN_WARNING, starget, "predicted fault, "
+                       "enclosure level(0x%04x), connector name( %s)\n",
+                       sas_device->enclosure_level,
+                       sas_device->connector_name);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
        if (ioc->pdev->subsystem_vendor == PCI_VENDOR_ID_IBM)
@@ -4119,8 +4270,15 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
                        _scsih_smart_predicted_fault(ioc,
                            le16_to_cpu(mpi_reply->DevHandle));
                mpt3sas_trigger_scsi(ioc, data.skey, data.asc, data.ascq);
-       }
 
+#ifdef CONFIG_SCSI_MPT3SAS_LOGGING
+               if (!(ioc->logging_level & MPT_DEBUG_REPLY) &&
+                    ((scmd->sense_buffer[2] == UNIT_ATTENTION) ||
+                    (scmd->sense_buffer[2] == MEDIUM_ERROR) ||
+                    (scmd->sense_buffer[2] == HARDWARE_ERROR)))
+                       _scsih_scsi_ioc_info(ioc, scmd, mpi_reply, smid);
+#endif
+       }
        switch (ioc_status) {
        case MPI2_IOCSTATUS_BUSY:
        case MPI2_IOCSTATUS_INSUFFICIENT_RESOURCES:
@@ -4146,6 +4304,9 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
                                scmd->device->expecting_cc_ua = 1;
                        }
                        break;
+               } else if (log_info == VIRTUAL_IO_FAILED_RETRY) {
+                       scmd->result = DID_RESET << 16;
+                       break;
                }
                scmd->result = DID_SOFT_ERROR << 16;
                break;
@@ -4788,6 +4949,16 @@ _scsih_check_device(struct MPT3SAS_ADAPTER *ioc,
                        sas_device->handle, handle);
                sas_target_priv_data->handle = handle;
                sas_device->handle = handle;
+               if (sas_device_pg0.Flags &
+                    MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID) {
+                       sas_device->enclosure_level =
+                               le16_to_cpu(sas_device_pg0.EnclosureLevel);
+                       memcpy(&sas_device->connector_name[0],
+                               &sas_device_pg0.ConnectorName[0], 4);
+               } else {
+                       sas_device->enclosure_level = 0;
+                       sas_device->connector_name[0] = '\0';
+               }
        }
 
        /* check if device is present */
@@ -4894,14 +5065,24 @@ _scsih_add_device(struct MPT3SAS_ADAPTER *ioc, u16 handle, u8 phy_num,
                    ioc->name, __FILE__, __LINE__, __func__);
        sas_device->enclosure_handle =
            le16_to_cpu(sas_device_pg0.EnclosureHandle);
-       sas_device->slot =
-           le16_to_cpu(sas_device_pg0.Slot);
+       if (sas_device->enclosure_handle != 0)
+               sas_device->slot =
+                   le16_to_cpu(sas_device_pg0.Slot);
        sas_device->device_info = device_info;
        sas_device->sas_address = sas_address;
        sas_device->phy = sas_device_pg0.PhyNum;
        sas_device->fast_path = (le16_to_cpu(sas_device_pg0.Flags) &
            MPI25_SAS_DEVICE0_FLAGS_FAST_PATH_CAPABLE) ? 1 : 0;
 
+       if (sas_device_pg0.Flags & MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID) {
+               sas_device->enclosure_level =
+                       le16_to_cpu(sas_device_pg0.EnclosureLevel);
+               memcpy(&sas_device->connector_name[0],
+                       &sas_device_pg0.ConnectorName[0], 4);
+       } else {
+               sas_device->enclosure_level = 0;
+               sas_device->connector_name[0] = '\0';
+       }
        /* get enclosure_logical_id */
        if (sas_device->enclosure_handle && !(mpt3sas_config_get_enclosure_pg0(
           ioc, &mpi_reply, &enclosure_pg0, MPI2_SAS_ENCLOS_PGAD_FORM_HANDLE,
@@ -4943,6 +5124,18 @@ _scsih_remove_device(struct MPT3SAS_ADAPTER *ioc,
                ioc->name, __func__,
            sas_device->handle, (unsigned long long)
            sas_device->sas_address));
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enter: enclosure logical id(0x%016llx), slot(%d)\n",
+                   ioc->name, __func__,
+                   (unsigned long long)sas_device->enclosure_logical_id,
+                   sas_device->slot));
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                 "%s: enter: enclosure level(0x%04x), connector name( %s)\n",
+                 ioc->name, __func__,
+                 sas_device->enclosure_level,
+                 sas_device->connector_name));
 
        if (sas_device->starget && sas_device->starget->hostdata) {
                sas_target_priv_data = sas_device->starget->hostdata;
@@ -4959,12 +5152,34 @@ _scsih_remove_device(struct MPT3SAS_ADAPTER *ioc,
                "removing handle(0x%04x), sas_addr(0x%016llx)\n",
                ioc->name, sas_device->handle,
            (unsigned long long) sas_device->sas_address);
+       if (sas_device->enclosure_handle != 0)
+               pr_info(MPT3SAS_FMT
+                 "removing : enclosure logical id(0x%016llx), slot(%d)\n",
+                 ioc->name,
+                 (unsigned long long)sas_device->enclosure_logical_id,
+                 sas_device->slot);
+       if (sas_device->connector_name[0] != '\0')
+               pr_info(MPT3SAS_FMT
+                 "removing enclosure level(0x%04x), connector name( %s)\n",
+                 ioc->name, sas_device->enclosure_level,
+                 sas_device->connector_name);
 
        dewtprintk(ioc, pr_info(MPT3SAS_FMT
                "%s: exit: handle(0x%04x), sas_addr(0x%016llx)\n",
                ioc->name, __func__,
-           sas_device->handle, (unsigned long long)
-           sas_device->sas_address));
+               sas_device->handle, (unsigned long long)
+               sas_device->sas_address));
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: exit: enclosure logical id(0x%016llx), slot(%d)\n",
+                   ioc->name, __func__,
+                   (unsigned long long)sas_device->enclosure_logical_id,
+                   sas_device->slot));
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: exit: enclosure level(0x%04x), connector name(%s)\n",
+                   ioc->name, __func__, sas_device->enclosure_level,
+                   sas_device->connector_name));
 
        kfree(sas_device);
 }
@@ -6357,9 +6572,7 @@ _scsih_prep_device_scan(struct MPT3SAS_ADAPTER *ioc)
 /**
  * _scsih_mark_responding_sas_device - mark a sas_devices as responding
  * @ioc: per adapter object
- * @sas_address: sas address
- * @slot: enclosure slot id
- * @handle: device handle
+ * @sas_device_pg0: SAS Device page 0
  *
  * After host reset, find out whether devices are still responding.
  * Used in _scsih_remove_unresponsive_sas_devices.
@@ -6367,8 +6580,8 @@ _scsih_prep_device_scan(struct MPT3SAS_ADAPTER *ioc)
  * Return nothing.
  */
 static void
-_scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address,
-       u16 slot, u16 handle)
+_scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc,
+Mpi2SasDevicePage0_t *sas_device_pg0)
 {
        struct MPT3SAS_TARGET *sas_target_priv_data = NULL;
        struct scsi_target *starget;
@@ -6377,8 +6590,8 @@ _scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address,
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_for_each_entry(sas_device, &ioc->sas_device_list, list) {
-               if (sas_device->sas_address == sas_address &&
-                   sas_device->slot == slot) {
+               if ((sas_device->sas_address == sas_device_pg0->SASAddress) &&
+                       (sas_device->slot == sas_device_pg0->Slot)) {
                        sas_device->responding = 1;
                        starget = sas_device->starget;
                        if (starget && starget->hostdata) {
@@ -6387,22 +6600,40 @@ _scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address,
                                sas_target_priv_data->deleted = 0;
                        } else
                                sas_target_priv_data = NULL;
-                       if (starget)
+                       if (starget) {
                                starget_printk(KERN_INFO, starget,
-                                   "handle(0x%04x), sas_addr(0x%016llx), "
-                                   "enclosure logical id(0x%016llx), "
-                                   "slot(%d)\n", handle,
-                                   (unsigned long long)sas_device->sas_address,
+                                   "handle(0x%04x), sas_addr(0x%016llx)\n",
+                                   sas_device_pg0->DevHandle,
                                    (unsigned long long)
-                                   sas_device->enclosure_logical_id,
-                                   sas_device->slot);
-                       if (sas_device->handle == handle)
+                                   sas_device->sas_address);
+
+                               if (sas_device->enclosure_handle != 0)
+                                       starget_printk(KERN_INFO, starget,
+                                        "enclosure logical id(0x%016llx),"
+                                        " slot(%d)\n",
+                                        (unsigned long long)
+                                        sas_device->enclosure_logical_id,
+                                        sas_device->slot);
+                       }
+                       if (sas_device_pg0->Flags &
+                             MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID) {
+                               sas_device->enclosure_level =
+                                  le16_to_cpu(sas_device_pg0->EnclosureLevel);
+                               memcpy(&sas_device->connector_name[0],
+                                       &sas_device_pg0->ConnectorName[0], 4);
+                       } else {
+                               sas_device->enclosure_level = 0;
+                               sas_device->connector_name[0] = '\0';
+                       }
+
+                       if (sas_device->handle == sas_device_pg0->DevHandle)
                                goto out;
                        pr_info("\thandle changed from(0x%04x)!!!\n",
                            sas_device->handle);
-                       sas_device->handle = handle;
+                       sas_device->handle = sas_device_pg0->DevHandle;
                        if (sas_target_priv_data)
-                               sas_target_priv_data->handle = handle;
+                               sas_target_priv_data->handle =
+                                       sas_device_pg0->DevHandle;
                        goto out;
                }
        }
@@ -6441,13 +6672,15 @@ _scsih_search_responding_sas_devices(struct MPT3SAS_ADAPTER *ioc)
                    MPI2_IOCSTATUS_MASK;
                if (ioc_status != MPI2_IOCSTATUS_SUCCESS)
                        break;
-               handle = le16_to_cpu(sas_device_pg0.DevHandle);
+               handle = sas_device_pg0.DevHandle =
+                               le16_to_cpu(sas_device_pg0.DevHandle);
                device_info = le32_to_cpu(sas_device_pg0.DeviceInfo);
                if (!(_scsih_is_end_device(device_info)))
                        continue;
-               _scsih_mark_responding_sas_device(ioc,
-                   le64_to_cpu(sas_device_pg0.SASAddress),
-                   le16_to_cpu(sas_device_pg0.Slot), handle);
+               sas_device_pg0.SASAddress =
+                               le64_to_cpu(sas_device_pg0.SASAddress);
+               sas_device_pg0.Slot = le16_to_cpu(sas_device_pg0.Slot);
+               _scsih_mark_responding_sas_device(ioc, &sas_device_pg0);
        }
 
  out:
@@ -7854,8 +8087,8 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        /* event thread */
        snprintf(ioc->firmware_event_name, sizeof(ioc->firmware_event_name),
            "fw_event%d", ioc->id);
-       ioc->firmware_event_thread = create_singlethread_workqueue(
-           ioc->firmware_event_name);
+       ioc->firmware_event_thread = alloc_ordered_workqueue(
+           ioc->firmware_event_name, WQ_MEM_RECLAIM);
        if (!ioc->firmware_event_thread) {
                pr_err(MPT3SAS_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
index efb98af..70fd019 100644 (file)
@@ -649,6 +649,7 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle,
        unsigned long flags;
        struct _sas_node *sas_node;
        struct sas_rphy *rphy;
+       struct _sas_device *sas_device = NULL;
        int i;
        struct sas_port *port;
 
@@ -731,10 +732,27 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle,
                    mpt3sas_port->remote_identify.device_type);
 
        rphy->identify = mpt3sas_port->remote_identify;
+
+       if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE) {
+               sas_device = mpt3sas_scsih_sas_device_find_by_sas_address(ioc,
+                                   mpt3sas_port->remote_identify.sas_address);
+               if (!sas_device) {
+                       dfailprintk(ioc, printk(MPT3SAS_FMT
+                               "failure at %s:%d/%s()!\n",
+                               ioc->name, __FILE__, __LINE__, __func__));
+                       goto out_fail;
+               }
+               sas_device->pend_sas_rphy_add = 1;
+       }
+
        if ((sas_rphy_add(rphy))) {
                pr_err(MPT3SAS_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
        }
+
+       if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE)
+               sas_device->pend_sas_rphy_add = 0;
+
        if ((ioc->logging_level & MPT_DEBUG_TRANSPORT))
                dev_printk(KERN_INFO, &rphy->dev,
                        "add: handle(0x%04x), sas_addr(0x%016llx)\n",
@@ -1946,7 +1964,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        } else {
                dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio),
                    blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
-               if (!dma_addr_out) {
+               if (pci_dma_mapping_error(ioc->pdev, dma_addr_out)) {
                        pr_info(MPT3SAS_FMT "%s(): DMA Addr out = NULL\n",
                            ioc->name, __func__);
                        rc = -ENOMEM;
@@ -1968,7 +1986,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        } else {
                dma_addr_in =  pci_map_single(ioc->pdev, bio_data(rsp->bio),
                    blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
-               if (!dma_addr_in) {
+               if (pci_dma_mapping_error(ioc->pdev, dma_addr_in)) {
                        pr_info(MPT3SAS_FMT "%s(): DMA Addr in = NULL\n",
                            ioc->name, __func__);
                        rc = -ENOMEM;
index 39306b1..04e67a1 100644 (file)
@@ -2642,6 +2642,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
                ts->resp = SAS_TASK_COMPLETE;
                ts->stat = SAS_OPEN_REJECT;
                ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
+               break;
        default:
                PM8001_IO_DBG(pm8001_ha,
                        pm8001_printk("Unknown status 0x%x\n", status));
index 0e1628f..9a389f1 100644 (file)
@@ -2337,6 +2337,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
                ts->resp = SAS_TASK_COMPLETE;
                ts->stat = SAS_OPEN_REJECT;
                ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
+               break;
        default:
                PM8001_IO_DBG(pm8001_ha,
                        pm8001_printk("Unknown status 0x%x\n", status));
index 33f60c9..a0f732b 100644 (file)
@@ -32,10 +32,10 @@ config SCSI_QLA_FC
        They are also included in the linux-firmware tree as well.
 
 config TCM_QLA2XXX
-       tristate "TCM_QLA2XXX fabric module for Qlogic 2xxx series target mode HBAs"
+       tristate "TCM_QLA2XXX fabric module for QLogic 24xx+ series target mode HBAs"
        depends on SCSI_QLA_FC && TARGET_CORE
        depends on LIBFC
        select BTREE
        default n
        ---help---
-       Say Y here to enable the TCM_QLA2XXX fabric module for Qlogic 2xxx series target mode HBAs
+       Say Y here to enable the TCM_QLA2XXX fabric module for QLogic 24xx+ series target mode HBAs
index 7ed7bae..ac65cb7 100644 (file)
@@ -1359,9 +1359,7 @@ static void tcm_qla2xxx_free_session(struct qla_tgt_sess *sess)
        struct qla_hw_data *ha = tgt->ha;
        scsi_qla_host_t *vha = pci_get_drvdata(ha->pdev);
        struct se_session *se_sess;
-       struct se_node_acl *se_nacl;
        struct tcm_qla2xxx_lport *lport;
-       struct tcm_qla2xxx_nacl *nacl;
 
        BUG_ON(in_interrupt());
 
@@ -1371,8 +1369,6 @@ static void tcm_qla2xxx_free_session(struct qla_tgt_sess *sess)
                dump_stack();
                return;
        }
-       se_nacl = se_sess->se_node_acl;
-       nacl = container_of(se_nacl, struct tcm_qla2xxx_nacl, se_node_acl);
 
        lport = vha->vha_tgt.target_lport_ptr;
        if (!lport) {
@@ -1680,7 +1676,6 @@ static int tcm_qla2xxx_lport_register_npiv_cb(struct scsi_qla_host *base_vha,
                        (struct tcm_qla2xxx_lport *)target_lport_ptr;
        struct tcm_qla2xxx_lport *base_lport =
                        (struct tcm_qla2xxx_lport *)base_vha->vha_tgt.target_lport_ptr;
-       struct tcm_qla2xxx_tpg *base_tpg;
        struct fc_vport_identifiers vport_id;
 
        if (!qla_tgt_mode_enabled(base_vha)) {
@@ -1693,7 +1688,6 @@ static int tcm_qla2xxx_lport_register_npiv_cb(struct scsi_qla_host *base_vha,
                pr_err("qla2xxx base_lport or tpg_1 not available\n");
                return -EPERM;
        }
-       base_tpg = base_lport->tpg_1;
 
        memset(&vport_id, 0, sizeof(vport_id));
        vport_id.port_name = npiv_wwpn;
@@ -1810,6 +1804,11 @@ static const struct target_core_fabric_ops tcm_qla2xxx_ops = {
        .module                         = THIS_MODULE,
        .name                           = "qla2xxx",
        .node_acl_size                  = sizeof(struct tcm_qla2xxx_nacl),
+       /*
+        * XXX: Limit assumes single page per scatter-gather-list entry.
+        * Current maximum is ~4.9 MB per se_cmd->t_data_sg with PAGE_SIZE=4096
+        */
+       .max_data_sg_nents              = 1200,
        .get_fabric_name                = tcm_qla2xxx_get_fabric_name,
        .tpg_get_wwn                    = tcm_qla2xxx_get_fabric_wwn,
        .tpg_get_tag                    = tcm_qla2xxx_get_tag,
@@ -1958,7 +1957,7 @@ static void __exit tcm_qla2xxx_exit(void)
        tcm_qla2xxx_deregister_configfs();
 }
 
-MODULE_DESCRIPTION("TCM QLA2XXX series NPIV enabled fabric driver");
+MODULE_DESCRIPTION("TCM QLA24XX+ series NPIV enabled fabric driver");
 MODULE_LICENSE("GPL");
 module_init(tcm_qla2xxx_init);
 module_exit(tcm_qla2xxx_exit);
index 2ff0922..c126966 100644 (file)
@@ -5,6 +5,8 @@
 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/errno.h>
+#include <asm/unaligned.h>
 #include <scsi/scsi_common.h>
 
 /* NB: These are exposed through /proc/scsi/scsi and form part of the ABI.
@@ -176,3 +178,110 @@ bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len,
        return true;
 }
 EXPORT_SYMBOL(scsi_normalize_sense);
+
+/**
+ * scsi_sense_desc_find - search for a given descriptor type in        descriptor sense data format.
+ * @sense_buffer:      byte array of descriptor format sense data
+ * @sb_len:            number of valid bytes in sense_buffer
+ * @desc_type:         value of descriptor type to find
+ *                     (e.g. 0 -> information)
+ *
+ * Notes:
+ *     only valid when sense data is in descriptor format
+ *
+ * Return value:
+ *     pointer to start of (first) descriptor if found else NULL
+ */
+const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
+                               int desc_type)
+{
+       int add_sen_len, add_len, desc_len, k;
+       const u8 * descp;
+
+       if ((sb_len < 8) || (0 == (add_sen_len = sense_buffer[7])))
+               return NULL;
+       if ((sense_buffer[0] < 0x72) || (sense_buffer[0] > 0x73))
+               return NULL;
+       add_sen_len = (add_sen_len < (sb_len - 8)) ?
+                       add_sen_len : (sb_len - 8);
+       descp = &sense_buffer[8];
+       for (desc_len = 0, k = 0; k < add_sen_len; k += desc_len) {
+               descp += desc_len;
+               add_len = (k < (add_sen_len - 1)) ? descp[1]: -1;
+               desc_len = add_len + 2;
+               if (descp[0] == desc_type)
+                       return descp;
+               if (add_len < 0) // short descriptor ??
+                       break;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(scsi_sense_desc_find);
+
+/**
+ * scsi_build_sense_buffer - build sense data in a buffer
+ * @desc:      Sense format (non zero == descriptor format,
+ *              0 == fixed format)
+ * @buf:       Where to build sense data
+ * @key:       Sense key
+ * @asc:       Additional sense code
+ * @ascq:      Additional sense code qualifier
+ *
+ **/
+void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq)
+{
+       if (desc) {
+               buf[0] = 0x72;  /* descriptor, current */
+               buf[1] = key;
+               buf[2] = asc;
+               buf[3] = ascq;
+               buf[7] = 0;
+       } else {
+               buf[0] = 0x70;  /* fixed, current */
+               buf[2] = key;
+               buf[7] = 0xa;
+               buf[12] = asc;
+               buf[13] = ascq;
+       }
+}
+EXPORT_SYMBOL(scsi_build_sense_buffer);
+
+/**
+ * scsi_set_sense_information - set the information field in a
+ *             formatted sense data buffer
+ * @buf:       Where to build sense data
+ * @buf_len:    buffer length
+ * @info:      64-bit information value to be set
+ *
+ * Return value:
+ *     0 on success or EINVAL for invalid sense buffer length
+ **/
+int scsi_set_sense_information(u8 *buf, int buf_len, u64 info)
+{
+       if ((buf[0] & 0x7f) == 0x72) {
+               u8 *ucp, len;
+
+               len = buf[7];
+               ucp = (char *)scsi_sense_desc_find(buf, len + 8, 0);
+               if (!ucp) {
+                       buf[7] = len + 0xc;
+                       ucp = buf + 8 + len;
+               }
+
+               if (buf_len < len + 0xc)
+                       /* Not enough room for info */
+                       return -EINVAL;
+
+               ucp[0] = 0;
+               ucp[1] = 0xa;
+               ucp[2] = 0x80; /* Valid bit */
+               ucp[3] = 0;
+               put_unaligned_be64(info, &ucp[4]);
+       } else if ((buf[0] & 0x7f) == 0x70) {
+               buf[0] |= 0x80;
+               put_unaligned_be64(info, &buf[3]);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(scsi_set_sense_information);
index 30268bb..dfcc45b 100644 (file)
@@ -25,6 +25,9 @@
  *        module options to "modprobe scsi_debug num_tgts=2" [20021221]
  */
 
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
 #include <linux/module.h>
 
 #include <linux/kernel.h>
@@ -201,7 +204,6 @@ static const char *scsi_debug_version_date = "20141022";
 /* If REPORT LUNS has luns >= 256 it can choose "flat space" (value 1)
  * or "peripheral device" addressing (value 0) */
 #define SAM2_LUN_ADDRESS_METHOD 0
-#define SAM2_WLUN_REPORT_LUNS 0xc101
 
 /* SCSI_DEBUG_CANQUEUE is the maximum number of commands that can be queued
  * (for response) at one time. Can be reduced by max_queue option. Command
@@ -698,7 +700,7 @@ static void sdebug_max_tgts_luns(void)
                else
                        hpnt->max_id = scsi_debug_num_tgts;
                /* scsi_debug_max_luns; */
-               hpnt->max_lun = SAM2_WLUN_REPORT_LUNS;
+               hpnt->max_lun = SCSI_W_LUN_REPORT_LUNS + 1;
        }
        spin_unlock(&sdebug_host_list_lock);
 }
@@ -1288,7 +1290,7 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
        arr = kzalloc(SDEBUG_MAX_INQ_ARR_SZ, GFP_ATOMIC);
        if (! arr)
                return DID_REQUEUE << 16;
-       have_wlun = (scp->device->lun == SAM2_WLUN_REPORT_LUNS);
+       have_wlun = (scp->device->lun == SCSI_W_LUN_REPORT_LUNS);
        if (have_wlun)
                pq_pdt = 0x1e;  /* present, wlun */
        else if (scsi_debug_no_lun_0 && (0 == devip->lun))
@@ -1427,12 +1429,11 @@ static int resp_requests(struct scsi_cmnd * scp,
        unsigned char * sbuff;
        unsigned char *cmd = scp->cmnd;
        unsigned char arr[SCSI_SENSE_BUFFERSIZE];
-       bool dsense, want_dsense;
+       bool dsense;
        int len = 18;
 
        memset(arr, 0, sizeof(arr));
        dsense = !!(cmd[1] & 1);
-       want_dsense = dsense || scsi_debug_dsense;
        sbuff = scp->sense_buffer;
        if ((iec_m_pg[2] & 0x4) && (6 == (iec_m_pg[3] & 0xf))) {
                if (dsense) {
@@ -2446,8 +2447,7 @@ static int dif_verify(struct sd_dif_tuple *sdt, const void *data,
        __be16 csum = dif_compute_csum(data, scsi_debug_sector_size);
 
        if (sdt->guard_tag != csum) {
-               pr_err("%s: GUARD check failed on sector %lu rcvd 0x%04x, data 0x%04x\n",
-                       __func__,
+               pr_err("GUARD check failed on sector %lu rcvd 0x%04x, data 0x%04x\n",
                        (unsigned long)sector,
                        be16_to_cpu(sdt->guard_tag),
                        be16_to_cpu(csum));
@@ -2455,14 +2455,14 @@ static int dif_verify(struct sd_dif_tuple *sdt, const void *data,
        }
        if (scsi_debug_dif == SD_DIF_TYPE1_PROTECTION &&
            be32_to_cpu(sdt->ref_tag) != (sector & 0xffffffff)) {
-               pr_err("%s: REF check failed on sector %lu\n",
-                       __func__, (unsigned long)sector);
+               pr_err("REF check failed on sector %lu\n",
+                       (unsigned long)sector);
                return 0x03;
        }
        if (scsi_debug_dif == SD_DIF_TYPE2_PROTECTION &&
            be32_to_cpu(sdt->ref_tag) != ei_lba) {
-               pr_err("%s: REF check failed on sector %lu\n",
-                       __func__, (unsigned long)sector);
+               pr_err("REF check failed on sector %lu\n",
+                       (unsigned long)sector);
                return 0x03;
        }
        return 0;
@@ -2680,7 +2680,7 @@ resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
        return 0;
 }
 
-void dump_sector(unsigned char *buf, int len)
+static void dump_sector(unsigned char *buf, int len)
 {
        int i, j, n;
 
@@ -3365,8 +3365,8 @@ static int resp_report_luns(struct scsi_cmnd * scp,
                one_lun[i].scsi_lun[1] = lun & 0xff;
        }
        if (want_wlun) {
-               one_lun[i].scsi_lun[0] = (SAM2_WLUN_REPORT_LUNS >> 8) & 0xff;
-               one_lun[i].scsi_lun[1] = SAM2_WLUN_REPORT_LUNS & 0xff;
+               one_lun[i].scsi_lun[0] = (SCSI_W_LUN_REPORT_LUNS >> 8) & 0xff;
+               one_lun[i].scsi_lun[1] = SCSI_W_LUN_REPORT_LUNS & 0xff;
                i++;
        }
        alloc_len = (unsigned char *)(one_lun + i) - arr;
@@ -3449,7 +3449,7 @@ static void sdebug_q_cmd_complete(unsigned long indx)
        atomic_inc(&sdebug_completions);
        qa_indx = indx;
        if ((qa_indx < 0) || (qa_indx >= SCSI_DEBUG_CANQUEUE)) {
-               pr_err("%s: wild qa_indx=%d\n", __func__, qa_indx);
+               pr_err("wild qa_indx=%d\n", qa_indx);
                return;
        }
        spin_lock_irqsave(&queued_arr_lock, iflags);
@@ -3457,21 +3457,21 @@ static void sdebug_q_cmd_complete(unsigned long indx)
        scp = sqcp->a_cmnd;
        if (NULL == scp) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: scp is NULL\n", __func__);
+               pr_err("scp is NULL\n");
                return;
        }
        devip = (struct sdebug_dev_info *)scp->device->hostdata;
        if (devip)
                atomic_dec(&devip->num_in_q);
        else
-               pr_err("%s: devip=NULL\n", __func__);
+               pr_err("devip=NULL\n");
        if (atomic_read(&retired_max_queue) > 0)
                retiring = 1;
 
        sqcp->a_cmnd = NULL;
        if (!test_and_clear_bit(qa_indx, queued_in_use_bm)) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: Unexpected completion\n", __func__);
+               pr_err("Unexpected completion\n");
                return;
        }
 
@@ -3481,7 +3481,7 @@ static void sdebug_q_cmd_complete(unsigned long indx)
                retval = atomic_read(&retired_max_queue);
                if (qa_indx >= retval) {
                        spin_unlock_irqrestore(&queued_arr_lock, iflags);
-                       pr_err("%s: index %d too large\n", __func__, retval);
+                       pr_err("index %d too large\n", retval);
                        return;
                }
                k = find_last_bit(queued_in_use_bm, retval);
@@ -3509,7 +3509,7 @@ sdebug_q_cmd_hrt_complete(struct hrtimer *timer)
        atomic_inc(&sdebug_completions);
        qa_indx = sd_hrtp->qa_indx;
        if ((qa_indx < 0) || (qa_indx >= SCSI_DEBUG_CANQUEUE)) {
-               pr_err("%s: wild qa_indx=%d\n", __func__, qa_indx);
+               pr_err("wild qa_indx=%d\n", qa_indx);
                goto the_end;
        }
        spin_lock_irqsave(&queued_arr_lock, iflags);
@@ -3517,21 +3517,21 @@ sdebug_q_cmd_hrt_complete(struct hrtimer *timer)
        scp = sqcp->a_cmnd;
        if (NULL == scp) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: scp is NULL\n", __func__);
+               pr_err("scp is NULL\n");
                goto the_end;
        }
        devip = (struct sdebug_dev_info *)scp->device->hostdata;
        if (devip)
                atomic_dec(&devip->num_in_q);
        else
-               pr_err("%s: devip=NULL\n", __func__);
+               pr_err("devip=NULL\n");
        if (atomic_read(&retired_max_queue) > 0)
                retiring = 1;
 
        sqcp->a_cmnd = NULL;
        if (!test_and_clear_bit(qa_indx, queued_in_use_bm)) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: Unexpected completion\n", __func__);
+               pr_err("Unexpected completion\n");
                goto the_end;
        }
 
@@ -3541,7 +3541,7 @@ sdebug_q_cmd_hrt_complete(struct hrtimer *timer)
                retval = atomic_read(&retired_max_queue);
                if (qa_indx >= retval) {
                        spin_unlock_irqrestore(&queued_arr_lock, iflags);
-                       pr_err("%s: index %d too large\n", __func__, retval);
+                       pr_err("index %d too large\n", retval);
                        goto the_end;
                }
                k = find_last_bit(queued_in_use_bm, retval);
@@ -3580,7 +3580,7 @@ static struct sdebug_dev_info * devInfoReg(struct scsi_device * sdev)
                return devip;
        sdbg_host = *(struct sdebug_host_info **)shost_priv(sdev->host);
        if (!sdbg_host) {
-               pr_err("%s: Host info NULL\n", __func__);
+               pr_err("Host info NULL\n");
                return NULL;
         }
        list_for_each_entry(devip, &sdbg_host->dev_info_list, dev_list) {
@@ -3596,8 +3596,7 @@ static struct sdebug_dev_info * devInfoReg(struct scsi_device * sdev)
        if (!open_devip) { /* try and make a new one */
                open_devip = sdebug_device_create(sdbg_host, GFP_ATOMIC);
                if (!open_devip) {
-                       printk(KERN_ERR "%s: out of memory at line %d\n",
-                               __func__, __LINE__);
+                       pr_err("out of memory at line %d\n", __LINE__);
                        return NULL;
                }
        }
@@ -3615,7 +3614,7 @@ static struct sdebug_dev_info * devInfoReg(struct scsi_device * sdev)
 static int scsi_debug_slave_alloc(struct scsi_device *sdp)
 {
        if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
-               printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %llu>\n",
+               pr_info("slave_alloc <%u %u %u %llu>\n",
                       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
        queue_flag_set_unlocked(QUEUE_FLAG_BIDI, sdp->request_queue);
        return 0;
@@ -3626,7 +3625,7 @@ static int scsi_debug_slave_configure(struct scsi_device *sdp)
        struct sdebug_dev_info *devip;
 
        if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
-               printk(KERN_INFO "scsi_debug: slave_configure <%u %u %u %llu>\n",
+               pr_info("slave_configure <%u %u %u %llu>\n",
                       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
        if (sdp->host->max_cmd_len != SCSI_DEBUG_MAX_CMD_LEN)
                sdp->host->max_cmd_len = SCSI_DEBUG_MAX_CMD_LEN;
@@ -3646,7 +3645,7 @@ static void scsi_debug_slave_destroy(struct scsi_device *sdp)
                (struct sdebug_dev_info *)sdp->hostdata;
 
        if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
-               printk(KERN_INFO "scsi_debug: slave_destroy <%u %u %u %llu>\n",
+               pr_info("slave_destroy <%u %u %u %llu>\n",
                       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
        if (devip) {
                /* make this slot available for re-use */
@@ -3897,8 +3896,7 @@ static void __init sdebug_build_parts(unsigned char *ramp,
                return;
        if (scsi_debug_num_parts > SDEBUG_MAX_PARTS) {
                scsi_debug_num_parts = SDEBUG_MAX_PARTS;
-               pr_warn("%s: reducing partitions to %d\n", __func__,
-                       SDEBUG_MAX_PARTS);
+               pr_warn("reducing partitions to %d\n", SDEBUG_MAX_PARTS);
        }
        num_sectors = (int)sdebug_store_sectors;
        sectors_per_part = (num_sectors - sdebug_sectors_per)
@@ -3942,14 +3940,20 @@ schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
        unsigned long iflags;
        int k, num_in_q, qdepth, inject;
        struct sdebug_queued_cmd *sqcp = NULL;
-       struct scsi_device *sdp = cmnd->device;
+       struct scsi_device *sdp;
+
+       /* this should never happen */
+       if (WARN_ON(!cmnd))
+               return SCSI_MLQUEUE_HOST_BUSY;
 
-       if (NULL == cmnd || NULL == devip) {
-               pr_warn("%s: called with NULL cmnd or devip pointer\n",
-                       __func__);
+       if (NULL == devip) {
+               pr_warn("called devip == NULL\n");
                /* no particularly good error to report back */
                return SCSI_MLQUEUE_HOST_BUSY;
        }
+
+       sdp = cmnd->device;
+
        if ((scsi_result) && (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts))
                sdev_printk(KERN_INFO, sdp, "%s: non-zero result=0x%x\n",
                            __func__, scsi_result);
@@ -4383,8 +4387,7 @@ static ssize_t fake_rw_store(struct device_driver *ddp, const char *buf,
 
                                fake_storep = vmalloc(sz);
                                if (NULL == fake_storep) {
-                                       pr_err("%s: out of memory, 9\n",
-                                              __func__);
+                                       pr_err("out of memory, 9\n");
                                        return -ENOMEM;
                                }
                                memset(fake_storep, 0, sz);
@@ -4784,8 +4787,7 @@ static int __init scsi_debug_init(void)
        atomic_set(&retired_max_queue, 0);
 
        if (scsi_debug_ndelay >= 1000000000) {
-               pr_warn("%s: ndelay must be less than 1 second, ignored\n",
-                       __func__);
+               pr_warn("ndelay must be less than 1 second, ignored\n");
                scsi_debug_ndelay = 0;
        } else if (scsi_debug_ndelay > 0)
                scsi_debug_delay = DELAY_OVERRIDDEN;
@@ -4797,8 +4799,7 @@ static int __init scsi_debug_init(void)
        case 4096:
                break;
        default:
-               pr_err("%s: invalid sector_size %d\n", __func__,
-                      scsi_debug_sector_size);
+               pr_err("invalid sector_size %d\n", scsi_debug_sector_size);
                return -EINVAL;
        }
 
@@ -4811,29 +4812,28 @@ static int __init scsi_debug_init(void)
                break;
 
        default:
-               pr_err("%s: dif must be 0, 1, 2 or 3\n", __func__);
+               pr_err("dif must be 0, 1, 2 or 3\n");
                return -EINVAL;
        }
 
        if (scsi_debug_guard > 1) {
-               pr_err("%s: guard must be 0 or 1\n", __func__);
+               pr_err("guard must be 0 or 1\n");
                return -EINVAL;
        }
 
        if (scsi_debug_ato > 1) {
-               pr_err("%s: ato must be 0 or 1\n", __func__);
+               pr_err("ato must be 0 or 1\n");
                return -EINVAL;
        }
 
        if (scsi_debug_physblk_exp > 15) {
-               pr_err("%s: invalid physblk_exp %u\n", __func__,
-                      scsi_debug_physblk_exp);
+               pr_err("invalid physblk_exp %u\n", scsi_debug_physblk_exp);
                return -EINVAL;
        }
 
        if (scsi_debug_lowest_aligned > 0x3fff) {
-               pr_err("%s: lowest_aligned too big: %u\n", __func__,
-                      scsi_debug_lowest_aligned);
+               pr_err("lowest_aligned too big: %u\n",
+                       scsi_debug_lowest_aligned);
                return -EINVAL;
        }
 
@@ -4863,7 +4863,7 @@ static int __init scsi_debug_init(void)
        if (0 == scsi_debug_fake_rw) {
                fake_storep = vmalloc(sz);
                if (NULL == fake_storep) {
-                       pr_err("%s: out of memory, 1\n", __func__);
+                       pr_err("out of memory, 1\n");
                        return -ENOMEM;
                }
                memset(fake_storep, 0, sz);
@@ -4877,11 +4877,10 @@ static int __init scsi_debug_init(void)
                dif_size = sdebug_store_sectors * sizeof(struct sd_dif_tuple);
                dif_storep = vmalloc(dif_size);
 
-               pr_err("%s: dif_storep %u bytes @ %p\n", __func__, dif_size,
-                       dif_storep);
+               pr_err("dif_storep %u bytes @ %p\n", dif_size, dif_storep);
 
                if (dif_storep == NULL) {
-                       pr_err("%s: out of mem. (DIX)\n", __func__);
+                       pr_err("out of mem. (DIX)\n");
                        ret = -ENOMEM;
                        goto free_vm;
                }
@@ -4903,18 +4902,17 @@ static int __init scsi_debug_init(void)
                if (scsi_debug_unmap_alignment &&
                    scsi_debug_unmap_granularity <=
                    scsi_debug_unmap_alignment) {
-                       pr_err("%s: ERR: unmap_granularity <= unmap_alignment\n",
-                              __func__);
+                       pr_err("ERR: unmap_granularity <= unmap_alignment\n");
                        return -EINVAL;
                }
 
                map_size = lba_to_map_index(sdebug_store_sectors - 1) + 1;
                map_storep = vmalloc(BITS_TO_LONGS(map_size) * sizeof(long));
 
-               pr_info("%s: %lu provisioning blocks\n", __func__, map_size);
+               pr_info("%lu provisioning blocks\n", map_size);
 
                if (map_storep == NULL) {
-                       pr_err("%s: out of mem. (MAP)\n", __func__);
+                       pr_err("out of mem. (MAP)\n");
                        ret = -ENOMEM;
                        goto free_vm;
                }
@@ -4928,18 +4926,18 @@ static int __init scsi_debug_init(void)
 
        pseudo_primary = root_device_register("pseudo_0");
        if (IS_ERR(pseudo_primary)) {
-               pr_warn("%s: root_device_register() error\n", __func__);
+               pr_warn("root_device_register() error\n");
                ret = PTR_ERR(pseudo_primary);
                goto free_vm;
        }
        ret = bus_register(&pseudo_lld_bus);
        if (ret < 0) {
-               pr_warn("%s: bus_register error: %d\n", __func__, ret);
+               pr_warn("bus_register error: %d\n", ret);
                goto dev_unreg;
        }
        ret = driver_register(&sdebug_driverfs_driver);
        if (ret < 0) {
-               pr_warn("%s: driver_register error: %d\n", __func__, ret);
+               pr_warn("driver_register error: %d\n", ret);
                goto bus_unreg;
        }
 
@@ -4948,16 +4946,14 @@ static int __init scsi_debug_init(void)
 
         for (k = 0; k < host_to_add; k++) {
                 if (sdebug_add_adapter()) {
-                       pr_err("%s: sdebug_add_adapter failed k=%d\n",
-                               __func__, k);
+                       pr_err("sdebug_add_adapter failed k=%d\n", k);
                         break;
                 }
         }
 
-       if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) {
-               pr_info("%s: built %d host(s)\n", __func__,
-                       scsi_debug_add_host);
-       }
+       if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
+               pr_info("built %d host(s)\n", scsi_debug_add_host);
+
        return 0;
 
 bus_unreg:
@@ -4965,10 +4961,8 @@ bus_unreg:
 dev_unreg:
        root_device_unregister(pseudo_primary);
 free_vm:
-       if (map_storep)
-               vfree(map_storep);
-       if (dif_storep)
-               vfree(dif_storep);
+       vfree(map_storep);
+       vfree(dif_storep);
        vfree(fake_storep);
 
        return ret;
@@ -4986,9 +4980,7 @@ static void __exit scsi_debug_exit(void)
        bus_unregister(&pseudo_lld_bus);
        root_device_unregister(pseudo_primary);
 
-       if (dif_storep)
-               vfree(dif_storep);
-
+       vfree(dif_storep);
        vfree(fake_storep);
 }
 
@@ -5012,8 +5004,7 @@ static int sdebug_add_adapter(void)
 
         sdbg_host = kzalloc(sizeof(*sdbg_host),GFP_KERNEL);
         if (NULL == sdbg_host) {
-                printk(KERN_ERR "%s: out of memory at line %d\n",
-                       __func__, __LINE__);
+               pr_err("out of memory at line %d\n", __LINE__);
                 return -ENOMEM;
         }
 
@@ -5023,8 +5014,7 @@ static int sdebug_add_adapter(void)
         for (k = 0; k < devs_per_host; k++) {
                sdbg_devinfo = sdebug_device_create(sdbg_host, GFP_KERNEL);
                if (!sdbg_devinfo) {
-                        printk(KERN_ERR "%s: out of memory at line %d\n",
-                               __func__, __LINE__);
+                       pr_err("out of memory at line %d\n", __LINE__);
                         error = -ENOMEM;
                        goto clean;
                 }
@@ -5178,7 +5168,7 @@ scsi_debug_queuecommand(struct scsi_cmnd *scp)
                }
                sdev_printk(KERN_INFO, sdp, "%s: cmd %s\n", my_name, b);
        }
-       has_wlun_rl = (sdp->lun == SAM2_WLUN_REPORT_LUNS);
+       has_wlun_rl = (sdp->lun == SCSI_W_LUN_REPORT_LUNS);
        if ((sdp->lun >= scsi_debug_max_luns) && !has_wlun_rl)
                return schedule_resp(scp, NULL, errsts_no_connect, 0);
 
@@ -5338,7 +5328,7 @@ static int sdebug_driver_probe(struct device * dev)
                sdebug_driver_template.use_clustering = ENABLE_CLUSTERING;
        hpnt = scsi_host_alloc(&sdebug_driver_template, sizeof(sdbg_host));
        if (NULL == hpnt) {
-               pr_err("%s: scsi_host_alloc failed\n", __func__);
+               pr_err("scsi_host_alloc failed\n");
                error = -ENODEV;
                return error;
        }
@@ -5349,7 +5339,8 @@ static int sdebug_driver_probe(struct device * dev)
                hpnt->max_id = scsi_debug_num_tgts + 1;
        else
                hpnt->max_id = scsi_debug_num_tgts;
-       hpnt->max_lun = SAM2_WLUN_REPORT_LUNS;  /* = scsi_debug_max_luns; */
+       /* = scsi_debug_max_luns; */
+       hpnt->max_lun = SCSI_W_LUN_REPORT_LUNS + 1;
 
        host_prot = 0;
 
@@ -5381,7 +5372,7 @@ static int sdebug_driver_probe(struct device * dev)
 
        scsi_host_set_prot(hpnt, host_prot);
 
-       printk(KERN_INFO "scsi_debug: host protection%s%s%s%s%s%s%s\n",
+       pr_info("host protection%s%s%s%s%s%s%s\n",
               (host_prot & SHOST_DIF_TYPE1_PROTECTION) ? " DIF1" : "",
               (host_prot & SHOST_DIF_TYPE2_PROTECTION) ? " DIF2" : "",
               (host_prot & SHOST_DIF_TYPE3_PROTECTION) ? " DIF3" : "",
@@ -5409,7 +5400,7 @@ static int sdebug_driver_probe(struct device * dev)
 
         error = scsi_add_host(hpnt, &sdbg_host->dev);
         if (error) {
-                printk(KERN_ERR "%s: scsi_add_host failed\n", __func__);
+               pr_err("scsi_add_host failed\n");
                 error = -ENODEV;
                scsi_host_put(hpnt);
         } else
@@ -5426,8 +5417,7 @@ static int sdebug_driver_remove(struct device * dev)
        sdbg_host = to_sdebug_host(dev);
 
        if (!sdbg_host) {
-               printk(KERN_ERR "%s: Unable to locate host info\n",
-                      __func__);
+               pr_err("Unable to locate host info\n");
                return -ENODEV;
        }
 
diff --git a/drivers/scsi/scsi_dh.c b/drivers/scsi/scsi_dh.c
new file mode 100644 (file)
index 0000000..edb044a
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * SCSI device handler infrastruture.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007
+ *      Authors:
+ *               Chandra Seetharaman <sekharan@us.ibm.com>
+ *               Mike Anderson <andmike@linux.vnet.ibm.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <scsi/scsi_dh.h>
+#include "scsi_priv.h"
+
+static DEFINE_SPINLOCK(list_lock);
+static LIST_HEAD(scsi_dh_list);
+
+struct scsi_dh_blist {
+       const char *vendor;
+       const char *model;
+       const char *driver;
+};
+
+static const struct scsi_dh_blist scsi_dh_blist[] = {
+       {"DGC", "RAID",                 "clariion" },
+       {"DGC", "DISK",                 "clariion" },
+       {"DGC", "VRAID",                "clariion" },
+
+       {"COMPAQ", "MSA1000 VOLUME",    "hp_sw" },
+       {"COMPAQ", "HSV110",            "hp_sw" },
+       {"HP", "HSV100",                "hp_sw"},
+       {"DEC", "HSG80",                "hp_sw"},
+
+       {"IBM", "1722",                 "rdac", },
+       {"IBM", "1724",                 "rdac", },
+       {"IBM", "1726",                 "rdac", },
+       {"IBM", "1742",                 "rdac", },
+       {"IBM", "1745",                 "rdac", },
+       {"IBM", "1746",                 "rdac", },
+       {"IBM", "1813",                 "rdac", },
+       {"IBM", "1814",                 "rdac", },
+       {"IBM", "1815",                 "rdac", },
+       {"IBM", "1818",                 "rdac", },
+       {"IBM", "3526",                 "rdac", },
+       {"SGI", "TP9",                  "rdac", },
+       {"SGI", "IS",                   "rdac", },
+       {"STK", "OPENstorage D280",     "rdac", },
+       {"STK", "FLEXLINE 380",         "rdac", },
+       {"SUN", "CSM",                  "rdac", },
+       {"SUN", "LCSM100",              "rdac", },
+       {"SUN", "STK6580_6780",         "rdac", },
+       {"SUN", "SUN_6180",             "rdac", },
+       {"SUN", "ArrayStorage",         "rdac", },
+       {"DELL", "MD3",                 "rdac", },
+       {"NETAPP", "INF-01-00",         "rdac", },
+       {"LSI", "INF-01-00",            "rdac", },
+       {"ENGENIO", "INF-01-00",        "rdac", },
+       {NULL, NULL,                    NULL },
+};
+
+static const char *
+scsi_dh_find_driver(struct scsi_device *sdev)
+{
+       const struct scsi_dh_blist *b;
+
+       if (scsi_device_tpgs(sdev))
+               return "alua";
+
+       for (b = scsi_dh_blist; b->vendor; b++) {
+               if (!strncmp(sdev->vendor, b->vendor, strlen(b->vendor)) &&
+                   !strncmp(sdev->model, b->model, strlen(b->model))) {
+                       return b->driver;
+               }
+       }
+       return NULL;
+}
+
+
+static struct scsi_device_handler *__scsi_dh_lookup(const char *name)
+{
+       struct scsi_device_handler *tmp, *found = NULL;
+
+       spin_lock(&list_lock);
+       list_for_each_entry(tmp, &scsi_dh_list, list) {
+               if (!strncmp(tmp->name, name, strlen(tmp->name))) {
+                       found = tmp;
+                       break;
+               }
+       }
+       spin_unlock(&list_lock);
+       return found;
+}
+
+static struct scsi_device_handler *scsi_dh_lookup(const char *name)
+{
+       struct scsi_device_handler *dh;
+
+       dh = __scsi_dh_lookup(name);
+       if (!dh) {
+               request_module(name);
+               dh = __scsi_dh_lookup(name);
+       }
+
+       return dh;
+}
+
+/*
+ * scsi_dh_handler_attach - Attach a device handler to a device
+ * @sdev - SCSI device the device handler should attach to
+ * @scsi_dh - The device handler to attach
+ */
+static int scsi_dh_handler_attach(struct scsi_device *sdev,
+                                 struct scsi_device_handler *scsi_dh)
+{
+       int error;
+
+       if (!try_module_get(scsi_dh->module))
+               return -EINVAL;
+
+       error = scsi_dh->attach(sdev);
+       if (error) {
+               sdev_printk(KERN_ERR, sdev, "%s: Attach failed (%d)\n",
+                           scsi_dh->name, error);
+               module_put(scsi_dh->module);
+       } else
+               sdev->handler = scsi_dh;
+
+       return error;
+}
+
+/*
+ * scsi_dh_handler_detach - Detach a device handler from a device
+ * @sdev - SCSI device the device handler should be detached from
+ */
+static void scsi_dh_handler_detach(struct scsi_device *sdev)
+{
+       sdev->handler->detach(sdev);
+       sdev_printk(KERN_NOTICE, sdev, "%s: Detached\n", sdev->handler->name);
+       module_put(sdev->handler->module);
+}
+
+/*
+ * Functions for sysfs attribute 'dh_state'
+ */
+static ssize_t
+store_dh_state(struct device *dev, struct device_attribute *attr,
+              const char *buf, size_t count)
+{
+       struct scsi_device *sdev = to_scsi_device(dev);
+       struct scsi_device_handler *scsi_dh;
+       int err = -EINVAL;
+
+       if (sdev->sdev_state == SDEV_CANCEL ||
+           sdev->sdev_state == SDEV_DEL)
+               return -ENODEV;
+
+       if (!sdev->handler) {
+               /*
+                * Attach to a device handler
+                */
+               scsi_dh = scsi_dh_lookup(buf);
+               if (!scsi_dh)
+                       return err;
+               err = scsi_dh_handler_attach(sdev, scsi_dh);
+       } else {
+               if (!strncmp(buf, "detach", 6)) {
+                       /*
+                        * Detach from a device handler
+                        */
+                       sdev_printk(KERN_WARNING, sdev,
+                                   "can't detach handler %s.\n",
+                                   sdev->handler->name);
+                       err = -EINVAL;
+               } else if (!strncmp(buf, "activate", 8)) {
+                       /*
+                        * Activate a device handler
+                        */
+                       if (sdev->handler->activate)
+                               err = sdev->handler->activate(sdev, NULL, NULL);
+                       else
+                               err = 0;
+               }
+       }
+
+       return err<0?err:count;
+}
+
+static ssize_t
+show_dh_state(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct scsi_device *sdev = to_scsi_device(dev);
+
+       if (!sdev->handler)
+               return snprintf(buf, 20, "detached\n");
+
+       return snprintf(buf, 20, "%s\n", sdev->handler->name);
+}
+
+static struct device_attribute scsi_dh_state_attr =
+       __ATTR(dh_state, S_IRUGO | S_IWUSR, show_dh_state,
+              store_dh_state);
+
+int scsi_dh_add_device(struct scsi_device *sdev)
+{
+       struct scsi_device_handler *devinfo = NULL;
+       const char *drv;
+       int err;
+
+       err = device_create_file(&sdev->sdev_gendev, &scsi_dh_state_attr);
+       if (err)
+               return err;
+
+       drv = scsi_dh_find_driver(sdev);
+       if (drv)
+               devinfo = scsi_dh_lookup(drv);
+       if (devinfo)
+               err = scsi_dh_handler_attach(sdev, devinfo);
+       return err;
+}
+
+void scsi_dh_remove_device(struct scsi_device *sdev)
+{
+       if (sdev->handler)
+               scsi_dh_handler_detach(sdev);
+       device_remove_file(&sdev->sdev_gendev, &scsi_dh_state_attr);
+}
+
+/*
+ * scsi_register_device_handler - register a device handler personality
+ *      module.
+ * @scsi_dh - device handler to be registered.
+ *
+ * Returns 0 on success, -EBUSY if handler already registered.
+ */
+int scsi_register_device_handler(struct scsi_device_handler *scsi_dh)
+{
+       if (__scsi_dh_lookup(scsi_dh->name))
+               return -EBUSY;
+
+       if (!scsi_dh->attach || !scsi_dh->detach)
+               return -EINVAL;
+
+       spin_lock(&list_lock);
+       list_add(&scsi_dh->list, &scsi_dh_list);
+       spin_unlock(&list_lock);
+
+       printk(KERN_INFO "%s: device handler registered\n", scsi_dh->name);
+
+       return SCSI_DH_OK;
+}
+EXPORT_SYMBOL_GPL(scsi_register_device_handler);
+
+/*
+ * scsi_unregister_device_handler - register a device handler personality
+ *      module.
+ * @scsi_dh - device handler to be unregistered.
+ *
+ * Returns 0 on success, -ENODEV if handler not registered.
+ */
+int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh)
+{
+       if (!__scsi_dh_lookup(scsi_dh->name))
+               return -ENODEV;
+
+       spin_lock(&list_lock);
+       list_del(&scsi_dh->list);
+       spin_unlock(&list_lock);
+       printk(KERN_INFO "%s: device handler unregistered\n", scsi_dh->name);
+
+       return SCSI_DH_OK;
+}
+EXPORT_SYMBOL_GPL(scsi_unregister_device_handler);
+
+static struct scsi_device *get_sdev_from_queue(struct request_queue *q)
+{
+       struct scsi_device *sdev;
+       unsigned long flags;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       sdev = q->queuedata;
+       if (!sdev || !get_device(&sdev->sdev_gendev))
+               sdev = NULL;
+       spin_unlock_irqrestore(q->queue_lock, flags);
+
+       return sdev;
+}
+
+/*
+ * scsi_dh_activate - activate the path associated with the scsi_device
+ *      corresponding to the given request queue.
+ *     Returns immediately without waiting for activation to be completed.
+ * @q    - Request queue that is associated with the scsi_device to be
+ *         activated.
+ * @fn   - Function to be called upon completion of the activation.
+ *         Function fn is called with data (below) and the error code.
+ *         Function fn may be called from the same calling context. So,
+ *         do not hold the lock in the caller which may be needed in fn.
+ * @data - data passed to the function fn upon completion.
+ *
+ */
+int scsi_dh_activate(struct request_queue *q, activate_complete fn, void *data)
+{
+       struct scsi_device *sdev;
+       int err = SCSI_DH_NOSYS;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev) {
+               if (fn)
+                       fn(data, err);
+               return err;
+       }
+
+       if (!sdev->handler)
+               goto out_fn;
+       err = SCSI_DH_NOTCONN;
+       if (sdev->sdev_state == SDEV_CANCEL ||
+           sdev->sdev_state == SDEV_DEL)
+               goto out_fn;
+
+       err = SCSI_DH_DEV_OFFLINED;
+       if (sdev->sdev_state == SDEV_OFFLINE)
+               goto out_fn;
+
+       if (sdev->handler->activate)
+               err = sdev->handler->activate(sdev, fn, data);
+
+out_put_device:
+       put_device(&sdev->sdev_gendev);
+       return err;
+
+out_fn:
+       if (fn)
+               fn(data, err);
+       goto out_put_device;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_activate);
+
+/*
+ * scsi_dh_set_params - set the parameters for the device as per the
+ *      string specified in params.
+ * @q - Request queue that is associated with the scsi_device for
+ *      which the parameters to be set.
+ * @params - parameters in the following format
+ *      "no_of_params\0param1\0param2\0param3\0...\0"
+ *      for example, string for 2 parameters with value 10 and 21
+ *      is specified as "2\010\021\0".
+ */
+int scsi_dh_set_params(struct request_queue *q, const char *params)
+{
+       struct scsi_device *sdev;
+       int err = -SCSI_DH_NOSYS;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev)
+               return err;
+
+       if (sdev->handler && sdev->handler->set_params)
+               err = sdev->handler->set_params(sdev, params);
+       put_device(&sdev->sdev_gendev);
+       return err;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_set_params);
+
+/*
+ * scsi_dh_attach - Attach device handler
+ * @q - Request queue that is associated with the scsi_device
+ *      the handler should be attached to
+ * @name - name of the handler to attach
+ */
+int scsi_dh_attach(struct request_queue *q, const char *name)
+{
+       struct scsi_device *sdev;
+       struct scsi_device_handler *scsi_dh;
+       int err = 0;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev)
+               return -ENODEV;
+
+       scsi_dh = scsi_dh_lookup(name);
+       if (!scsi_dh) {
+               err = -EINVAL;
+               goto out_put_device;
+       }
+
+       if (sdev->handler) {
+               if (sdev->handler != scsi_dh)
+                       err = -EBUSY;
+               goto out_put_device;
+       }
+
+       err = scsi_dh_handler_attach(sdev, scsi_dh);
+
+out_put_device:
+       put_device(&sdev->sdev_gendev);
+       return err;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_attach);
+
+/*
+ * scsi_dh_attached_handler_name - Get attached device handler's name
+ * @q - Request queue that is associated with the scsi_device
+ *      that may have a device handler attached
+ * @gfp - the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns name of attached handler, NULL if no handler is attached.
+ * Caller must take care to free the returned string.
+ */
+const char *scsi_dh_attached_handler_name(struct request_queue *q, gfp_t gfp)
+{
+       struct scsi_device *sdev;
+       const char *handler_name = NULL;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev)
+               return NULL;
+
+       if (sdev->handler)
+               handler_name = kstrdup(sdev->handler->name, gfp);
+       put_device(&sdev->sdev_gendev);
+       return handler_name;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_attached_handler_name);
index afd34a6..66a96cd 100644 (file)
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_driver.h>
 #include <scsi/scsi_eh.h>
+#include <scsi/scsi_common.h>
 #include <scsi/scsi_transport.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_ioctl.h>
+#include <scsi/scsi_dh.h>
 #include <scsi/sg.h>
 
 #include "scsi_priv.h"
@@ -463,11 +465,10 @@ static int scsi_check_sense(struct scsi_cmnd *scmd)
        if (scsi_sense_is_deferred(&sshdr))
                return NEEDS_RETRY;
 
-       if (sdev->scsi_dh_data && sdev->scsi_dh_data->scsi_dh &&
-                       sdev->scsi_dh_data->scsi_dh->check_sense) {
+       if (sdev->handler && sdev->handler->check_sense) {
                int rc;
 
-               rc = sdev->scsi_dh_data->scsi_dh->check_sense(sdev, &sshdr);
+               rc = sdev->handler->check_sense(sdev, &sshdr);
                if (rc != SCSI_RETURN_NOT_HANDLED)
                        return rc;
                /* handler does not care. Drop down to default handling */
@@ -2178,8 +2179,17 @@ int scsi_error_handler(void *data)
         * We never actually get interrupted because kthread_run
         * disables signal delivery for the created thread.
         */
-       while (!kthread_should_stop()) {
+       while (true) {
+               /*
+                * The sequence in kthread_stop() sets the stop flag first
+                * then wakes the process.  To avoid missed wakeups, the task
+                * should always be in a non running state before the stop
+                * flag is checked
+                */
                set_current_state(TASK_INTERRUPTIBLE);
+               if (kthread_should_stop())
+                       break;
+
                if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) ||
                    shost->host_failed != atomic_read(&shost->host_busy)) {
                        SCSI_LOG_ERROR_RECOVERY(1,
@@ -2415,45 +2425,6 @@ bool scsi_command_normalize_sense(const struct scsi_cmnd *cmd,
 }
 EXPORT_SYMBOL(scsi_command_normalize_sense);
 
-/**
- * scsi_sense_desc_find - search for a given descriptor type in        descriptor sense data format.
- * @sense_buffer:      byte array of descriptor format sense data
- * @sb_len:            number of valid bytes in sense_buffer
- * @desc_type:         value of descriptor type to find
- *                     (e.g. 0 -> information)
- *
- * Notes:
- *     only valid when sense data is in descriptor format
- *
- * Return value:
- *     pointer to start of (first) descriptor if found else NULL
- */
-const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
-                               int desc_type)
-{
-       int add_sen_len, add_len, desc_len, k;
-       const u8 * descp;
-
-       if ((sb_len < 8) || (0 == (add_sen_len = sense_buffer[7])))
-               return NULL;
-       if ((sense_buffer[0] < 0x72) || (sense_buffer[0] > 0x73))
-               return NULL;
-       add_sen_len = (add_sen_len < (sb_len - 8)) ?
-                       add_sen_len : (sb_len - 8);
-       descp = &sense_buffer[8];
-       for (desc_len = 0, k = 0; k < add_sen_len; k += desc_len) {
-               descp += desc_len;
-               add_len = (k < (add_sen_len - 1)) ? descp[1]: -1;
-               desc_len = add_len + 2;
-               if (descp[0] == desc_type)
-                       return descp;
-               if (add_len < 0) // short descriptor ??
-                       break;
-       }
-       return NULL;
-}
-EXPORT_SYMBOL(scsi_sense_desc_find);
-
 /**
  * scsi_get_sense_info_fld - get information field from sense data (either fixed or descriptor format)
  * @sense_buffer:      byte array of sense data
@@ -2503,31 +2474,3 @@ int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
        }
 }
 EXPORT_SYMBOL(scsi_get_sense_info_fld);
-
-/**
- * scsi_build_sense_buffer - build sense data in a buffer
- * @desc:      Sense format (non zero == descriptor format,
- *             0 == fixed format)
- * @buf:       Where to build sense data
- * @key:       Sense key
- * @asc:       Additional sense code
- * @ascq:      Additional sense code qualifier
- *
- **/
-void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq)
-{
-       if (desc) {
-               buf[0] = 0x72;  /* descriptor, current */
-               buf[1] = key;
-               buf[2] = asc;
-               buf[3] = ascq;
-               buf[7] = 0;
-       } else {
-               buf[0] = 0x70;  /* fixed, current */
-               buf[2] = key;
-               buf[7] = 0xa;
-               buf[12] = asc;
-               buf[13] = ascq;
-       }
-}
-EXPORT_SYMBOL(scsi_build_sense_buffer);
index 882864f..cbfc599 100644 (file)
@@ -31,6 +31,7 @@
 #include <scsi/scsi_driver.h>
 #include <scsi/scsi_eh.h>
 #include <scsi/scsi_host.h>
+#include <scsi/scsi_dh.h>
 
 #include <trace/events/scsi.h>
 
@@ -1248,9 +1249,8 @@ static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 {
        struct scsi_cmnd *cmd = req->special;
 
-       if (unlikely(sdev->scsi_dh_data && sdev->scsi_dh_data->scsi_dh
-                        && sdev->scsi_dh_data->scsi_dh->prep_fn)) {
-               int ret = sdev->scsi_dh_data->scsi_dh->prep_fn(sdev, req);
+       if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
+               int ret = sdev->handler->prep_fn(sdev, req);
                if (ret != BLKPREP_OK)
                        return ret;
        }
index e3902fc..644bb73 100644 (file)
@@ -170,6 +170,15 @@ static inline void scsi_autopm_put_host(struct Scsi_Host *h) {}
 extern struct async_domain scsi_sd_pm_domain;
 extern struct async_domain scsi_sd_probe_domain;
 
+/* scsi_dh.c */
+#ifdef CONFIG_SCSI_DH
+int scsi_dh_add_device(struct scsi_device *sdev);
+void scsi_dh_remove_device(struct scsi_device *sdev);
+#else
+static inline int scsi_dh_add_device(struct scsi_device *sdev) { return 0; }
+static inline void scsi_dh_remove_device(struct scsi_device *sdev) { }
+#endif
+
 /* 
  * internal scsi timeout functions: for use by mid-layer and transport
  * classes.
index 9ad4116..b333389 100644 (file)
@@ -1030,11 +1030,20 @@ int scsi_sysfs_add_sdev(struct scsi_device *sdev)
                                "failed to add device: %d\n", error);
                return error;
        }
+
+       error = scsi_dh_add_device(sdev);
+       if (error) {
+               sdev_printk(KERN_INFO, sdev,
+                               "failed to add device handler: %d\n", error);
+               return error;
+       }
+
        device_enable_async_suspend(&sdev->sdev_dev);
        error = device_add(&sdev->sdev_dev);
        if (error) {
                sdev_printk(KERN_INFO, sdev,
                                "failed to add class device: %d\n", error);
+               scsi_dh_remove_device(sdev);
                device_del(&sdev->sdev_gendev);
                return error;
        }
@@ -1074,6 +1083,7 @@ void __scsi_remove_device(struct scsi_device *sdev)
                bsg_unregister_queue(sdev->request_queue);
                device_unregister(&sdev->sdev_dev);
                transport_remove_device(dev);
+               scsi_dh_remove_device(sdev);
                device_del(dev);
        } else
                put_device(&sdev->sdev_dev);
index 9a05819..30d26e3 100644 (file)
@@ -1222,13 +1222,6 @@ show_sas_rphy_enclosure_identifier(struct device *dev,
        u64 identifier;
        int error;
 
-       /*
-        * Only devices behind an expander are supported, because the
-        * enclosure identifier is a SMP feature.
-        */
-       if (scsi_is_sas_phy_local(phy))
-               return -EINVAL;
-
        error = i->f->get_enclosure_identifier(rphy, &identifier);
        if (error)
                return error;
@@ -1248,9 +1241,6 @@ show_sas_rphy_bay_identifier(struct device *dev,
        struct sas_internal *i = to_sas_internal(shost->transportt);
        int val;
 
-       if (scsi_is_sas_phy_local(phy))
-               return -EINVAL;
-
        val = i->f->get_bay_identifier(rphy);
        if (val < 0)
                return val;
index fad22ca..9dc8687 100644 (file)
@@ -377,7 +377,6 @@ static int map_data_for_request(struct vscsifrnt_info *info,
        unsigned int data_len = scsi_bufflen(sc);
        unsigned int data_grants = 0, seg_grants = 0;
        struct scatterlist *sg;
-       unsigned long mfn;
        struct scsiif_request_segment *seg;
 
        ring_req->nr_segments = 0;
@@ -420,9 +419,9 @@ static int map_data_for_request(struct vscsifrnt_info *info,
                        ref = gnttab_claim_grant_reference(&gref_head);
                        BUG_ON(ref == -ENOSPC);
 
-                       mfn = pfn_to_mfn(page_to_pfn(page));
                        gnttab_grant_foreign_access_ref(ref,
-                               info->dev->otherend_id, mfn, 1);
+                               info->dev->otherend_id,
+                               xen_page_to_gfn(page), 1);
                        shadow->gref[ref_cnt] = ref;
                        ring_req->seg[ref_cnt].gref   = ref;
                        ring_req->seg[ref_cnt].offset = (uint16_t)off;
@@ -454,9 +453,10 @@ static int map_data_for_request(struct vscsifrnt_info *info,
                        ref = gnttab_claim_grant_reference(&gref_head);
                        BUG_ON(ref == -ENOSPC);
 
-                       mfn = pfn_to_mfn(page_to_pfn(page));
                        gnttab_grant_foreign_access_ref(ref,
-                               info->dev->otherend_id, mfn, grant_ro);
+                               info->dev->otherend_id,
+                               xen_page_to_gfn(page),
+                               grant_ro);
 
                        shadow->gref[ref_cnt] = ref;
                        seg->gref   = ref;
index 327adcf..a6155c9 100644 (file)
@@ -96,6 +96,7 @@ static const struct {
  * @smd:               handle to qcom_smd
  * @of_node:           of_node handle for information related to this edge
  * @edge_id:           identifier of this edge
+ * @remote_pid:                identifier of remote processor
  * @irq:               interrupt for signals on this edge
  * @ipc_regmap:                regmap handle holding the outgoing ipc register
  * @ipc_offset:                offset within @ipc_regmap of the register for ipc
@@ -111,6 +112,7 @@ struct qcom_smd_edge {
        struct qcom_smd *smd;
        struct device_node *of_node;
        unsigned edge_id;
+       unsigned remote_pid;
 
        int irq;
 
@@ -310,7 +312,7 @@ static void qcom_smd_channel_reset(struct qcom_smd_channel *channel)
        SET_TX_CHANNEL_INFO(channel, fHEAD, 0);
        SET_TX_CHANNEL_INFO(channel, fTAIL, 0);
        SET_TX_CHANNEL_INFO(channel, fSTATE, 1);
-       SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
+       SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 1);
        SET_TX_CHANNEL_INFO(channel, head, 0);
        SET_TX_CHANNEL_INFO(channel, tail, 0);
 
@@ -572,7 +574,7 @@ static irqreturn_t qcom_smd_edge_intr(int irq, void *data)
         * have to scan if the amount of available space in smem have changed
         * since last scan.
         */
-       available = qcom_smem_get_free_space(edge->edge_id);
+       available = qcom_smem_get_free_space(edge->remote_pid);
        if (available != edge->smem_available) {
                edge->smem_available = available;
                edge->need_rescan = true;
@@ -681,7 +683,7 @@ int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len)
                        goto out;
                }
 
-               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 1);
+               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
 
                ret = wait_event_interruptible(channel->fblockread_event,
                                       qcom_smd_get_tx_avail(channel) >= tlen ||
@@ -689,7 +691,7 @@ int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len)
                if (ret)
                        goto out;
 
-               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
+               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 1);
        }
 
        SET_TX_CHANNEL_INFO(channel, fTAIL, 0);
@@ -976,7 +978,8 @@ static struct qcom_smd_channel *qcom_smd_create_channel(struct qcom_smd_edge *ed
        spin_lock_init(&channel->recv_lock);
        init_waitqueue_head(&channel->fblockread_event);
 
-       ret = qcom_smem_get(edge->edge_id, smem_info_item, (void **)&info, &info_size);
+       ret = qcom_smem_get(edge->remote_pid, smem_info_item, (void **)&info,
+                           &info_size);
        if (ret)
                goto free_name_and_channel;
 
@@ -997,7 +1000,8 @@ static struct qcom_smd_channel *qcom_smd_create_channel(struct qcom_smd_edge *ed
                goto free_name_and_channel;
        }
 
-       ret = qcom_smem_get(edge->edge_id, smem_fifo_item, &fifo_base, &fifo_size);
+       ret = qcom_smem_get(edge->remote_pid, smem_fifo_item, &fifo_base,
+                           &fifo_size);
        if (ret)
                goto free_name_and_channel;
 
@@ -1041,7 +1045,7 @@ static void qcom_discover_channels(struct qcom_smd_edge *edge)
        int i;
 
        for (tbl = 0; tbl < SMD_ALLOC_TBL_COUNT; tbl++) {
-               ret = qcom_smem_get(edge->edge_id,
+               ret = qcom_smem_get(edge->remote_pid,
                                    smem_items[tbl].alloc_tbl_id,
                                    (void **)&alloc_tbl,
                                    NULL);
@@ -1184,6 +1188,10 @@ static int qcom_smd_parse_edge(struct device *dev,
                return -EINVAL;
        }
 
+       edge->remote_pid = QCOM_SMEM_HOST_ANY;
+       key = "qcom,remote-pid";
+       of_property_read_u32(node, key, &edge->remote_pid);
+
        syscon_np = of_parse_phandle(node, "qcom,ipc", 0);
        if (!syscon_np) {
                dev_err(dev, "no qcom,ipc node\n");
index 7c2c324..5236518 100644 (file)
@@ -258,10 +258,6 @@ static int qcom_smem_alloc_private(struct qcom_smem *smem,
        size_t alloc_size;
        void *p;
 
-       /* We're not going to find it if there's no matching partition */
-       if (host >= SMEM_HOST_COUNT || !smem->partitions[host])
-               return -ENOENT;
-
        phdr = smem->partitions[host];
 
        p = (void *)phdr + sizeof(*phdr);
@@ -371,8 +367,9 @@ int qcom_smem_alloc(unsigned host, unsigned item, size_t size)
        if (ret)
                return ret;
 
-       ret = qcom_smem_alloc_private(__smem, host, item, size);
-       if (ret == -ENOENT)
+       if (host < SMEM_HOST_COUNT && __smem->partitions[host])
+               ret = qcom_smem_alloc_private(__smem, host, item, size);
+       else
                ret = qcom_smem_alloc_global(__smem, item, size);
 
        hwspin_unlock_irqrestore(__smem->hwlock, &flags);
@@ -428,10 +425,6 @@ static int qcom_smem_get_private(struct qcom_smem *smem,
        struct smem_private_entry *hdr;
        void *p;
 
-       /* We're not going to find it if there's no matching partition */
-       if (host >= SMEM_HOST_COUNT || !smem->partitions[host])
-               return -ENOENT;
-
        phdr = smem->partitions[host];
 
        p = (void *)phdr + sizeof(*phdr);
@@ -484,8 +477,9 @@ int qcom_smem_get(unsigned host, unsigned item, void **ptr, size_t *size)
        if (ret)
                return ret;
 
-       ret = qcom_smem_get_private(__smem, host, item, ptr, size);
-       if (ret == -ENOENT)
+       if (host < SMEM_HOST_COUNT && __smem->partitions[host])
+               ret = qcom_smem_get_private(__smem, host, item, ptr, size);
+       else
                ret = qcom_smem_get_global(__smem, item, ptr, size);
 
        hwspin_unlock_irqrestore(__smem->hwlock, &flags);
index eec878e..217aa53 100644 (file)
@@ -997,7 +997,7 @@ static void ion_vm_close(struct vm_area_struct *vma)
        mutex_unlock(&buffer->lock);
 }
 
-static struct vm_operations_struct ion_vma_ops = {
+static const struct vm_operations_struct ion_vma_ops = {
        .open = ion_vm_open,
        .close = ion_vm_close,
        .fault = ion_vm_fault,
index 81df77b..9c41652 100644 (file)
@@ -91,7 +91,7 @@ static const struct board_staging_dev armadillo800eva_devices[] __initconst = {
                .pdev           = &lcdc0_device,
                .clocks         = lcdc0_clocks,
                .nclocks        = ARRAY_SIZE(lcdc0_clocks),
-               .domain         = "a4lc",
+               .domain         = "/system-controller@e6180000/pm-domains/c5/a4lc@1"
        },
 };
 
index 29d456e..3eb5eb8 100644 (file)
@@ -135,6 +135,40 @@ int __init board_staging_register_clock(const struct board_staging_clk *bsc)
        return error;
 }
 
+#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
+static int board_staging_add_dev_domain(struct platform_device *pdev,
+                                       const char *domain)
+{
+       struct of_phandle_args pd_args;
+       struct generic_pm_domain *pd;
+       struct device_node *np;
+
+       np = of_find_node_by_path(domain);
+       if (!np) {
+               pr_err("Cannot find domain node %s\n", domain);
+               return -ENOENT;
+       }
+
+       pd_args.np = np;
+       pd_args.args_count = 0;
+       pd = of_genpd_get_from_provider(&pd_args);
+       if (IS_ERR(pd)) {
+               pr_err("Cannot find genpd %s (%ld)\n", domain, PTR_ERR(pd));
+               return PTR_ERR(pd);
+
+       }
+       pr_debug("Found genpd %s for device %s\n", pd->name, pdev->name);
+
+       return pm_genpd_add_device(pd, &pdev->dev);
+}
+#else
+static inline int board_staging_add_dev_domain(struct platform_device *pdev,
+                                              const char *domain)
+{
+       return 0;
+}
+#endif
+
 int __init board_staging_register_device(const struct board_staging_dev *dev)
 {
        struct platform_device *pdev = dev->pdev;
@@ -161,7 +195,7 @@ int __init board_staging_register_device(const struct board_staging_dev *dev)
        }
 
        if (dev->domain)
-               __pm_genpd_name_add_device(dev->domain, &pdev->dev, NULL);
+               board_staging_add_dev_domain(pdev, dev->domain);
 
        return error;
 }
index fd54d09..0e8a451 100644 (file)
@@ -2156,7 +2156,7 @@ static void comedi_vm_close(struct vm_area_struct *area)
        comedi_buf_map_put(bm);
 }
 
-static struct vm_operations_struct comedi_vm_ops = {
+static const struct vm_operations_struct comedi_vm_ops = {
        .open = comedi_vm_open,
        .close = comedi_vm_close,
 };
index cf5fe9b..d7f6235 100644 (file)
@@ -24,6 +24,8 @@ if STAGING_RDMA
 
 source "drivers/staging/rdma/amso1100/Kconfig"
 
+source "drivers/staging/rdma/ehca/Kconfig"
+
 source "drivers/staging/rdma/hfi1/Kconfig"
 
 source "drivers/staging/rdma/ipath/Kconfig"
index cbd915a..139d78e 100644 (file)
@@ -1,4 +1,5 @@
 # Entries for RDMA_STAGING tree
 obj-$(CONFIG_INFINIBAND_AMSO1100)      += amso1100/
+obj-$(CONFIG_INFINIBAND_EHCA)  += ehca/
 obj-$(CONFIG_INFINIBAND_HFI1)  += hfi1/
 obj-$(CONFIG_INFINIBAND_IPATH) += ipath/
diff --git a/drivers/staging/rdma/ehca/Kconfig b/drivers/staging/rdma/ehca/Kconfig
new file mode 100644 (file)
index 0000000..3fadd2a
--- /dev/null
@@ -0,0 +1,10 @@
+config INFINIBAND_EHCA
+       tristate "eHCA support"
+       depends on IBMEBUS
+       ---help---
+       This driver supports the deprecated IBM pSeries eHCA InfiniBand
+       adapter.
+
+       To compile the driver as a module, choose M here. The module
+       will be called ib_ehca.
+
diff --git a/drivers/staging/rdma/ehca/Makefile b/drivers/staging/rdma/ehca/Makefile
new file mode 100644 (file)
index 0000000..74d284e
--- /dev/null
@@ -0,0 +1,16 @@
+#  Authors: Heiko J Schick <schickhj@de.ibm.com>
+#           Christoph Raisch <raisch@de.ibm.com>
+#           Joachim Fenkes <fenkes@de.ibm.com>
+#
+#  Copyright (c) 2005 IBM Corporation
+#
+#  All rights reserved.
+#
+#  This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
+
+obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
+
+ib_ehca-objs  = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
+               ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
+               ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
+
diff --git a/drivers/staging/rdma/ehca/TODO b/drivers/staging/rdma/ehca/TODO
new file mode 100644 (file)
index 0000000..199a4a6
--- /dev/null
@@ -0,0 +1,4 @@
+9/2015
+
+The ehca driver has been deprecated and moved to drivers/staging/rdma.
+It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/ehca/ehca_av.c b/drivers/staging/rdma/ehca/ehca_av.c
new file mode 100644 (file)
index 0000000..4659263
--- /dev/null
@@ -0,0 +1,277 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  address vector functions
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *av_cache;
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+                 enum ib_rate path_rate, u32 *ipd)
+{
+       int path = ib_rate_to_mult(path_rate);
+       int link, ret;
+       struct ib_port_attr pa;
+
+       if (path_rate == IB_RATE_PORT_CURRENT) {
+               *ipd = 0;
+               return 0;
+       }
+
+       if (unlikely(path < 0)) {
+               ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
+                        path_rate);
+               return -EINVAL;
+       }
+
+       ret = ehca_query_port(&shca->ib_device, port, &pa);
+       if (unlikely(ret < 0)) {
+               ehca_err(&shca->ib_device, "Failed to query port  ret=%i", ret);
+               return ret;
+       }
+
+       link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
+
+       if (path >= link)
+               /* no need to throttle if path faster than link */
+               *ipd = 0;
+       else
+               /* IPD = round((link / path) - 1) */
+               *ipd = ((link + (path >> 1)) / path) - 1;
+
+       return 0;
+}
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+       int ret;
+       struct ehca_av *av;
+       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+                                             ib_device);
+
+       av = kmem_cache_alloc(av_cache, GFP_KERNEL);
+       if (!av) {
+               ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
+                        pd, ah_attr);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       av->av.sl = ah_attr->sl;
+       av->av.dlid = ah_attr->dlid;
+       av->av.slid_path_bits = ah_attr->src_path_bits;
+
+       if (ehca_static_rate < 0) {
+               u32 ipd;
+               if (ehca_calc_ipd(shca, ah_attr->port_num,
+                                 ah_attr->static_rate, &ipd)) {
+                       ret = -EINVAL;
+                       goto create_ah_exit1;
+               }
+               av->av.ipd = ipd;
+       } else
+               av->av.ipd = ehca_static_rate;
+
+       av->av.lnh = ah_attr->ah_flags;
+       av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
+                                           ah_attr->grh.traffic_class);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+                                           ah_attr->grh.flow_label);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+                                           ah_attr->grh.hop_limit);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
+       /* set sgid in grh.word_1 */
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               int rc;
+               struct ib_port_attr port_attr;
+               union ib_gid gid;
+               memset(&port_attr, 0, sizeof(port_attr));
+               rc = ehca_query_port(pd->device, ah_attr->port_num,
+                                    &port_attr);
+               if (rc) { /* invalid port number */
+                       ret = -EINVAL;
+                       ehca_err(pd->device, "Invalid port number "
+                                "ehca_query_port() returned %x "
+                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
+                       goto create_ah_exit1;
+               }
+               memset(&gid, 0, sizeof(gid));
+               rc = ehca_query_gid(pd->device,
+                                   ah_attr->port_num,
+                                   ah_attr->grh.sgid_index, &gid);
+               if (rc) {
+                       ret = -EINVAL;
+                       ehca_err(pd->device, "Failed to retrieve sgid "
+                                "ehca_query_gid() returned %x "
+                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
+                       goto create_ah_exit1;
+               }
+               memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
+       }
+       av->av.pmtu = shca->max_mtu;
+
+       /* dgid comes in grh.word_3 */
+       memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
+              sizeof(ah_attr->grh.dgid));
+
+       return &av->ib_ah;
+
+create_ah_exit1:
+       kmem_cache_free(av_cache, av);
+
+       return ERR_PTR(ret);
+}
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+       struct ehca_av *av;
+       struct ehca_ud_av new_ehca_av;
+       struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
+                                             ib_device);
+
+       memset(&new_ehca_av, 0, sizeof(new_ehca_av));
+       new_ehca_av.sl = ah_attr->sl;
+       new_ehca_av.dlid = ah_attr->dlid;
+       new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
+       new_ehca_av.ipd = ah_attr->static_rate;
+       new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
+                                        (ah_attr->ah_flags & IB_AH_GRH) > 0);
+       new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
+                                               ah_attr->grh.traffic_class);
+       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+                                                ah_attr->grh.flow_label);
+       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+                                                ah_attr->grh.hop_limit);
+       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
+
+       /* set sgid in grh.word_1 */
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               int rc;
+               struct ib_port_attr port_attr;
+               union ib_gid gid;
+               memset(&port_attr, 0, sizeof(port_attr));
+               rc = ehca_query_port(ah->device, ah_attr->port_num,
+                                    &port_attr);
+               if (rc) { /* invalid port number */
+                       ehca_err(ah->device, "Invalid port number "
+                                "ehca_query_port() returned %x "
+                                "ah=%p ah_attr=%p port_num=%x",
+                                rc, ah, ah_attr, ah_attr->port_num);
+                       return -EINVAL;
+               }
+               memset(&gid, 0, sizeof(gid));
+               rc = ehca_query_gid(ah->device,
+                                   ah_attr->port_num,
+                                   ah_attr->grh.sgid_index, &gid);
+               if (rc) {
+                       ehca_err(ah->device, "Failed to retrieve sgid "
+                                "ehca_query_gid() returned %x "
+                                "ah=%p ah_attr=%p port_num=%x "
+                                "sgid_index=%x",
+                                rc, ah, ah_attr, ah_attr->port_num,
+                                ah_attr->grh.sgid_index);
+                       return -EINVAL;
+               }
+               memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
+       }
+
+       new_ehca_av.pmtu = shca->max_mtu;
+
+       memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
+              sizeof(ah_attr->grh.dgid));
+
+       av = container_of(ah, struct ehca_av, ib_ah);
+       av->av = new_ehca_av;
+
+       return 0;
+}
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+       struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
+
+       memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
+              sizeof(ah_attr->grh.dgid));
+       ah_attr->sl = av->av.sl;
+
+       ah_attr->dlid = av->av.dlid;
+
+       ah_attr->src_path_bits = av->av.slid_path_bits;
+       ah_attr->static_rate = av->av.ipd;
+       ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
+       ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
+                                                   av->av.grh.word_0);
+       ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
+                                               av->av.grh.word_0);
+       ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
+                                                av->av.grh.word_0);
+
+       return 0;
+}
+
+int ehca_destroy_ah(struct ib_ah *ah)
+{
+       kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
+
+       return 0;
+}
+
+int ehca_init_av_cache(void)
+{
+       av_cache = kmem_cache_create("ehca_cache_av",
+                                  sizeof(struct ehca_av), 0,
+                                  SLAB_HWCACHE_ALIGN,
+                                  NULL);
+       if (!av_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_av_cache(void)
+{
+       if (av_cache)
+               kmem_cache_destroy(av_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h
new file mode 100644 (file)
index 0000000..bd45e0f
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Struct definition for eHCA internal structures
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_H__
+#define __EHCA_CLASSES_H__
+
+struct ehca_module;
+struct ehca_qp;
+struct ehca_cq;
+struct ehca_eq;
+struct ehca_mr;
+struct ehca_mw;
+struct ehca_pd;
+struct ehca_av;
+
+#include <linux/wait.h>
+#include <linux/mutex.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#ifdef CONFIG_PPC64
+#include "ehca_classes_pSeries.h"
+#endif
+#include "ipz_pt_fn.h"
+#include "ehca_qes.h"
+#include "ehca_irq.h"
+
+#define EHCA_EQE_CACHE_SIZE 20
+#define EHCA_MAX_NUM_QUEUES 0xffff
+
+struct ehca_eqe_cache_entry {
+       struct ehca_eqe *eqe;
+       struct ehca_cq *cq;
+};
+
+struct ehca_eq {
+       u32 length;
+       struct ipz_queue ipz_queue;
+       struct ipz_eq_handle ipz_eq_handle;
+       struct work_struct work;
+       struct h_galpas galpas;
+       int is_initialized;
+       struct ehca_pfeq pf;
+       spinlock_t spinlock;
+       struct tasklet_struct interrupt_task;
+       u32 ist;
+       spinlock_t irq_spinlock;
+       struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
+};
+
+struct ehca_sma_attr {
+       u16 lid, lmc, sm_sl, sm_lid;
+       u16 pkey_tbl_len, pkeys[16];
+};
+
+struct ehca_sport {
+       struct ib_cq *ibcq_aqp1;
+       struct ib_qp *ibqp_sqp[2];
+       /* lock to serialze modify_qp() calls for sqp in normal
+        * and irq path (when event PORT_ACTIVE is received first time)
+        */
+       spinlock_t mod_sqp_lock;
+       enum ib_port_state port_state;
+       struct ehca_sma_attr saved_attr;
+       u32 pma_qp_nr;
+};
+
+#define HCA_CAP_MR_PGSIZE_4K  0x80000000
+#define HCA_CAP_MR_PGSIZE_64K 0x40000000
+#define HCA_CAP_MR_PGSIZE_1M  0x20000000
+#define HCA_CAP_MR_PGSIZE_16M 0x10000000
+
+struct ehca_shca {
+       struct ib_device ib_device;
+       struct platform_device *ofdev;
+       u8 num_ports;
+       int hw_level;
+       struct list_head shca_list;
+       struct ipz_adapter_handle ipz_hca_handle;
+       struct ehca_sport sport[2];
+       struct ehca_eq eq;
+       struct ehca_eq neq;
+       struct ehca_mr *maxmr;
+       struct ehca_pd *pd;
+       struct h_galpas galpas;
+       struct mutex modify_mutex;
+       u64 hca_cap;
+       /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
+       u32 hca_cap_mr_pgsize;
+       int max_mtu;
+       int max_num_qps;
+       int max_num_cqs;
+       atomic_t num_cqs;
+       atomic_t num_qps;
+};
+
+struct ehca_pd {
+       struct ib_pd ib_pd;
+       struct ipz_pd fw_pd;
+       /* small queue mgmt */
+       struct mutex lock;
+       struct list_head free[2];
+       struct list_head full[2];
+};
+
+enum ehca_ext_qp_type {
+       EQPT_NORMAL    = 0,
+       EQPT_LLQP      = 1,
+       EQPT_SRQBASE   = 2,
+       EQPT_SRQ       = 3,
+};
+
+/* struct to cache modify_qp()'s parms for GSI/SMI qp */
+struct ehca_mod_qp_parm {
+       int mask;
+       struct ib_qp_attr attr;
+};
+
+#define EHCA_MOD_QP_PARM_MAX 4
+
+#define QMAP_IDX_MASK 0xFFFFULL
+
+/* struct for tracking if cqes have been reported to the application */
+struct ehca_qmap_entry {
+       u16 app_wr_id;
+       u8 reported;
+       u8 cqe_req;
+};
+
+struct ehca_queue_map {
+       struct ehca_qmap_entry *map;
+       unsigned int entries;
+       unsigned int tail;
+       unsigned int left_to_poll;
+       unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
+};
+
+/* function to calculate the next index for the qmap */
+static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
+{
+       unsigned int temp = cur_index + 1;
+       return (temp == limit) ? 0 : temp;
+}
+
+struct ehca_qp {
+       union {
+               struct ib_qp ib_qp;
+               struct ib_srq ib_srq;
+       };
+       u32 qp_type;
+       enum ehca_ext_qp_type ext_type;
+       enum ib_qp_state state;
+       struct ipz_queue ipz_squeue;
+       struct ehca_queue_map sq_map;
+       struct ipz_queue ipz_rqueue;
+       struct ehca_queue_map rq_map;
+       struct h_galpas galpas;
+       u32 qkey;
+       u32 real_qp_num;
+       u32 token;
+       spinlock_t spinlock_s;
+       spinlock_t spinlock_r;
+       u32 sq_max_inline_data_size;
+       struct ipz_qp_handle ipz_qp_handle;
+       struct ehca_pfqp pf;
+       struct ib_qp_init_attr init_attr;
+       struct ehca_cq *send_cq;
+       struct ehca_cq *recv_cq;
+       unsigned int sqerr_purgeflag;
+       struct hlist_node list_entries;
+       /* array to cache modify_qp()'s parms for GSI/SMI qp */
+       struct ehca_mod_qp_parm *mod_qp_parm;
+       int mod_qp_parm_idx;
+       /* mmap counter for resources mapped into user space */
+       u32 mm_count_squeue;
+       u32 mm_count_rqueue;
+       u32 mm_count_galpa;
+       /* unsolicited ack circumvention */
+       int unsol_ack_circ;
+       int mtu_shift;
+       u32 message_count;
+       u32 packet_count;
+       atomic_t nr_events; /* events seen */
+       wait_queue_head_t wait_completion;
+       int mig_armed;
+       struct list_head sq_err_node;
+       struct list_head rq_err_node;
+};
+
+#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
+#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
+#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
+
+/* must be power of 2 */
+#define QP_HASHTAB_LEN 8
+
+struct ehca_cq {
+       struct ib_cq ib_cq;
+       struct ipz_queue ipz_queue;
+       struct h_galpas galpas;
+       spinlock_t spinlock;
+       u32 cq_number;
+       u32 token;
+       u32 nr_of_entries;
+       struct ipz_cq_handle ipz_cq_handle;
+       struct ehca_pfcq pf;
+       spinlock_t cb_lock;
+       struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
+       struct list_head entry;
+       u32 nr_callbacks;   /* #events assigned to cpu by scaling code */
+       atomic_t nr_events; /* #events seen */
+       wait_queue_head_t wait_completion;
+       spinlock_t task_lock;
+       /* mmap counter for resources mapped into user space */
+       u32 mm_count_queue;
+       u32 mm_count_galpa;
+       struct list_head sqp_err_list;
+       struct list_head rqp_err_list;
+};
+
+enum ehca_mr_flag {
+       EHCA_MR_FLAG_FMR = 0x80000000,   /* FMR, created with ehca_alloc_fmr */
+       EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR                           */
+};
+
+struct ehca_mr {
+       union {
+               struct ib_mr ib_mr;     /* must always be first in ehca_mr */
+               struct ib_fmr ib_fmr;   /* must always be first in ehca_mr */
+       } ib;
+       struct ib_umem *umem;
+       spinlock_t mrlock;
+
+       enum ehca_mr_flag flags;
+       u32 num_kpages;         /* number of kernel pages */
+       u32 num_hwpages;        /* number of hw pages to form MR */
+       u64 hwpage_size;        /* hw page size used for this MR */
+       int acl;                /* ACL (stored here for usage in reregister) */
+       u64 *start;             /* virtual start address (stored here for */
+                               /* usage in reregister) */
+       u64 size;               /* size (stored here for usage in reregister) */
+       u32 fmr_page_size;      /* page size for FMR */
+       u32 fmr_max_pages;      /* max pages for FMR */
+       u32 fmr_max_maps;       /* max outstanding maps for FMR */
+       u32 fmr_map_cnt;        /* map counter for FMR */
+       /* fw specific data */
+       struct ipz_mrmw_handle ipz_mr_handle;   /* MR handle for h-calls */
+       struct h_galpas galpas;
+};
+
+struct ehca_mw {
+       struct ib_mw ib_mw;     /* gen2 mw, must always be first in ehca_mw */
+       spinlock_t mwlock;
+
+       u8 never_bound;         /* indication MW was never bound */
+       struct ipz_mrmw_handle ipz_mw_handle;   /* MW handle for h-calls */
+       struct h_galpas galpas;
+};
+
+enum ehca_mr_pgi_type {
+       EHCA_MR_PGI_PHYS   = 1,  /* type of ehca_reg_phys_mr,
+                                 * ehca_rereg_phys_mr,
+                                 * ehca_reg_internal_maxmr */
+       EHCA_MR_PGI_USER   = 2,  /* type of ehca_reg_user_mr */
+       EHCA_MR_PGI_FMR    = 3   /* type of ehca_map_phys_fmr */
+};
+
+struct ehca_mr_pginfo {
+       enum ehca_mr_pgi_type type;
+       u64 num_kpages;
+       u64 kpage_cnt;
+       u64 hwpage_size;     /* hw page size used for this MR */
+       u64 num_hwpages;     /* number of hw pages */
+       u64 hwpage_cnt;      /* counter for hw pages */
+       u64 next_hwpage;     /* next hw page in buffer/chunk/listelem */
+
+       union {
+               struct { /* type EHCA_MR_PGI_PHYS section */
+                       int num_phys_buf;
+                       struct ib_phys_buf *phys_buf_array;
+                       u64 next_buf;
+               } phy;
+               struct { /* type EHCA_MR_PGI_USER section */
+                       struct ib_umem *region;
+                       struct scatterlist *next_sg;
+                       u64 next_nmap;
+               } usr;
+               struct { /* type EHCA_MR_PGI_FMR section */
+                       u64 fmr_pgsize;
+                       u64 *page_list;
+                       u64 next_listelem;
+               } fmr;
+       } u;
+};
+
+/* output parameters for MR/FMR hipz calls */
+struct ehca_mr_hipzout_parms {
+       struct ipz_mrmw_handle handle;
+       u32 lkey;
+       u32 rkey;
+       u64 len;
+       u64 vaddr;
+       u32 acl;
+};
+
+/* output parameters for MW hipz calls */
+struct ehca_mw_hipzout_parms {
+       struct ipz_mrmw_handle handle;
+       u32 rkey;
+};
+
+struct ehca_av {
+       struct ib_ah ib_ah;
+       struct ehca_ud_av av;
+};
+
+struct ehca_ucontext {
+       struct ib_ucontext ib_ucontext;
+};
+
+int ehca_init_pd_cache(void);
+void ehca_cleanup_pd_cache(void);
+int ehca_init_cq_cache(void);
+void ehca_cleanup_cq_cache(void);
+int ehca_init_qp_cache(void);
+void ehca_cleanup_qp_cache(void);
+int ehca_init_av_cache(void);
+void ehca_cleanup_av_cache(void);
+int ehca_init_mrmw_cache(void);
+void ehca_cleanup_mrmw_cache(void);
+int ehca_init_small_qp_cache(void);
+void ehca_cleanup_small_qp_cache(void);
+
+extern rwlock_t ehca_qp_idr_lock;
+extern rwlock_t ehca_cq_idr_lock;
+extern struct idr ehca_qp_idr;
+extern struct idr ehca_cq_idr;
+extern spinlock_t shca_list_lock;
+
+extern int ehca_static_rate;
+extern int ehca_port_act_time;
+extern bool ehca_use_hp_mr;
+extern bool ehca_scaling_code;
+extern int ehca_lock_hcalls;
+extern int ehca_nr_ports;
+extern int ehca_max_cq;
+extern int ehca_max_qp;
+
+struct ipzu_queue_resp {
+       u32 qe_size;      /* queue entry size */
+       u32 act_nr_of_sg;
+       u32 queue_length; /* queue length allocated in bytes */
+       u32 pagesize;
+       u32 toggle_state;
+       u32 offset; /* save offset within a page for small_qp */
+};
+
+struct ehca_create_cq_resp {
+       u32 cq_number;
+       u32 token;
+       struct ipzu_queue_resp ipz_queue;
+       u32 fw_handle_ofs;
+       u32 dummy;
+};
+
+struct ehca_create_qp_resp {
+       u32 qp_num;
+       u32 token;
+       u32 qp_type;
+       u32 ext_type;
+       u32 qkey;
+       /* qp_num assigned by ehca: sqp0/1 may have got different numbers */
+       u32 real_qp_num;
+       u32 fw_handle_ofs;
+       u32 dummy;
+       struct ipzu_queue_resp ipz_squeue;
+       struct ipzu_queue_resp ipz_rqueue;
+};
+
+struct ehca_alloc_cq_parms {
+       u32 nr_cqe;
+       u32 act_nr_of_entries;
+       u32 act_pages;
+       struct ipz_eq_handle eq_handle;
+};
+
+enum ehca_service_type {
+       ST_RC  = 0,
+       ST_UC  = 1,
+       ST_RD  = 2,
+       ST_UD  = 3,
+};
+
+enum ehca_ll_comp_flags {
+       LLQP_SEND_COMP = 0x20,
+       LLQP_RECV_COMP = 0x40,
+       LLQP_COMP_MASK = 0x60,
+};
+
+struct ehca_alloc_queue_parms {
+       /* input parameters */
+       int max_wr;
+       int max_sge;
+       int page_size;
+       int is_small;
+
+       /* output parameters */
+       u16 act_nr_wqes;
+       u8  act_nr_sges;
+       u32 queue_size; /* bytes for small queues, pages otherwise */
+};
+
+struct ehca_alloc_qp_parms {
+       struct ehca_alloc_queue_parms squeue;
+       struct ehca_alloc_queue_parms rqueue;
+
+       /* input parameters */
+       enum ehca_service_type servicetype;
+       int qp_storage;
+       int sigtype;
+       enum ehca_ext_qp_type ext_type;
+       enum ehca_ll_comp_flags ll_comp_flags;
+       int ud_av_l_key_ctl;
+
+       u32 token;
+       struct ipz_eq_handle eq_handle;
+       struct ipz_pd pd;
+       struct ipz_cq_handle send_cq_handle, recv_cq_handle;
+
+       u32 srq_qpn, srq_token, srq_limit;
+
+       /* output parameters */
+       u32 real_qp_num;
+       struct ipz_qp_handle qp_handle;
+       struct h_galpas galpas;
+};
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_classes_pSeries.h b/drivers/staging/rdma/ehca/ehca_classes_pSeries.h
new file mode 100644 (file)
index 0000000..689c357
--- /dev/null
@@ -0,0 +1,208 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  pSeries interface definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_PSERIES_H__
+#define __EHCA_CLASSES_PSERIES_H__
+
+#include "hcp_phyp.h"
+#include "ipz_pt_fn.h"
+
+
+struct ehca_pfqp {
+       struct ipz_qpt sqpt;
+       struct ipz_qpt rqpt;
+};
+
+struct ehca_pfcq {
+       struct ipz_qpt qpt;
+       u32 cqnr;
+};
+
+struct ehca_pfeq {
+       struct ipz_qpt qpt;
+       struct h_galpa galpa;
+       u32 eqnr;
+};
+
+struct ipz_adapter_handle {
+       u64 handle;
+};
+
+struct ipz_cq_handle {
+       u64 handle;
+};
+
+struct ipz_eq_handle {
+       u64 handle;
+};
+
+struct ipz_qp_handle {
+       u64 handle;
+};
+struct ipz_mrmw_handle {
+       u64 handle;
+};
+
+struct ipz_pd {
+       u32 value;
+};
+
+struct hcp_modify_qp_control_block {
+       u32 qkey;                      /* 00 */
+       u32 rdd;                       /* reliable datagram domain */
+       u32 send_psn;                  /* 02 */
+       u32 receive_psn;               /* 03 */
+       u32 prim_phys_port;            /* 04 */
+       u32 alt_phys_port;             /* 05 */
+       u32 prim_p_key_idx;            /* 06 */
+       u32 alt_p_key_idx;             /* 07 */
+       u32 rdma_atomic_ctrl;          /* 08 */
+       u32 qp_state;                  /* 09 */
+       u32 reserved_10;               /* 10 */
+       u32 rdma_nr_atomic_resp_res;   /* 11 */
+       u32 path_migration_state;      /* 12 */
+       u32 rdma_atomic_outst_dest_qp; /* 13 */
+       u32 dest_qp_nr;                /* 14 */
+       u32 min_rnr_nak_timer_field;   /* 15 */
+       u32 service_level;             /* 16 */
+       u32 send_grh_flag;             /* 17 */
+       u32 retry_count;               /* 18 */
+       u32 timeout;                   /* 19 */
+       u32 path_mtu;                  /* 20 */
+       u32 max_static_rate;           /* 21 */
+       u32 dlid;                      /* 22 */
+       u32 rnr_retry_count;           /* 23 */
+       u32 source_path_bits;          /* 24 */
+       u32 traffic_class;             /* 25 */
+       u32 hop_limit;                 /* 26 */
+       u32 source_gid_idx;            /* 27 */
+       u32 flow_label;                /* 28 */
+       u32 reserved_29;               /* 29 */
+       union {                        /* 30 */
+               u64 dw[2];
+               u8 byte[16];
+       } dest_gid;
+       u32 service_level_al;          /* 34 */
+       u32 send_grh_flag_al;          /* 35 */
+       u32 retry_count_al;            /* 36 */
+       u32 timeout_al;                /* 37 */
+       u32 max_static_rate_al;        /* 38 */
+       u32 dlid_al;                   /* 39 */
+       u32 rnr_retry_count_al;        /* 40 */
+       u32 source_path_bits_al;       /* 41 */
+       u32 traffic_class_al;          /* 42 */
+       u32 hop_limit_al;              /* 43 */
+       u32 source_gid_idx_al;         /* 44 */
+       u32 flow_label_al;             /* 45 */
+       u32 reserved_46;               /* 46 */
+       u32 reserved_47;               /* 47 */
+       union {                        /* 48 */
+               u64 dw[2];
+               u8 byte[16];
+       } dest_gid_al;
+       u32 max_nr_outst_send_wr;      /* 52 */
+       u32 max_nr_outst_recv_wr;      /* 53 */
+       u32 disable_ete_credit_check;  /* 54 */
+       u32 qp_number;                 /* 55 */
+       u64 send_queue_handle;         /* 56 */
+       u64 recv_queue_handle;         /* 58 */
+       u32 actual_nr_sges_in_sq_wqe;  /* 60 */
+       u32 actual_nr_sges_in_rq_wqe;  /* 61 */
+       u32 qp_enable;                 /* 62 */
+       u32 curr_srq_limit;            /* 63 */
+       u64 qp_aff_asyn_ev_log_reg;    /* 64 */
+       u64 shared_rq_hndl;            /* 66 */
+       u64 trigg_doorbell_qp_hndl;    /* 68 */
+       u32 reserved_70_127[58];       /* 70 */
+};
+
+#define MQPCB_MASK_QKEY                         EHCA_BMASK_IBM( 0,  0)
+#define MQPCB_MASK_SEND_PSN                     EHCA_BMASK_IBM( 2,  2)
+#define MQPCB_MASK_RECEIVE_PSN                  EHCA_BMASK_IBM( 3,  3)
+#define MQPCB_MASK_PRIM_PHYS_PORT               EHCA_BMASK_IBM( 4,  4)
+#define MQPCB_PRIM_PHYS_PORT                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_PHYS_PORT                EHCA_BMASK_IBM( 5,  5)
+#define MQPCB_MASK_PRIM_P_KEY_IDX               EHCA_BMASK_IBM( 6,  6)
+#define MQPCB_PRIM_P_KEY_IDX                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
+#define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
+#define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
+#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
+#define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
+#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
+#define MQPCB_MASK_DEST_QP_NR                   EHCA_BMASK_IBM(14, 14)
+#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD      EHCA_BMASK_IBM(15, 15)
+#define MQPCB_MASK_SERVICE_LEVEL                EHCA_BMASK_IBM(16, 16)
+#define MQPCB_MASK_SEND_GRH_FLAG                EHCA_BMASK_IBM(17, 17)
+#define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
+#define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
+#define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
+#define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
+#define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
+#define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
+#define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
+#define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
+#define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
+#define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
+#define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
+#define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
+#define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
+#define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
+#define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
+#define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
+#define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
+#define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
+#define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
+#define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
+#define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
+#define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
+#define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
+#define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
+#define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
+#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
+#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
+#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
+#define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
+#define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
+#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
+#define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
+
+#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/staging/rdma/ehca/ehca_cq.c b/drivers/staging/rdma/ehca/ehca_cq.c
new file mode 100644 (file)
index 0000000..9b68b17
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Completion queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *cq_cache;
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
+{
+       unsigned int qp_num = qp->real_qp_num;
+       unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->spinlock, flags);
+       hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
+       spin_unlock_irqrestore(&cq->spinlock, flags);
+
+       ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
+                cq->cq_number, qp_num);
+
+       return 0;
+}
+
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
+{
+       int ret = -EINVAL;
+       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+       struct hlist_node *iter;
+       struct ehca_qp *qp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->spinlock, flags);
+       hlist_for_each(iter, &cq->qp_hashtab[key]) {
+               qp = hlist_entry(iter, struct ehca_qp, list_entries);
+               if (qp->real_qp_num == real_qp_num) {
+                       hlist_del(iter);
+                       ehca_dbg(cq->ib_cq.device,
+                                "removed qp from cq .cq_num=%x real_qp_num=%x",
+                                cq->cq_number, real_qp_num);
+                       ret = 0;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&cq->spinlock, flags);
+       if (ret)
+               ehca_err(cq->ib_cq.device,
+                        "qp not found cq_num=%x real_qp_num=%x",
+                        cq->cq_number, real_qp_num);
+
+       return ret;
+}
+
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
+{
+       struct ehca_qp *ret = NULL;
+       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+       struct hlist_node *iter;
+       struct ehca_qp *qp;
+       hlist_for_each(iter, &cq->qp_hashtab[key]) {
+               qp = hlist_entry(iter, struct ehca_qp, list_entries);
+               if (qp->real_qp_num == real_qp_num) {
+                       ret = qp;
+                       break;
+               }
+       }
+       return ret;
+}
+
+struct ib_cq *ehca_create_cq(struct ib_device *device,
+                            const struct ib_cq_init_attr *attr,
+                            struct ib_ucontext *context,
+                            struct ib_udata *udata)
+{
+       int cqe = attr->cqe;
+       static const u32 additional_cqe = 20;
+       struct ib_cq *cq;
+       struct ehca_cq *my_cq;
+       struct ehca_shca *shca =
+               container_of(device, struct ehca_shca, ib_device);
+       struct ipz_adapter_handle adapter_handle;
+       struct ehca_alloc_cq_parms param; /* h_call's out parameters */
+       struct h_galpa gal;
+       void *vpage;
+       u32 counter;
+       u64 rpage, cqx_fec, h_ret;
+       int ipz_rc, i;
+       unsigned long flags;
+
+       if (attr->flags)
+               return ERR_PTR(-EINVAL);
+
+       if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
+               return ERR_PTR(-EINVAL);
+
+       if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
+               ehca_err(device, "Unable to create CQ, max number of %i "
+                       "CQs reached.", shca->max_num_cqs);
+               ehca_err(device, "To increase the maximum number of CQs "
+                       "use the number_of_cqs module parameter.\n");
+               return ERR_PTR(-ENOSPC);
+       }
+
+       my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
+       if (!my_cq) {
+               ehca_err(device, "Out of memory for ehca_cq struct device=%p",
+                        device);
+               atomic_dec(&shca->num_cqs);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
+
+       spin_lock_init(&my_cq->spinlock);
+       spin_lock_init(&my_cq->cb_lock);
+       spin_lock_init(&my_cq->task_lock);
+       atomic_set(&my_cq->nr_events, 0);
+       init_waitqueue_head(&my_cq->wait_completion);
+
+       cq = &my_cq->ib_cq;
+
+       adapter_handle = shca->ipz_hca_handle;
+       param.eq_handle = shca->eq.ipz_eq_handle;
+
+       idr_preload(GFP_KERNEL);
+       write_lock_irqsave(&ehca_cq_idr_lock, flags);
+       my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
+       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+       idr_preload_end();
+
+       if (my_cq->token < 0) {
+               cq = ERR_PTR(-ENOMEM);
+               ehca_err(device, "Can't allocate new idr entry. device=%p",
+                        device);
+               goto create_cq_exit1;
+       }
+
+       /*
+        * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
+        * for receiving errors CQEs.
+        */
+       param.nr_cqe = cqe + additional_cqe;
+       h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
+
+       if (h_ret != H_SUCCESS) {
+               ehca_err(device, "hipz_h_alloc_resource_cq() failed "
+                        "h_ret=%lli device=%p", h_ret, device);
+               cq = ERR_PTR(ehca2ib_return_code(h_ret));
+               goto create_cq_exit2;
+       }
+
+       ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
+                               EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
+       if (!ipz_rc) {
+               ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
+                        ipz_rc, device);
+               cq = ERR_PTR(-EINVAL);
+               goto create_cq_exit3;
+       }
+
+       for (counter = 0; counter < param.act_pages; counter++) {
+               vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+               if (!vpage) {
+                       ehca_err(device, "ipz_qpageit_get_inc() "
+                                "returns NULL device=%p", device);
+                       cq = ERR_PTR(-EAGAIN);
+                       goto create_cq_exit4;
+               }
+               rpage = __pa(vpage);
+
+               h_ret = hipz_h_register_rpage_cq(adapter_handle,
+                                                my_cq->ipz_cq_handle,
+                                                &my_cq->pf,
+                                                0,
+                                                0,
+                                                rpage,
+                                                1,
+                                                my_cq->galpas.
+                                                kernel);
+
+               if (h_ret < H_SUCCESS) {
+                       ehca_err(device, "hipz_h_register_rpage_cq() failed "
+                                "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
+                                "act_pages=%i", my_cq, my_cq->cq_number,
+                                h_ret, counter, param.act_pages);
+                       cq = ERR_PTR(-EINVAL);
+                       goto create_cq_exit4;
+               }
+
+               if (counter == (param.act_pages - 1)) {
+                       vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+                       if ((h_ret != H_SUCCESS) || vpage) {
+                               ehca_err(device, "Registration of pages not "
+                                        "complete ehca_cq=%p cq_num=%x "
+                                        "h_ret=%lli", my_cq, my_cq->cq_number,
+                                        h_ret);
+                               cq = ERR_PTR(-EAGAIN);
+                               goto create_cq_exit4;
+                       }
+               } else {
+                       if (h_ret != H_PAGE_REGISTERED) {
+                               ehca_err(device, "Registration of page failed "
+                                        "ehca_cq=%p cq_num=%x h_ret=%lli "
+                                        "counter=%i act_pages=%i",
+                                        my_cq, my_cq->cq_number,
+                                        h_ret, counter, param.act_pages);
+                               cq = ERR_PTR(-ENOMEM);
+                               goto create_cq_exit4;
+                       }
+               }
+       }
+
+       ipz_qeit_reset(&my_cq->ipz_queue);
+
+       gal = my_cq->galpas.kernel;
+       cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
+       ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
+                my_cq, my_cq->cq_number, cqx_fec);
+
+       my_cq->ib_cq.cqe = my_cq->nr_of_entries =
+               param.act_nr_of_entries - additional_cqe;
+       my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
+
+       for (i = 0; i < QP_HASHTAB_LEN; i++)
+               INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
+
+       INIT_LIST_HEAD(&my_cq->sqp_err_list);
+       INIT_LIST_HEAD(&my_cq->rqp_err_list);
+
+       if (context) {
+               struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
+               struct ehca_create_cq_resp resp;
+               memset(&resp, 0, sizeof(resp));
+               resp.cq_number = my_cq->cq_number;
+               resp.token = my_cq->token;
+               resp.ipz_queue.qe_size = ipz_queue->qe_size;
+               resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
+               resp.ipz_queue.queue_length = ipz_queue->queue_length;
+               resp.ipz_queue.pagesize = ipz_queue->pagesize;
+               resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
+               resp.fw_handle_ofs = (u32)
+                       (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
+               if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+                       ehca_err(device, "Copy to udata failed.");
+                       cq = ERR_PTR(-EFAULT);
+                       goto create_cq_exit4;
+               }
+       }
+
+       return cq;
+
+create_cq_exit4:
+       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+
+create_cq_exit3:
+       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+       if (h_ret != H_SUCCESS)
+               ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
+                        "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
+
+create_cq_exit2:
+       write_lock_irqsave(&ehca_cq_idr_lock, flags);
+       idr_remove(&ehca_cq_idr, my_cq->token);
+       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+create_cq_exit1:
+       kmem_cache_free(cq_cache, my_cq);
+
+       atomic_dec(&shca->num_cqs);
+       return cq;
+}
+
+int ehca_destroy_cq(struct ib_cq *cq)
+{
+       u64 h_ret;
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       int cq_num = my_cq->cq_number;
+       struct ib_device *device = cq->device;
+       struct ehca_shca *shca = container_of(device, struct ehca_shca,
+                                             ib_device);
+       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+       unsigned long flags;
+
+       if (cq->uobject) {
+               if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
+                       ehca_err(device, "Resources still referenced in "
+                                "user space cq_num=%x", my_cq->cq_number);
+                       return -EINVAL;
+               }
+       }
+
+       /*
+        * remove the CQ from the idr first to make sure
+        * no more interrupt tasklets will touch this CQ
+        */
+       write_lock_irqsave(&ehca_cq_idr_lock, flags);
+       idr_remove(&ehca_cq_idr, my_cq->token);
+       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+       /* now wait until all pending events have completed */
+       wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
+
+       /* nobody's using our CQ any longer -- we can destroy it */
+       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
+       if (h_ret == H_R_STATE) {
+               /* cq in err: read err data and destroy it forcibly */
+               ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
+                        "state. Try to delete it forcibly.",
+                        my_cq, cq_num, my_cq->ipz_cq_handle.handle);
+               ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
+               h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+               if (h_ret == H_SUCCESS)
+                       ehca_dbg(device, "cq_num=%x deleted successfully.",
+                                cq_num);
+       }
+       if (h_ret != H_SUCCESS) {
+               ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
+                        "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
+               return ehca2ib_return_code(h_ret);
+       }
+       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+       kmem_cache_free(cq_cache, my_cq);
+
+       atomic_dec(&shca->num_cqs);
+       return 0;
+}
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+       /* TODO: proper resize needs to be done */
+       ehca_err(cq->device, "not implemented yet");
+
+       return -EFAULT;
+}
+
+int ehca_init_cq_cache(void)
+{
+       cq_cache = kmem_cache_create("ehca_cache_cq",
+                                    sizeof(struct ehca_cq), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!cq_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_cq_cache(void)
+{
+       if (cq_cache)
+               kmem_cache_destroy(cq_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_eq.c b/drivers/staging/rdma/ehca/ehca_eq.c
new file mode 100644 (file)
index 0000000..90da674
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Event queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_qes.h"
+#include "hcp_if.h"
+#include "ipz_pt_fn.h"
+
+int ehca_create_eq(struct ehca_shca *shca,
+                  struct ehca_eq *eq,
+                  const enum ehca_eq_type type, const u32 length)
+{
+       int ret;
+       u64 h_ret;
+       u32 nr_pages;
+       u32 i;
+       void *vpage;
+       struct ib_device *ib_dev = &shca->ib_device;
+
+       spin_lock_init(&eq->spinlock);
+       spin_lock_init(&eq->irq_spinlock);
+       eq->is_initialized = 0;
+
+       if (type != EHCA_EQ && type != EHCA_NEQ) {
+               ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
+               return -EINVAL;
+       }
+       if (!length) {
+               ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
+               return -EINVAL;
+       }
+
+       h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
+                                        &eq->pf,
+                                        type,
+                                        length,
+                                        &eq->ipz_eq_handle,
+                                        &eq->length,
+                                        &nr_pages, &eq->ist);
+
+       if (h_ret != H_SUCCESS) {
+               ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
+               return -EINVAL;
+       }
+
+       ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
+                            EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
+       if (!ret) {
+               ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
+               goto create_eq_exit1;
+       }
+
+       for (i = 0; i < nr_pages; i++) {
+               u64 rpage;
+
+               vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+               if (!vpage)
+                       goto create_eq_exit2;
+
+               rpage = __pa(vpage);
+               h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
+                                                eq->ipz_eq_handle,
+                                                &eq->pf,
+                                                0, 0, rpage, 1);
+
+               if (i == (nr_pages - 1)) {
+                       /* last page */
+                       vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+                       if (h_ret != H_SUCCESS || vpage)
+                               goto create_eq_exit2;
+               } else {
+                       if (h_ret != H_PAGE_REGISTERED)
+                               goto create_eq_exit2;
+               }
+       }
+
+       ipz_qeit_reset(&eq->ipz_queue);
+
+       /* register interrupt handlers and initialize work queues */
+       if (type == EHCA_EQ) {
+               tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
+
+               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
+                                         0, "ehca_eq",
+                                         (void *)shca);
+               if (ret < 0)
+                       ehca_err(ib_dev, "Can't map interrupt handler.");
+       } else if (type == EHCA_NEQ) {
+               tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
+
+               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
+                                         0, "ehca_neq",
+                                         (void *)shca);
+               if (ret < 0)
+                       ehca_err(ib_dev, "Can't map interrupt handler.");
+       }
+
+       eq->is_initialized = 1;
+
+       return 0;
+
+create_eq_exit2:
+       ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+create_eq_exit1:
+       hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+       return -EINVAL;
+}
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+       unsigned long flags;
+       void *eqe;
+
+       spin_lock_irqsave(&eq->spinlock, flags);
+       eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
+       spin_unlock_irqrestore(&eq->spinlock, flags);
+
+       return eqe;
+}
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+       unsigned long flags;
+       u64 h_ret;
+
+       ibmebus_free_irq(eq->ist, (void *)shca);
+
+       spin_lock_irqsave(&shca_list_lock, flags);
+       eq->is_initialized = 0;
+       spin_unlock_irqrestore(&shca_list_lock, flags);
+
+       tasklet_kill(&eq->interrupt_task);
+
+       h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't free EQ resources.");
+               return -EINVAL;
+       }
+       ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_hca.c b/drivers/staging/rdma/ehca/ehca_hca.c
new file mode 100644 (file)
index 0000000..e8b1bb6
--- /dev/null
@@ -0,0 +1,414 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HCA query functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/gfp.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static unsigned int limit_uint(unsigned int value)
+{
+       return min_t(unsigned int, value, INT_MAX);
+}
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+                     struct ib_udata *uhw)
+{
+       int i, ret = 0;
+       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+                                             ib_device);
+       struct hipz_query_hca *rblock;
+
+       static const u32 cap_mapping[] = {
+               IB_DEVICE_RESIZE_MAX_WR,      HCA_CAP_WQE_RESIZE,
+               IB_DEVICE_BAD_PKEY_CNTR,      HCA_CAP_BAD_P_KEY_CTR,
+               IB_DEVICE_BAD_QKEY_CNTR,      HCA_CAP_Q_KEY_VIOL_CTR,
+               IB_DEVICE_RAW_MULTI,          HCA_CAP_RAW_PACKET_MCAST,
+               IB_DEVICE_AUTO_PATH_MIG,      HCA_CAP_AUTO_PATH_MIG,
+               IB_DEVICE_CHANGE_PHY_PORT,    HCA_CAP_SQD_RTS_PORT_CHANGE,
+               IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
+               IB_DEVICE_CURR_QP_STATE_MOD,  HCA_CAP_CUR_QP_STATE_MOD,
+               IB_DEVICE_SHUTDOWN_PORT,      HCA_CAP_SHUTDOWN_PORT,
+               IB_DEVICE_INIT_TYPE,          HCA_CAP_INIT_TYPE,
+               IB_DEVICE_PORT_ACTIVE_EVENT,  HCA_CAP_PORT_ACTIVE_EVENT,
+       };
+
+       if (uhw->inlen || uhw->outlen)
+               return -EINVAL;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query device properties");
+               ret = -EINVAL;
+               goto query_device1;
+       }
+
+       memset(props, 0, sizeof(struct ib_device_attr));
+       props->page_size_cap   = shca->hca_cap_mr_pgsize;
+       props->fw_ver          = rblock->hw_ver;
+       props->max_mr_size     = rblock->max_mr_size;
+       props->vendor_id       = rblock->vendor_id >> 8;
+       props->vendor_part_id  = rblock->vendor_part_id >> 16;
+       props->hw_ver          = rblock->hw_ver;
+       props->max_qp          = limit_uint(rblock->max_qp);
+       props->max_qp_wr       = limit_uint(rblock->max_wqes_wq);
+       props->max_sge         = limit_uint(rblock->max_sge);
+       props->max_sge_rd      = limit_uint(rblock->max_sge_rd);
+       props->max_cq          = limit_uint(rblock->max_cq);
+       props->max_cqe         = limit_uint(rblock->max_cqe);
+       props->max_mr          = limit_uint(rblock->max_mr);
+       props->max_mw          = limit_uint(rblock->max_mw);
+       props->max_pd          = limit_uint(rblock->max_pd);
+       props->max_ah          = limit_uint(rblock->max_ah);
+       props->max_ee          = limit_uint(rblock->max_rd_ee_context);
+       props->max_rdd         = limit_uint(rblock->max_rd_domain);
+       props->max_fmr         = limit_uint(rblock->max_mr);
+       props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
+       props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
+       props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
+       props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
+       props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
+
+       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+               props->max_srq         = limit_uint(props->max_qp);
+               props->max_srq_wr      = limit_uint(props->max_qp_wr);
+               props->max_srq_sge     = 3;
+       }
+
+       props->max_pkeys           = 16;
+       /* Some FW versions say 0 here; insert sensible value in that case */
+       props->local_ca_ack_delay  = rblock->local_ca_ack_delay ?
+               min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
+       props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
+       props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
+       props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
+       props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
+       props->max_total_mcast_qp_attach
+               = limit_uint(rblock->max_total_mcast_qp_attach);
+
+       /* translate device capabilities */
+       props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
+               IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
+       for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
+               if (rblock->hca_cap_indicators & cap_mapping[i + 1])
+                       props->device_cap_flags |= cap_mapping[i];
+
+query_device1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+{
+       switch (fw_mtu) {
+       case 0x1:
+               return IB_MTU_256;
+       case 0x2:
+               return IB_MTU_512;
+       case 0x3:
+               return IB_MTU_1024;
+       case 0x4:
+               return IB_MTU_2048;
+       case 0x5:
+               return IB_MTU_4096;
+       default:
+               ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
+                        fw_mtu);
+               return 0;
+       }
+}
+
+static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
+{
+       switch (vl_cap) {
+       case 0x1:
+               return 1;
+       case 0x2:
+               return 2;
+       case 0x3:
+               return 4;
+       case 0x4:
+               return 8;
+       case 0x5:
+               return 15;
+       default:
+               ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
+                        vl_cap);
+               return 0;
+       }
+}
+
+int ehca_query_port(struct ib_device *ibdev,
+                   u8 port, struct ib_port_attr *props)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+                                             ib_device);
+       struct hipz_query_port *rblock;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_port1;
+       }
+
+       memset(props, 0, sizeof(struct ib_port_attr));
+
+       props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
+       props->port_cap_flags  = rblock->capability_mask;
+       props->gid_tbl_len     = rblock->gid_tbl_len;
+       if (rblock->max_msg_sz)
+               props->max_msg_sz      = rblock->max_msg_sz;
+       else
+               props->max_msg_sz      = 0x1 << 31;
+       props->bad_pkey_cntr   = rblock->bad_pkey_cntr;
+       props->qkey_viol_cntr  = rblock->qkey_viol_cntr;
+       props->pkey_tbl_len    = rblock->pkey_tbl_len;
+       props->lid             = rblock->lid;
+       props->sm_lid          = rblock->sm_lid;
+       props->lmc             = rblock->lmc;
+       props->sm_sl           = rblock->sm_sl;
+       props->subnet_timeout  = rblock->subnet_timeout;
+       props->init_type_reply = rblock->init_type_reply;
+       props->max_vl_num      = map_number_of_vls(shca, rblock->vl_cap);
+
+       if (rblock->state && rblock->phys_width) {
+               props->phys_state      = rblock->phys_pstate;
+               props->state           = rblock->phys_state;
+               props->active_width    = rblock->phys_width;
+               props->active_speed    = rblock->phys_speed;
+       } else {
+               /* old firmware releases don't report physical
+                * port info, so use default values
+                */
+               props->phys_state      = 5;
+               props->state           = rblock->state;
+               props->active_width    = IB_WIDTH_12X;
+               props->active_speed    = IB_SPEED_SDR;
+       }
+
+query_port1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+int ehca_query_sma_attr(struct ehca_shca *shca,
+                       u8 port, struct ehca_sma_attr *attr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct hipz_query_port *rblock;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_sma_attr1;
+       }
+
+       memset(attr, 0, sizeof(struct ehca_sma_attr));
+
+       attr->lid    = rblock->lid;
+       attr->lmc    = rblock->lmc;
+       attr->sm_sl  = rblock->sm_sl;
+       attr->sm_lid = rblock->sm_lid;
+
+       attr->pkey_tbl_len = rblock->pkey_tbl_len;
+       memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
+
+query_sma_attr1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca;
+       struct hipz_query_port *rblock;
+
+       shca = container_of(ibdev, struct ehca_shca, ib_device);
+       if (index > 16) {
+               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+               return -EINVAL;
+       }
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_pkey1;
+       }
+
+       memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
+
+query_pkey1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port,
+                  int index, union ib_gid *gid)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+                                             ib_device);
+       struct hipz_query_port *rblock;
+
+       if (index < 0 || index > 255) {
+               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+               return -EINVAL;
+       }
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_gid1;
+       }
+
+       memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
+       memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
+
+query_gid1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+static const u32 allowed_port_caps = (
+       IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
+       IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
+       IB_PORT_VENDOR_CLASS_SUP);
+
+int ehca_modify_port(struct ib_device *ibdev,
+                    u8 port, int port_modify_mask,
+                    struct ib_port_modify *props)
+{
+       int ret = 0;
+       struct ehca_shca *shca;
+       struct hipz_query_port *rblock;
+       u32 cap;
+       u64 hret;
+
+       shca = container_of(ibdev, struct ehca_shca, ib_device);
+       if ((props->set_port_cap_mask | props->clr_port_cap_mask)
+           & ~allowed_port_caps) {
+               ehca_err(&shca->ib_device, "Non-changeable bits set in masks  "
+                        "set=%x  clr=%x  allowed=%x", props->set_port_cap_mask,
+                        props->clr_port_cap_mask, allowed_port_caps);
+               return -EINVAL;
+       }
+
+       if (mutex_lock_interruptible(&shca->modify_mutex))
+               return -ERESTARTSYS;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+               ret = -ENOMEM;
+               goto modify_port1;
+       }
+
+       hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (hret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto modify_port2;
+       }
+
+       cap = (rblock->capability_mask | props->set_port_cap_mask)
+               & ~props->clr_port_cap_mask;
+
+       hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
+                                 cap, props->init_type, port_modify_mask);
+       if (hret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Modify port failed  h_ret=%lli",
+                        hret);
+               ret = -EINVAL;
+       }
+
+modify_port2:
+       ehca_free_fw_ctrlblock(rblock);
+
+modify_port1:
+       mutex_unlock(&shca->modify_mutex);
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.c b/drivers/staging/rdma/ehca/ehca_irq.c
new file mode 100644 (file)
index 0000000..8615d7c
--- /dev/null
@@ -0,0 +1,870 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Functions for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/smpboot.h>
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define EQE_COMPLETION_EVENT   EHCA_BMASK_IBM( 1,  1)
+#define EQE_CQ_QP_NUMBER       EHCA_BMASK_IBM( 8, 31)
+#define EQE_EE_IDENTIFIER      EHCA_BMASK_IBM( 2,  7)
+#define EQE_CQ_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_TOKEN           EHCA_BMASK_IBM(32, 63)
+#define EQE_CQ_TOKEN           EHCA_BMASK_IBM(32, 63)
+
+#define NEQE_COMPLETION_EVENT  EHCA_BMASK_IBM( 1,  1)
+#define NEQE_EVENT_CODE        EHCA_BMASK_IBM( 2,  7)
+#define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
+#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
+#define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
+#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
+
+#define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
+#define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
+
+static void queue_comp_task(struct ehca_cq *__cq);
+
+static struct ehca_comp_pool *pool;
+
+static inline void comp_event_callback(struct ehca_cq *cq)
+{
+       if (!cq->ib_cq.comp_handler)
+               return;
+
+       spin_lock(&cq->cb_lock);
+       cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
+       spin_unlock(&cq->cb_lock);
+
+       return;
+}
+
+static void print_error_data(struct ehca_shca *shca, void *data,
+                            u64 *rblock, int length)
+{
+       u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
+       u64 resource = rblock[1];
+
+       switch (type) {
+       case 0x1: /* Queue Pair */
+       {
+               struct ehca_qp *qp = (struct ehca_qp *)data;
+
+               /* only print error data if AER is set */
+               if (rblock[6] == 0)
+                       return;
+
+               ehca_err(&shca->ib_device,
+                        "QP 0x%x (resource=%llx) has errors.",
+                        qp->ib_qp.qp_num, resource);
+               break;
+       }
+       case 0x4: /* Completion Queue */
+       {
+               struct ehca_cq *cq = (struct ehca_cq *)data;
+
+               ehca_err(&shca->ib_device,
+                        "CQ 0x%x (resource=%llx) has errors.",
+                        cq->cq_number, resource);
+               break;
+       }
+       default:
+               ehca_err(&shca->ib_device,
+                        "Unknown error type: %llx on %s.",
+                        type, shca->ib_device.name);
+               break;
+       }
+
+       ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
+       ehca_err(&shca->ib_device, "EHCA ----- error data begin "
+                "---------------------------------------------------");
+       ehca_dmp(rblock, length, "resource=%llx", resource);
+       ehca_err(&shca->ib_device, "EHCA ----- error data end "
+                "----------------------------------------------------");
+
+       return;
+}
+
+int ehca_error_data(struct ehca_shca *shca, void *data,
+                   u64 resource)
+{
+
+       unsigned long ret;
+       u64 *rblock;
+       unsigned long block_count;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
+               ret = -ENOMEM;
+               goto error_data1;
+       }
+
+       /* rblock must be 4K aligned and should be 4K large */
+       ret = hipz_h_error_data(shca->ipz_hca_handle,
+                               resource,
+                               rblock,
+                               &block_count);
+
+       if (ret == H_R_STATE)
+               ehca_err(&shca->ib_device,
+                        "No error data is available: %llx.", resource);
+       else if (ret == H_SUCCESS) {
+               int length;
+
+               length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
+
+               if (length > EHCA_PAGESIZE)
+                       length = EHCA_PAGESIZE;
+
+               print_error_data(shca, data, rblock, length);
+       } else
+               ehca_err(&shca->ib_device,
+                        "Error data could not be fetched: %llx", resource);
+
+       ehca_free_fw_ctrlblock(rblock);
+
+error_data1:
+       return ret;
+
+}
+
+static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
+                             enum ib_event_type event_type)
+{
+       struct ib_event event;
+
+       /* PATH_MIG without the QP ever having been armed is false alarm */
+       if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
+               return;
+
+       event.device = &shca->ib_device;
+       event.event = event_type;
+
+       if (qp->ext_type == EQPT_SRQ) {
+               if (!qp->ib_srq.event_handler)
+                       return;
+
+               event.element.srq = &qp->ib_srq;
+               qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
+       } else {
+               if (!qp->ib_qp.event_handler)
+                       return;
+
+               event.element.qp = &qp->ib_qp;
+               qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
+       }
+}
+
+static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
+                             enum ib_event_type event_type, int fatal)
+{
+       struct ehca_qp *qp;
+       u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
+
+       read_lock(&ehca_qp_idr_lock);
+       qp = idr_find(&ehca_qp_idr, token);
+       if (qp)
+               atomic_inc(&qp->nr_events);
+       read_unlock(&ehca_qp_idr_lock);
+
+       if (!qp)
+               return;
+
+       if (fatal)
+               ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
+
+       dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
+                         IB_EVENT_SRQ_ERR : event_type);
+
+       /*
+        * eHCA only processes one WQE at a time for SRQ base QPs,
+        * so the last WQE has been processed as soon as the QP enters
+        * error state.
+        */
+       if (fatal && qp->ext_type == EQPT_SRQBASE)
+               dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
+
+       if (atomic_dec_and_test(&qp->nr_events))
+               wake_up(&qp->wait_completion);
+       return;
+}
+
+static void cq_event_callback(struct ehca_shca *shca,
+                             u64 eqe)
+{
+       struct ehca_cq *cq;
+       u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
+
+       read_lock(&ehca_cq_idr_lock);
+       cq = idr_find(&ehca_cq_idr, token);
+       if (cq)
+               atomic_inc(&cq->nr_events);
+       read_unlock(&ehca_cq_idr_lock);
+
+       if (!cq)
+               return;
+
+       ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
+
+       if (atomic_dec_and_test(&cq->nr_events))
+               wake_up(&cq->wait_completion);
+
+       return;
+}
+
+static void parse_identifier(struct ehca_shca *shca, u64 eqe)
+{
+       u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
+
+       switch (identifier) {
+       case 0x02: /* path migrated */
+               qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
+               break;
+       case 0x03: /* communication established */
+               qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
+               break;
+       case 0x04: /* send queue drained */
+               qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
+               break;
+       case 0x05: /* QP error */
+       case 0x06: /* QP error */
+               qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
+               break;
+       case 0x07: /* CQ error */
+       case 0x08: /* CQ error */
+               cq_event_callback(shca, eqe);
+               break;
+       case 0x09: /* MRMWPTE error */
+               ehca_err(&shca->ib_device, "MRMWPTE error.");
+               break;
+       case 0x0A: /* port event */
+               ehca_err(&shca->ib_device, "Port event.");
+               break;
+       case 0x0B: /* MR access error */
+               ehca_err(&shca->ib_device, "MR access error.");
+               break;
+       case 0x0C: /* EQ error */
+               ehca_err(&shca->ib_device, "EQ error.");
+               break;
+       case 0x0D: /* P/Q_Key mismatch */
+               ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
+               break;
+       case 0x10: /* sampling complete */
+               ehca_err(&shca->ib_device, "Sampling complete.");
+               break;
+       case 0x11: /* unaffiliated access error */
+               ehca_err(&shca->ib_device, "Unaffiliated access error.");
+               break;
+       case 0x12: /* path migrating */
+               ehca_err(&shca->ib_device, "Path migrating.");
+               break;
+       case 0x13: /* interface trace stopped */
+               ehca_err(&shca->ib_device, "Interface trace stopped.");
+               break;
+       case 0x14: /* first error capture info available */
+               ehca_info(&shca->ib_device, "First error capture available");
+               break;
+       case 0x15: /* SRQ limit reached */
+               qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
+               break;
+       default:
+               ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
+                        identifier, shca->ib_device.name);
+               break;
+       }
+
+       return;
+}
+
+static void dispatch_port_event(struct ehca_shca *shca, int port_num,
+                               enum ib_event_type type, const char *msg)
+{
+       struct ib_event event;
+
+       ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
+       event.device = &shca->ib_device;
+       event.event = type;
+       event.element.port_num = port_num;
+       ib_dispatch_event(&event);
+}
+
+static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
+{
+       struct ehca_sma_attr  new_attr;
+       struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
+
+       ehca_query_sma_attr(shca, port_num, &new_attr);
+
+       if (new_attr.sm_sl  != old_attr->sm_sl ||
+           new_attr.sm_lid != old_attr->sm_lid)
+               dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
+                                   "SM changed");
+
+       if (new_attr.lid != old_attr->lid ||
+           new_attr.lmc != old_attr->lmc)
+               dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
+                                   "LID changed");
+
+       if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
+           memcmp(new_attr.pkeys, old_attr->pkeys,
+                  sizeof(u16) * new_attr.pkey_tbl_len))
+               dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
+                                   "P_Key changed");
+
+       *old_attr = new_attr;
+}
+
+/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
+static int replay_modify_qp(struct ehca_sport *sport)
+{
+       int aqp1_destroyed;
+       unsigned long flags;
+
+       spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+
+       aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
+
+       if (sport->ibqp_sqp[IB_QPT_SMI])
+               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
+       if (!aqp1_destroyed)
+               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
+
+       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+
+       return aqp1_destroyed;
+}
+
+static void parse_ec(struct ehca_shca *shca, u64 eqe)
+{
+       u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
+       u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
+       u8 spec_event;
+       struct ehca_sport *sport = &shca->sport[port - 1];
+
+       switch (ec) {
+       case 0x30: /* port availability change */
+               if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
+                       /* only replay modify_qp calls in autodetect mode;
+                        * if AQP1 was destroyed, the port is already down
+                        * again and we can drop the event.
+                        */
+                       if (ehca_nr_ports < 0)
+                               if (replay_modify_qp(sport))
+                                       break;
+
+                       sport->port_state = IB_PORT_ACTIVE;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+                                           "is active");
+                       ehca_query_sma_attr(shca, port, &sport->saved_attr);
+               } else {
+                       sport->port_state = IB_PORT_DOWN;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+                                           "is inactive");
+               }
+               break;
+       case 0x31:
+               /* port configuration change
+                * disruptive change is caused by
+                * LID, PKEY or SM change
+                */
+               if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
+                       ehca_warn(&shca->ib_device, "disruptive port "
+                                 "%d configuration change", port);
+
+                       sport->port_state = IB_PORT_DOWN;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+                                           "is inactive");
+
+                       sport->port_state = IB_PORT_ACTIVE;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+                                           "is active");
+                       ehca_query_sma_attr(shca, port,
+                                           &sport->saved_attr);
+               } else
+                       notify_port_conf_change(shca, port);
+               break;
+       case 0x32: /* adapter malfunction */
+               ehca_err(&shca->ib_device, "Adapter malfunction.");
+               break;
+       case 0x33:  /* trace stopped */
+               ehca_err(&shca->ib_device, "Traced stopped.");
+               break;
+       case 0x34: /* util async event */
+               spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
+               if (spec_event == 0x80) /* client reregister required */
+                       dispatch_port_event(shca, port,
+                                           IB_EVENT_CLIENT_REREGISTER,
+                                           "client reregister req.");
+               else
+                       ehca_warn(&shca->ib_device, "Unknown util async "
+                                 "event %x on port %x", spec_event, port);
+               break;
+       default:
+               ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
+                        ec, shca->ib_device.name);
+               break;
+       }
+
+       return;
+}
+
+static inline void reset_eq_pending(struct ehca_cq *cq)
+{
+       u64 CQx_EP;
+       struct h_galpa gal = cq->galpas.kernel;
+
+       hipz_galpa_store_cq(gal, cqx_ep, 0x0);
+       CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
+
+       return;
+}
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
+{
+       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+       tasklet_hi_schedule(&shca->neq.interrupt_task);
+
+       return IRQ_HANDLED;
+}
+
+void ehca_tasklet_neq(unsigned long data)
+{
+       struct ehca_shca *shca = (struct ehca_shca*)data;
+       struct ehca_eqe *eqe;
+       u64 ret;
+
+       eqe = ehca_poll_eq(shca, &shca->neq);
+
+       while (eqe) {
+               if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
+                       parse_ec(shca, eqe->entry);
+
+               eqe = ehca_poll_eq(shca, &shca->neq);
+       }
+
+       ret = hipz_h_reset_event(shca->ipz_hca_handle,
+                                shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
+
+       if (ret != H_SUCCESS)
+               ehca_err(&shca->ib_device, "Can't clear notification events.");
+
+       return;
+}
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
+{
+       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+       tasklet_hi_schedule(&shca->eq.interrupt_task);
+
+       return IRQ_HANDLED;
+}
+
+
+static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
+{
+       u64 eqe_value;
+       u32 token;
+       struct ehca_cq *cq;
+
+       eqe_value = eqe->entry;
+       ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
+       if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+               ehca_dbg(&shca->ib_device, "Got completion event");
+               token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+               read_lock(&ehca_cq_idr_lock);
+               cq = idr_find(&ehca_cq_idr, token);
+               if (cq)
+                       atomic_inc(&cq->nr_events);
+               read_unlock(&ehca_cq_idr_lock);
+               if (cq == NULL) {
+                       ehca_err(&shca->ib_device,
+                                "Invalid eqe for non-existing cq token=%x",
+                                token);
+                       return;
+               }
+               reset_eq_pending(cq);
+               if (ehca_scaling_code)
+                       queue_comp_task(cq);
+               else {
+                       comp_event_callback(cq);
+                       if (atomic_dec_and_test(&cq->nr_events))
+                               wake_up(&cq->wait_completion);
+               }
+       } else {
+               ehca_dbg(&shca->ib_device, "Got non completion event");
+               parse_identifier(shca, eqe_value);
+       }
+}
+
+void ehca_process_eq(struct ehca_shca *shca, int is_irq)
+{
+       struct ehca_eq *eq = &shca->eq;
+       struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
+       u64 eqe_value, ret;
+       int eqe_cnt, i;
+       int eq_empty = 0;
+
+       spin_lock(&eq->irq_spinlock);
+       if (is_irq) {
+               const int max_query_cnt = 100;
+               int query_cnt = 0;
+               int int_state = 1;
+               do {
+                       int_state = hipz_h_query_int_state(
+                               shca->ipz_hca_handle, eq->ist);
+                       query_cnt++;
+                       iosync();
+               } while (int_state && query_cnt < max_query_cnt);
+               if (unlikely((query_cnt == max_query_cnt)))
+                       ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
+                                int_state, query_cnt);
+       }
+
+       /* read out all eqes */
+       eqe_cnt = 0;
+       do {
+               u32 token;
+               eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
+               if (!eqe_cache[eqe_cnt].eqe)
+                       break;
+               eqe_value = eqe_cache[eqe_cnt].eqe->entry;
+               if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+                       token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+                       read_lock(&ehca_cq_idr_lock);
+                       eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
+                       if (eqe_cache[eqe_cnt].cq)
+                               atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
+                       read_unlock(&ehca_cq_idr_lock);
+                       if (!eqe_cache[eqe_cnt].cq) {
+                               ehca_err(&shca->ib_device,
+                                        "Invalid eqe for non-existing cq "
+                                        "token=%x", token);
+                               continue;
+                       }
+               } else
+                       eqe_cache[eqe_cnt].cq = NULL;
+               eqe_cnt++;
+       } while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
+       if (!eqe_cnt) {
+               if (is_irq)
+                       ehca_dbg(&shca->ib_device,
+                                "No eqe found for irq event");
+               goto unlock_irq_spinlock;
+       } else if (!is_irq) {
+               ret = hipz_h_eoi(eq->ist);
+               if (ret != H_SUCCESS)
+                       ehca_err(&shca->ib_device,
+                                "bad return code EOI -rc = %lld\n", ret);
+               ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+       }
+       if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
+               ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
+       /* enable irq for new packets */
+       for (i = 0; i < eqe_cnt; i++) {
+               if (eq->eqe_cache[i].cq)
+                       reset_eq_pending(eq->eqe_cache[i].cq);
+       }
+       /* check eq */
+       spin_lock(&eq->spinlock);
+       eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
+       spin_unlock(&eq->spinlock);
+       /* call completion handler for cached eqes */
+       for (i = 0; i < eqe_cnt; i++)
+               if (eq->eqe_cache[i].cq) {
+                       if (ehca_scaling_code)
+                               queue_comp_task(eq->eqe_cache[i].cq);
+                       else {
+                               struct ehca_cq *cq = eq->eqe_cache[i].cq;
+                               comp_event_callback(cq);
+                               if (atomic_dec_and_test(&cq->nr_events))
+                                       wake_up(&cq->wait_completion);
+                       }
+               } else {
+                       ehca_dbg(&shca->ib_device, "Got non completion event");
+                       parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
+               }
+       /* poll eq if not empty */
+       if (eq_empty)
+               goto unlock_irq_spinlock;
+       do {
+               struct ehca_eqe *eqe;
+               eqe = ehca_poll_eq(shca, &shca->eq);
+               if (!eqe)
+                       break;
+               process_eqe(shca, eqe);
+       } while (1);
+
+unlock_irq_spinlock:
+       spin_unlock(&eq->irq_spinlock);
+}
+
+void ehca_tasklet_eq(unsigned long data)
+{
+       ehca_process_eq((struct ehca_shca*)data, 1);
+}
+
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
+{
+       int cpu;
+       unsigned long flags;
+
+       WARN_ON_ONCE(!in_interrupt());
+       if (ehca_debug_level >= 3)
+               ehca_dmp(cpu_online_mask, cpumask_size(), "");
+
+       spin_lock_irqsave(&pool->last_cpu_lock, flags);
+       do {
+               cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+               if (cpu >= nr_cpu_ids)
+                       cpu = cpumask_first(cpu_online_mask);
+               pool->last_cpu = cpu;
+       } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
+       spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
+
+       return cpu;
+}
+
+static void __queue_comp_task(struct ehca_cq *__cq,
+                             struct ehca_cpu_comp_task *cct,
+                             struct task_struct *thread)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&cct->task_lock, flags);
+       spin_lock(&__cq->task_lock);
+
+       if (__cq->nr_callbacks == 0) {
+               __cq->nr_callbacks++;
+               list_add_tail(&__cq->entry, &cct->cq_list);
+               cct->cq_jobs++;
+               wake_up_process(thread);
+       } else
+               __cq->nr_callbacks++;
+
+       spin_unlock(&__cq->task_lock);
+       spin_unlock_irqrestore(&cct->task_lock, flags);
+}
+
+static void queue_comp_task(struct ehca_cq *__cq)
+{
+       int cpu_id;
+       struct ehca_cpu_comp_task *cct;
+       struct task_struct *thread;
+       int cq_jobs;
+       unsigned long flags;
+
+       cpu_id = find_next_online_cpu(pool);
+       BUG_ON(!cpu_online(cpu_id));
+
+       cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+       BUG_ON(!cct || !thread);
+
+       spin_lock_irqsave(&cct->task_lock, flags);
+       cq_jobs = cct->cq_jobs;
+       spin_unlock_irqrestore(&cct->task_lock, flags);
+       if (cq_jobs > 0) {
+               cpu_id = find_next_online_cpu(pool);
+               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+               thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+               BUG_ON(!cct || !thread);
+       }
+       __queue_comp_task(__cq, cct, thread);
+}
+
+static void run_comp_task(struct ehca_cpu_comp_task *cct)
+{
+       struct ehca_cq *cq;
+
+       while (!list_empty(&cct->cq_list)) {
+               cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
+               spin_unlock_irq(&cct->task_lock);
+
+               comp_event_callback(cq);
+               if (atomic_dec_and_test(&cq->nr_events))
+                       wake_up(&cq->wait_completion);
+
+               spin_lock_irq(&cct->task_lock);
+               spin_lock(&cq->task_lock);
+               cq->nr_callbacks--;
+               if (!cq->nr_callbacks) {
+                       list_del_init(cct->cq_list.next);
+                       cct->cq_jobs--;
+               }
+               spin_unlock(&cq->task_lock);
+       }
+}
+
+static void comp_task_park(unsigned int cpu)
+{
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+       struct ehca_cpu_comp_task *target;
+       struct task_struct *thread;
+       struct ehca_cq *cq, *tmp;
+       LIST_HEAD(list);
+
+       spin_lock_irq(&cct->task_lock);
+       cct->cq_jobs = 0;
+       cct->active = 0;
+       list_splice_init(&cct->cq_list, &list);
+       spin_unlock_irq(&cct->task_lock);
+
+       cpu = find_next_online_cpu(pool);
+       target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
+       spin_lock_irq(&target->task_lock);
+       list_for_each_entry_safe(cq, tmp, &list, entry) {
+               list_del(&cq->entry);
+               __queue_comp_task(cq, target, thread);
+       }
+       spin_unlock_irq(&target->task_lock);
+}
+
+static void comp_task_stop(unsigned int cpu, bool online)
+{
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+       spin_lock_irq(&cct->task_lock);
+       cct->cq_jobs = 0;
+       cct->active = 0;
+       WARN_ON(!list_empty(&cct->cq_list));
+       spin_unlock_irq(&cct->task_lock);
+}
+
+static int comp_task_should_run(unsigned int cpu)
+{
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+       return cct->cq_jobs;
+}
+
+static void comp_task(unsigned int cpu)
+{
+       struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+       int cql_empty;
+
+       spin_lock_irq(&cct->task_lock);
+       cql_empty = list_empty(&cct->cq_list);
+       if (!cql_empty) {
+               __set_current_state(TASK_RUNNING);
+               run_comp_task(cct);
+       }
+       spin_unlock_irq(&cct->task_lock);
+}
+
+static struct smp_hotplug_thread comp_pool_threads = {
+       .thread_should_run      = comp_task_should_run,
+       .thread_fn              = comp_task,
+       .thread_comm            = "ehca_comp/%u",
+       .cleanup                = comp_task_stop,
+       .park                   = comp_task_park,
+};
+
+int ehca_create_comp_pool(void)
+{
+       int cpu, ret = -ENOMEM;
+
+       if (!ehca_scaling_code)
+               return 0;
+
+       pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
+       if (pool == NULL)
+               return -ENOMEM;
+
+       spin_lock_init(&pool->last_cpu_lock);
+       pool->last_cpu = cpumask_any(cpu_online_mask);
+
+       pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
+       if (!pool->cpu_comp_tasks)
+               goto out_pool;
+
+       pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+       if (!pool->cpu_comp_threads)
+               goto out_tasks;
+
+       for_each_present_cpu(cpu) {
+               struct ehca_cpu_comp_task *cct;
+
+               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+               spin_lock_init(&cct->task_lock);
+               INIT_LIST_HEAD(&cct->cq_list);
+       }
+
+       comp_pool_threads.store = pool->cpu_comp_threads;
+       ret = smpboot_register_percpu_thread(&comp_pool_threads);
+       if (ret)
+               goto out_threads;
+
+       pr_info("eHCA scaling code enabled\n");
+       return ret;
+
+out_threads:
+       free_percpu(pool->cpu_comp_threads);
+out_tasks:
+       free_percpu(pool->cpu_comp_tasks);
+out_pool:
+       kfree(pool);
+       return ret;
+}
+
+void ehca_destroy_comp_pool(void)
+{
+       if (!ehca_scaling_code)
+               return;
+
+       smpboot_unregister_percpu_thread(&comp_pool_threads);
+
+       free_percpu(pool->cpu_comp_threads);
+       free_percpu(pool->cpu_comp_tasks);
+       kfree(pool);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.h b/drivers/staging/rdma/ehca/ehca_irq.h
new file mode 100644 (file)
index 0000000..5370199
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions and structs for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IRQ_H
+#define __EHCA_IRQ_H
+
+
+struct ehca_shca;
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+
+int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
+void ehca_tasklet_neq(unsigned long data);
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
+void ehca_tasklet_eq(unsigned long data);
+void ehca_process_eq(struct ehca_shca *shca, int is_irq);
+
+struct ehca_cpu_comp_task {
+       struct list_head cq_list;
+       spinlock_t task_lock;
+       int cq_jobs;
+       int active;
+};
+
+struct ehca_comp_pool {
+       struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+       struct task_struct * __percpu *cpu_comp_threads;
+       int last_cpu;
+       spinlock_t last_cpu_lock;
+};
+
+int ehca_create_comp_pool(void);
+void ehca_destroy_comp_pool(void);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h
new file mode 100644 (file)
index 0000000..80e6a3d
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions for internal functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Dietmar Decker <ddecker@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IVERBS_H__
+#define __EHCA_IVERBS_H__
+
+#include "ehca_classes.h"
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+                     struct ib_udata *uhw);
+
+int ehca_query_port(struct ib_device *ibdev, u8 port,
+                   struct ib_port_attr *props);
+
+enum rdma_protocol_type
+ehca_query_protocol(struct ib_device *device, u8 port_num);
+
+int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
+                       struct ehca_sma_attr *attr);
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
+                  union ib_gid *gid);
+
+int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
+                    struct ib_port_modify *props);
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+                           struct ib_ucontext *context,
+                           struct ib_udata *udata);
+
+int ehca_dealloc_pd(struct ib_pd *pd);
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_destroy_ah(struct ib_ah *ah);
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+                              struct ib_phys_buf *phys_buf_array,
+                              int num_phys_buf,
+                              int mr_access_flags, u64 *iova_start);
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                              u64 virt, int mr_access_flags,
+                              struct ib_udata *udata);
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+                      int mr_rereg_mask,
+                      struct ib_pd *pd,
+                      struct ib_phys_buf *phys_buf_array,
+                      int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+int ehca_dereg_mr(struct ib_mr *mr);
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+                struct ib_mw_bind *mw_bind);
+
+int ehca_dealloc_mw(struct ib_mw *mw);
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+                             int mr_access_flags,
+                             struct ib_fmr_attr *fmr_attr);
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+                     u64 *page_list, int list_len, u64 iova);
+
+int ehca_unmap_fmr(struct list_head *fmr_list);
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr);
+
+enum ehca_eq_type {
+       EHCA_EQ = 0, /* Event Queue              */
+       EHCA_NEQ     /* Notification Event Queue */
+};
+
+int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
+                  enum ehca_eq_type type, const u32 length);
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+
+struct ib_cq *ehca_create_cq(struct ib_device *device,
+                            const struct ib_cq_init_attr *attr,
+                            struct ib_ucontext *context,
+                            struct ib_udata *udata);
+
+int ehca_destroy_cq(struct ib_cq *cq);
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+
+int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+                            struct ib_qp_init_attr *init_attr,
+                            struct ib_udata *udata);
+
+int ehca_destroy_qp(struct ib_qp *qp);
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+                  struct ib_udata *udata);
+
+int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+                  struct ib_send_wr **bad_send_wr);
+
+int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+                  struct ib_recv_wr **bad_recv_wr);
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+                      struct ib_recv_wr *recv_wr,
+                      struct ib_recv_wr **bad_recv_wr);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+                              struct ib_srq_init_attr *init_attr,
+                              struct ib_udata *udata);
+
+int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
+                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+
+int ehca_destroy_srq(struct ib_srq *srq);
+
+u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
+                   struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+                                       struct ib_udata *udata);
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context);
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in, size_t in_mad_size,
+                    struct ib_mad_hdr *out, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index);
+
+void ehca_poll_eqs(unsigned long data);
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+                 enum ib_rate path_rate, u32 *ipd);
+
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
+
+#ifdef CONFIG_PPC_64K_PAGES
+void *ehca_alloc_fw_ctrlblock(gfp_t flags);
+void ehca_free_fw_ctrlblock(void *ptr);
+#else
+#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
+#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
+#endif
+
+void ehca_recover_sqp(struct ib_qp *sqp);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c
new file mode 100644 (file)
index 0000000..8246418
--- /dev/null
@@ -0,0 +1,1123 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  module start stop, hca detection
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <linux/slab.h>
+#endif
+
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <rdma/ib_mad.h>
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+#define HCAD_VERSION "0029"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
+MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
+MODULE_VERSION(HCAD_VERSION);
+
+static bool ehca_open_aqp1    = 0;
+static int ehca_hw_level      = 0;
+static bool ehca_poll_all_eqs = 1;
+
+int ehca_debug_level   = 0;
+int ehca_nr_ports      = -1;
+bool ehca_use_hp_mr    = 0;
+int ehca_port_act_time = 30;
+int ehca_static_rate   = -1;
+bool ehca_scaling_code = 0;
+int ehca_lock_hcalls   = -1;
+int ehca_max_cq        = -1;
+int ehca_max_qp        = -1;
+
+module_param_named(open_aqp1,     ehca_open_aqp1,     bool, S_IRUGO);
+module_param_named(debug_level,   ehca_debug_level,   int,  S_IRUGO);
+module_param_named(hw_level,      ehca_hw_level,      int,  S_IRUGO);
+module_param_named(nr_ports,      ehca_nr_ports,      int,  S_IRUGO);
+module_param_named(use_hp_mr,     ehca_use_hp_mr,     bool, S_IRUGO);
+module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
+module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
+module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
+module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
+module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
+module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
+module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
+
+MODULE_PARM_DESC(open_aqp1,
+                "Open AQP1 on startup (default: no)");
+MODULE_PARM_DESC(debug_level,
+                "Amount of debug output (0: none (default), 1: traces, "
+                "2: some dumps, 3: lots)");
+MODULE_PARM_DESC(hw_level,
+                "Hardware level (0: autosensing (default), "
+                "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
+MODULE_PARM_DESC(nr_ports,
+                "number of connected ports (-1: autodetect (default), "
+                "1: port one only, 2: two ports)");
+MODULE_PARM_DESC(use_hp_mr,
+                "Use high performance MRs (default: no)");
+MODULE_PARM_DESC(port_act_time,
+                "Time to wait for port activation (default: 30 sec)");
+MODULE_PARM_DESC(poll_all_eqs,
+                "Poll all event queues periodically (default: yes)");
+MODULE_PARM_DESC(static_rate,
+                "Set permanent static rate (default: no static rate)");
+MODULE_PARM_DESC(scaling_code,
+                "Enable scaling code (default: no)");
+MODULE_PARM_DESC(lock_hcalls,
+                "Serialize all hCalls made by the driver "
+                "(default: autodetect)");
+MODULE_PARM_DESC(number_of_cqs,
+               "Max number of CQs which can be allocated "
+               "(default: autodetect)");
+MODULE_PARM_DESC(number_of_qps,
+               "Max number of QPs which can be allocated "
+               "(default: autodetect)");
+
+DEFINE_RWLOCK(ehca_qp_idr_lock);
+DEFINE_RWLOCK(ehca_cq_idr_lock);
+DEFINE_IDR(ehca_qp_idr);
+DEFINE_IDR(ehca_cq_idr);
+
+static LIST_HEAD(shca_list); /* list of all registered ehcas */
+DEFINE_SPINLOCK(shca_list_lock);
+
+static struct timer_list poll_eqs_timer;
+
+#ifdef CONFIG_PPC_64K_PAGES
+static struct kmem_cache *ctblk_cache;
+
+void *ehca_alloc_fw_ctrlblock(gfp_t flags)
+{
+       void *ret = kmem_cache_zalloc(ctblk_cache, flags);
+       if (!ret)
+               ehca_gen_err("Out of memory for ctblk");
+       return ret;
+}
+
+void ehca_free_fw_ctrlblock(void *ptr)
+{
+       if (ptr)
+               kmem_cache_free(ctblk_cache, ptr);
+
+}
+#endif
+
+int ehca2ib_return_code(u64 ehca_rc)
+{
+       switch (ehca_rc) {
+       case H_SUCCESS:
+               return 0;
+       case H_RESOURCE:             /* Resource in use */
+       case H_BUSY:
+               return -EBUSY;
+       case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
+       case H_CONSTRAINED:          /* resource constraint */
+       case H_NO_MEM:
+               return -ENOMEM;
+       default:
+               return -EINVAL;
+       }
+}
+
+static int ehca_create_slab_caches(void)
+{
+       int ret;
+
+       ret = ehca_init_pd_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create PD SLAB cache.");
+               return ret;
+       }
+
+       ret = ehca_init_cq_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create CQ SLAB cache.");
+               goto create_slab_caches2;
+       }
+
+       ret = ehca_init_qp_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create QP SLAB cache.");
+               goto create_slab_caches3;
+       }
+
+       ret = ehca_init_av_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create AV SLAB cache.");
+               goto create_slab_caches4;
+       }
+
+       ret = ehca_init_mrmw_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create MR&MW SLAB cache.");
+               goto create_slab_caches5;
+       }
+
+       ret = ehca_init_small_qp_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create small queue SLAB cache.");
+               goto create_slab_caches6;
+       }
+
+#ifdef CONFIG_PPC_64K_PAGES
+       ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
+                                       EHCA_PAGESIZE, H_CB_ALIGNMENT,
+                                       SLAB_HWCACHE_ALIGN,
+                                       NULL);
+       if (!ctblk_cache) {
+               ehca_gen_err("Cannot create ctblk SLAB cache.");
+               ehca_cleanup_small_qp_cache();
+               ret = -ENOMEM;
+               goto create_slab_caches6;
+       }
+#endif
+       return 0;
+
+create_slab_caches6:
+       ehca_cleanup_mrmw_cache();
+
+create_slab_caches5:
+       ehca_cleanup_av_cache();
+
+create_slab_caches4:
+       ehca_cleanup_qp_cache();
+
+create_slab_caches3:
+       ehca_cleanup_cq_cache();
+
+create_slab_caches2:
+       ehca_cleanup_pd_cache();
+
+       return ret;
+}
+
+static void ehca_destroy_slab_caches(void)
+{
+       ehca_cleanup_small_qp_cache();
+       ehca_cleanup_mrmw_cache();
+       ehca_cleanup_av_cache();
+       ehca_cleanup_qp_cache();
+       ehca_cleanup_cq_cache();
+       ehca_cleanup_pd_cache();
+#ifdef CONFIG_PPC_64K_PAGES
+       if (ctblk_cache)
+               kmem_cache_destroy(ctblk_cache);
+#endif
+}
+
+#define EHCA_HCAAVER  EHCA_BMASK_IBM(32, 39)
+#define EHCA_REVID    EHCA_BMASK_IBM(40, 63)
+
+static struct cap_descr {
+       u64 mask;
+       char *descr;
+} hca_cap_descr[] = {
+       { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
+       { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
+       { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
+       { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
+       { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
+       { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
+       { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
+       { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
+       { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
+       { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
+       { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
+       { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
+       { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
+       { HCA_CAP_SRQ, "HCA_CAP_SRQ" },
+       { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
+       { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
+       { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
+       { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
+};
+
+static int ehca_sense_attributes(struct ehca_shca *shca)
+{
+       int i, ret = 0;
+       u64 h_ret;
+       struct hipz_query_hca *rblock;
+       struct hipz_query_port *port;
+       const char *loc_code;
+
+       static const u32 pgsize_map[] = {
+               HCA_CAP_MR_PGSIZE_4K,  0x1000,
+               HCA_CAP_MR_PGSIZE_64K, 0x10000,
+               HCA_CAP_MR_PGSIZE_1M,  0x100000,
+               HCA_CAP_MR_PGSIZE_16M, 0x1000000,
+       };
+
+       ehca_gen_dbg("Probing adapter %s...",
+                    shca->ofdev->dev.of_node->full_name);
+       loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
+                                  NULL);
+       if (loc_code)
+               ehca_gen_dbg(" ... location lode=%s", loc_code);
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_gen_err("Cannot allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_gen_err("Cannot query device properties. h_ret=%lli",
+                            h_ret);
+               ret = -EPERM;
+               goto sense_attributes1;
+       }
+
+       if (ehca_nr_ports == 1)
+               shca->num_ports = 1;
+       else
+               shca->num_ports = (u8)rblock->num_ports;
+
+       ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
+
+       if (ehca_hw_level == 0) {
+               u32 hcaaver;
+               u32 revid;
+
+               hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
+               revid   = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
+
+               ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
+
+               if (hcaaver == 1) {
+                       if (revid <= 3)
+                               shca->hw_level = 0x10 | (revid + 1);
+                       else
+                               shca->hw_level = 0x14;
+               } else if (hcaaver == 2) {
+                       if (revid == 0)
+                               shca->hw_level = 0x21;
+                       else if (revid == 0x10)
+                               shca->hw_level = 0x22;
+                       else if (revid == 0x20 || revid == 0x21)
+                               shca->hw_level = 0x23;
+               }
+
+               if (!shca->hw_level) {
+                       ehca_gen_warn("unknown hardware version"
+                                     " - assuming default level");
+                       shca->hw_level = 0x22;
+               }
+       } else
+               shca->hw_level = ehca_hw_level;
+       ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
+
+       shca->hca_cap = rblock->hca_cap_indicators;
+       ehca_gen_dbg(" ... HCA capabilities:");
+       for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
+               if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
+                       ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
+
+       /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
+        * a firmware property, so it's valid across all adapters
+        */
+       if (ehca_lock_hcalls == -1)
+               ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
+                                       shca->hca_cap);
+
+       /* translate supported MR page sizes; always support 4K */
+       shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
+       for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
+               if (rblock->memory_page_size_supported & pgsize_map[i])
+                       shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
+
+       /* Set maximum number of CQs and QPs to calculate EQ size */
+       if (shca->max_num_qps == -1)
+               shca->max_num_qps = min_t(int, rblock->max_qp,
+                                         EHCA_MAX_NUM_QUEUES);
+       else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
+               ehca_gen_warn("The requested number of QPs is out of range "
+                             "(1 - %i) specified by HW. Value is set to %i",
+                             rblock->max_qp, rblock->max_qp);
+               shca->max_num_qps = rblock->max_qp;
+       }
+
+       if (shca->max_num_cqs == -1)
+               shca->max_num_cqs = min_t(int, rblock->max_cq,
+                                         EHCA_MAX_NUM_QUEUES);
+       else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
+               ehca_gen_warn("The requested number of CQs is out of range "
+                             "(1 - %i) specified by HW. Value is set to %i",
+                             rblock->max_cq, rblock->max_cq);
+       }
+
+       /* query max MTU from first port -- it's the same for all ports */
+       port = (struct hipz_query_port *)rblock;
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
+       if (h_ret != H_SUCCESS) {
+               ehca_gen_err("Cannot query port properties. h_ret=%lli",
+                            h_ret);
+               ret = -EPERM;
+               goto sense_attributes1;
+       }
+
+       shca->max_mtu = port->max_mtu;
+
+sense_attributes1:
+       ehca_free_fw_ctrlblock(rblock);
+       return ret;
+}
+
+static int init_node_guid(struct ehca_shca *shca)
+{
+       int ret = 0;
+       struct hipz_query_hca *rblock;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query device properties");
+               ret = -EINVAL;
+               goto init_node_guid1;
+       }
+
+       memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
+
+init_node_guid1:
+       ehca_free_fw_ctrlblock(rblock);
+       return ret;
+}
+
+static int ehca_port_immutable(struct ib_device *ibdev, u8 port_num,
+                              struct ib_port_immutable *immutable)
+{
+       struct ib_port_attr attr;
+       int err;
+
+       err = ehca_query_port(ibdev, port_num, &attr);
+       if (err)
+               return err;
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+       return 0;
+}
+
+static int ehca_init_device(struct ehca_shca *shca)
+{
+       int ret;
+
+       ret = init_node_guid(shca);
+       if (ret)
+               return ret;
+
+       strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
+       shca->ib_device.owner               = THIS_MODULE;
+
+       shca->ib_device.uverbs_abi_ver      = 8;
+       shca->ib_device.uverbs_cmd_mask     =
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+
+       shca->ib_device.node_type           = RDMA_NODE_IB_CA;
+       shca->ib_device.phys_port_cnt       = shca->num_ports;
+       shca->ib_device.num_comp_vectors    = 1;
+       shca->ib_device.dma_device          = &shca->ofdev->dev;
+       shca->ib_device.query_device        = ehca_query_device;
+       shca->ib_device.query_port          = ehca_query_port;
+       shca->ib_device.query_gid           = ehca_query_gid;
+       shca->ib_device.query_pkey          = ehca_query_pkey;
+       /* shca->in_device.modify_device    = ehca_modify_device    */
+       shca->ib_device.modify_port         = ehca_modify_port;
+       shca->ib_device.alloc_ucontext      = ehca_alloc_ucontext;
+       shca->ib_device.dealloc_ucontext    = ehca_dealloc_ucontext;
+       shca->ib_device.alloc_pd            = ehca_alloc_pd;
+       shca->ib_device.dealloc_pd          = ehca_dealloc_pd;
+       shca->ib_device.create_ah           = ehca_create_ah;
+       /* shca->ib_device.modify_ah        = ehca_modify_ah;       */
+       shca->ib_device.query_ah            = ehca_query_ah;
+       shca->ib_device.destroy_ah          = ehca_destroy_ah;
+       shca->ib_device.create_qp           = ehca_create_qp;
+       shca->ib_device.modify_qp           = ehca_modify_qp;
+       shca->ib_device.query_qp            = ehca_query_qp;
+       shca->ib_device.destroy_qp          = ehca_destroy_qp;
+       shca->ib_device.post_send           = ehca_post_send;
+       shca->ib_device.post_recv           = ehca_post_recv;
+       shca->ib_device.create_cq           = ehca_create_cq;
+       shca->ib_device.destroy_cq          = ehca_destroy_cq;
+       shca->ib_device.resize_cq           = ehca_resize_cq;
+       shca->ib_device.poll_cq             = ehca_poll_cq;
+       /* shca->ib_device.peek_cq          = ehca_peek_cq;         */
+       shca->ib_device.req_notify_cq       = ehca_req_notify_cq;
+       /* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
+       shca->ib_device.get_dma_mr          = ehca_get_dma_mr;
+       shca->ib_device.reg_phys_mr         = ehca_reg_phys_mr;
+       shca->ib_device.reg_user_mr         = ehca_reg_user_mr;
+       shca->ib_device.query_mr            = ehca_query_mr;
+       shca->ib_device.dereg_mr            = ehca_dereg_mr;
+       shca->ib_device.rereg_phys_mr       = ehca_rereg_phys_mr;
+       shca->ib_device.alloc_mw            = ehca_alloc_mw;
+       shca->ib_device.bind_mw             = ehca_bind_mw;
+       shca->ib_device.dealloc_mw          = ehca_dealloc_mw;
+       shca->ib_device.alloc_fmr           = ehca_alloc_fmr;
+       shca->ib_device.map_phys_fmr        = ehca_map_phys_fmr;
+       shca->ib_device.unmap_fmr           = ehca_unmap_fmr;
+       shca->ib_device.dealloc_fmr         = ehca_dealloc_fmr;
+       shca->ib_device.attach_mcast        = ehca_attach_mcast;
+       shca->ib_device.detach_mcast        = ehca_detach_mcast;
+       shca->ib_device.process_mad         = ehca_process_mad;
+       shca->ib_device.mmap                = ehca_mmap;
+       shca->ib_device.dma_ops             = &ehca_dma_mapping_ops;
+       shca->ib_device.get_port_immutable  = ehca_port_immutable;
+
+       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+               shca->ib_device.uverbs_cmd_mask |=
+                       (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+                       (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+                       (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+                       (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+               shca->ib_device.create_srq          = ehca_create_srq;
+               shca->ib_device.modify_srq          = ehca_modify_srq;
+               shca->ib_device.query_srq           = ehca_query_srq;
+               shca->ib_device.destroy_srq         = ehca_destroy_srq;
+               shca->ib_device.post_srq_recv       = ehca_post_srq_recv;
+       }
+
+       return ret;
+}
+
+static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
+{
+       struct ehca_sport *sport = &shca->sport[port - 1];
+       struct ib_cq *ibcq;
+       struct ib_qp *ibqp;
+       struct ib_qp_init_attr qp_init_attr;
+       struct ib_cq_init_attr cq_attr = {};
+       int ret;
+
+       if (sport->ibcq_aqp1) {
+               ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
+               return -EPERM;
+       }
+
+       cq_attr.cqe = 10;
+       ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
+                           &cq_attr);
+       if (IS_ERR(ibcq)) {
+               ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
+               return PTR_ERR(ibcq);
+       }
+       sport->ibcq_aqp1 = ibcq;
+
+       if (sport->ibqp_sqp[IB_QPT_GSI]) {
+               ehca_err(&shca->ib_device, "AQP1 QP is already created.");
+               ret = -EPERM;
+               goto create_aqp1;
+       }
+
+       memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+       qp_init_attr.send_cq          = ibcq;
+       qp_init_attr.recv_cq          = ibcq;
+       qp_init_attr.sq_sig_type      = IB_SIGNAL_ALL_WR;
+       qp_init_attr.cap.max_send_wr  = 100;
+       qp_init_attr.cap.max_recv_wr  = 100;
+       qp_init_attr.cap.max_send_sge = 2;
+       qp_init_attr.cap.max_recv_sge = 1;
+       qp_init_attr.qp_type          = IB_QPT_GSI;
+       qp_init_attr.port_num         = port;
+       qp_init_attr.qp_context       = NULL;
+       qp_init_attr.event_handler    = NULL;
+       qp_init_attr.srq              = NULL;
+
+       ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
+       if (IS_ERR(ibqp)) {
+               ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
+               ret = PTR_ERR(ibqp);
+               goto create_aqp1;
+       }
+       sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
+
+       return 0;
+
+create_aqp1:
+       ib_destroy_cq(sport->ibcq_aqp1);
+       return ret;
+}
+
+static int ehca_destroy_aqp1(struct ehca_sport *sport)
+{
+       int ret;
+
+       ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
+       if (ret) {
+               ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
+               return ret;
+       }
+
+       ret = ib_destroy_cq(sport->ibcq_aqp1);
+       if (ret)
+               ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
+
+       return ret;
+}
+
+static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
+}
+
+static ssize_t ehca_store_debug_level(struct device_driver *ddp,
+                                     const char *buf, size_t count)
+{
+       int value = (*buf) - '0';
+       if (value >= 0 && value <= 9)
+               ehca_debug_level = value;
+       return 1;
+}
+
+static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
+                  ehca_show_debug_level, ehca_store_debug_level);
+
+static struct attribute *ehca_drv_attrs[] = {
+       &driver_attr_debug_level.attr,
+       NULL
+};
+
+static struct attribute_group ehca_drv_attr_grp = {
+       .attrs = ehca_drv_attrs
+};
+
+static const struct attribute_group *ehca_drv_attr_groups[] = {
+       &ehca_drv_attr_grp,
+       NULL,
+};
+
+#define EHCA_RESOURCE_ATTR(name)                                           \
+static ssize_t  ehca_show_##name(struct device *dev,                       \
+                                struct device_attribute *attr,            \
+                                char *buf)                                \
+{                                                                         \
+       struct ehca_shca *shca;                                            \
+       struct hipz_query_hca *rblock;                                     \
+       int data;                                                          \
+                                                                          \
+       shca = dev_get_drvdata(dev);                                       \
+                                                                          \
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);                      \
+       if (!rblock) {                                                     \
+               dev_err(dev, "Can't allocate rblock memory.\n");           \
+               return 0;                                                  \
+       }                                                                  \
+                                                                          \
+       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
+               dev_err(dev, "Can't query device properties\n");           \
+               ehca_free_fw_ctrlblock(rblock);                            \
+               return 0;                                                  \
+       }                                                                  \
+                                                                          \
+       data = rblock->name;                                               \
+       ehca_free_fw_ctrlblock(rblock);                                    \
+                                                                          \
+       if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1))     \
+               return snprintf(buf, 256, "1\n");                          \
+       else                                                               \
+               return snprintf(buf, 256, "%d\n", data);                   \
+                                                                          \
+}                                                                         \
+static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
+
+EHCA_RESOURCE_ATTR(num_ports);
+EHCA_RESOURCE_ATTR(hw_ver);
+EHCA_RESOURCE_ATTR(max_eq);
+EHCA_RESOURCE_ATTR(cur_eq);
+EHCA_RESOURCE_ATTR(max_cq);
+EHCA_RESOURCE_ATTR(cur_cq);
+EHCA_RESOURCE_ATTR(max_qp);
+EHCA_RESOURCE_ATTR(cur_qp);
+EHCA_RESOURCE_ATTR(max_mr);
+EHCA_RESOURCE_ATTR(cur_mr);
+EHCA_RESOURCE_ATTR(max_mw);
+EHCA_RESOURCE_ATTR(cur_mw);
+EHCA_RESOURCE_ATTR(max_pd);
+EHCA_RESOURCE_ATTR(max_ah);
+
+static ssize_t ehca_show_adapter_handle(struct device *dev,
+                                       struct device_attribute *attr,
+                                       char *buf)
+{
+       struct ehca_shca *shca = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
+
+}
+static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
+
+static struct attribute *ehca_dev_attrs[] = {
+       &dev_attr_adapter_handle.attr,
+       &dev_attr_num_ports.attr,
+       &dev_attr_hw_ver.attr,
+       &dev_attr_max_eq.attr,
+       &dev_attr_cur_eq.attr,
+       &dev_attr_max_cq.attr,
+       &dev_attr_cur_cq.attr,
+       &dev_attr_max_qp.attr,
+       &dev_attr_cur_qp.attr,
+       &dev_attr_max_mr.attr,
+       &dev_attr_cur_mr.attr,
+       &dev_attr_max_mw.attr,
+       &dev_attr_cur_mw.attr,
+       &dev_attr_max_pd.attr,
+       &dev_attr_max_ah.attr,
+       NULL
+};
+
+static struct attribute_group ehca_dev_attr_grp = {
+       .attrs = ehca_dev_attrs
+};
+
+static int ehca_probe(struct platform_device *dev)
+{
+       struct ehca_shca *shca;
+       const u64 *handle;
+       struct ib_pd *ibpd;
+       int ret, i, eq_size;
+       unsigned long flags;
+
+       handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
+       if (!handle) {
+               ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
+                            dev->dev.of_node->full_name);
+               return -ENODEV;
+       }
+
+       if (!(*handle)) {
+               ehca_gen_err("Wrong eHCA handle for adapter: %s.",
+                            dev->dev.of_node->full_name);
+               return -ENODEV;
+       }
+
+       shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
+       if (!shca) {
+               ehca_gen_err("Cannot allocate shca memory.");
+               return -ENOMEM;
+       }
+
+       mutex_init(&shca->modify_mutex);
+       atomic_set(&shca->num_cqs, 0);
+       atomic_set(&shca->num_qps, 0);
+       shca->max_num_qps = ehca_max_qp;
+       shca->max_num_cqs = ehca_max_cq;
+
+       for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
+               spin_lock_init(&shca->sport[i].mod_sqp_lock);
+
+       shca->ofdev = dev;
+       shca->ipz_hca_handle.handle = *handle;
+       dev_set_drvdata(&dev->dev, shca);
+
+       ret = ehca_sense_attributes(shca);
+       if (ret < 0) {
+               ehca_gen_err("Cannot sense eHCA attributes.");
+               goto probe1;
+       }
+
+       ret = ehca_init_device(shca);
+       if (ret) {
+               ehca_gen_err("Cannot init ehca  device struct");
+               goto probe1;
+       }
+
+       eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
+       /* create event queues */
+       ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
+       if (ret) {
+               ehca_err(&shca->ib_device, "Cannot create EQ.");
+               goto probe1;
+       }
+
+       ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
+       if (ret) {
+               ehca_err(&shca->ib_device, "Cannot create NEQ.");
+               goto probe3;
+       }
+
+       /* create internal protection domain */
+       ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
+       if (IS_ERR(ibpd)) {
+               ehca_err(&shca->ib_device, "Cannot create internal PD.");
+               ret = PTR_ERR(ibpd);
+               goto probe4;
+       }
+
+       shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
+       shca->pd->ib_pd.device = &shca->ib_device;
+
+       /* create internal max MR */
+       ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
+
+       if (ret) {
+               ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
+                        ret);
+               goto probe5;
+       }
+
+       ret = ib_register_device(&shca->ib_device, NULL);
+       if (ret) {
+               ehca_err(&shca->ib_device,
+                        "ib_register_device() failed ret=%i", ret);
+               goto probe6;
+       }
+
+       /* create AQP1 for port 1 */
+       if (ehca_open_aqp1 == 1) {
+               shca->sport[0].port_state = IB_PORT_DOWN;
+               ret = ehca_create_aqp1(shca, 1);
+               if (ret) {
+                       ehca_err(&shca->ib_device,
+                                "Cannot create AQP1 for port 1.");
+                       goto probe7;
+               }
+       }
+
+       /* create AQP1 for port 2 */
+       if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
+               shca->sport[1].port_state = IB_PORT_DOWN;
+               ret = ehca_create_aqp1(shca, 2);
+               if (ret) {
+                       ehca_err(&shca->ib_device,
+                                "Cannot create AQP1 for port 2.");
+                       goto probe8;
+               }
+       }
+
+       ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+       if (ret) /* only complain; we can live without attributes */
+               ehca_err(&shca->ib_device,
+                        "Cannot create device attributes  ret=%d", ret);
+
+       spin_lock_irqsave(&shca_list_lock, flags);
+       list_add(&shca->shca_list, &shca_list);
+       spin_unlock_irqrestore(&shca_list_lock, flags);
+
+       return 0;
+
+probe8:
+       ret = ehca_destroy_aqp1(&shca->sport[0]);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy AQP1 for port 1. ret=%i", ret);
+
+probe7:
+       ib_unregister_device(&shca->ib_device);
+
+probe6:
+       ret = ehca_dereg_internal_maxmr(shca);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal MR. ret=%x", ret);
+
+probe5:
+       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal PD. ret=%x", ret);
+
+probe4:
+       ret = ehca_destroy_eq(shca, &shca->neq);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy NEQ. ret=%x", ret);
+
+probe3:
+       ret = ehca_destroy_eq(shca, &shca->eq);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy EQ. ret=%x", ret);
+
+probe1:
+       ib_dealloc_device(&shca->ib_device);
+
+       return -EINVAL;
+}
+
+static int ehca_remove(struct platform_device *dev)
+{
+       struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
+       unsigned long flags;
+       int ret;
+
+       sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+
+       if (ehca_open_aqp1 == 1) {
+               int i;
+               for (i = 0; i < shca->num_ports; i++) {
+                       ret = ehca_destroy_aqp1(&shca->sport[i]);
+                       if (ret)
+                               ehca_err(&shca->ib_device,
+                                        "Cannot destroy AQP1 for port %x "
+                                        "ret=%i", ret, i);
+               }
+       }
+
+       ib_unregister_device(&shca->ib_device);
+
+       ret = ehca_dereg_internal_maxmr(shca);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal MR. ret=%i", ret);
+
+       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal PD. ret=%i", ret);
+
+       ret = ehca_destroy_eq(shca, &shca->eq);
+       if (ret)
+               ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
+
+       ret = ehca_destroy_eq(shca, &shca->neq);
+       if (ret)
+               ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
+
+       ib_dealloc_device(&shca->ib_device);
+
+       spin_lock_irqsave(&shca_list_lock, flags);
+       list_del(&shca->shca_list);
+       spin_unlock_irqrestore(&shca_list_lock, flags);
+
+       return ret;
+}
+
+static struct of_device_id ehca_device_table[] =
+{
+       {
+               .name       = "lhca",
+               .compatible = "IBM,lhca",
+       },
+       {},
+};
+MODULE_DEVICE_TABLE(of, ehca_device_table);
+
+static struct platform_driver ehca_driver = {
+       .probe       = ehca_probe,
+       .remove      = ehca_remove,
+       .driver = {
+               .name = "ehca",
+               .owner = THIS_MODULE,
+               .groups = ehca_drv_attr_groups,
+               .of_match_table = ehca_device_table,
+       },
+};
+
+void ehca_poll_eqs(unsigned long data)
+{
+       struct ehca_shca *shca;
+
+       spin_lock(&shca_list_lock);
+       list_for_each_entry(shca, &shca_list, shca_list) {
+               if (shca->eq.is_initialized) {
+                       /* call deadman proc only if eq ptr does not change */
+                       struct ehca_eq *eq = &shca->eq;
+                       int max = 3;
+                       volatile u64 q_ofs, q_ofs2;
+                       unsigned long flags;
+                       spin_lock_irqsave(&eq->spinlock, flags);
+                       q_ofs = eq->ipz_queue.current_q_offset;
+                       spin_unlock_irqrestore(&eq->spinlock, flags);
+                       do {
+                               spin_lock_irqsave(&eq->spinlock, flags);
+                               q_ofs2 = eq->ipz_queue.current_q_offset;
+                               spin_unlock_irqrestore(&eq->spinlock, flags);
+                               max--;
+                       } while (q_ofs == q_ofs2 && max > 0);
+                       if (q_ofs == q_ofs2)
+                               ehca_process_eq(shca, 0);
+               }
+       }
+       mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
+       spin_unlock(&shca_list_lock);
+}
+
+static int ehca_mem_notifier(struct notifier_block *nb,
+                            unsigned long action, void *data)
+{
+       static unsigned long ehca_dmem_warn_time;
+       unsigned long flags;
+
+       switch (action) {
+       case MEM_CANCEL_OFFLINE:
+       case MEM_CANCEL_ONLINE:
+       case MEM_ONLINE:
+       case MEM_OFFLINE:
+               return NOTIFY_OK;
+       case MEM_GOING_ONLINE:
+       case MEM_GOING_OFFLINE:
+               /* only ok if no hca is attached to the lpar */
+               spin_lock_irqsave(&shca_list_lock, flags);
+               if (list_empty(&shca_list)) {
+                       spin_unlock_irqrestore(&shca_list_lock, flags);
+                       return NOTIFY_OK;
+               } else {
+                       spin_unlock_irqrestore(&shca_list_lock, flags);
+                       if (printk_timed_ratelimit(&ehca_dmem_warn_time,
+                                                  30 * 1000))
+                               ehca_gen_err("DMEM operations are not allowed"
+                                            "in conjunction with eHCA");
+                       return NOTIFY_BAD;
+               }
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block ehca_mem_nb = {
+       .notifier_call = ehca_mem_notifier,
+};
+
+static int __init ehca_module_init(void)
+{
+       int ret;
+
+       printk(KERN_INFO "eHCA Infiniband Device Driver "
+              "(Version " HCAD_VERSION ")\n");
+
+       ret = ehca_create_comp_pool();
+       if (ret) {
+               ehca_gen_err("Cannot create comp pool.");
+               return ret;
+       }
+
+       ret = ehca_create_slab_caches();
+       if (ret) {
+               ehca_gen_err("Cannot create SLAB caches");
+               ret = -ENOMEM;
+               goto module_init1;
+       }
+
+       ret = ehca_create_busmap();
+       if (ret) {
+               ehca_gen_err("Cannot create busmap.");
+               goto module_init2;
+       }
+
+       ret = ibmebus_register_driver(&ehca_driver);
+       if (ret) {
+               ehca_gen_err("Cannot register eHCA device driver");
+               ret = -EINVAL;
+               goto module_init3;
+       }
+
+       ret = register_memory_notifier(&ehca_mem_nb);
+       if (ret) {
+               ehca_gen_err("Failed registering memory add/remove notifier");
+               goto module_init4;
+       }
+
+       if (ehca_poll_all_eqs != 1) {
+               ehca_gen_err("WARNING!!!");
+               ehca_gen_err("It is possible to lose interrupts.");
+       } else {
+               init_timer(&poll_eqs_timer);
+               poll_eqs_timer.function = ehca_poll_eqs;
+               poll_eqs_timer.expires = jiffies + HZ;
+               add_timer(&poll_eqs_timer);
+       }
+
+       return 0;
+
+module_init4:
+       ibmebus_unregister_driver(&ehca_driver);
+
+module_init3:
+       ehca_destroy_busmap();
+
+module_init2:
+       ehca_destroy_slab_caches();
+
+module_init1:
+       ehca_destroy_comp_pool();
+       return ret;
+};
+
+static void __exit ehca_module_exit(void)
+{
+       if (ehca_poll_all_eqs == 1)
+               del_timer_sync(&poll_eqs_timer);
+
+       ibmebus_unregister_driver(&ehca_driver);
+
+       unregister_memory_notifier(&ehca_mem_nb);
+
+       ehca_destroy_busmap();
+
+       ehca_destroy_slab_caches();
+
+       ehca_destroy_comp_pool();
+
+       idr_destroy(&ehca_cq_idr);
+       idr_destroy(&ehca_qp_idr);
+};
+
+module_init(ehca_module_init);
+module_exit(ehca_module_exit);
diff --git a/drivers/staging/rdma/ehca/ehca_mcast.c b/drivers/staging/rdma/ehca/ehca_mcast.c
new file mode 100644 (file)
index 0000000..cec1815
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  mcast  functions
+ *
+ *  Authors: Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define MAX_MC_LID 0xFFFE
+#define MIN_MC_LID 0xC000      /* Multicast limits */
+#define EHCA_VALID_MULTICAST_GID(gid)  ((gid)[0] == 0xFF)
+#define EHCA_VALID_MULTICAST_LID(lid) \
+       (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
+
+int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+                                             ib_device);
+       union ib_gid my_gid;
+       u64 subnet_prefix, interface_id, h_ret;
+
+       if (ibqp->qp_type != IB_QPT_UD) {
+               ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
+               return -EINVAL;
+       }
+
+       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+               ehca_err(ibqp->device, "invalid mulitcast gid");
+               return -EINVAL;
+       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+               return -EINVAL;
+       }
+
+       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+       interface_id = be64_to_cpu(my_gid.global.interface_id);
+       h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
+                                  my_qp->ipz_qp_handle,
+                                  my_qp->galpas.kernel,
+                                  lid, subnet_prefix, interface_id);
+       if (h_ret != H_SUCCESS)
+               ehca_err(ibqp->device,
+                        "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
+                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+       return ehca2ib_return_code(h_ret);
+}
+
+int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca = container_of(ibqp->pd->device,
+                                             struct ehca_shca, ib_device);
+       union ib_gid my_gid;
+       u64 subnet_prefix, interface_id, h_ret;
+
+       if (ibqp->qp_type != IB_QPT_UD) {
+               ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
+               return -EINVAL;
+       }
+
+       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+               ehca_err(ibqp->device, "invalid mulitcast gid");
+               return -EINVAL;
+       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+               return -EINVAL;
+       }
+
+       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+       interface_id = be64_to_cpu(my_gid.global.interface_id);
+       h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
+                                  my_qp->ipz_qp_handle,
+                                  my_qp->galpas.kernel,
+                                  lid, subnet_prefix, interface_id);
+       if (h_ret != H_SUCCESS)
+               ehca_err(ibqp->device,
+                        "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
+                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+       return ehca2ib_return_code(h_ret);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c
new file mode 100644 (file)
index 0000000..f914b30
--- /dev/null
@@ -0,0 +1,2593 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "hcp_if.h"
+#include "hipz_hw.h"
+
+#define NUM_CHUNKS(length, chunk_size) \
+       (((length) + (chunk_size - 1)) / (chunk_size))
+
+/* max number of rpages (per hcall register_rpages) */
+#define MAX_RPAGES 512
+
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+
+static unsigned long ehca_mr_len;
+
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+       u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+       struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+       struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
+static struct kmem_cache *mr_cache;
+static struct kmem_cache *mw_cache;
+
+enum ehca_mr_pgsize {
+       EHCA_MR_PGSIZE4K  = 0x1000L,
+       EHCA_MR_PGSIZE64K = 0x10000L,
+       EHCA_MR_PGSIZE1M  = 0x100000L,
+       EHCA_MR_PGSIZE16M = 0x1000000L
+};
+
+#define EHCA_MR_PGSHIFT4K  12
+#define EHCA_MR_PGSHIFT64K 16
+#define EHCA_MR_PGSHIFT1M  20
+#define EHCA_MR_PGSHIFT16M 24
+
+static u64 ehca_map_vaddr(void *caddr);
+
+static u32 ehca_encode_hwpage_size(u32 pgsize)
+{
+       int log = ilog2(pgsize);
+       WARN_ON(log < 12 || log > 24 || log & 3);
+       return (log - 12) / 4;
+}
+
+static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
+{
+       return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
+}
+
+static struct ehca_mr *ehca_mr_new(void)
+{
+       struct ehca_mr *me;
+
+       me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
+       if (me)
+               spin_lock_init(&me->mrlock);
+       else
+               ehca_gen_err("alloc failed");
+
+       return me;
+}
+
+static void ehca_mr_delete(struct ehca_mr *me)
+{
+       kmem_cache_free(mr_cache, me);
+}
+
+static struct ehca_mw *ehca_mw_new(void)
+{
+       struct ehca_mw *me;
+
+       me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
+       if (me)
+               spin_lock_init(&me->mwlock);
+       else
+               ehca_gen_err("alloc failed");
+
+       return me;
+}
+
+static void ehca_mw_delete(struct ehca_mw *me)
+{
+       kmem_cache_free(mw_cache, me);
+}
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+       struct ib_mr *ib_mr;
+       int ret;
+       struct ehca_mr *e_maxmr;
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+
+       if (shca->maxmr) {
+               e_maxmr = ehca_mr_new();
+               if (!e_maxmr) {
+                       ehca_err(&shca->ib_device, "out of memory");
+                       ib_mr = ERR_PTR(-ENOMEM);
+                       goto get_dma_mr_exit0;
+               }
+
+               ret = ehca_reg_maxmr(shca, e_maxmr,
+                                    (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
+                                    mr_access_flags, e_pd,
+                                    &e_maxmr->ib.ib_mr.lkey,
+                                    &e_maxmr->ib.ib_mr.rkey);
+               if (ret) {
+                       ehca_mr_delete(e_maxmr);
+                       ib_mr = ERR_PTR(ret);
+                       goto get_dma_mr_exit0;
+               }
+               ib_mr = &e_maxmr->ib.ib_mr;
+       } else {
+               ehca_err(&shca->ib_device, "no internal max-MR exist!");
+               ib_mr = ERR_PTR(-EINVAL);
+               goto get_dma_mr_exit0;
+       }
+
+get_dma_mr_exit0:
+       if (IS_ERR(ib_mr))
+               ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
+                        PTR_ERR(ib_mr), pd, mr_access_flags);
+       return ib_mr;
+} /* end ehca_get_dma_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+                              struct ib_phys_buf *phys_buf_array,
+                              int num_phys_buf,
+                              int mr_access_flags,
+                              u64 *iova_start)
+{
+       struct ib_mr *ib_mr;
+       int ret;
+       struct ehca_mr *e_mr;
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+       u64 size;
+
+       if ((num_phys_buf <= 0) || !phys_buf_array) {
+               ehca_err(pd->device, "bad input values: num_phys_buf=%x "
+                        "phys_buf_array=%p", num_phys_buf, phys_buf_array);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_phys_mr_exit0;
+       }
+       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_phys_mr_exit0;
+       }
+
+       /* check physical buffer list and calculate size */
+       ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
+                                           iova_start, &size);
+       if (ret) {
+               ib_mr = ERR_PTR(ret);
+               goto reg_phys_mr_exit0;
+       }
+       if ((size == 0) ||
+           (((u64)iova_start + size) < (u64)iova_start)) {
+               ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
+                        size, iova_start);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_phys_mr_exit0;
+       }
+
+       e_mr = ehca_mr_new();
+       if (!e_mr) {
+               ehca_err(pd->device, "out of memory");
+               ib_mr = ERR_PTR(-ENOMEM);
+               goto reg_phys_mr_exit0;
+       }
+
+       /* register MR on HCA */
+       if (ehca_mr_is_maxmr(size, iova_start)) {
+               e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+               ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
+                                    e_pd, &e_mr->ib.ib_mr.lkey,
+                                    &e_mr->ib.ib_mr.rkey);
+               if (ret) {
+                       ib_mr = ERR_PTR(ret);
+                       goto reg_phys_mr_exit1;
+               }
+       } else {
+               struct ehca_mr_pginfo pginfo;
+               u32 num_kpages;
+               u32 num_hwpages;
+               u64 hw_pgsize;
+
+               num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
+                                       PAGE_SIZE);
+               /* for kernel space we try most possible pgsize */
+               hw_pgsize = ehca_get_max_hwpage_size(shca);
+               num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
+                                        hw_pgsize);
+               memset(&pginfo, 0, sizeof(pginfo));
+               pginfo.type = EHCA_MR_PGI_PHYS;
+               pginfo.num_kpages = num_kpages;
+               pginfo.hwpage_size = hw_pgsize;
+               pginfo.num_hwpages = num_hwpages;
+               pginfo.u.phy.num_phys_buf = num_phys_buf;
+               pginfo.u.phy.phys_buf_array = phys_buf_array;
+               pginfo.next_hwpage =
+                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+
+               ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
+                                 e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+                                 &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+               if (ret) {
+                       ib_mr = ERR_PTR(ret);
+                       goto reg_phys_mr_exit1;
+               }
+       }
+
+       /* successful registration of all pages */
+       return &e_mr->ib.ib_mr;
+
+reg_phys_mr_exit1:
+       ehca_mr_delete(e_mr);
+reg_phys_mr_exit0:
+       if (IS_ERR(ib_mr))
+               ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
+                        "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
+                        PTR_ERR(ib_mr), pd, phys_buf_array,
+                        num_phys_buf, mr_access_flags, iova_start);
+       return ib_mr;
+} /* end ehca_reg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                              u64 virt, int mr_access_flags,
+                              struct ib_udata *udata)
+{
+       struct ib_mr *ib_mr;
+       struct ehca_mr *e_mr;
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_mr_pginfo pginfo;
+       int ret, page_shift;
+       u32 num_kpages;
+       u32 num_hwpages;
+       u64 hwpage_size;
+
+       if (!pd) {
+               ehca_gen_err("bad pd=%p", pd);
+               return ERR_PTR(-EFAULT);
+       }
+
+       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_user_mr_exit0;
+       }
+
+       if (length == 0 || virt + length < virt) {
+               ehca_err(pd->device, "bad input values: length=%llx "
+                        "virt_base=%llx", length, virt);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_user_mr_exit0;
+       }
+
+       e_mr = ehca_mr_new();
+       if (!e_mr) {
+               ehca_err(pd->device, "out of memory");
+               ib_mr = ERR_PTR(-ENOMEM);
+               goto reg_user_mr_exit0;
+       }
+
+       e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
+                                mr_access_flags, 0);
+       if (IS_ERR(e_mr->umem)) {
+               ib_mr = (void *)e_mr->umem;
+               goto reg_user_mr_exit1;
+       }
+
+       if (e_mr->umem->page_size != PAGE_SIZE) {
+               ehca_err(pd->device, "page size not supported, "
+                        "e_mr->umem->page_size=%x", e_mr->umem->page_size);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_user_mr_exit2;
+       }
+
+       /* determine number of MR pages */
+       num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
+       /* select proper hw_pgsize */
+       page_shift = PAGE_SHIFT;
+       if (e_mr->umem->hugetlb) {
+               /* determine page_shift, clamp between 4K and 16M */
+               page_shift = (fls64(length - 1) + 3) & ~3;
+               page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
+                                EHCA_MR_PGSHIFT16M);
+       }
+       hwpage_size = 1UL << page_shift;
+
+       /* now that we have the desired page size, shift until it's
+        * supported, too. 4K is always supported, so this terminates.
+        */
+       while (!(hwpage_size & shca->hca_cap_mr_pgsize))
+               hwpage_size >>= 4;
+
+reg_user_mr_fallback:
+       num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
+       /* register MR on HCA */
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_USER;
+       pginfo.hwpage_size = hwpage_size;
+       pginfo.num_kpages = num_kpages;
+       pginfo.num_hwpages = num_hwpages;
+       pginfo.u.usr.region = e_mr->umem;
+       pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
+       pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
+       ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
+                         e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+                         &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+       if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
+               ehca_warn(pd->device, "failed to register mr "
+                         "with hwpage_size=%llx", hwpage_size);
+               ehca_info(pd->device, "try to register mr with "
+                         "kpage_size=%lx", PAGE_SIZE);
+               /*
+                * this means kpages are not contiguous for a hw page
+                * try kernel page size as fallback solution
+                */
+               hwpage_size = PAGE_SIZE;
+               goto reg_user_mr_fallback;
+       }
+       if (ret) {
+               ib_mr = ERR_PTR(ret);
+               goto reg_user_mr_exit2;
+       }
+
+       /* successful registration of all pages */
+       return &e_mr->ib.ib_mr;
+
+reg_user_mr_exit2:
+       ib_umem_release(e_mr->umem);
+reg_user_mr_exit1:
+       ehca_mr_delete(e_mr);
+reg_user_mr_exit0:
+       if (IS_ERR(ib_mr))
+               ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
+                        PTR_ERR(ib_mr), pd, mr_access_flags, udata);
+       return ib_mr;
+} /* end ehca_reg_user_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+                      int mr_rereg_mask,
+                      struct ib_pd *pd,
+                      struct ib_phys_buf *phys_buf_array,
+                      int num_phys_buf,
+                      int mr_access_flags,
+                      u64 *iova_start)
+{
+       int ret;
+
+       struct ehca_shca *shca =
+               container_of(mr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+       u64 new_size;
+       u64 *new_start;
+       u32 new_acl;
+       struct ehca_pd *new_pd;
+       u32 tmp_lkey, tmp_rkey;
+       unsigned long sl_flags;
+       u32 num_kpages = 0;
+       u32 num_hwpages = 0;
+       struct ehca_mr_pginfo pginfo;
+
+       if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
+               /* TODO not supported, because PHYP rereg hCall needs pages */
+               ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
+                        "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+
+       if (mr_rereg_mask & IB_MR_REREG_PD) {
+               if (!pd) {
+                       ehca_err(mr->device, "rereg with bad pd, pd=%p "
+                                "mr_rereg_mask=%x", pd, mr_rereg_mask);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit0;
+               }
+       }
+
+       if ((mr_rereg_mask &
+            ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
+           (mr_rereg_mask == 0)) {
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+
+       /* check other parameters */
+       if (e_mr == shca->maxmr) {
+               /* should be impossible, however reject to be sure */
+               ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
+                        "shca->maxmr=%p mr->lkey=%x",
+                        mr, shca->maxmr, mr->lkey);
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+       if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
+               if (e_mr->flags & EHCA_MR_FLAG_FMR) {
+                       ehca_err(mr->device, "not supported for FMR, mr=%p "
+                                "flags=%x", mr, e_mr->flags);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit0;
+               }
+               if (!phys_buf_array || num_phys_buf <= 0) {
+                       ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
+                                " phys_buf_array=%p num_phys_buf=%x",
+                                mr_rereg_mask, phys_buf_array, num_phys_buf);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit0;
+               }
+       }
+       if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&     /* change ACL */
+           (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+            ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
+                        "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+
+       /* set requested values dependent on rereg request */
+       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+       new_start = e_mr->start;
+       new_size = e_mr->size;
+       new_acl = e_mr->acl;
+       new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
+
+       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+               u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
+
+               new_start = iova_start; /* change address */
+               /* check physical buffer list and calculate size */
+               ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
+                                                   num_phys_buf, iova_start,
+                                                   &new_size);
+               if (ret)
+                       goto rereg_phys_mr_exit1;
+               if ((new_size == 0) ||
+                   (((u64)iova_start + new_size) < (u64)iova_start)) {
+                       ehca_err(mr->device, "bad input values: new_size=%llx "
+                                "iova_start=%p", new_size, iova_start);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit1;
+               }
+               num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
+                                       new_size, PAGE_SIZE);
+               num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
+                                        new_size, hw_pgsize);
+               memset(&pginfo, 0, sizeof(pginfo));
+               pginfo.type = EHCA_MR_PGI_PHYS;
+               pginfo.num_kpages = num_kpages;
+               pginfo.hwpage_size = hw_pgsize;
+               pginfo.num_hwpages = num_hwpages;
+               pginfo.u.phy.num_phys_buf = num_phys_buf;
+               pginfo.u.phy.phys_buf_array = phys_buf_array;
+               pginfo.next_hwpage =
+                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+       }
+       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+               new_acl = mr_access_flags;
+       if (mr_rereg_mask & IB_MR_REREG_PD)
+               new_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+       ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
+                           new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+       if (ret)
+               goto rereg_phys_mr_exit1;
+
+       /* successful reregistration */
+       if (mr_rereg_mask & IB_MR_REREG_PD)
+               mr->pd = pd;
+       mr->lkey = tmp_lkey;
+       mr->rkey = tmp_rkey;
+
+rereg_phys_mr_exit1:
+       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+rereg_phys_mr_exit0:
+       if (ret)
+               ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
+                        "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
+                        "iova_start=%p",
+                        ret, mr, mr_rereg_mask, pd, phys_buf_array,
+                        num_phys_buf, mr_access_flags, iova_start);
+       return ret;
+} /* end ehca_rereg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(mr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+       unsigned long sl_flags;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+               ret = -EINVAL;
+               goto query_mr_exit0;
+       }
+
+       memset(mr_attr, 0, sizeof(struct ib_mr_attr));
+       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+
+       h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
+                        "hca_hndl=%llx mr_hndl=%llx lkey=%x",
+                        h_ret, mr, shca->ipz_hca_handle.handle,
+                        e_mr->ipz_mr_handle.handle, mr->lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto query_mr_exit1;
+       }
+       mr_attr->pd = mr->pd;
+       mr_attr->device_virt_addr = hipzout.vaddr;
+       mr_attr->size = hipzout.len;
+       mr_attr->lkey = hipzout.lkey;
+       mr_attr->rkey = hipzout.rkey;
+       ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
+
+query_mr_exit1:
+       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+query_mr_exit0:
+       if (ret)
+               ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
+                        ret, mr, mr_attr);
+       return ret;
+} /* end ehca_query_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_mr(struct ib_mr *mr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(mr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+
+       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+               ret = -EINVAL;
+               goto dereg_mr_exit0;
+       } else if (e_mr == shca->maxmr) {
+               /* should be impossible, however reject to be sure */
+               ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
+                        "shca->maxmr=%p mr->lkey=%x",
+                        mr, shca->maxmr, mr->lkey);
+               ret = -EINVAL;
+               goto dereg_mr_exit0;
+       }
+
+       /* TODO: BUSY: MR still has bound window(s) */
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
+                        "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
+                        h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
+                        e_mr->ipz_mr_handle.handle, mr->lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto dereg_mr_exit0;
+       }
+
+       if (e_mr->umem)
+               ib_umem_release(e_mr->umem);
+
+       /* successful deregistration */
+       ehca_mr_delete(e_mr);
+
+dereg_mr_exit0:
+       if (ret)
+               ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
+       return ret;
+} /* end ehca_dereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+       struct ib_mw *ib_mw;
+       u64 h_ret;
+       struct ehca_mw *e_mw;
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_mw_hipzout_parms hipzout;
+
+       if (type != IB_MW_TYPE_1)
+               return ERR_PTR(-EINVAL);
+
+       e_mw = ehca_mw_new();
+       if (!e_mw) {
+               ib_mw = ERR_PTR(-ENOMEM);
+               goto alloc_mw_exit0;
+       }
+
+       h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
+                                        e_pd->fw_pd, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
+                        "shca=%p hca_hndl=%llx mw=%p",
+                        h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
+               ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
+               goto alloc_mw_exit1;
+       }
+       /* successful MW allocation */
+       e_mw->ipz_mw_handle = hipzout.handle;
+       e_mw->ib_mw.rkey    = hipzout.rkey;
+       return &e_mw->ib_mw;
+
+alloc_mw_exit1:
+       ehca_mw_delete(e_mw);
+alloc_mw_exit0:
+       if (IS_ERR(ib_mw))
+               ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
+       return ib_mw;
+} /* end ehca_alloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_bind_mw(struct ib_qp *qp,
+                struct ib_mw *mw,
+                struct ib_mw_bind *mw_bind)
+{
+       /* TODO: not supported up to now */
+       ehca_gen_err("bind MW currently not supported by HCAD");
+
+       return -EPERM;
+} /* end ehca_bind_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_mw(struct ib_mw *mw)
+{
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(mw->device, struct ehca_shca, ib_device);
+       struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
+
+       h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
+                        "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
+                        h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
+                        e_mw->ipz_mw_handle.handle);
+               return ehca2ib_return_code(h_ret);
+       }
+       /* successful deallocation */
+       ehca_mw_delete(e_mw);
+       return 0;
+} /* end ehca_dealloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+                             int mr_access_flags,
+                             struct ib_fmr_attr *fmr_attr)
+{
+       struct ib_fmr *ib_fmr;
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_mr *e_fmr;
+       int ret;
+       u32 tmp_lkey, tmp_rkey;
+       struct ehca_mr_pginfo pginfo;
+       u64 hw_pgsize;
+
+       /* check other parameters */
+       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+       if (mr_access_flags & IB_ACCESS_MW_BIND) {
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+       if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
+               ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
+                        "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
+                        fmr_attr->max_pages, fmr_attr->max_maps,
+                        fmr_attr->page_shift);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+
+       hw_pgsize = 1 << fmr_attr->page_shift;
+       if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
+               ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
+                        fmr_attr->page_shift);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+
+       e_fmr = ehca_mr_new();
+       if (!e_fmr) {
+               ib_fmr = ERR_PTR(-ENOMEM);
+               goto alloc_fmr_exit0;
+       }
+       e_fmr->flags |= EHCA_MR_FLAG_FMR;
+
+       /* register MR on HCA */
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.hwpage_size = hw_pgsize;
+       /*
+        * pginfo.num_hwpages==0, ie register_rpages() will not be called
+        * but deferred to map_phys_fmr()
+        */
+       ret = ehca_reg_mr(shca, e_fmr, NULL,
+                         fmr_attr->max_pages * (1 << fmr_attr->page_shift),
+                         mr_access_flags, e_pd, &pginfo,
+                         &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
+       if (ret) {
+               ib_fmr = ERR_PTR(ret);
+               goto alloc_fmr_exit1;
+       }
+
+       /* successful */
+       e_fmr->hwpage_size = hw_pgsize;
+       e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
+       e_fmr->fmr_max_pages = fmr_attr->max_pages;
+       e_fmr->fmr_max_maps = fmr_attr->max_maps;
+       e_fmr->fmr_map_cnt = 0;
+       return &e_fmr->ib.ib_fmr;
+
+alloc_fmr_exit1:
+       ehca_mr_delete(e_fmr);
+alloc_fmr_exit0:
+       return ib_fmr;
+} /* end ehca_alloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+                     u64 *page_list,
+                     int list_len,
+                     u64 iova)
+{
+       int ret;
+       struct ehca_shca *shca =
+               container_of(fmr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+       struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
+       struct ehca_mr_pginfo pginfo;
+       u32 tmp_lkey, tmp_rkey;
+
+       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+                        e_fmr, e_fmr->flags);
+               ret = -EINVAL;
+               goto map_phys_fmr_exit0;
+       }
+       ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
+       if (ret)
+               goto map_phys_fmr_exit0;
+       if (iova % e_fmr->fmr_page_size) {
+               /* only whole-numbered pages */
+               ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
+                        iova, e_fmr->fmr_page_size);
+               ret = -EINVAL;
+               goto map_phys_fmr_exit0;
+       }
+       if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
+               /* HCAD does not limit the maps, however trace this anyway */
+               ehca_info(fmr->device, "map limit exceeded, fmr=%p "
+                         "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
+                         fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
+       }
+
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_FMR;
+       pginfo.num_kpages = list_len;
+       pginfo.hwpage_size = e_fmr->hwpage_size;
+       pginfo.num_hwpages =
+               list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
+       pginfo.u.fmr.page_list = page_list;
+       pginfo.next_hwpage =
+               (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
+       pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
+
+       ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
+                           list_len * e_fmr->fmr_page_size,
+                           e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+       if (ret)
+               goto map_phys_fmr_exit0;
+
+       /* successful reregistration */
+       e_fmr->fmr_map_cnt++;
+       e_fmr->ib.ib_fmr.lkey = tmp_lkey;
+       e_fmr->ib.ib_fmr.rkey = tmp_rkey;
+       return 0;
+
+map_phys_fmr_exit0:
+       if (ret)
+               ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
+                        "iova=%llx", ret, fmr, page_list, list_len, iova);
+       return ret;
+} /* end ehca_map_phys_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_fmr(struct list_head *fmr_list)
+{
+       int ret = 0;
+       struct ib_fmr *ib_fmr;
+       struct ehca_shca *shca = NULL;
+       struct ehca_shca *prev_shca;
+       struct ehca_mr *e_fmr;
+       u32 num_fmr = 0;
+       u32 unmap_fmr_cnt = 0;
+
+       /* check all FMR belong to same SHCA, and check internal flag */
+       list_for_each_entry(ib_fmr, fmr_list, list) {
+               prev_shca = shca;
+               shca = container_of(ib_fmr->device, struct ehca_shca,
+                                   ib_device);
+               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+               if ((shca != prev_shca) && prev_shca) {
+                       ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
+                                "prev_shca=%p e_fmr=%p",
+                                shca, prev_shca, e_fmr);
+                       ret = -EINVAL;
+                       goto unmap_fmr_exit0;
+               }
+               if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+                       ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
+                                "e_fmr->flags=%x", e_fmr, e_fmr->flags);
+                       ret = -EINVAL;
+                       goto unmap_fmr_exit0;
+               }
+               num_fmr++;
+       }
+
+       /* loop over all FMRs to unmap */
+       list_for_each_entry(ib_fmr, fmr_list, list) {
+               unmap_fmr_cnt++;
+               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+               shca = container_of(ib_fmr->device, struct ehca_shca,
+                                   ib_device);
+               ret = ehca_unmap_one_fmr(shca, e_fmr);
+               if (ret) {
+                       /* unmap failed, stop unmapping of rest of FMRs */
+                       ehca_err(&shca->ib_device, "unmap of one FMR failed, "
+                                "stop rest, e_fmr=%p num_fmr=%x "
+                                "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
+                                unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
+                       goto unmap_fmr_exit0;
+               }
+       }
+
+unmap_fmr_exit0:
+       if (ret)
+               ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
+                            ret, fmr_list, num_fmr, unmap_fmr_cnt);
+       return ret;
+} /* end ehca_unmap_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr)
+{
+       int ret;
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(fmr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+
+       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+                        e_fmr, e_fmr->flags);
+               ret = -EINVAL;
+               goto free_fmr_exit0;
+       }
+
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
+                        "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
+                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
+                        e_fmr->ipz_mr_handle.handle, fmr->lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto free_fmr_exit0;
+       }
+       /* successful deregistration */
+       ehca_mr_delete(e_fmr);
+       return 0;
+
+free_fmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
+       return ret;
+} /* end ehca_dealloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+                                  struct ehca_mr *e_mr,
+                                  struct ehca_mr_pginfo *pginfo);
+
+int ehca_reg_mr(struct ehca_shca *shca,
+               struct ehca_mr *e_mr,
+               u64 *iova_start,
+               u64 size,
+               int acl,
+               struct ehca_pd *e_pd,
+               struct ehca_mr_pginfo *pginfo,
+               u32 *lkey, /*OUT*/
+               u32 *rkey, /*OUT*/
+               enum ehca_reg_type reg_type)
+{
+       int ret;
+       u64 h_ret;
+       u32 hipz_acl;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+       if (ehca_use_hp_mr == 1)
+               hipz_acl |= 0x00000001;
+
+       h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
+                                        (u64)iova_start, size, hipz_acl,
+                                        e_pd->fw_pd, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
+                        "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
+               ret = ehca2ib_return_code(h_ret);
+               goto ehca_reg_mr_exit0;
+       }
+
+       e_mr->ipz_mr_handle = hipzout.handle;
+
+       if (reg_type == EHCA_REG_BUSMAP_MR)
+               ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+       else if (reg_type == EHCA_REG_MR)
+               ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+       else
+               ret = -EINVAL;
+
+       if (ret)
+               goto ehca_reg_mr_exit1;
+
+       /* successful registration */
+       e_mr->num_kpages = pginfo->num_kpages;
+       e_mr->num_hwpages = pginfo->num_hwpages;
+       e_mr->hwpage_size = pginfo->hwpage_size;
+       e_mr->start = iova_start;
+       e_mr->size = size;
+       e_mr->acl = acl;
+       *lkey = hipzout.lkey;
+       *rkey = hipzout.rkey;
+       return 0;
+
+ehca_reg_mr_exit1:
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
+                        "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
+                        "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
+                        h_ret, shca, e_mr, iova_start, size, acl, e_pd,
+                        hipzout.lkey, pginfo, pginfo->num_kpages,
+                        pginfo->num_hwpages, ret);
+               ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
+                        "not recoverable");
+       }
+ehca_reg_mr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+                        "num_kpages=%llx num_hwpages=%llx",
+                        ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
+                        pginfo->num_kpages, pginfo->num_hwpages);
+       return ret;
+} /* end ehca_reg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+                      struct ehca_mr *e_mr,
+                      struct ehca_mr_pginfo *pginfo)
+{
+       int ret = 0;
+       u64 h_ret;
+       u32 rnum;
+       u64 rpage;
+       u32 i;
+       u64 *kpage;
+
+       if (!pginfo->num_hwpages) /* in case of fmr */
+               return 0;
+
+       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!kpage) {
+               ehca_err(&shca->ib_device, "kpage alloc failed");
+               ret = -ENOMEM;
+               goto ehca_reg_mr_rpages_exit0;
+       }
+
+       /* max MAX_RPAGES ehca mr pages per register call */
+       for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
+
+               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+                       rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
+                       if (rnum == 0)
+                               rnum = MAX_RPAGES;      /* last shot is full */
+               } else
+                       rnum = MAX_RPAGES;
+
+               ret = ehca_set_pagebuf(pginfo, rnum, kpage);
+               if (ret) {
+                       ehca_err(&shca->ib_device, "ehca_set_pagebuf "
+                                "bad rc, ret=%i rnum=%x kpage=%p",
+                                ret, rnum, kpage);
+                       goto ehca_reg_mr_rpages_exit1;
+               }
+
+               if (rnum > 1) {
+                       rpage = __pa(kpage);
+                       if (!rpage) {
+                               ehca_err(&shca->ib_device, "kpage=%p i=%x",
+                                        kpage, i);
+                               ret = -EFAULT;
+                               goto ehca_reg_mr_rpages_exit1;
+                       }
+               } else
+                       rpage = *kpage;
+
+               h_ret = hipz_h_register_rpage_mr(
+                       shca->ipz_hca_handle, e_mr,
+                       ehca_encode_hwpage_size(pginfo->hwpage_size),
+                       0, rpage, rnum);
+
+               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+                       /*
+                        * check for 'registration complete'==H_SUCCESS
+                        * and for 'page registered'==H_PAGE_REGISTERED
+                        */
+                       if (h_ret != H_SUCCESS) {
+                               ehca_err(&shca->ib_device, "last "
+                                        "hipz_reg_rpage_mr failed, h_ret=%lli "
+                                        "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
+                                        " lkey=%x", h_ret, e_mr, i,
+                                        shca->ipz_hca_handle.handle,
+                                        e_mr->ipz_mr_handle.handle,
+                                        e_mr->ib.ib_mr.lkey);
+                               ret = ehca2ib_return_code(h_ret);
+                               break;
+                       } else
+                               ret = 0;
+               } else if (h_ret != H_PAGE_REGISTERED) {
+                       ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
+                                "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
+                                "mr_hndl=%llx", h_ret, e_mr, i,
+                                e_mr->ib.ib_mr.lkey,
+                                shca->ipz_hca_handle.handle,
+                                e_mr->ipz_mr_handle.handle);
+                       ret = ehca2ib_return_code(h_ret);
+                       break;
+               } else
+                       ret = 0;
+       } /* end for(i) */
+
+
+ehca_reg_mr_rpages_exit1:
+       ehca_free_fw_ctrlblock(kpage);
+ehca_reg_mr_rpages_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
+                        "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
+                        pginfo, pginfo->num_kpages, pginfo->num_hwpages);
+       return ret;
+} /* end ehca_reg_mr_rpages() */
+
+/*----------------------------------------------------------------------*/
+
+inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
+                               struct ehca_mr *e_mr,
+                               u64 *iova_start,
+                               u64 size,
+                               u32 acl,
+                               struct ehca_pd *e_pd,
+                               struct ehca_mr_pginfo *pginfo,
+                               u32 *lkey, /*OUT*/
+                               u32 *rkey) /*OUT*/
+{
+       int ret;
+       u64 h_ret;
+       u32 hipz_acl;
+       u64 *kpage;
+       u64 rpage;
+       struct ehca_mr_pginfo pginfo_save;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+
+       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!kpage) {
+               ehca_err(&shca->ib_device, "kpage alloc failed");
+               ret = -ENOMEM;
+               goto ehca_rereg_mr_rereg1_exit0;
+       }
+
+       pginfo_save = *pginfo;
+       ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
+       if (ret) {
+               ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
+                        "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
+                        "kpage=%p", e_mr, pginfo, pginfo->type,
+                        pginfo->num_kpages, pginfo->num_hwpages, kpage);
+               goto ehca_rereg_mr_rereg1_exit1;
+       }
+       rpage = __pa(kpage);
+       if (!rpage) {
+               ehca_err(&shca->ib_device, "kpage=%p", kpage);
+               ret = -EFAULT;
+               goto ehca_rereg_mr_rereg1_exit1;
+       }
+       h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
+                                     (u64)iova_start, size, hipz_acl,
+                                     e_pd->fw_pd, rpage, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               /*
+                * reregistration unsuccessful, try it again with the 3 hCalls,
+                * e.g. this is required in case H_MR_CONDITION
+                * (MW bound or MR is shared)
+                */
+               ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
+                         "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
+               *pginfo = pginfo_save;
+               ret = -EAGAIN;
+       } else if ((u64 *)hipzout.vaddr != iova_start) {
+               ehca_err(&shca->ib_device, "PHYP changed iova_start in "
+                        "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
+                        "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
+                        hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
+                        e_mr->ib.ib_mr.lkey, hipzout.lkey);
+               ret = -EFAULT;
+       } else {
+               /*
+                * successful reregistration
+                * note: start and start_out are identical for eServer HCAs
+                */
+               e_mr->num_kpages = pginfo->num_kpages;
+               e_mr->num_hwpages = pginfo->num_hwpages;
+               e_mr->hwpage_size = pginfo->hwpage_size;
+               e_mr->start = iova_start;
+               e_mr->size = size;
+               e_mr->acl = acl;
+               *lkey = hipzout.lkey;
+               *rkey = hipzout.rkey;
+       }
+
+ehca_rereg_mr_rereg1_exit1:
+       ehca_free_fw_ctrlblock(kpage);
+ehca_rereg_mr_rereg1_exit0:
+       if ( ret && (ret != -EAGAIN) )
+               ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
+                        "pginfo=%p num_kpages=%llx num_hwpages=%llx",
+                        ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
+                        pginfo->num_hwpages);
+       return ret;
+} /* end ehca_rereg_mr_rereg1() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+                 struct ehca_mr *e_mr,
+                 u64 *iova_start,
+                 u64 size,
+                 int acl,
+                 struct ehca_pd *e_pd,
+                 struct ehca_mr_pginfo *pginfo,
+                 u32 *lkey,
+                 u32 *rkey)
+{
+       int ret = 0;
+       u64 h_ret;
+       int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
+       int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
+
+       /* first determine reregistration hCall(s) */
+       if ((pginfo->num_hwpages > MAX_RPAGES) ||
+           (e_mr->num_hwpages > MAX_RPAGES) ||
+           (pginfo->num_hwpages > e_mr->num_hwpages)) {
+               ehca_dbg(&shca->ib_device, "Rereg3 case, "
+                        "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
+                        pginfo->num_hwpages, e_mr->num_hwpages);
+               rereg_1_hcall = 0;
+               rereg_3_hcall = 1;
+       }
+
+       if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */
+               rereg_1_hcall = 0;
+               rereg_3_hcall = 1;
+               e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
+               ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
+                        e_mr);
+       }
+
+       if (rereg_1_hcall) {
+               ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
+                                          acl, e_pd, pginfo, lkey, rkey);
+               if (ret) {
+                       if (ret == -EAGAIN)
+                               rereg_3_hcall = 1;
+                       else
+                               goto ehca_rereg_mr_exit0;
+               }
+       }
+
+       if (rereg_3_hcall) {
+               struct ehca_mr save_mr;
+
+               /* first deregister old MR */
+               h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+               if (h_ret != H_SUCCESS) {
+                       ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+                                "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
+                                "mr->lkey=%x",
+                                h_ret, e_mr, shca->ipz_hca_handle.handle,
+                                e_mr->ipz_mr_handle.handle,
+                                e_mr->ib.ib_mr.lkey);
+                       ret = ehca2ib_return_code(h_ret);
+                       goto ehca_rereg_mr_exit0;
+               }
+               /* clean ehca_mr_t, without changing struct ib_mr and lock */
+               save_mr = *e_mr;
+               ehca_mr_deletenew(e_mr);
+
+               /* set some MR values */
+               e_mr->flags = save_mr.flags;
+               e_mr->hwpage_size = save_mr.hwpage_size;
+               e_mr->fmr_page_size = save_mr.fmr_page_size;
+               e_mr->fmr_max_pages = save_mr.fmr_max_pages;
+               e_mr->fmr_max_maps = save_mr.fmr_max_maps;
+               e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
+
+               ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
+                                 e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
+               if (ret) {
+                       u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
+                       memcpy(&e_mr->flags, &(save_mr.flags),
+                              sizeof(struct ehca_mr) - offset);
+                       goto ehca_rereg_mr_exit0;
+               }
+       }
+
+ehca_rereg_mr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+                        "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
+                        "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
+                        acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
+                        rereg_1_hcall, rereg_3_hcall);
+       return ret;
+} /* end ehca_rereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+                      struct ehca_mr *e_fmr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_pd *e_pd =
+               container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
+       struct ehca_mr save_fmr;
+       u32 tmp_lkey, tmp_rkey;
+       struct ehca_mr_pginfo pginfo;
+       struct ehca_mr_hipzout_parms hipzout;
+       struct ehca_mr save_mr;
+
+       if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
+               /*
+                * note: after using rereg hcall with len=0,
+                * rereg hcall must be used again for registering pages
+                */
+               h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
+                                             0, 0, e_pd->fw_pd, 0, &hipzout);
+               if (h_ret == H_SUCCESS) {
+                       /* successful reregistration */
+                       e_fmr->start = NULL;
+                       e_fmr->size = 0;
+                       tmp_lkey = hipzout.lkey;
+                       tmp_rkey = hipzout.rkey;
+                       return 0;
+               }
+               /*
+                * should not happen, because length checked above,
+                * FMRs are not shared and no MW bound to FMRs
+                */
+               ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
+                        "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
+                        "mr_hndl=%llx lkey=%x lkey_out=%x",
+                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
+                        e_fmr->ipz_mr_handle.handle,
+                        e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
+               /* try free and rereg */
+       }
+
+       /* first free old FMR */
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+                        "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
+                        "lkey=%x",
+                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
+                        e_fmr->ipz_mr_handle.handle,
+                        e_fmr->ib.ib_fmr.lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto ehca_unmap_one_fmr_exit0;
+       }
+       /* clean ehca_mr_t, without changing lock */
+       save_fmr = *e_fmr;
+       ehca_mr_deletenew(e_fmr);
+
+       /* set some MR values */
+       e_fmr->flags = save_fmr.flags;
+       e_fmr->hwpage_size = save_fmr.hwpage_size;
+       e_fmr->fmr_page_size = save_fmr.fmr_page_size;
+       e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
+       e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
+       e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
+       e_fmr->acl = save_fmr.acl;
+
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_FMR;
+       ret = ehca_reg_mr(shca, e_fmr, NULL,
+                         (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
+                         e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
+                         &tmp_rkey, EHCA_REG_MR);
+       if (ret) {
+               u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
+               memcpy(&e_fmr->flags, &(save_mr.flags),
+                      sizeof(struct ehca_mr) - offset);
+       }
+
+ehca_unmap_one_fmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
+                        "fmr_max_pages=%x",
+                        ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
+       return ret;
+} /* end ehca_unmap_one_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_smr(struct ehca_shca *shca,
+                struct ehca_mr *e_origmr,
+                struct ehca_mr *e_newmr,
+                u64 *iova_start,
+                int acl,
+                struct ehca_pd *e_pd,
+                u32 *lkey, /*OUT*/
+                u32 *rkey) /*OUT*/
+{
+       int ret = 0;
+       u64 h_ret;
+       u32 hipz_acl;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
+                                   &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+                        "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
+                        "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+                        h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
+                        shca->ipz_hca_handle.handle,
+                        e_origmr->ipz_mr_handle.handle,
+                        e_origmr->ib.ib_mr.lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto ehca_reg_smr_exit0;
+       }
+       /* successful registration */
+       e_newmr->num_kpages = e_origmr->num_kpages;
+       e_newmr->num_hwpages = e_origmr->num_hwpages;
+       e_newmr->hwpage_size   = e_origmr->hwpage_size;
+       e_newmr->start = iova_start;
+       e_newmr->size = e_origmr->size;
+       e_newmr->acl = acl;
+       e_newmr->ipz_mr_handle = hipzout.handle;
+       *lkey = hipzout.lkey;
+       *rkey = hipzout.rkey;
+       return 0;
+
+ehca_reg_smr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
+                        "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
+                        ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
+       return ret;
+} /* end ehca_reg_smr() */
+
+/*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+       unsigned long ret = idx;
+       ret |= dir << EHCA_DIR_INDEX_SHIFT;
+       ret |= top << EHCA_TOP_INDEX_SHIFT;
+       return __va(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+       ((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+                              struct ehca_shca *shca, struct ehca_mr *mr,
+                              struct ehca_mr_pginfo *pginfo)
+{
+       u64 h_ret = 0;
+       unsigned long page = 0;
+       u64 rpage = __pa(kpage);
+       int page_count;
+
+       void *sectbase = ehca_calc_sectbase(top, dir, idx);
+       if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+               ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+                                          "hwpage_size does not fit to "
+                                          "section start address");
+       }
+       page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+       while (page < page_count) {
+               u64 rnum;
+               for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+                    rnum++) {
+                       void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+                       kpage[rnum] = __pa(pg);
+               }
+
+               h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+                       ehca_encode_hwpage_size(pginfo->hwpage_size),
+                       0, rpage, rnum);
+
+               if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+                       ehca_err(&shca->ib_device, "register_rpage_mr failed");
+                       return h_ret;
+               }
+       }
+       return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+                               struct ehca_shca *shca, struct ehca_mr *mr,
+                               struct ehca_mr_pginfo *pginfo)
+{
+       u64 hret = H_SUCCESS;
+       int idx;
+
+       for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+                       continue;
+
+               hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+                                          pginfo);
+               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+                               return hret;
+       }
+       return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+                                   struct ehca_mr *mr,
+                                   struct ehca_mr_pginfo *pginfo)
+{
+       u64 hret = H_SUCCESS;
+       int dir;
+
+       for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+                       continue;
+
+               hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+                               return hret;
+       }
+       return hret;
+}
+
+/* register internal max-MR to internal SHCA */
+int ehca_reg_internal_maxmr(
+       struct ehca_shca *shca,
+       struct ehca_pd *e_pd,
+       struct ehca_mr **e_maxmr)  /*OUT*/
+{
+       int ret;
+       struct ehca_mr *e_mr;
+       u64 *iova_start;
+       u64 size_maxmr;
+       struct ehca_mr_pginfo pginfo;
+       struct ib_phys_buf ib_pbuf;
+       u32 num_kpages;
+       u32 num_hwpages;
+       u64 hw_pgsize;
+
+       if (!ehca_bmap) {
+               ret = -EFAULT;
+               goto ehca_reg_internal_maxmr_exit0;
+       }
+
+       e_mr = ehca_mr_new();
+       if (!e_mr) {
+               ehca_err(&shca->ib_device, "out of memory");
+               ret = -ENOMEM;
+               goto ehca_reg_internal_maxmr_exit0;
+       }
+       e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+
+       /* register internal max-MR on HCA */
+       size_maxmr = ehca_mr_len;
+       iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
+       ib_pbuf.addr = 0;
+       ib_pbuf.size = size_maxmr;
+       num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
+                               PAGE_SIZE);
+       hw_pgsize = ehca_get_max_hwpage_size(shca);
+       num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
+                                hw_pgsize);
+
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_PHYS;
+       pginfo.num_kpages = num_kpages;
+       pginfo.num_hwpages = num_hwpages;
+       pginfo.hwpage_size = hw_pgsize;
+       pginfo.u.phy.num_phys_buf = 1;
+       pginfo.u.phy.phys_buf_array = &ib_pbuf;
+
+       ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
+                         &pginfo, &e_mr->ib.ib_mr.lkey,
+                         &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
+       if (ret) {
+               ehca_err(&shca->ib_device, "reg of internal max MR failed, "
+                        "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
+                        "num_hwpages=%x", e_mr, iova_start, size_maxmr,
+                        num_kpages, num_hwpages);
+               goto ehca_reg_internal_maxmr_exit1;
+       }
+
+       /* successful registration of all pages */
+       e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
+       e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
+       e_mr->ib.ib_mr.uobject = NULL;
+       atomic_inc(&(e_pd->ib_pd.usecnt));
+       atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
+       *e_maxmr = e_mr;
+       return 0;
+
+ehca_reg_internal_maxmr_exit1:
+       ehca_mr_delete(e_mr);
+ehca_reg_internal_maxmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
+                        ret, shca, e_pd, e_maxmr);
+       return ret;
+} /* end ehca_reg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+                  struct ehca_mr *e_newmr,
+                  u64 *iova_start,
+                  int acl,
+                  struct ehca_pd *e_pd,
+                  u32 *lkey,
+                  u32 *rkey)
+{
+       u64 h_ret;
+       struct ehca_mr *e_origmr = shca->maxmr;
+       u32 hipz_acl;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
+                                   &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+                        "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+                        h_ret, e_origmr, shca->ipz_hca_handle.handle,
+                        e_origmr->ipz_mr_handle.handle,
+                        e_origmr->ib.ib_mr.lkey);
+               return ehca2ib_return_code(h_ret);
+       }
+       /* successful registration */
+       e_newmr->num_kpages = e_origmr->num_kpages;
+       e_newmr->num_hwpages = e_origmr->num_hwpages;
+       e_newmr->hwpage_size = e_origmr->hwpage_size;
+       e_newmr->start = iova_start;
+       e_newmr->size = e_origmr->size;
+       e_newmr->acl = acl;
+       e_newmr->ipz_mr_handle = hipzout.handle;
+       *lkey = hipzout.lkey;
+       *rkey = hipzout.rkey;
+       return 0;
+} /* end ehca_reg_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
+{
+       int ret;
+       struct ehca_mr *e_maxmr;
+       struct ib_pd *ib_pd;
+
+       if (!shca->maxmr) {
+               ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
+               ret = -EINVAL;
+               goto ehca_dereg_internal_maxmr_exit0;
+       }
+
+       e_maxmr = shca->maxmr;
+       ib_pd = e_maxmr->ib.ib_mr.pd;
+       shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
+
+       ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
+       if (ret) {
+               ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
+                        "ret=%i e_maxmr=%p shca=%p lkey=%x",
+                        ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
+               shca->maxmr = e_maxmr;
+               goto ehca_dereg_internal_maxmr_exit0;
+       }
+
+       atomic_dec(&ib_pd->usecnt);
+
+ehca_dereg_internal_maxmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
+                        ret, shca, shca->maxmr);
+       return ret;
+} /* end ehca_dereg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check physical buffer array of MR verbs for validness and
+ * calculates MR size
+ */
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+                                 int num_phys_buf,
+                                 u64 *iova_start,
+                                 u64 *size)
+{
+       struct ib_phys_buf *pbuf = phys_buf_array;
+       u64 size_count = 0;
+       u32 i;
+
+       if (num_phys_buf == 0) {
+               ehca_gen_err("bad phys buf array len, num_phys_buf=0");
+               return -EINVAL;
+       }
+       /* check first buffer */
+       if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
+               ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
+                            "pbuf->addr=%llx pbuf->size=%llx",
+                            iova_start, pbuf->addr, pbuf->size);
+               return -EINVAL;
+       }
+       if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
+           (num_phys_buf > 1)) {
+               ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
+                            "pbuf->size=%llx", pbuf->addr, pbuf->size);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < num_phys_buf; i++) {
+               if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
+                       ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
+                                    "pbuf->size=%llx",
+                                    i, pbuf->addr, pbuf->size);
+                       return -EINVAL;
+               }
+               if (((i > 0) && /* not 1st */
+                    (i < (num_phys_buf - 1)) &&        /* not last */
+                    (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
+                       ehca_gen_err("bad size, i=%x pbuf->size=%llx",
+                                    i, pbuf->size);
+                       return -EINVAL;
+               }
+               size_count += pbuf->size;
+               pbuf++;
+       }
+
+       *size = size_count;
+       return 0;
+} /* end ehca_mr_chk_buf_and_calc_size() */
+
+/*----------------------------------------------------------------------*/
+
+/* check page list of map FMR verb for validness */
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+                            u64 *page_list,
+                            int list_len)
+{
+       u32 i;
+       u64 *page;
+
+       if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
+               ehca_gen_err("bad list_len, list_len=%x "
+                            "e_fmr->fmr_max_pages=%x fmr=%p",
+                            list_len, e_fmr->fmr_max_pages, e_fmr);
+               return -EINVAL;
+       }
+
+       /* each page must be aligned */
+       page = page_list;
+       for (i = 0; i < list_len; i++) {
+               if (*page % e_fmr->fmr_page_size) {
+                       ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
+                                    "fmr_page_size=%x", i, *page, page, e_fmr,
+                                    e_fmr->fmr_page_size);
+                       return -EINVAL;
+               }
+               page++;
+       }
+
+       return 0;
+} /* end ehca_fmr_check_page_list() */
+
+/*----------------------------------------------------------------------*/
+
+/* PAGE_SIZE >= pginfo->hwpage_size */
+static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
+                                 u32 number,
+                                 u64 *kpage)
+{
+       int ret = 0;
+       u64 pgaddr;
+       u32 j = 0;
+       int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
+       struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+       while (*sg != NULL) {
+               pgaddr = page_to_pfn(sg_page(*sg))
+                       << PAGE_SHIFT;
+               *kpage = pgaddr + (pginfo->next_hwpage *
+                                  pginfo->hwpage_size);
+               if (!(*kpage)) {
+                       ehca_gen_err("pgaddr=%llx "
+                                    "sg_dma_address=%llx "
+                                    "entry=%llx next_hwpage=%llx",
+                                    pgaddr, (u64)sg_dma_address(*sg),
+                                    pginfo->u.usr.next_nmap,
+                                    pginfo->next_hwpage);
+                       return -EFAULT;
+               }
+               (pginfo->hwpage_cnt)++;
+               (pginfo->next_hwpage)++;
+               kpage++;
+               if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
+                       (pginfo->kpage_cnt)++;
+                       (pginfo->u.usr.next_nmap)++;
+                       pginfo->next_hwpage = 0;
+                       *sg = sg_next(*sg);
+               }
+               j++;
+               if (j >= number)
+                       break;
+       }
+
+       return ret;
+}
+
+/*
+ * check given pages for contiguous layout
+ * last page addr is returned in prev_pgaddr for further check
+ */
+static int ehca_check_kpages_per_ate(struct scatterlist **sg,
+                                    int num_pages,
+                                    u64 *prev_pgaddr)
+{
+       for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
+               u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
+               if (ehca_debug_level >= 3)
+                       ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
+                                    *(u64 *)__va(pgaddr));
+               if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
+                       ehca_gen_err("uncontiguous page found pgaddr=%llx "
+                                    "prev_pgaddr=%llx entries_left_in_hwpage=%x",
+                                    pgaddr, *prev_pgaddr, num_pages);
+                       return -EINVAL;
+               }
+               *prev_pgaddr = pgaddr;
+       }
+       return 0;
+}
+
+/* PAGE_SIZE < pginfo->hwpage_size */
+static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
+                                 u32 number,
+                                 u64 *kpage)
+{
+       int ret = 0;
+       u64 pgaddr, prev_pgaddr;
+       u32 j = 0;
+       int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
+       int nr_kpages = kpages_per_hwpage;
+       struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+       while (*sg != NULL) {
+
+               if (nr_kpages == kpages_per_hwpage) {
+                       pgaddr = (page_to_pfn(sg_page(*sg))
+                                  << PAGE_SHIFT);
+                       *kpage = pgaddr;
+                       if (!(*kpage)) {
+                               ehca_gen_err("pgaddr=%llx entry=%llx",
+                                            pgaddr, pginfo->u.usr.next_nmap);
+                               ret = -EFAULT;
+                               return ret;
+                       }
+                       /*
+                        * The first page in a hwpage must be aligned;
+                        * the first MR page is exempt from this rule.
+                        */
+                       if (pgaddr & (pginfo->hwpage_size - 1)) {
+                               if (pginfo->hwpage_cnt) {
+                                       ehca_gen_err(
+                                               "invalid alignment "
+                                               "pgaddr=%llx entry=%llx "
+                                               "mr_pgsize=%llx",
+                                               pgaddr, pginfo->u.usr.next_nmap,
+                                               pginfo->hwpage_size);
+                                       ret = -EFAULT;
+                                       return ret;
+                               }
+                               /* first MR page */
+                               pginfo->kpage_cnt =
+                                       (pgaddr &
+                                        (pginfo->hwpage_size - 1)) >>
+                                       PAGE_SHIFT;
+                               nr_kpages -= pginfo->kpage_cnt;
+                               *kpage = pgaddr &
+                                        ~(pginfo->hwpage_size - 1);
+                       }
+                       if (ehca_debug_level >= 3) {
+                               u64 val = *(u64 *)__va(pgaddr);
+                               ehca_gen_dbg("kpage=%llx page=%llx "
+                                            "value=%016llx",
+                                            *kpage, pgaddr, val);
+                       }
+                       prev_pgaddr = pgaddr;
+                       *sg = sg_next(*sg);
+                       pginfo->kpage_cnt++;
+                       pginfo->u.usr.next_nmap++;
+                       nr_kpages--;
+                       if (!nr_kpages)
+                               goto next_kpage;
+                       continue;
+               }
+
+               ret = ehca_check_kpages_per_ate(sg, nr_kpages,
+                                               &prev_pgaddr);
+               if (ret)
+                       return ret;
+               pginfo->kpage_cnt += nr_kpages;
+               pginfo->u.usr.next_nmap += nr_kpages;
+
+next_kpage:
+               nr_kpages = kpages_per_hwpage;
+               (pginfo->hwpage_cnt)++;
+               kpage++;
+               j++;
+               if (j >= number)
+                       break;
+       }
+
+       return ret;
+}
+
+static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
+                                u32 number, u64 *kpage)
+{
+       int ret = 0;
+       struct ib_phys_buf *pbuf;
+       u64 num_hw, offs_hw;
+       u32 i = 0;
+
+       /* loop over desired phys_buf_array entries */
+       while (i < number) {
+               pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
+               num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
+                                    pbuf->size, pginfo->hwpage_size);
+               offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
+                       pginfo->hwpage_size;
+               while (pginfo->next_hwpage < offs_hw + num_hw) {
+                       /* sanity check */
+                       if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+                           (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+                               ehca_gen_err("kpage_cnt >= num_kpages, "
+                                            "kpage_cnt=%llx num_kpages=%llx "
+                                            "hwpage_cnt=%llx "
+                                            "num_hwpages=%llx i=%x",
+                                            pginfo->kpage_cnt,
+                                            pginfo->num_kpages,
+                                            pginfo->hwpage_cnt,
+                                            pginfo->num_hwpages, i);
+                               return -EFAULT;
+                       }
+                       *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
+                                (pginfo->next_hwpage * pginfo->hwpage_size);
+                       if ( !(*kpage) && pbuf->addr ) {
+                               ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
+                                            "next_hwpage=%llx", pbuf->addr,
+                                            pbuf->size, pginfo->next_hwpage);
+                               return -EFAULT;
+                       }
+                       (pginfo->hwpage_cnt)++;
+                       (pginfo->next_hwpage)++;
+                       if (PAGE_SIZE >= pginfo->hwpage_size) {
+                               if (pginfo->next_hwpage %
+                                   (PAGE_SIZE / pginfo->hwpage_size) == 0)
+                                       (pginfo->kpage_cnt)++;
+                       } else
+                               pginfo->kpage_cnt += pginfo->hwpage_size /
+                                       PAGE_SIZE;
+                       kpage++;
+                       i++;
+                       if (i >= number) break;
+               }
+               if (pginfo->next_hwpage >= offs_hw + num_hw) {
+                       (pginfo->u.phy.next_buf)++;
+                       pginfo->next_hwpage = 0;
+               }
+       }
+       return ret;
+}
+
+static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
+                               u32 number, u64 *kpage)
+{
+       int ret = 0;
+       u64 *fmrlist;
+       u32 i;
+
+       /* loop over desired page_list entries */
+       fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
+       for (i = 0; i < number; i++) {
+               *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
+                          pginfo->next_hwpage * pginfo->hwpage_size;
+               if ( !(*kpage) ) {
+                       ehca_gen_err("*fmrlist=%llx fmrlist=%p "
+                                    "next_listelem=%llx next_hwpage=%llx",
+                                    *fmrlist, fmrlist,
+                                    pginfo->u.fmr.next_listelem,
+                                    pginfo->next_hwpage);
+                       return -EFAULT;
+               }
+               (pginfo->hwpage_cnt)++;
+               if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
+                       if (pginfo->next_hwpage %
+                           (pginfo->u.fmr.fmr_pgsize /
+                            pginfo->hwpage_size) == 0) {
+                               (pginfo->kpage_cnt)++;
+                               (pginfo->u.fmr.next_listelem)++;
+                               fmrlist++;
+                               pginfo->next_hwpage = 0;
+                       } else
+                               (pginfo->next_hwpage)++;
+               } else {
+                       unsigned int cnt_per_hwpage = pginfo->hwpage_size /
+                               pginfo->u.fmr.fmr_pgsize;
+                       unsigned int j;
+                       u64 prev = *kpage;
+                       /* check if adrs are contiguous */
+                       for (j = 1; j < cnt_per_hwpage; j++) {
+                               u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
+                               if (prev + pginfo->u.fmr.fmr_pgsize != p) {
+                                       ehca_gen_err("uncontiguous fmr pages "
+                                                    "found prev=%llx p=%llx "
+                                                    "idx=%x", prev, p, i + j);
+                                       return -EINVAL;
+                               }
+                               prev = p;
+                       }
+                       pginfo->kpage_cnt += cnt_per_hwpage;
+                       pginfo->u.fmr.next_listelem += cnt_per_hwpage;
+                       fmrlist += cnt_per_hwpage;
+               }
+               kpage++;
+       }
+       return ret;
+}
+
+/* setup page buffer from page info */
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+                    u32 number,
+                    u64 *kpage)
+{
+       int ret;
+
+       switch (pginfo->type) {
+       case EHCA_MR_PGI_PHYS:
+               ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
+               break;
+       case EHCA_MR_PGI_USER:
+               ret = PAGE_SIZE >= pginfo->hwpage_size ?
+                       ehca_set_pagebuf_user1(pginfo, number, kpage) :
+                       ehca_set_pagebuf_user2(pginfo, number, kpage);
+               break;
+       case EHCA_MR_PGI_FMR:
+               ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
+               break;
+       default:
+               ehca_gen_err("bad pginfo->type=%x", pginfo->type);
+               ret = -EFAULT;
+               break;
+       }
+       return ret;
+} /* end ehca_set_pagebuf() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check MR if it is a max-MR, i.e. uses whole memory
+ * in case it's a max-MR 1 is returned, else 0
+ */
+int ehca_mr_is_maxmr(u64 size,
+                    u64 *iova_start)
+{
+       /* a MR is treated as max-MR only if it fits following: */
+       if ((size == ehca_mr_len) &&
+           (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
+               ehca_gen_dbg("this is a max-MR");
+               return 1;
+       } else
+               return 0;
+} /* end ehca_mr_is_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/* map access control for MR/MW. This routine is used for MR and MW. */
+void ehca_mrmw_map_acl(int ib_acl,
+                      u32 *hipz_acl)
+{
+       *hipz_acl = 0;
+       if (ib_acl & IB_ACCESS_REMOTE_READ)
+               *hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
+       if (ib_acl & IB_ACCESS_REMOTE_WRITE)
+               *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
+       if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
+               *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
+       if (ib_acl & IB_ACCESS_LOCAL_WRITE)
+               *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
+       if (ib_acl & IB_ACCESS_MW_BIND)
+               *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
+} /* end ehca_mrmw_map_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/* sets page size in hipz access control for MR/MW. */
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
+{
+       *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
+} /* end ehca_mrmw_set_pgsize_hipz_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * reverse map access control for MR/MW.
+ * This routine is used for MR and MW.
+ */
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+                              int *ib_acl) /*OUT*/
+{
+       *ib_acl = 0;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
+               *ib_acl |= IB_ACCESS_REMOTE_READ;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
+               *ib_acl |= IB_ACCESS_REMOTE_WRITE;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
+               *ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
+               *ib_acl |= IB_ACCESS_LOCAL_WRITE;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
+               *ib_acl |= IB_ACCESS_MW_BIND;
+} /* end ehca_mrmw_reverse_map_acl() */
+
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * MR destructor and constructor
+ * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
+ * except struct ib_mr and spinlock
+ */
+void ehca_mr_deletenew(struct ehca_mr *mr)
+{
+       mr->flags = 0;
+       mr->num_kpages = 0;
+       mr->num_hwpages = 0;
+       mr->acl = 0;
+       mr->start = NULL;
+       mr->fmr_page_size = 0;
+       mr->fmr_max_pages = 0;
+       mr->fmr_max_maps = 0;
+       mr->fmr_map_cnt = 0;
+       memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
+       memset(&mr->galpas, 0, sizeof(mr->galpas));
+} /* end ehca_mr_deletenew() */
+
+int ehca_init_mrmw_cache(void)
+{
+       mr_cache = kmem_cache_create("ehca_cache_mr",
+                                    sizeof(struct ehca_mr), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!mr_cache)
+               return -ENOMEM;
+       mw_cache = kmem_cache_create("ehca_cache_mw",
+                                    sizeof(struct ehca_mw), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!mw_cache) {
+               kmem_cache_destroy(mr_cache);
+               mr_cache = NULL;
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+void ehca_cleanup_mrmw_cache(void)
+{
+       if (mr_cache)
+               kmem_cache_destroy(mr_cache);
+       if (mw_cache)
+               kmem_cache_destroy(mw_cache);
+}
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+                                    int dir)
+{
+       if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+               ehca_top_bmap->dir[dir] =
+                       kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+               if (!ehca_top_bmap->dir[dir])
+                       return -ENOMEM;
+               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+               memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+       }
+       return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+       if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+               ehca_bmap->top[top] =
+                       kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+               if (!ehca_bmap->top[top])
+                       return -ENOMEM;
+               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+               memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+       }
+       return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+       return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+       int top, dir;
+
+       if (!ehca_bmap)
+               return;
+
+       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]))
+                       continue;
+               for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+                       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+                               continue;
+
+                       kfree(ehca_bmap->top[top]->dir[dir]);
+               }
+
+               kfree(ehca_bmap->top[top]);
+       }
+
+       kfree(ehca_bmap);
+       ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+       unsigned long i, start_section, end_section;
+       int top, dir, idx;
+
+       if (!nr_pages)
+               return 0;
+
+       if (!ehca_bmap) {
+               ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+               if (!ehca_bmap)
+                       return -ENOMEM;
+               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+               memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+       }
+
+       start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+       end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+       for (i = start_section; i < end_section; i++) {
+               int ret;
+               top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+               dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+               idx = i & EHCA_INDEX_MASK;
+
+               ret = ehca_init_bmap(ehca_bmap, top, dir);
+               if (ret) {
+                       ehca_destroy_busmap();
+                       return ret;
+               }
+               ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+               ehca_mr_len += EHCA_SECTSIZE;
+       }
+       return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+       int page_order;
+
+       if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+               return 0;
+
+       page_order = compound_order(pfn_to_page(pfn));
+       if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+               return 0;
+
+       return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+                                      unsigned long total_nr_pages, void *arg)
+{
+       int ret;
+       unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+       if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+               return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+       /* Given chunk is >= 16GB -> check for hugepages */
+       start_pfn = initial_pfn;
+       end_pfn = initial_pfn + total_nr_pages;
+       pfn = start_pfn;
+
+       while (pfn < end_pfn) {
+               if (ehca_is_hugepage(pfn)) {
+                       /* Add mem found in front of the hugepage */
+                       nr_pages = pfn - start_pfn;
+                       ret = ehca_update_busmap(start_pfn, nr_pages);
+                       if (ret)
+                               return ret;
+                       /* Skip the hugepage */
+                       pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+                       start_pfn = pfn;
+               } else
+                       pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+       }
+
+       /* Add mem found behind the hugepage(s)  */
+       nr_pages = pfn - start_pfn;
+       return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+       int ret;
+
+       ehca_mr_len = 0;
+       ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+                                  ehca_create_busmap_callback);
+       return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+                                  struct ehca_mr *e_mr,
+                                  struct ehca_mr_pginfo *pginfo)
+{
+       int top;
+       u64 hret, *kpage;
+
+       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!kpage) {
+               ehca_err(&shca->ib_device, "kpage alloc failed");
+               return -ENOMEM;
+       }
+       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]))
+                       continue;
+               hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+               if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+                       break;
+       }
+
+       ehca_free_fw_ctrlblock(kpage);
+
+       if (hret == H_SUCCESS)
+               return 0; /* Everything is fine */
+       else {
+               ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+                                "h_ret=%lli e_mr=%p top=%x lkey=%x "
+                                "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+                                e_mr->ib.ib_mr.lkey,
+                                shca->ipz_hca_handle.handle,
+                                e_mr->ipz_mr_handle.handle);
+               return ehca2ib_return_code(hret);
+       }
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+       int top, dir, idx;
+       unsigned long abs_addr, offset;
+       u64 entry;
+
+       if (!ehca_bmap)
+               return EHCA_INVAL_ADDR;
+
+       abs_addr = __pa(caddr);
+       top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+       if (!ehca_bmap_valid(ehca_bmap->top[top]))
+               return EHCA_INVAL_ADDR;
+
+       dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+               return EHCA_INVAL_ADDR;
+
+       idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+       entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+       if (ehca_bmap_valid(entry)) {
+               offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+               return entry | offset;
+       } else
+               return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+                              size_t size, enum dma_data_direction direction)
+{
+       if (cpu_addr)
+               return ehca_map_vaddr(cpu_addr);
+       else
+               return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+                                 enum dma_data_direction direction)
+{
+       /* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+                            unsigned long offset, size_t size,
+                            enum dma_data_direction direction)
+{
+       u64 addr;
+
+       if (offset + size > PAGE_SIZE)
+               return EHCA_INVAL_ADDR;
+
+       addr = ehca_map_vaddr(page_address(page));
+       if (!ehca_dma_mapping_error(dev, addr))
+               addr += offset;
+
+       return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+                               enum dma_data_direction direction)
+{
+       /* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                          int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sgl, sg, nents, i) {
+               u64 addr;
+               addr = ehca_map_vaddr(sg_virt(sg));
+               if (ehca_dma_mapping_error(dev, addr))
+                       return 0;
+
+               sg->dma_address = addr;
+               sg->dma_length = sg->length;
+       }
+       return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+                             int nents, enum dma_data_direction direction)
+{
+       /* This is only a stub; nothing to be done here */
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+                                        size_t size,
+                                        enum dma_data_direction dir)
+{
+       dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+                                           size_t size,
+                                           enum dma_data_direction dir)
+{
+       dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                    u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+       u64 dma_addr;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p) {
+               addr = page_address(p);
+               dma_addr = ehca_map_vaddr(addr);
+               if (ehca_dma_mapping_error(dev, dma_addr)) {
+                       free_pages((unsigned long)addr, get_order(size));
+                       return NULL;
+               }
+               if (dma_handle)
+                       *dma_handle = dma_addr;
+               return addr;
+       }
+       return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+                                  void *cpu_addr, u64 dma_handle)
+{
+       if (cpu_addr && size)
+               free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+       .mapping_error          = ehca_dma_mapping_error,
+       .map_single             = ehca_dma_map_single,
+       .unmap_single           = ehca_dma_unmap_single,
+       .map_page               = ehca_dma_map_page,
+       .unmap_page             = ehca_dma_unmap_page,
+       .map_sg                 = ehca_dma_map_sg,
+       .unmap_sg               = ehca_dma_unmap_sg,
+       .sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
+       .sync_single_for_device = ehca_dma_sync_single_for_device,
+       .alloc_coherent         = ehca_dma_alloc_coherent,
+       .free_coherent          = ehca_dma_free_coherent,
+};
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h
new file mode 100644 (file)
index 0000000..50d8b51
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW declarations and inline functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _EHCA_MRMW_H_
+#define _EHCA_MRMW_H_
+
+enum ehca_reg_type {
+       EHCA_REG_MR,
+       EHCA_REG_BUSMAP_MR
+};
+
+int ehca_reg_mr(struct ehca_shca *shca,
+               struct ehca_mr *e_mr,
+               u64 *iova_start,
+               u64 size,
+               int acl,
+               struct ehca_pd *e_pd,
+               struct ehca_mr_pginfo *pginfo,
+               u32 *lkey,
+               u32 *rkey,
+               enum ehca_reg_type reg_type);
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+                      struct ehca_mr *e_mr,
+                      struct ehca_mr_pginfo *pginfo);
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+                 struct ehca_mr *e_mr,
+                 u64 *iova_start,
+                 u64 size,
+                 int mr_access_flags,
+                 struct ehca_pd *e_pd,
+                 struct ehca_mr_pginfo *pginfo,
+                 u32 *lkey,
+                 u32 *rkey);
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+                      struct ehca_mr *e_fmr);
+
+int ehca_reg_smr(struct ehca_shca *shca,
+                struct ehca_mr *e_origmr,
+                struct ehca_mr *e_newmr,
+                u64 *iova_start,
+                int acl,
+                struct ehca_pd *e_pd,
+                u32 *lkey,
+                u32 *rkey);
+
+int ehca_reg_internal_maxmr(struct ehca_shca *shca,
+                           struct ehca_pd *e_pd,
+                           struct ehca_mr **maxmr);
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+                  struct ehca_mr *e_newmr,
+                  u64 *iova_start,
+                  int acl,
+                  struct ehca_pd *e_pd,
+                  u32 *lkey,
+                  u32 *rkey);
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
+
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+                                 int num_phys_buf,
+                                 u64 *iova_start,
+                                 u64 *size);
+
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+                            u64 *page_list,
+                            int list_len);
+
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+                    u32 number,
+                    u64 *kpage);
+
+int ehca_mr_is_maxmr(u64 size,
+                    u64 *iova_start);
+
+void ehca_mrmw_map_acl(int ib_acl,
+                      u32 *hipz_acl);
+
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
+
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+                              int *ib_acl);
+
+void ehca_mr_deletenew(struct ehca_mr *mr);
+
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
+#endif  /*_EHCA_MRMW_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_pd.c b/drivers/staging/rdma/ehca/ehca_pd.c
new file mode 100644 (file)
index 0000000..351577a
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  PD functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+
+static struct kmem_cache *pd_cache;
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+                           struct ib_ucontext *context, struct ib_udata *udata)
+{
+       struct ehca_pd *pd;
+       int i;
+
+       pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
+       if (!pd) {
+               ehca_err(device, "device=%p context=%p out of memory",
+                        device, context);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       for (i = 0; i < 2; i++) {
+               INIT_LIST_HEAD(&pd->free[i]);
+               INIT_LIST_HEAD(&pd->full[i]);
+       }
+       mutex_init(&pd->lock);
+
+       /*
+        * Kernel PD: when device = -1, 0
+        * User   PD: when context != -1
+        */
+       if (!context) {
+               /*
+                * Kernel PDs after init reuses always
+                * the one created in ehca_shca_reopen()
+                */
+               struct ehca_shca *shca = container_of(device, struct ehca_shca,
+                                                     ib_device);
+               pd->fw_pd.value = shca->pd->fw_pd.value;
+       } else
+               pd->fw_pd.value = (u64)pd;
+
+       return &pd->ib_pd;
+}
+
+int ehca_dealloc_pd(struct ib_pd *pd)
+{
+       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+       int i, leftovers = 0;
+       struct ipz_small_queue_page *page, *tmp;
+
+       for (i = 0; i < 2; i++) {
+               list_splice(&my_pd->full[i], &my_pd->free[i]);
+               list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
+                       leftovers = 1;
+                       free_page(page->page);
+                       kmem_cache_free(small_qp_cache, page);
+               }
+       }
+
+       if (leftovers)
+               ehca_warn(pd->device,
+                         "Some small queue pages were not freed");
+
+       kmem_cache_free(pd_cache, my_pd);
+
+       return 0;
+}
+
+int ehca_init_pd_cache(void)
+{
+       pd_cache = kmem_cache_create("ehca_cache_pd",
+                                    sizeof(struct ehca_pd), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!pd_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_pd_cache(void)
+{
+       if (pd_cache)
+               kmem_cache_destroy(pd_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_qes.h b/drivers/staging/rdma/ehca/ehca_qes.h
new file mode 100644 (file)
index 0000000..90c4efa
--- /dev/null
@@ -0,0 +1,260 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Hardware request structures
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _EHCA_QES_H_
+#define _EHCA_QES_H_
+
+#include "ehca_tools.h"
+
+/* virtual scatter gather entry to specify remote addresses with length */
+struct ehca_vsgentry {
+       u64 vaddr;
+       u32 lkey;
+       u32 length;
+};
+
+#define GRH_FLAG_MASK        EHCA_BMASK_IBM( 7,  7)
+#define GRH_IPVERSION_MASK   EHCA_BMASK_IBM( 0,  3)
+#define GRH_TCLASS_MASK      EHCA_BMASK_IBM( 4, 12)
+#define GRH_FLOWLABEL_MASK   EHCA_BMASK_IBM(13, 31)
+#define GRH_PAYLEN_MASK      EHCA_BMASK_IBM(32, 47)
+#define GRH_NEXTHEADER_MASK  EHCA_BMASK_IBM(48, 55)
+#define GRH_HOPLIMIT_MASK    EHCA_BMASK_IBM(56, 63)
+
+/*
+ * Unreliable Datagram Address Vector Format
+ * see IBTA Vol1 chapter 8.3 Global Routing Header
+ */
+struct ehca_ud_av {
+       u8 sl;
+       u8 lnh;
+       u16 dlid;
+       u8 reserved1;
+       u8 reserved2;
+       u8 reserved3;
+       u8 slid_path_bits;
+       u8 reserved4;
+       u8 ipd;
+       u8 reserved5;
+       u8 pmtu;
+       u32 reserved6;
+       u64 reserved7;
+       union {
+               struct {
+                       u64 word_0; /* always set to 6  */
+                       /*should be 0x1B for IB transport */
+                       u64 word_1;
+                       u64 word_2;
+                       u64 word_3;
+                       u64 word_4;
+               } grh;
+               struct {
+                       u32 wd_0;
+                       u32 wd_1;
+                       /* DWord_1 --> SGID */
+
+                       u32 sgid_wd3;
+                       u32 sgid_wd2;
+
+                       u32 sgid_wd1;
+                       u32 sgid_wd0;
+                       /* DWord_3 --> DGID */
+
+                       u32 dgid_wd3;
+                       u32 dgid_wd2;
+
+                       u32 dgid_wd1;
+                       u32 dgid_wd0;
+               } grh_l;
+       };
+};
+
+/* maximum number of sg entries allowed in a WQE */
+#define MAX_WQE_SG_ENTRIES 252
+
+#define WQE_OPTYPE_SEND             0x80
+#define WQE_OPTYPE_RDMAREAD         0x40
+#define WQE_OPTYPE_RDMAWRITE        0x20
+#define WQE_OPTYPE_CMPSWAP          0x10
+#define WQE_OPTYPE_FETCHADD         0x08
+#define WQE_OPTYPE_BIND             0x04
+
+#define WQE_WRFLAG_REQ_SIGNAL_COM   0x80
+#define WQE_WRFLAG_FENCE            0x40
+#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
+#define WQE_WRFLAG_SOLIC_EVENT      0x10
+
+#define WQEF_CACHE_HINT             0x80
+#define WQEF_CACHE_HINT_RD_WR       0x40
+#define WQEF_TIMED_WQE              0x20
+#define WQEF_PURGE                  0x08
+#define WQEF_HIGH_NIBBLE            0xF0
+
+#define MW_BIND_ACCESSCTRL_R_WRITE   0x40
+#define MW_BIND_ACCESSCTRL_R_READ    0x20
+#define MW_BIND_ACCESSCTRL_R_ATOMIC  0x10
+
+struct ehca_wqe {
+       u64 work_request_id;
+       u8 optype;
+       u8 wr_flag;
+       u16 pkeyi;
+       u8 wqef;
+       u8 nr_of_data_seg;
+       u16 wqe_provided_slid;
+       u32 destination_qp_number;
+       u32 resync_psn_sqp;
+       u32 local_ee_context_qkey;
+       u32 immediate_data;
+       union {
+               struct {
+                       u64 remote_virtual_address;
+                       u32 rkey;
+                       u32 reserved;
+                       u64 atomic_1st_op_dma_len;
+                       u64 atomic_2nd_op;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+
+               } nud;
+               struct {
+                       u64 ehca_ud_av_ptr;
+                       u64 reserved1;
+                       u64 reserved2;
+                       u64 reserved3;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+               } ud_avp;
+               struct {
+                       struct ehca_ud_av ud_av;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
+                                                    2];
+               } ud_av;
+               struct {
+                       u64 reserved0;
+                       u64 reserved1;
+                       u64 reserved2;
+                       u64 reserved3;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+               } all_rcv;
+
+               struct {
+                       u64 reserved;
+                       u32 rkey;
+                       u32 old_rkey;
+                       u64 reserved1;
+                       u64 reserved2;
+                       u64 virtual_address;
+                       u32 reserved3;
+                       u32 length;
+                       u32 reserved4;
+                       u16 reserved5;
+                       u8 reserved6;
+                       u8 lr_ctl;
+                       u32 lkey;
+                       u32 reserved7;
+                       u64 reserved8;
+                       u64 reserved9;
+                       u64 reserved10;
+                       u64 reserved11;
+               } bind;
+               struct {
+                       u64 reserved12;
+                       u64 reserved13;
+                       u32 size;
+                       u32 start;
+               } inline_data;
+       } u;
+
+};
+
+#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
+#define WC_IMM_DATA     EHCA_BMASK_IBM(1, 1)
+#define WC_GRH_PRESENT  EHCA_BMASK_IBM(2, 2)
+#define WC_SE_BIT       EHCA_BMASK_IBM(3, 3)
+#define WC_STATUS_ERROR_BIT 0x80000000
+#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
+#define WC_STATUS_PURGE_BIT 0x10
+#define WC_SEND_RECEIVE_BIT 0x80
+
+struct ehca_cqe {
+       u64 work_request_id;
+       u8 optype;
+       u8 w_completion_flags;
+       u16 reserved1;
+       u32 nr_bytes_transferred;
+       u32 immediate_data;
+       u32 local_qp_number;
+       u8 freed_resource_count;
+       u8 service_level;
+       u16 wqe_count;
+       u32 qp_token;
+       u32 qkey_ee_token;
+       u32 remote_qp_number;
+       u16 dlid;
+       u16 rlid;
+       u16 reserved2;
+       u16 pkey_index;
+       u32 cqe_timestamp;
+       u32 wqe_timestamp;
+       u8 wqe_timestamp_valid;
+       u8 reserved3;
+       u8 reserved4;
+       u8 cqe_flags;
+       u32 status;
+};
+
+struct ehca_eqe {
+       u64 entry;
+};
+
+struct ehca_mrte {
+       u64 starting_va;
+       u64 length; /* length of memory region in bytes*/
+       u32 pd;
+       u8 key_instance;
+       u8 pagesize;
+       u8 mr_control;
+       u8 local_remote_access_ctrl;
+       u8 reserved[0x20 - 0x18];
+       u64 at_pointer[4];
+};
+#endif /*_EHCA_QES_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_qp.c b/drivers/staging/rdma/ehca/ehca_qp.c
new file mode 100644 (file)
index 0000000..2e89356
--- /dev/null
@@ -0,0 +1,2257 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  QP functions
+ *
+ *  Authors: Joachim Fenkes <fenkes@de.ibm.com>
+ *           Stefan Roscher <stefan.roscher@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+static struct kmem_cache *qp_cache;
+
+/*
+ * attributes not supported by query qp
+ */
+#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
+                                    IB_QP_EN_SQD_ASYNC_NOTIFY)
+
+/*
+ * ehca (internal) qp state values
+ */
+enum ehca_qp_state {
+       EHCA_QPS_RESET = 1,
+       EHCA_QPS_INIT = 2,
+       EHCA_QPS_RTR = 3,
+       EHCA_QPS_RTS = 5,
+       EHCA_QPS_SQD = 6,
+       EHCA_QPS_SQE = 8,
+       EHCA_QPS_ERR = 128
+};
+
+/*
+ * qp state transitions as defined by IB Arch Rel 1.1 page 431
+ */
+enum ib_qp_statetrans {
+       IB_QPST_ANY2RESET,
+       IB_QPST_ANY2ERR,
+       IB_QPST_RESET2INIT,
+       IB_QPST_INIT2RTR,
+       IB_QPST_INIT2INIT,
+       IB_QPST_RTR2RTS,
+       IB_QPST_RTS2SQD,
+       IB_QPST_RTS2RTS,
+       IB_QPST_SQD2RTS,
+       IB_QPST_SQE2RTS,
+       IB_QPST_SQD2SQD,
+       IB_QPST_MAX     /* nr of transitions, this must be last!!! */
+};
+
+/*
+ * ib2ehca_qp_state maps IB to ehca qp_state
+ * returns ehca qp state corresponding to given ib qp state
+ */
+static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
+{
+       switch (ib_qp_state) {
+       case IB_QPS_RESET:
+               return EHCA_QPS_RESET;
+       case IB_QPS_INIT:
+               return EHCA_QPS_INIT;
+       case IB_QPS_RTR:
+               return EHCA_QPS_RTR;
+       case IB_QPS_RTS:
+               return EHCA_QPS_RTS;
+       case IB_QPS_SQD:
+               return EHCA_QPS_SQD;
+       case IB_QPS_SQE:
+               return EHCA_QPS_SQE;
+       case IB_QPS_ERR:
+               return EHCA_QPS_ERR;
+       default:
+               ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
+               return -EINVAL;
+       }
+}
+
+/*
+ * ehca2ib_qp_state maps ehca to IB qp_state
+ * returns ib qp state corresponding to given ehca qp state
+ */
+static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
+                                               ehca_qp_state)
+{
+       switch (ehca_qp_state) {
+       case EHCA_QPS_RESET:
+               return IB_QPS_RESET;
+       case EHCA_QPS_INIT:
+               return IB_QPS_INIT;
+       case EHCA_QPS_RTR:
+               return IB_QPS_RTR;
+       case EHCA_QPS_RTS:
+               return IB_QPS_RTS;
+       case EHCA_QPS_SQD:
+               return IB_QPS_SQD;
+       case EHCA_QPS_SQE:
+               return IB_QPS_SQE;
+       case EHCA_QPS_ERR:
+               return IB_QPS_ERR;
+       default:
+               ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
+               return -EINVAL;
+       }
+}
+
+/*
+ * ehca_qp_type used as index for req_attr and opt_attr of
+ * struct ehca_modqp_statetrans
+ */
+enum ehca_qp_type {
+       QPT_RC = 0,
+       QPT_UC = 1,
+       QPT_UD = 2,
+       QPT_SQP = 3,
+       QPT_MAX
+};
+
+/*
+ * ib2ehcaqptype maps Ib to ehca qp_type
+ * returns ehca qp type corresponding to ib qp type
+ */
+static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
+{
+       switch (ibqptype) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               return QPT_SQP;
+       case IB_QPT_RC:
+               return QPT_RC;
+       case IB_QPT_UC:
+               return QPT_UC;
+       case IB_QPT_UD:
+               return QPT_UD;
+       default:
+               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+               return -EINVAL;
+       }
+}
+
+static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
+                                                        int ib_tostate)
+{
+       int index = -EINVAL;
+       switch (ib_tostate) {
+       case IB_QPS_RESET:
+               index = IB_QPST_ANY2RESET;
+               break;
+       case IB_QPS_INIT:
+               switch (ib_fromstate) {
+               case IB_QPS_RESET:
+                       index = IB_QPST_RESET2INIT;
+                       break;
+               case IB_QPS_INIT:
+                       index = IB_QPST_INIT2INIT;
+                       break;
+               }
+               break;
+       case IB_QPS_RTR:
+               if (ib_fromstate == IB_QPS_INIT)
+                       index = IB_QPST_INIT2RTR;
+               break;
+       case IB_QPS_RTS:
+               switch (ib_fromstate) {
+               case IB_QPS_RTR:
+                       index = IB_QPST_RTR2RTS;
+                       break;
+               case IB_QPS_RTS:
+                       index = IB_QPST_RTS2RTS;
+                       break;
+               case IB_QPS_SQD:
+                       index = IB_QPST_SQD2RTS;
+                       break;
+               case IB_QPS_SQE:
+                       index = IB_QPST_SQE2RTS;
+                       break;
+               }
+               break;
+       case IB_QPS_SQD:
+               if (ib_fromstate == IB_QPS_RTS)
+                       index = IB_QPST_RTS2SQD;
+               break;
+       case IB_QPS_SQE:
+               break;
+       case IB_QPS_ERR:
+               index = IB_QPST_ANY2ERR;
+               break;
+       default:
+               break;
+       }
+       return index;
+}
+
+/*
+ * ibqptype2servicetype returns hcp service type corresponding to given
+ * ib qp type used by create_qp()
+ */
+static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
+{
+       switch (ibqptype) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               return ST_UD;
+       case IB_QPT_RC:
+               return ST_RC;
+       case IB_QPT_UC:
+               return ST_UC;
+       case IB_QPT_UD:
+               return ST_UD;
+       case IB_QPT_RAW_IPV6:
+               return -EINVAL;
+       case IB_QPT_RAW_ETHERTYPE:
+               return -EINVAL;
+       default:
+               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+               return -EINVAL;
+       }
+}
+
+/*
+ * init userspace queue info from ipz_queue data
+ */
+static inline void queue2resp(struct ipzu_queue_resp *resp,
+                             struct ipz_queue *queue)
+{
+       resp->qe_size = queue->qe_size;
+       resp->act_nr_of_sg = queue->act_nr_of_sg;
+       resp->queue_length = queue->queue_length;
+       resp->pagesize = queue->pagesize;
+       resp->toggle_state = queue->toggle_state;
+       resp->offset = queue->offset;
+}
+
+/*
+ * init_qp_queue initializes/constructs r/squeue and registers queue pages.
+ */
+static inline int init_qp_queue(struct ehca_shca *shca,
+                               struct ehca_pd *pd,
+                               struct ehca_qp *my_qp,
+                               struct ipz_queue *queue,
+                               int q_type,
+                               u64 expected_hret,
+                               struct ehca_alloc_queue_parms *parms,
+                               int wqe_size)
+{
+       int ret, cnt, ipz_rc, nr_q_pages;
+       void *vpage;
+       u64 rpage, h_ret;
+       struct ib_device *ib_dev = &shca->ib_device;
+       struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
+
+       if (!parms->queue_size)
+               return 0;
+
+       if (parms->is_small) {
+               nr_q_pages = 1;
+               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+                                       128 << parms->page_size,
+                                       wqe_size, parms->act_nr_sges, 1);
+       } else {
+               nr_q_pages = parms->queue_size;
+               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+                                       EHCA_PAGESIZE, wqe_size,
+                                       parms->act_nr_sges, 0);
+       }
+
+       if (!ipz_rc) {
+               ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
+                        ipz_rc);
+               return -EBUSY;
+       }
+
+       /* register queue pages */
+       for (cnt = 0; cnt < nr_q_pages; cnt++) {
+               vpage = ipz_qpageit_get_inc(queue);
+               if (!vpage) {
+                       ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+                                "failed p_vpage= %p", vpage);
+                       ret = -EINVAL;
+                       goto init_qp_queue1;
+               }
+               rpage = __pa(vpage);
+
+               h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
+                                                my_qp->ipz_qp_handle,
+                                                NULL, 0, q_type,
+                                                rpage, parms->is_small ? 0 : 1,
+                                                my_qp->galpas.kernel);
+               if (cnt == (nr_q_pages - 1)) {  /* last page! */
+                       if (h_ret != expected_hret) {
+                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
+                                        "h_ret=%lli", h_ret);
+                               ret = ehca2ib_return_code(h_ret);
+                               goto init_qp_queue1;
+                       }
+                       vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
+                       if (vpage) {
+                               ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+                                        "should not succeed vpage=%p", vpage);
+                               ret = -EINVAL;
+                               goto init_qp_queue1;
+                       }
+               } else {
+                       if (h_ret != H_PAGE_REGISTERED) {
+                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
+                                        "h_ret=%lli", h_ret);
+                               ret = ehca2ib_return_code(h_ret);
+                               goto init_qp_queue1;
+                       }
+               }
+       }
+
+       ipz_qeit_reset(queue);
+
+       return 0;
+
+init_qp_queue1:
+       ipz_queue_dtor(pd, queue);
+       return ret;
+}
+
+static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
+{
+       if (is_llqp)
+               return 128 << act_nr_sge;
+       else
+               return offsetof(struct ehca_wqe,
+                               u.nud.sg_list[act_nr_sge]);
+}
+
+static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
+                                      int req_nr_sge, int is_llqp)
+{
+       u32 wqe_size, q_size;
+       int act_nr_sge = req_nr_sge;
+
+       if (!is_llqp)
+               /* round up #SGEs so WQE size is a power of 2 */
+               for (act_nr_sge = 4; act_nr_sge <= 252;
+                    act_nr_sge = 4 + 2 * act_nr_sge)
+                       if (act_nr_sge >= req_nr_sge)
+                               break;
+
+       wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
+       q_size = wqe_size * (queue->max_wr + 1);
+
+       if (q_size <= 512)
+               queue->page_size = 2;
+       else if (q_size <= 1024)
+               queue->page_size = 3;
+       else
+               queue->page_size = 0;
+
+       queue->is_small = (queue->page_size != 0);
+}
+
+/* needs to be called with cq->spinlock held */
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
+{
+       struct list_head *list, *node;
+
+       /* TODO: support low latency QPs */
+       if (qp->ext_type == EQPT_LLQP)
+               return;
+
+       if (on_sq) {
+               list = &qp->send_cq->sqp_err_list;
+               node = &qp->sq_err_node;
+       } else {
+               list = &qp->recv_cq->rqp_err_list;
+               node = &qp->rq_err_node;
+       }
+
+       if (list_empty(node))
+               list_add_tail(node, list);
+
+       return;
+}
+
+static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->spinlock, flags);
+
+       if (!list_empty(node))
+               list_del_init(node);
+
+       spin_unlock_irqrestore(&cq->spinlock, flags);
+}
+
+static void reset_queue_map(struct ehca_queue_map *qmap)
+{
+       int i;
+
+       qmap->tail = qmap->entries - 1;
+       qmap->left_to_poll = 0;
+       qmap->next_wqe_idx = 0;
+       for (i = 0; i < qmap->entries; i++) {
+               qmap->map[i].reported = 1;
+               qmap->map[i].cqe_req = 0;
+       }
+}
+
+/*
+ * Create an ib_qp struct that is either a QP or an SRQ, depending on
+ * the value of the is_srq parameter. If init_attr and srq_init_attr share
+ * fields, the field out of init_attr is used.
+ */
+static struct ehca_qp *internal_create_qp(
+       struct ib_pd *pd,
+       struct ib_qp_init_attr *init_attr,
+       struct ib_srq_init_attr *srq_init_attr,
+       struct ib_udata *udata, int is_srq)
+{
+       struct ehca_qp *my_qp, *my_srq = NULL;
+       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+                                             ib_device);
+       struct ib_ucontext *context = NULL;
+       u64 h_ret;
+       int is_llqp = 0, has_srq = 0, is_user = 0;
+       int qp_type, max_send_sge, max_recv_sge, ret;
+
+       /* h_call's out parameters */
+       struct ehca_alloc_qp_parms parms;
+       u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
+       unsigned long flags;
+
+       if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
+               ehca_err(pd->device, "Unable to create QP, max number of %i "
+                        "QPs reached.", shca->max_num_qps);
+               ehca_err(pd->device, "To increase the maximum number of QPs "
+                        "use the number_of_qps module parameter.\n");
+               return ERR_PTR(-ENOSPC);
+       }
+
+       if (init_attr->create_flags) {
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       memset(&parms, 0, sizeof(parms));
+       qp_type = init_attr->qp_type;
+
+       if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
+               init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
+               ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
+                        init_attr->sq_sig_type);
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       /* save LLQP info */
+       if (qp_type & 0x80) {
+               is_llqp = 1;
+               parms.ext_type = EQPT_LLQP;
+               parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
+       }
+       qp_type &= 0x1F;
+       init_attr->qp_type &= 0x1F;
+
+       /* handle SRQ base QPs */
+       if (init_attr->srq) {
+               my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
+
+               if (qp_type == IB_QPT_UC) {
+                       ehca_err(pd->device, "UC with SRQ not supported");
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+
+               has_srq = 1;
+               parms.ext_type = EQPT_SRQBASE;
+               parms.srq_qpn = my_srq->real_qp_num;
+       }
+
+       if (is_llqp && has_srq) {
+               ehca_err(pd->device, "LLQPs can't have an SRQ");
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       /* handle SRQs */
+       if (is_srq) {
+               parms.ext_type = EQPT_SRQ;
+               parms.srq_limit = srq_init_attr->attr.srq_limit;
+               if (init_attr->cap.max_recv_sge > 3) {
+                       ehca_err(pd->device, "no more than three SGEs "
+                                "supported for SRQ  pd=%p  max_sge=%x",
+                                pd, init_attr->cap.max_recv_sge);
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+       }
+
+       /* check QP type */
+       if (qp_type != IB_QPT_UD &&
+           qp_type != IB_QPT_UC &&
+           qp_type != IB_QPT_RC &&
+           qp_type != IB_QPT_SMI &&
+           qp_type != IB_QPT_GSI) {
+               ehca_err(pd->device, "wrong QP Type=%x", qp_type);
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (is_llqp) {
+               switch (qp_type) {
+               case IB_QPT_RC:
+                       if ((init_attr->cap.max_send_wr > 255) ||
+                           (init_attr->cap.max_recv_wr > 255)) {
+                               ehca_err(pd->device,
+                                        "Invalid Number of max_sq_wr=%x "
+                                        "or max_rq_wr=%x for RC LLQP",
+                                        init_attr->cap.max_send_wr,
+                                        init_attr->cap.max_recv_wr);
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-EINVAL);
+                       }
+                       break;
+               case IB_QPT_UD:
+                       if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
+                               ehca_err(pd->device, "UD LLQP not supported "
+                                        "by this adapter");
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-ENOSYS);
+                       }
+                       if (!(init_attr->cap.max_send_sge <= 5
+                           && init_attr->cap.max_send_sge >= 1
+                           && init_attr->cap.max_recv_sge <= 5
+                           && init_attr->cap.max_recv_sge >= 1)) {
+                               ehca_err(pd->device,
+                                        "Invalid Number of max_send_sge=%x "
+                                        "or max_recv_sge=%x for UD LLQP",
+                                        init_attr->cap.max_send_sge,
+                                        init_attr->cap.max_recv_sge);
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-EINVAL);
+                       } else if (init_attr->cap.max_send_wr > 255) {
+                               ehca_err(pd->device,
+                                        "Invalid Number of "
+                                        "max_send_wr=%x for UD QP_TYPE=%x",
+                                        init_attr->cap.max_send_wr, qp_type);
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-EINVAL);
+                       }
+                       break;
+               default:
+                       ehca_err(pd->device, "unsupported LL QP Type=%x",
+                                qp_type);
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+       } else {
+               int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
+                              || qp_type == IB_QPT_GSI) ? 250 : 252;
+
+               if (init_attr->cap.max_send_sge > max_sge
+                   || init_attr->cap.max_recv_sge > max_sge) {
+                       ehca_err(pd->device, "Invalid number of SGEs requested "
+                                "send_sge=%x recv_sge=%x max_sge=%x",
+                                init_attr->cap.max_send_sge,
+                                init_attr->cap.max_recv_sge, max_sge);
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+       }
+
+       my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
+       if (!my_qp) {
+               ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       if (pd->uobject && udata) {
+               is_user = 1;
+               context = pd->uobject->context;
+       }
+
+       atomic_set(&my_qp->nr_events, 0);
+       init_waitqueue_head(&my_qp->wait_completion);
+       spin_lock_init(&my_qp->spinlock_s);
+       spin_lock_init(&my_qp->spinlock_r);
+       my_qp->qp_type = qp_type;
+       my_qp->ext_type = parms.ext_type;
+       my_qp->state = IB_QPS_RESET;
+
+       if (init_attr->recv_cq)
+               my_qp->recv_cq =
+                       container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
+       if (init_attr->send_cq)
+               my_qp->send_cq =
+                       container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
+
+       idr_preload(GFP_KERNEL);
+       write_lock_irqsave(&ehca_qp_idr_lock, flags);
+
+       ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
+       if (ret >= 0)
+               my_qp->token = ret;
+
+       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+       idr_preload_end();
+       if (ret < 0) {
+               if (ret == -ENOSPC) {
+                       ret = -EINVAL;
+                       ehca_err(pd->device, "Invalid number of qp");
+               } else {
+                       ret = -ENOMEM;
+                       ehca_err(pd->device, "Can't allocate new idr entry.");
+               }
+               goto create_qp_exit0;
+       }
+
+       if (has_srq)
+               parms.srq_token = my_qp->token;
+
+       parms.servicetype = ibqptype2servicetype(qp_type);
+       if (parms.servicetype < 0) {
+               ret = -EINVAL;
+               ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
+               goto create_qp_exit1;
+       }
+
+       /* Always signal by WQE so we can hide circ. WQEs */
+       parms.sigtype = HCALL_SIGT_BY_WQE;
+
+       /* UD_AV CIRCUMVENTION */
+       max_send_sge = init_attr->cap.max_send_sge;
+       max_recv_sge = init_attr->cap.max_recv_sge;
+       if (parms.servicetype == ST_UD && !is_llqp) {
+               max_send_sge += 2;
+               max_recv_sge += 2;
+       }
+
+       parms.token = my_qp->token;
+       parms.eq_handle = shca->eq.ipz_eq_handle;
+       parms.pd = my_pd->fw_pd;
+       if (my_qp->send_cq)
+               parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
+       if (my_qp->recv_cq)
+               parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
+
+       parms.squeue.max_wr = init_attr->cap.max_send_wr;
+       parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
+       parms.squeue.max_sge = max_send_sge;
+       parms.rqueue.max_sge = max_recv_sge;
+
+       /* RC QPs need one more SWQE for unsolicited ack circumvention */
+       if (qp_type == IB_QPT_RC)
+               parms.squeue.max_wr++;
+
+       if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
+               if (HAS_SQ(my_qp))
+                       ehca_determine_small_queue(
+                               &parms.squeue, max_send_sge, is_llqp);
+               if (HAS_RQ(my_qp))
+                       ehca_determine_small_queue(
+                               &parms.rqueue, max_recv_sge, is_llqp);
+               parms.qp_storage =
+                       (parms.squeue.is_small || parms.rqueue.is_small);
+       }
+
+       h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
+                        h_ret);
+               ret = ehca2ib_return_code(h_ret);
+               goto create_qp_exit1;
+       }
+
+       ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
+       my_qp->ipz_qp_handle = parms.qp_handle;
+       my_qp->galpas = parms.galpas;
+
+       swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
+       rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
+
+       switch (qp_type) {
+       case IB_QPT_RC:
+               if (is_llqp) {
+                       parms.squeue.act_nr_sges = 1;
+                       parms.rqueue.act_nr_sges = 1;
+               }
+               /* hide the extra WQE */
+               parms.squeue.act_nr_wqes--;
+               break;
+       case IB_QPT_UD:
+       case IB_QPT_GSI:
+       case IB_QPT_SMI:
+               /* UD circumvention */
+               if (is_llqp) {
+                       parms.squeue.act_nr_sges = 1;
+                       parms.rqueue.act_nr_sges = 1;
+               } else {
+                       parms.squeue.act_nr_sges -= 2;
+                       parms.rqueue.act_nr_sges -= 2;
+               }
+
+               if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
+                       parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
+                       parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
+                       parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
+                       parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
+                       ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
+               }
+
+               break;
+
+       default:
+               break;
+       }
+
+       /* initialize r/squeue and register queue pages */
+       if (HAS_SQ(my_qp)) {
+               ret = init_qp_queue(
+                       shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
+                       HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
+                       &parms.squeue, swqe_size);
+               if (ret) {
+                       ehca_err(pd->device, "Couldn't initialize squeue "
+                                "and pages ret=%i", ret);
+                       goto create_qp_exit2;
+               }
+
+               if (!is_user) {
+                       my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+                               my_qp->ipz_squeue.qe_size;
+                       my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+                                                   sizeof(struct ehca_qmap_entry));
+                       if (!my_qp->sq_map.map) {
+                               ehca_err(pd->device, "Couldn't allocate squeue "
+                                        "map ret=%i", ret);
+                               goto create_qp_exit3;
+                       }
+                       INIT_LIST_HEAD(&my_qp->sq_err_node);
+                       /* to avoid the generation of bogus flush CQEs */
+                       reset_queue_map(&my_qp->sq_map);
+               }
+       }
+
+       if (HAS_RQ(my_qp)) {
+               ret = init_qp_queue(
+                       shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
+                       H_SUCCESS, &parms.rqueue, rwqe_size);
+               if (ret) {
+                       ehca_err(pd->device, "Couldn't initialize rqueue "
+                                "and pages ret=%i", ret);
+                       goto create_qp_exit4;
+               }
+               if (!is_user) {
+                       my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
+                               my_qp->ipz_rqueue.qe_size;
+                       my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
+                                                   sizeof(struct ehca_qmap_entry));
+                       if (!my_qp->rq_map.map) {
+                               ehca_err(pd->device, "Couldn't allocate squeue "
+                                        "map ret=%i", ret);
+                               goto create_qp_exit5;
+                       }
+                       INIT_LIST_HEAD(&my_qp->rq_err_node);
+                       /* to avoid the generation of bogus flush CQEs */
+                       reset_queue_map(&my_qp->rq_map);
+               }
+       } else if (init_attr->srq && !is_user) {
+               /* this is a base QP, use the queue map of the SRQ */
+               my_qp->rq_map = my_srq->rq_map;
+               INIT_LIST_HEAD(&my_qp->rq_err_node);
+
+               my_qp->ipz_rqueue = my_srq->ipz_rqueue;
+       }
+
+       if (is_srq) {
+               my_qp->ib_srq.pd = &my_pd->ib_pd;
+               my_qp->ib_srq.device = my_pd->ib_pd.device;
+
+               my_qp->ib_srq.srq_context = init_attr->qp_context;
+               my_qp->ib_srq.event_handler = init_attr->event_handler;
+       } else {
+               my_qp->ib_qp.qp_num = ib_qp_num;
+               my_qp->ib_qp.pd = &my_pd->ib_pd;
+               my_qp->ib_qp.device = my_pd->ib_pd.device;
+
+               my_qp->ib_qp.recv_cq = init_attr->recv_cq;
+               my_qp->ib_qp.send_cq = init_attr->send_cq;
+
+               my_qp->ib_qp.qp_type = qp_type;
+               my_qp->ib_qp.srq = init_attr->srq;
+
+               my_qp->ib_qp.qp_context = init_attr->qp_context;
+               my_qp->ib_qp.event_handler = init_attr->event_handler;
+       }
+
+       init_attr->cap.max_inline_data = 0; /* not supported yet */
+       init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
+       init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
+       init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
+       init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
+       my_qp->init_attr = *init_attr;
+
+       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+               shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+                       &my_qp->ib_qp;
+               if (ehca_nr_ports < 0) {
+                       /* alloc array to cache subsequent modify qp parms
+                        * for autodetect mode
+                        */
+                       my_qp->mod_qp_parm =
+                               kzalloc(EHCA_MOD_QP_PARM_MAX *
+                                       sizeof(*my_qp->mod_qp_parm),
+                                       GFP_KERNEL);
+                       if (!my_qp->mod_qp_parm) {
+                               ehca_err(pd->device,
+                                        "Could not alloc mod_qp_parm");
+                               goto create_qp_exit5;
+                       }
+               }
+       }
+
+       /* NOTE: define_apq0() not supported yet */
+       if (qp_type == IB_QPT_GSI) {
+               h_ret = ehca_define_sqp(shca, my_qp, init_attr);
+               if (h_ret != H_SUCCESS) {
+                       kfree(my_qp->mod_qp_parm);
+                       my_qp->mod_qp_parm = NULL;
+                       /* the QP pointer is no longer valid */
+                       shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+                               NULL;
+                       ret = ehca2ib_return_code(h_ret);
+                       goto create_qp_exit6;
+               }
+       }
+
+       if (my_qp->send_cq) {
+               ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
+               if (ret) {
+                       ehca_err(pd->device,
+                                "Couldn't assign qp to send_cq ret=%i", ret);
+                       goto create_qp_exit7;
+               }
+       }
+
+       /* copy queues, galpa data to user space */
+       if (context && udata) {
+               struct ehca_create_qp_resp resp;
+               memset(&resp, 0, sizeof(resp));
+
+               resp.qp_num = my_qp->real_qp_num;
+               resp.token = my_qp->token;
+               resp.qp_type = my_qp->qp_type;
+               resp.ext_type = my_qp->ext_type;
+               resp.qkey = my_qp->qkey;
+               resp.real_qp_num = my_qp->real_qp_num;
+
+               if (HAS_SQ(my_qp))
+                       queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
+               if (HAS_RQ(my_qp))
+                       queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
+               resp.fw_handle_ofs = (u32)
+                       (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
+
+               if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
+                       ehca_err(pd->device, "Copy to udata failed");
+                       ret = -EINVAL;
+                       goto create_qp_exit8;
+               }
+       }
+
+       return my_qp;
+
+create_qp_exit8:
+       ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
+
+create_qp_exit7:
+       kfree(my_qp->mod_qp_parm);
+
+create_qp_exit6:
+       if (HAS_RQ(my_qp) && !is_user)
+               vfree(my_qp->rq_map.map);
+
+create_qp_exit5:
+       if (HAS_RQ(my_qp))
+               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+
+create_qp_exit4:
+       if (HAS_SQ(my_qp) && !is_user)
+               vfree(my_qp->sq_map.map);
+
+create_qp_exit3:
+       if (HAS_SQ(my_qp))
+               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+
+create_qp_exit2:
+       hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+
+create_qp_exit1:
+       write_lock_irqsave(&ehca_qp_idr_lock, flags);
+       idr_remove(&ehca_qp_idr, my_qp->token);
+       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+create_qp_exit0:
+       kmem_cache_free(qp_cache, my_qp);
+       atomic_dec(&shca->num_qps);
+       return ERR_PTR(ret);
+}
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+                            struct ib_qp_init_attr *qp_init_attr,
+                            struct ib_udata *udata)
+{
+       struct ehca_qp *ret;
+
+       ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
+       return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+                              struct ib_uobject *uobject);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+                              struct ib_srq_init_attr *srq_init_attr,
+                              struct ib_udata *udata)
+{
+       struct ib_qp_init_attr qp_init_attr;
+       struct ehca_qp *my_qp;
+       struct ib_srq *ret;
+       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+                                             ib_device);
+       struct hcp_modify_qp_control_block *mqpcb;
+       u64 hret, update_mask;
+
+       if (srq_init_attr->srq_type != IB_SRQT_BASIC)
+               return ERR_PTR(-ENOSYS);
+
+       /* For common attributes, internal_create_qp() takes its info
+        * out of qp_init_attr, so copy all common attrs there.
+        */
+       memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+       qp_init_attr.event_handler = srq_init_attr->event_handler;
+       qp_init_attr.qp_context = srq_init_attr->srq_context;
+       qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+       qp_init_attr.qp_type = IB_QPT_RC;
+       qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
+       qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
+
+       my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
+       if (IS_ERR(my_qp))
+               return (struct ib_srq *)my_qp;
+
+       /* copy back return values */
+       srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
+       srq_init_attr->attr.max_sge = 3;
+
+       /* drive SRQ into RTR state */
+       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!mqpcb) {
+               ehca_err(pd->device, "Could not get zeroed page for mqpcb "
+                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+               ret = ERR_PTR(-ENOMEM);
+               goto create_srq1;
+       }
+
+       mqpcb->qp_state = EHCA_QPS_INIT;
+       mqpcb->prim_phys_port = 1;
+       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               update_mask,
+                               mqpcb, my_qp->galpas.kernel);
+       if (hret != H_SUCCESS) {
+               ehca_err(pd->device, "Could not modify SRQ to INIT "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, hret);
+               goto create_srq2;
+       }
+
+       mqpcb->qp_enable = 1;
+       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               update_mask,
+                               mqpcb, my_qp->galpas.kernel);
+       if (hret != H_SUCCESS) {
+               ehca_err(pd->device, "Could not enable SRQ "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, hret);
+               goto create_srq2;
+       }
+
+       mqpcb->qp_state  = EHCA_QPS_RTR;
+       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               update_mask,
+                               mqpcb, my_qp->galpas.kernel);
+       if (hret != H_SUCCESS) {
+               ehca_err(pd->device, "Could not modify SRQ to RTR "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, hret);
+               goto create_srq2;
+       }
+
+       ehca_free_fw_ctrlblock(mqpcb);
+
+       return &my_qp->ib_srq;
+
+create_srq2:
+       ret = ERR_PTR(ehca2ib_return_code(hret));
+       ehca_free_fw_ctrlblock(mqpcb);
+
+create_srq1:
+       internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
+
+       return ret;
+}
+
+/*
+ * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
+ * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
+ * returns total number of bad wqes in bad_wqe_cnt
+ */
+static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
+                          int *bad_wqe_cnt)
+{
+       u64 h_ret;
+       struct ipz_queue *squeue;
+       void *bad_send_wqe_p, *bad_send_wqe_v;
+       u64 q_ofs;
+       struct ehca_wqe *wqe;
+       int qp_num = my_qp->ib_qp.qp_num;
+
+       /* get send wqe pointer */
+       h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+                                          my_qp->ipz_qp_handle, &my_qp->pf,
+                                          &bad_send_wqe_p, NULL, 2);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
+                        " ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, qp_num, h_ret);
+               return ehca2ib_return_code(h_ret);
+       }
+       bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
+       ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
+                qp_num, bad_send_wqe_p);
+       /* convert wqe pointer to vadr */
+       bad_send_wqe_v = __va((u64)bad_send_wqe_p);
+       if (ehca_debug_level >= 2)
+               ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
+       squeue = &my_qp->ipz_squeue;
+       if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
+               ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
+                        " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
+               return -EFAULT;
+       }
+
+       /* loop sets wqe's purge bit */
+       wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+       *bad_wqe_cnt = 0;
+       while (wqe->optype != 0xff && wqe->wqef != 0xff) {
+               if (ehca_debug_level >= 2)
+                       ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
+               wqe->nr_of_data_seg = 0; /* suppress data access */
+               wqe->wqef = WQEF_PURGE; /* WQE to be purged */
+               q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
+               wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+               *bad_wqe_cnt = (*bad_wqe_cnt)+1;
+       }
+       /*
+        * bad wqe will be reprocessed and ignored when pol_cq() is called,
+        *  i.e. nr of wqes with flush error status is one less
+        */
+       ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
+                qp_num, (*bad_wqe_cnt)-1);
+       wqe->wqef = 0;
+
+       return 0;
+}
+
+static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
+                         struct ehca_queue_map *qmap)
+{
+       void *wqe_v;
+       u64 q_ofs;
+       u32 wqe_idx;
+       unsigned int tail_idx;
+
+       /* convert real to abs address */
+       wqe_p = wqe_p & (~(1UL << 63));
+
+       wqe_v = __va(wqe_p);
+
+       if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
+               ehca_gen_err("Invalid offset for calculating left cqes "
+                               "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
+               return -EFAULT;
+       }
+
+       tail_idx = next_index(qmap->tail, qmap->entries);
+       wqe_idx = q_ofs / ipz_queue->qe_size;
+
+       /* check all processed wqes, whether a cqe is requested or not */
+       while (tail_idx != wqe_idx) {
+               if (qmap->map[tail_idx].cqe_req)
+                       qmap->left_to_poll++;
+               tail_idx = next_index(tail_idx, qmap->entries);
+       }
+       /* save index in queue, where we have to start flushing */
+       qmap->next_wqe_idx = wqe_idx;
+       return 0;
+}
+
+static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
+{
+       u64 h_ret;
+       void *send_wqe_p, *recv_wqe_p;
+       int ret;
+       unsigned long flags;
+       int qp_num = my_qp->ib_qp.qp_num;
+
+       /* this hcall is not supported on base QPs */
+       if (my_qp->ext_type != EQPT_SRQBASE) {
+               /* get send and receive wqe pointer */
+               h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle, &my_qp->pf,
+                               &send_wqe_p, &recv_wqe_p, 4);
+               if (h_ret != H_SUCCESS) {
+                       ehca_err(&shca->ib_device, "disable_and_get_wqe() "
+                                "failed ehca_qp=%p qp_num=%x h_ret=%lli",
+                                my_qp, qp_num, h_ret);
+                       return ehca2ib_return_code(h_ret);
+               }
+
+               /*
+                * acquire lock to ensure that nobody is polling the cq which
+                * could mean that the qmap->tail pointer is in an
+                * inconsistent state.
+                */
+               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+               ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
+                               &my_qp->sq_map);
+               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+               if (ret)
+                       return ret;
+
+
+               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+               ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
+                               &my_qp->rq_map);
+               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+               if (ret)
+                       return ret;
+       } else {
+               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+               my_qp->sq_map.left_to_poll = 0;
+               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+                                                       my_qp->sq_map.entries);
+               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+               my_qp->rq_map.left_to_poll = 0;
+               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+                                                       my_qp->rq_map.entries);
+               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+       }
+
+       /* this assures flush cqes being generated only for pending wqes */
+       if ((my_qp->sq_map.left_to_poll == 0) &&
+                               (my_qp->rq_map.left_to_poll == 0)) {
+               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+               ehca_add_to_err_list(my_qp, 1);
+               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+               if (HAS_RQ(my_qp)) {
+                       spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+                       ehca_add_to_err_list(my_qp, 0);
+                       spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
+                                       flags);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * internal_modify_qp with circumvention to handle aqp0 properly
+ * smi_reset2init indicates if this is an internal reset-to-init-call for
+ * smi. This flag must always be zero if called from ehca_modify_qp()!
+ * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
+ */
+static int internal_modify_qp(struct ib_qp *ibqp,
+                             struct ib_qp_attr *attr,
+                             int attr_mask, int smi_reset2init)
+{
+       enum ib_qp_state qp_cur_state, qp_new_state;
+       int cnt, qp_attr_idx, ret = 0;
+       enum ib_qp_statetrans statetrans;
+       struct hcp_modify_qp_control_block *mqpcb;
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca =
+               container_of(ibqp->pd->device, struct ehca_shca, ib_device);
+       u64 update_mask;
+       u64 h_ret;
+       int bad_wqe_cnt = 0;
+       int is_user = 0;
+       int squeue_locked = 0;
+       unsigned long flags = 0;
+
+       /* do query_qp to obtain current attr values */
+       mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+       if (!mqpcb) {
+               ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
+                        "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               mqpcb, my_qp->galpas.kernel);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(ibqp->device, "hipz_h_query_qp() failed "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, ibqp->qp_num, h_ret);
+               ret = ehca2ib_return_code(h_ret);
+               goto modify_qp_exit1;
+       }
+       if (ibqp->uobject)
+               is_user = 1;
+
+       qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
+
+       if (qp_cur_state == -EINVAL) {  /* invalid qp state */
+               ret = -EINVAL;
+               ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
+                        "ehca_qp=%p qp_num=%x",
+                        mqpcb->qp_state, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+       /*
+        * circumvention to set aqp0 initial state to init
+        * as expected by IB spec
+        */
+       if (smi_reset2init == 0 &&
+           ibqp->qp_type == IB_QPT_SMI &&
+           qp_cur_state == IB_QPS_RESET &&
+           (attr_mask & IB_QP_STATE) &&
+           attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
+               struct ib_qp_attr smiqp_attr = {
+                       .qp_state = IB_QPS_INIT,
+                       .port_num = my_qp->init_attr.port_num,
+                       .pkey_index = 0,
+                       .qkey = 0
+               };
+               int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
+                       IB_QP_PKEY_INDEX | IB_QP_QKEY;
+               int smirc = internal_modify_qp(
+                       ibqp, &smiqp_attr, smiqp_attr_mask, 1);
+               if (smirc) {
+                       ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
+                                "ehca_modify_qp() rc=%i", smirc);
+                       ret = H_PARAMETER;
+                       goto modify_qp_exit1;
+               }
+               qp_cur_state = IB_QPS_INIT;
+               ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
+       }
+       /* is transmitted current state  equal to "real" current state */
+       if ((attr_mask & IB_QP_CUR_STATE) &&
+           qp_cur_state != attr->cur_qp_state) {
+               ret = -EINVAL;
+               ehca_err(ibqp->device,
+                        "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
+                        " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
+                        attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
+                "new qp_state=%x attribute_mask=%x",
+                my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
+
+       qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
+       if (!smi_reset2init &&
+           !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
+                               attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
+               ret = -EINVAL;
+               ehca_err(ibqp->device,
+                        "Invalid qp transition new_state=%x cur_state=%x "
+                        "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
+                        qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
+               goto modify_qp_exit1;
+       }
+
+       mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
+       if (mqpcb->qp_state)
+               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+       else {
+               ret = -EINVAL;
+               ehca_err(ibqp->device, "Invalid new qp state=%x "
+                        "ehca_qp=%p qp_num=%x",
+                        qp_new_state, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       /* retrieve state transition struct to get req and opt attrs */
+       statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
+       if (statetrans < 0) {
+               ret = -EINVAL;
+               ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
+                        "new_qp_state=%x State_xsition=%x ehca_qp=%p "
+                        "qp_num=%x", qp_cur_state, qp_new_state,
+                        statetrans, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
+
+       if (qp_attr_idx < 0) {
+               ret = qp_attr_idx;
+               ehca_err(ibqp->device,
+                        "Invalid QP type=%x ehca_qp=%p qp_num=%x",
+                        ibqp->qp_type, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       ehca_dbg(ibqp->device,
+                "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
+                my_qp, ibqp->qp_num, statetrans);
+
+       /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
+        * in non-LL UD QPs.
+        */
+       if ((my_qp->qp_type == IB_QPT_UD) &&
+           (my_qp->ext_type != EQPT_LLQP) &&
+           (statetrans == IB_QPST_INIT2RTR) &&
+           (shca->hw_level >= 0x22)) {
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+               mqpcb->send_grh_flag = 1;
+       }
+
+       /* sqe -> rts: set purge bit of bad wqe before actual trans */
+       if ((my_qp->qp_type == IB_QPT_UD ||
+            my_qp->qp_type == IB_QPT_GSI ||
+            my_qp->qp_type == IB_QPT_SMI) &&
+           statetrans == IB_QPST_SQE2RTS) {
+               /* mark next free wqe if kernel */
+               if (!ibqp->uobject) {
+                       struct ehca_wqe *wqe;
+                       /* lock send queue */
+                       spin_lock_irqsave(&my_qp->spinlock_s, flags);
+                       squeue_locked = 1;
+                       /* mark next free wqe */
+                       wqe = (struct ehca_wqe *)
+                               ipz_qeit_get(&my_qp->ipz_squeue);
+                       wqe->optype = wqe->wqef = 0xff;
+                       ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
+                                ibqp->qp_num, wqe);
+               }
+               ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
+               if (ret) {
+                       ehca_err(ibqp->device, "prepare_sqe_rts() failed "
+                                "ehca_qp=%p qp_num=%x ret=%i",
+                                my_qp, ibqp->qp_num, ret);
+                       goto modify_qp_exit2;
+               }
+       }
+
+       /*
+        * enable RDMA_Atomic_Control if reset->init und reliable con
+        * this is necessary since gen2 does not provide that flag,
+        * but pHyp requires it
+        */
+       if (statetrans == IB_QPST_RESET2INIT &&
+           (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
+               mqpcb->rdma_atomic_ctrl = 3;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
+       }
+       /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
+       if (statetrans == IB_QPST_INIT2RTR &&
+           (ibqp->qp_type == IB_QPT_UC) &&
+           !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
+               mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX) {
+               if (attr->pkey_index >= 16) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid pkey_index=%x. "
+                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
+                                attr->pkey_index, my_qp, ibqp->qp_num);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->prim_p_key_idx = attr->pkey_index;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
+       }
+       if (attr_mask & IB_QP_PORT) {
+               struct ehca_sport *sport;
+               struct ehca_qp *aqp1;
+               if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid port=%x. "
+                                "ehca_qp=%p qp_num=%x num_ports=%x",
+                                attr->port_num, my_qp, ibqp->qp_num,
+                                shca->num_ports);
+                       goto modify_qp_exit2;
+               }
+               sport = &shca->sport[attr->port_num - 1];
+               if (!sport->ibqp_sqp[IB_QPT_GSI]) {
+                       /* should not occur */
+                       ret = -EFAULT;
+                       ehca_err(ibqp->device, "AQP1 was not created for "
+                                "port=%x", attr->port_num);
+                       goto modify_qp_exit2;
+               }
+               aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
+                                   struct ehca_qp, ib_qp);
+               if (ibqp->qp_type != IB_QPT_GSI &&
+                   ibqp->qp_type != IB_QPT_SMI &&
+                   aqp1->mod_qp_parm) {
+                       /*
+                        * firmware will reject this modify_qp() because
+                        * port is not activated/initialized fully
+                        */
+                       ret = -EFAULT;
+                       ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
+                                 "either port is being activated (try again) "
+                                 "or cabling issue", attr->port_num);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->prim_phys_port = attr->port_num;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
+       }
+       if (attr_mask & IB_QP_QKEY) {
+               mqpcb->qkey = attr->qkey;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
+       }
+       if (attr_mask & IB_QP_AV) {
+               mqpcb->dlid = attr->ah_attr.dlid;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
+               mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
+               mqpcb->service_level = attr->ah_attr.sl;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
+
+               if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
+                                 attr->ah_attr.static_rate,
+                                 &mqpcb->max_static_rate)) {
+                       ret = -EINVAL;
+                       goto modify_qp_exit2;
+               }
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
+
+               /*
+                * Always supply the GRH flag, even if it's zero, to give the
+                * hypervisor a clear "yes" or "no" instead of a "perhaps"
+                */
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+
+               /*
+                * only if GRH is TRUE we might consider SOURCE_GID_IDX
+                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+                */
+               if (attr->ah_attr.ah_flags == IB_AH_GRH) {
+                       mqpcb->send_grh_flag = 1;
+
+                       mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
+                       update_mask |=
+                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
+
+                       for (cnt = 0; cnt < 16; cnt++)
+                               mqpcb->dest_gid.byte[cnt] =
+                                       attr->ah_attr.grh.dgid.raw[cnt];
+
+                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
+                       mqpcb->flow_label = attr->ah_attr.grh.flow_label;
+                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
+                       mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
+                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
+                       mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
+                       update_mask |=
+                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
+               }
+       }
+
+       if (attr_mask & IB_QP_PATH_MTU) {
+               /* store ld(MTU) */
+               my_qp->mtu_shift = attr->path_mtu + 7;
+               mqpcb->path_mtu = attr->path_mtu;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
+       }
+       if (attr_mask & IB_QP_TIMEOUT) {
+               mqpcb->timeout = attr->timeout;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
+       }
+       if (attr_mask & IB_QP_RETRY_CNT) {
+               mqpcb->retry_count = attr->retry_cnt;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
+       }
+       if (attr_mask & IB_QP_RNR_RETRY) {
+               mqpcb->rnr_retry_count = attr->rnr_retry;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
+       }
+       if (attr_mask & IB_QP_RQ_PSN) {
+               mqpcb->receive_psn = attr->rq_psn;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
+       }
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+               mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
+                       attr->max_dest_rd_atomic : 2;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+       }
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+               mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
+                       attr->max_rd_atomic : 2;
+               update_mask |=
+                       EHCA_BMASK_SET
+                       (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
+       }
+       if (attr_mask & IB_QP_ALT_PATH) {
+               if (attr->alt_port_num < 1
+                   || attr->alt_port_num > shca->num_ports) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid alt_port=%x. "
+                                "ehca_qp=%p qp_num=%x num_ports=%x",
+                                attr->alt_port_num, my_qp, ibqp->qp_num,
+                                shca->num_ports);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->alt_phys_port = attr->alt_port_num;
+
+               if (attr->alt_pkey_index >= 16) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
+                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
+                                attr->pkey_index, my_qp, ibqp->qp_num);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->alt_p_key_idx = attr->alt_pkey_index;
+
+               mqpcb->timeout_al = attr->alt_timeout;
+               mqpcb->dlid_al = attr->alt_ah_attr.dlid;
+               mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
+               mqpcb->service_level_al = attr->alt_ah_attr.sl;
+
+               if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
+                                 attr->alt_ah_attr.static_rate,
+                                 &mqpcb->max_static_rate_al)) {
+                       ret = -EINVAL;
+                       goto modify_qp_exit2;
+               }
+
+               /* OpenIB doesn't support alternate retry counts - copy them */
+               mqpcb->retry_count_al = mqpcb->retry_count;
+               mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
+
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
+
+               /*
+                * Always supply the GRH flag, even if it's zero, to give the
+                * hypervisor a clear "yes" or "no" instead of a "perhaps"
+                */
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
+
+               /*
+                * only if GRH is TRUE we might consider SOURCE_GID_IDX
+                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+                */
+               if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
+                       mqpcb->send_grh_flag_al = 1;
+
+                       for (cnt = 0; cnt < 16; cnt++)
+                               mqpcb->dest_gid_al.byte[cnt] =
+                                       attr->alt_ah_attr.grh.dgid.raw[cnt];
+                       mqpcb->source_gid_idx_al =
+                               attr->alt_ah_attr.grh.sgid_index;
+                       mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
+                       mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
+                       mqpcb->traffic_class_al =
+                               attr->alt_ah_attr.grh.traffic_class;
+
+                       update_mask |=
+                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
+                               | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
+                               | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
+                               | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
+                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
+               }
+       }
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+               mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
+       }
+
+       if (attr_mask & IB_QP_SQ_PSN) {
+               mqpcb->send_psn = attr->sq_psn;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
+       }
+
+       if (attr_mask & IB_QP_DEST_QPN) {
+               mqpcb->dest_qp_nr = attr->dest_qp_num;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
+       }
+
+       if (attr_mask & IB_QP_PATH_MIG_STATE) {
+               if (attr->path_mig_state != IB_MIG_REARM
+                   && attr->path_mig_state != IB_MIG_MIGRATED) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid mig_state=%x",
+                                attr->path_mig_state);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->path_migration_state = attr->path_mig_state + 1;
+               if (attr->path_mig_state == IB_MIG_REARM)
+                       my_qp->mig_armed = 1;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
+       }
+
+       if (attr_mask & IB_QP_CAP) {
+               mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
+               mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
+               /* no support for max_send/recv_sge yet */
+       }
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
+
+       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                                my_qp->ipz_qp_handle,
+                                &my_qp->pf,
+                                update_mask,
+                                mqpcb, my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
+                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
+               goto modify_qp_exit2;
+       }
+
+       if ((my_qp->qp_type == IB_QPT_UD ||
+            my_qp->qp_type == IB_QPT_GSI ||
+            my_qp->qp_type == IB_QPT_SMI) &&
+           statetrans == IB_QPST_SQE2RTS) {
+               /* doorbell to reprocessing wqes */
+               iosync(); /* serialize GAL register access */
+               hipz_update_sqa(my_qp, bad_wqe_cnt-1);
+               ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
+       }
+
+       if (statetrans == IB_QPST_RESET2INIT ||
+           statetrans == IB_QPST_INIT2INIT) {
+               mqpcb->qp_enable = 1;
+               mqpcb->qp_state = EHCA_QPS_INIT;
+               update_mask = 0;
+               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+
+               h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                                        my_qp->ipz_qp_handle,
+                                        &my_qp->pf,
+                                        update_mask,
+                                        mqpcb,
+                                        my_qp->galpas.kernel);
+
+               if (h_ret != H_SUCCESS) {
+                       ret = ehca2ib_return_code(h_ret);
+                       ehca_err(ibqp->device, "ENABLE in context of "
+                                "RESET_2_INIT failed! Maybe you didn't get "
+                                "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
+                                h_ret, my_qp, ibqp->qp_num);
+                       goto modify_qp_exit2;
+               }
+       }
+       if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
+           && !is_user) {
+               ret = check_for_left_cqes(my_qp, shca);
+               if (ret)
+                       goto modify_qp_exit2;
+       }
+
+       if (statetrans == IB_QPST_ANY2RESET) {
+               ipz_qeit_reset(&my_qp->ipz_rqueue);
+               ipz_qeit_reset(&my_qp->ipz_squeue);
+
+               if (qp_cur_state == IB_QPS_ERR && !is_user) {
+                       del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+                       if (HAS_RQ(my_qp))
+                               del_from_err_list(my_qp->recv_cq,
+                                                 &my_qp->rq_err_node);
+               }
+               if (!is_user)
+                       reset_queue_map(&my_qp->sq_map);
+
+               if (HAS_RQ(my_qp) && !is_user)
+                       reset_queue_map(&my_qp->rq_map);
+       }
+
+       if (attr_mask & IB_QP_QKEY)
+               my_qp->qkey = attr->qkey;
+
+modify_qp_exit2:
+       if (squeue_locked) { /* this means: sqe -> rts */
+               spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+               my_qp->sqerr_purgeflag = 1;
+       }
+
+modify_qp_exit1:
+       ehca_free_fw_ctrlblock(mqpcb);
+
+       return ret;
+}
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+                  struct ib_udata *udata)
+{
+       int ret = 0;
+
+       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+                                             ib_device);
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+
+       /* The if-block below caches qp_attr to be modified for GSI and SMI
+        * qps during the initialization by ib_mad. When the respective port
+        * is activated, ie we got an event PORT_ACTIVE, we'll replay the
+        * cached modify calls sequence, see ehca_recover_sqs() below.
+        * Why that is required:
+        * 1) If one port is connected, older code requires that port one
+        *    to be connected and module option nr_ports=1 to be given by
+        *    user, which is very inconvenient for end user.
+        * 2) Firmware accepts modify_qp() only if respective port has become
+        *    active. Older code had a wait loop of 30sec create_qp()/
+        *    define_aqp1(), which is not appropriate in practice. This
+        *    code now removes that wait loop, see define_aqp1(), and always
+        *    reports all ports to ib_mad resp. users. Only activated ports
+        *    will then usable for the users.
+        */
+       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+               int port = my_qp->init_attr.port_num;
+               struct ehca_sport *sport = &shca->sport[port - 1];
+               unsigned long flags;
+               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+               /* cache qp_attr only during init */
+               if (my_qp->mod_qp_parm) {
+                       struct ehca_mod_qp_parm *p;
+                       if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
+                               ehca_err(&shca->ib_device,
+                                        "mod_qp_parm overflow state=%x port=%x"
+                                        " type=%x", attr->qp_state,
+                                        my_qp->init_attr.port_num,
+                                        ibqp->qp_type);
+                               spin_unlock_irqrestore(&sport->mod_sqp_lock,
+                                                      flags);
+                               return -EINVAL;
+                       }
+                       p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
+                       p->mask = attr_mask;
+                       p->attr = *attr;
+                       my_qp->mod_qp_parm_idx++;
+                       ehca_dbg(&shca->ib_device,
+                                "Saved qp_attr for state=%x port=%x type=%x",
+                                attr->qp_state, my_qp->init_attr.port_num,
+                                ibqp->qp_type);
+                       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+                       goto out;
+               }
+               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+       }
+
+       ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
+
+out:
+       if ((ret == 0) && (attr_mask & IB_QP_STATE))
+               my_qp->state = attr->qp_state;
+
+       return ret;
+}
+
+void ehca_recover_sqp(struct ib_qp *sqp)
+{
+       struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
+       int port = my_sqp->init_attr.port_num;
+       struct ib_qp_attr attr;
+       struct ehca_mod_qp_parm *qp_parm;
+       int i, qp_parm_idx, ret;
+       unsigned long flags, wr_cnt;
+
+       if (!my_sqp->mod_qp_parm)
+               return;
+       ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
+
+       qp_parm = my_sqp->mod_qp_parm;
+       qp_parm_idx = my_sqp->mod_qp_parm_idx;
+       for (i = 0; i < qp_parm_idx; i++) {
+               attr = qp_parm[i].attr;
+               ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
+               if (ret) {
+                       ehca_err(sqp->device, "Could not modify SQP port=%x "
+                                "qp_num=%x ret=%x", port, sqp->qp_num, ret);
+                       goto free_qp_parm;
+               }
+               ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
+                        port, sqp->qp_num, attr.qp_state);
+       }
+
+       /* re-trigger posted recv wrs */
+       wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
+               my_sqp->ipz_rqueue.qe_size;
+       if (wr_cnt) {
+               spin_lock_irqsave(&my_sqp->spinlock_r, flags);
+               hipz_update_rqa(my_sqp, wr_cnt);
+               spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
+               ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
+                        port, sqp->qp_num, wr_cnt);
+       }
+
+free_qp_parm:
+       kfree(qp_parm);
+       /* this prevents subsequent calls to modify_qp() to cache qp_attr */
+       my_sqp->mod_qp_parm = NULL;
+}
+
+int ehca_query_qp(struct ib_qp *qp,
+                 struct ib_qp_attr *qp_attr,
+                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
+                                             ib_device);
+       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+       struct hcp_modify_qp_control_block *qpcb;
+       int cnt, ret = 0;
+       u64 h_ret;
+
+       if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
+               ehca_err(qp->device, "Invalid attribute mask "
+                        "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
+                        my_qp, qp->qp_num, qp_attr_mask);
+               return -EINVAL;
+       }
+
+       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!qpcb) {
+               ehca_err(qp->device, "Out of memory for qpcb "
+                        "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_qp(adapter_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               qpcb, my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(qp->device, "hipz_h_query_qp() failed "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, qp->qp_num, h_ret);
+               goto query_qp_exit1;
+       }
+
+       qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
+       qp_attr->qp_state = qp_attr->cur_qp_state;
+
+       if (qp_attr->cur_qp_state == -EINVAL) {
+               ret = -EINVAL;
+               ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
+                        "ehca_qp=%p qp_num=%x",
+                        qpcb->qp_state, my_qp, qp->qp_num);
+               goto query_qp_exit1;
+       }
+
+       if (qp_attr->qp_state == IB_QPS_SQD)
+               qp_attr->sq_draining = 1;
+
+       qp_attr->qkey = qpcb->qkey;
+       qp_attr->path_mtu = qpcb->path_mtu;
+       qp_attr->path_mig_state = qpcb->path_migration_state - 1;
+       qp_attr->rq_psn = qpcb->receive_psn;
+       qp_attr->sq_psn = qpcb->send_psn;
+       qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
+       qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
+       qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
+       /* UD_AV CIRCUMVENTION */
+       if (my_qp->qp_type == IB_QPT_UD) {
+               qp_attr->cap.max_send_sge =
+                       qpcb->actual_nr_sges_in_sq_wqe - 2;
+               qp_attr->cap.max_recv_sge =
+                       qpcb->actual_nr_sges_in_rq_wqe - 2;
+       } else {
+               qp_attr->cap.max_send_sge =
+                       qpcb->actual_nr_sges_in_sq_wqe;
+               qp_attr->cap.max_recv_sge =
+                       qpcb->actual_nr_sges_in_rq_wqe;
+       }
+
+       qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
+       qp_attr->dest_qp_num = qpcb->dest_qp_nr;
+
+       qp_attr->pkey_index = qpcb->prim_p_key_idx;
+       qp_attr->port_num = qpcb->prim_phys_port;
+       qp_attr->timeout = qpcb->timeout;
+       qp_attr->retry_cnt = qpcb->retry_count;
+       qp_attr->rnr_retry = qpcb->rnr_retry_count;
+
+       qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
+       qp_attr->alt_port_num = qpcb->alt_phys_port;
+       qp_attr->alt_timeout = qpcb->timeout_al;
+
+       qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
+       qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
+
+       /* primary av */
+       qp_attr->ah_attr.sl = qpcb->service_level;
+
+       if (qpcb->send_grh_flag) {
+               qp_attr->ah_attr.ah_flags = IB_AH_GRH;
+       }
+
+       qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
+       qp_attr->ah_attr.dlid = qpcb->dlid;
+       qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
+       qp_attr->ah_attr.port_num = qp_attr->port_num;
+
+       /* primary GRH */
+       qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
+       qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
+       qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
+       qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
+
+       for (cnt = 0; cnt < 16; cnt++)
+               qp_attr->ah_attr.grh.dgid.raw[cnt] =
+                       qpcb->dest_gid.byte[cnt];
+
+       /* alternate AV */
+       qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
+       if (qpcb->send_grh_flag_al) {
+               qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
+       }
+
+       qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
+       qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
+       qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
+
+       /* alternate GRH */
+       qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
+       qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
+       qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
+       qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
+
+       for (cnt = 0; cnt < 16; cnt++)
+               qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
+                       qpcb->dest_gid_al.byte[cnt];
+
+       /* return init attributes given in ehca_create_qp */
+       if (qp_init_attr)
+               *qp_init_attr = my_qp->init_attr;
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
+
+query_qp_exit1:
+       ehca_free_fw_ctrlblock(qpcb);
+
+       return ret;
+}
+
+int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+       struct ehca_qp *my_qp =
+               container_of(ibsrq, struct ehca_qp, ib_srq);
+       struct ehca_shca *shca =
+               container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
+       struct hcp_modify_qp_control_block *mqpcb;
+       u64 update_mask;
+       u64 h_ret;
+       int ret = 0;
+
+       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!mqpcb) {
+               ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
+                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+               return -ENOMEM;
+       }
+
+       update_mask = 0;
+       if (attr_mask & IB_SRQ_LIMIT) {
+               attr_mask &= ~IB_SRQ_LIMIT;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
+               mqpcb->curr_srq_limit = attr->srq_limit;
+               mqpcb->qp_aff_asyn_ev_log_reg =
+                       EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
+       }
+
+       /* by now, all bits in attr_mask should have been cleared */
+       if (attr_mask) {
+               ehca_err(ibsrq->device, "invalid attribute mask bits set  "
+                        "attr_mask=%x", attr_mask);
+               ret = -EINVAL;
+               goto modify_srq_exit0;
+       }
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
+                                NULL, update_mask, mqpcb,
+                                my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
+                        "ehca_qp=%p qp_num=%x",
+                        h_ret, my_qp, my_qp->real_qp_num);
+       }
+
+modify_srq_exit0:
+       ehca_free_fw_ctrlblock(mqpcb);
+
+       return ret;
+}
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
+{
+       struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
+       struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
+                                             ib_device);
+       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+       struct hcp_modify_qp_control_block *qpcb;
+       int ret = 0;
+       u64 h_ret;
+
+       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!qpcb) {
+               ehca_err(srq->device, "Out of memory for qpcb "
+                        "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
+                               NULL, qpcb, my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(srq->device, "hipz_h_query_qp() failed "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, h_ret);
+               goto query_srq_exit1;
+       }
+
+       srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+       srq_attr->max_sge = 3;
+       srq_attr->srq_limit = qpcb->curr_srq_limit;
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+query_srq_exit1:
+       ehca_free_fw_ctrlblock(qpcb);
+
+       return ret;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+                              struct ib_uobject *uobject)
+{
+       struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
+       struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
+                                            ib_pd);
+       struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
+       u32 qp_num = my_qp->real_qp_num;
+       int ret;
+       u64 h_ret;
+       u8 port_num;
+       int is_user = 0;
+       enum ib_qp_type qp_type;
+       unsigned long flags;
+
+       if (uobject) {
+               is_user = 1;
+               if (my_qp->mm_count_galpa ||
+                   my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
+                       ehca_err(dev, "Resources still referenced in "
+                                "user space qp_num=%x", qp_num);
+                       return -EINVAL;
+               }
+       }
+
+       if (my_qp->send_cq) {
+               ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
+               if (ret) {
+                       ehca_err(dev, "Couldn't unassign qp from "
+                                "send_cq ret=%i qp_num=%x cq_num=%x", ret,
+                                qp_num, my_qp->send_cq->cq_number);
+                       return ret;
+               }
+       }
+
+       write_lock_irqsave(&ehca_qp_idr_lock, flags);
+       idr_remove(&ehca_qp_idr, my_qp->token);
+       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+       /*
+        * SRQs will never get into an error list and do not have a recv_cq,
+        * so we need to skip them here.
+        */
+       if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
+               del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
+
+       if (HAS_SQ(my_qp) && !is_user)
+               del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+       /* now wait until all pending events have completed */
+       wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
+
+       h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
+                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
+               return ehca2ib_return_code(h_ret);
+       }
+
+       port_num = my_qp->init_attr.port_num;
+       qp_type  = my_qp->init_attr.qp_type;
+
+       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+               kfree(my_qp->mod_qp_parm);
+               my_qp->mod_qp_parm = NULL;
+               shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
+               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+       }
+
+       /* no support for IB_QPT_SMI yet */
+       if (qp_type == IB_QPT_GSI) {
+               struct ib_event event;
+               ehca_info(dev, "device %s: port %x is inactive.",
+                               shca->ib_device.name, port_num);
+               event.device = &shca->ib_device;
+               event.event = IB_EVENT_PORT_ERR;
+               event.element.port_num = port_num;
+               shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
+               ib_dispatch_event(&event);
+       }
+
+       if (HAS_RQ(my_qp)) {
+               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+               if (!is_user)
+                       vfree(my_qp->rq_map.map);
+       }
+       if (HAS_SQ(my_qp)) {
+               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+               if (!is_user)
+                       vfree(my_qp->sq_map.map);
+       }
+       kmem_cache_free(qp_cache, my_qp);
+       atomic_dec(&shca->num_qps);
+       return 0;
+}
+
+int ehca_destroy_qp(struct ib_qp *qp)
+{
+       return internal_destroy_qp(qp->device,
+                                  container_of(qp, struct ehca_qp, ib_qp),
+                                  qp->uobject);
+}
+
+int ehca_destroy_srq(struct ib_srq *srq)
+{
+       return internal_destroy_qp(srq->device,
+                                  container_of(srq, struct ehca_qp, ib_srq),
+                                  srq->uobject);
+}
+
+int ehca_init_qp_cache(void)
+{
+       qp_cache = kmem_cache_create("ehca_cache_qp",
+                                    sizeof(struct ehca_qp), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!qp_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_qp_cache(void)
+{
+       if (qp_cache)
+               kmem_cache_destroy(qp_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c
new file mode 100644 (file)
index 0000000..47f9498
--- /dev/null
@@ -0,0 +1,953 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  post_send/recv, poll_cq, req_notify
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+/* in RC traffic, insert an empty RDMA READ every this many packets */
+#define ACK_CIRC_THRESHOLD 2000000
+
+static u64 replace_wr_id(u64 wr_id, u16 idx)
+{
+       u64 ret;
+
+       ret = wr_id & ~QMAP_IDX_MASK;
+       ret |= idx & QMAP_IDX_MASK;
+
+       return ret;
+}
+
+static u16 get_app_wr_id(u64 wr_id)
+{
+       return wr_id & QMAP_IDX_MASK;
+}
+
+static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
+                                 struct ehca_wqe *wqe_p,
+                                 struct ib_recv_wr *recv_wr,
+                                 u32 rq_map_idx)
+{
+       u8 cnt_ds;
+       if (unlikely((recv_wr->num_sge < 0) ||
+                    (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
+               ehca_gen_err("Invalid number of WQE SGE. "
+                        "num_sqe=%x max_nr_of_sg=%x",
+                        recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
+               return -EINVAL; /* invalid SG list length */
+       }
+
+       /* clear wqe header until sglist */
+       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+       wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
+       wqe_p->nr_of_data_seg = recv_wr->num_sge;
+
+       for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
+               wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
+                       recv_wr->sg_list[cnt_ds].addr;
+               wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
+                       recv_wr->sg_list[cnt_ds].lkey;
+               wqe_p->u.all_rcv.sg_list[cnt_ds].length =
+                       recv_wr->sg_list[cnt_ds].length;
+       }
+
+       if (ehca_debug_level >= 3) {
+               ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
+                            ipz_rqueue);
+               ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
+       }
+
+       return 0;
+}
+
+#if defined(DEBUG_GSI_SEND_WR)
+
+/* need ib_mad struct */
+#include <rdma/ib_mad.h>
+
+static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
+{
+       int idx;
+       int j;
+       while (send_wr) {
+               struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
+               struct ib_sge *sge = send_wr->sg_list;
+               ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
+                            "send_flags=%x opcode=%x", idx, send_wr->wr_id,
+                            send_wr->num_sge, send_wr->send_flags,
+                            send_wr->opcode);
+               if (mad_hdr) {
+                       ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
+                                    "mgmt_class=%x class_version=%x method=%x "
+                                    "status=%x class_specific=%x tid=%lx "
+                                    "attr_id=%x resv=%x attr_mod=%x",
+                                    idx, mad_hdr->base_version,
+                                    mad_hdr->mgmt_class,
+                                    mad_hdr->class_version, mad_hdr->method,
+                                    mad_hdr->status, mad_hdr->class_specific,
+                                    mad_hdr->tid, mad_hdr->attr_id,
+                                    mad_hdr->resv,
+                                    mad_hdr->attr_mod);
+               }
+               for (j = 0; j < send_wr->num_sge; j++) {
+                       u8 *data = __va(sge->addr);
+                       ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
+                                    "lkey=%x",
+                                    idx, j, data, sge->length, sge->lkey);
+                       /* assume length is n*16 */
+                       ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
+                                idx, j);
+                       sge++;
+               } /* eof for j */
+               idx++;
+               send_wr = send_wr->next;
+       } /* eof while send_wr */
+}
+
+#endif /* DEBUG_GSI_SEND_WR */
+
+static inline int ehca_write_swqe(struct ehca_qp *qp,
+                                 struct ehca_wqe *wqe_p,
+                                 const struct ib_send_wr *send_wr,
+                                 u32 sq_map_idx,
+                                 int hidden)
+{
+       u32 idx;
+       u64 dma_length;
+       struct ehca_av *my_av;
+       u32 remote_qkey = send_wr->wr.ud.remote_qkey;
+       struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
+
+       if (unlikely((send_wr->num_sge < 0) ||
+                    (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
+               ehca_gen_err("Invalid number of WQE SGE. "
+                        "num_sqe=%x max_nr_of_sg=%x",
+                        send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
+               return -EINVAL; /* invalid SG list length */
+       }
+
+       /* clear wqe header until sglist */
+       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+       wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
+
+       qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
+       qmap_entry->reported = 0;
+       qmap_entry->cqe_req = 0;
+
+       switch (send_wr->opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               wqe_p->optype = WQE_OPTYPE_SEND;
+               break;
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
+               break;
+       case IB_WR_RDMA_READ:
+               wqe_p->optype = WQE_OPTYPE_RDMAREAD;
+               break;
+       default:
+               ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
+               return -EINVAL; /* invalid opcode */
+       }
+
+       wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
+
+       wqe_p->wr_flag = 0;
+
+       if ((send_wr->send_flags & IB_SEND_SIGNALED ||
+           qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
+           && !hidden) {
+               wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
+               qmap_entry->cqe_req = 1;
+       }
+
+       if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
+           send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+               /* this might not work as long as HW does not support it */
+               wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
+               wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
+       }
+
+       wqe_p->nr_of_data_seg = send_wr->num_sge;
+
+       switch (qp->qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               /* no break is intential here */
+       case IB_QPT_UD:
+               /* IB 1.2 spec C10-15 compliance */
+               if (send_wr->wr.ud.remote_qkey & 0x80000000)
+                       remote_qkey = qp->qkey;
+
+               wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
+               wqe_p->local_ee_context_qkey = remote_qkey;
+               if (unlikely(!send_wr->wr.ud.ah)) {
+                       ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
+                       return -EINVAL;
+               }
+               if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
+                       ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
+                       return -EINVAL;
+               }
+               my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
+               wqe_p->u.ud_av.ud_av = my_av->av;
+
+               /*
+                * omitted check of IB_SEND_INLINE
+                * since HW does not support it
+                */
+               for (idx = 0; idx < send_wr->num_sge; idx++) {
+                       wqe_p->u.ud_av.sg_list[idx].vaddr =
+                               send_wr->sg_list[idx].addr;
+                       wqe_p->u.ud_av.sg_list[idx].lkey =
+                               send_wr->sg_list[idx].lkey;
+                       wqe_p->u.ud_av.sg_list[idx].length =
+                               send_wr->sg_list[idx].length;
+               } /* eof for idx */
+               if (qp->qp_type == IB_QPT_SMI ||
+                   qp->qp_type == IB_QPT_GSI)
+                       wqe_p->u.ud_av.ud_av.pmtu = 1;
+               if (qp->qp_type == IB_QPT_GSI) {
+                       wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
+#ifdef DEBUG_GSI_SEND_WR
+                       trace_send_wr_ud(send_wr);
+#endif /* DEBUG_GSI_SEND_WR */
+               }
+               break;
+
+       case IB_QPT_UC:
+               if (send_wr->send_flags & IB_SEND_FENCE)
+                       wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
+               /* no break is intentional here */
+       case IB_QPT_RC:
+               /* TODO: atomic not implemented */
+               wqe_p->u.nud.remote_virtual_address =
+                       send_wr->wr.rdma.remote_addr;
+               wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;
+
+               /*
+                * omitted checking of IB_SEND_INLINE
+                * since HW does not support it
+                */
+               dma_length = 0;
+               for (idx = 0; idx < send_wr->num_sge; idx++) {
+                       wqe_p->u.nud.sg_list[idx].vaddr =
+                               send_wr->sg_list[idx].addr;
+                       wqe_p->u.nud.sg_list[idx].lkey =
+                               send_wr->sg_list[idx].lkey;
+                       wqe_p->u.nud.sg_list[idx].length =
+                               send_wr->sg_list[idx].length;
+                       dma_length += send_wr->sg_list[idx].length;
+               } /* eof idx */
+               wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
+
+               /* unsolicited ack circumvention */
+               if (send_wr->opcode == IB_WR_RDMA_READ) {
+                       /* on RDMA read, switch on and reset counters */
+                       qp->message_count = qp->packet_count = 0;
+                       qp->unsol_ack_circ = 1;
+               } else
+                       /* else estimate #packets */
+                       qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
+
+               break;
+
+       default:
+               ehca_gen_err("Invalid qptype=%x", qp->qp_type);
+               return -EINVAL;
+       }
+
+       if (ehca_debug_level >= 3) {
+               ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
+               ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
+       }
+       return 0;
+}
+
+/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
+static inline void map_ib_wc_status(u32 cqe_status,
+                                   enum ib_wc_status *wc_status)
+{
+       if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
+               switch (cqe_status & 0x3F) {
+               case 0x01:
+               case 0x21:
+                       *wc_status = IB_WC_LOC_LEN_ERR;
+                       break;
+               case 0x02:
+               case 0x22:
+                       *wc_status = IB_WC_LOC_QP_OP_ERR;
+                       break;
+               case 0x03:
+               case 0x23:
+                       *wc_status = IB_WC_LOC_EEC_OP_ERR;
+                       break;
+               case 0x04:
+               case 0x24:
+                       *wc_status = IB_WC_LOC_PROT_ERR;
+                       break;
+               case 0x05:
+               case 0x25:
+                       *wc_status = IB_WC_WR_FLUSH_ERR;
+                       break;
+               case 0x06:
+                       *wc_status = IB_WC_MW_BIND_ERR;
+                       break;
+               case 0x07: /* remote error - look into bits 20:24 */
+                       switch ((cqe_status
+                                & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
+                       case 0x0:
+                               /*
+                                * PSN Sequence Error!
+                                * couldn't find a matching status!
+                                */
+                               *wc_status = IB_WC_GENERAL_ERR;
+                               break;
+                       case 0x1:
+                               *wc_status = IB_WC_REM_INV_REQ_ERR;
+                               break;
+                       case 0x2:
+                               *wc_status = IB_WC_REM_ACCESS_ERR;
+                               break;
+                       case 0x3:
+                               *wc_status = IB_WC_REM_OP_ERR;
+                               break;
+                       case 0x4:
+                               *wc_status = IB_WC_REM_INV_RD_REQ_ERR;
+                               break;
+                       }
+                       break;
+               case 0x08:
+                       *wc_status = IB_WC_RETRY_EXC_ERR;
+                       break;
+               case 0x09:
+                       *wc_status = IB_WC_RNR_RETRY_EXC_ERR;
+                       break;
+               case 0x0A:
+               case 0x2D:
+                       *wc_status = IB_WC_REM_ABORT_ERR;
+                       break;
+               case 0x0B:
+               case 0x2E:
+                       *wc_status = IB_WC_INV_EECN_ERR;
+                       break;
+               case 0x0C:
+               case 0x2F:
+                       *wc_status = IB_WC_INV_EEC_STATE_ERR;
+                       break;
+               case 0x0D:
+                       *wc_status = IB_WC_BAD_RESP_ERR;
+                       break;
+               case 0x10:
+                       /* WQE purged */
+                       *wc_status = IB_WC_WR_FLUSH_ERR;
+                       break;
+               default:
+                       *wc_status = IB_WC_FATAL_ERR;
+
+               }
+       } else
+               *wc_status = IB_WC_SUCCESS;
+}
+
+static inline int post_one_send(struct ehca_qp *my_qp,
+                        struct ib_send_wr *cur_send_wr,
+                        int hidden)
+{
+       struct ehca_wqe *wqe_p;
+       int ret;
+       u32 sq_map_idx;
+       u64 start_offset = my_qp->ipz_squeue.current_q_offset;
+
+       /* get pointer next to free WQE */
+       wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
+       if (unlikely(!wqe_p)) {
+               /* too many posted work requests: queue overflow */
+               ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
+                        "qp_num=%x", my_qp->ib_qp.qp_num);
+               return -ENOMEM;
+       }
+
+       /*
+        * Get the index of the WQE in the send queue. The same index is used
+        * for writing into the sq_map.
+        */
+       sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
+
+       /* write a SEND WQE into the QUEUE */
+       ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
+       /*
+        * if something failed,
+        * reset the free entry pointer to the start value
+        */
+       if (unlikely(ret)) {
+               my_qp->ipz_squeue.current_q_offset = start_offset;
+               ehca_err(my_qp->ib_qp.device, "Could not write WQE "
+                        "qp_num=%x", my_qp->ib_qp.qp_num);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int ehca_post_send(struct ib_qp *qp,
+                  struct ib_send_wr *send_wr,
+                  struct ib_send_wr **bad_send_wr)
+{
+       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+       int wqe_cnt = 0;
+       int ret = 0;
+       unsigned long flags;
+
+       /* Reject WR if QP is in RESET, INIT or RTR state */
+       if (unlikely(my_qp->state < IB_QPS_RTS)) {
+               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+                        my_qp->state, qp->qp_num);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* LOCK the QUEUE */
+       spin_lock_irqsave(&my_qp->spinlock_s, flags);
+
+       /* Send an empty extra RDMA read if:
+        *  1) there has been an RDMA read on this connection before
+        *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
+        *  3) we can be sure that any previous extra RDMA read has been
+        *     processed so we don't overflow the SQ
+        */
+       if (unlikely(my_qp->unsol_ack_circ &&
+                    my_qp->packet_count > ACK_CIRC_THRESHOLD &&
+                    my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
+               /* insert an empty RDMA READ to fix up the remote QP state */
+               struct ib_send_wr circ_wr;
+               memset(&circ_wr, 0, sizeof(circ_wr));
+               circ_wr.opcode = IB_WR_RDMA_READ;
+               post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
+               wqe_cnt++;
+               ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
+               my_qp->message_count = my_qp->packet_count = 0;
+       }
+
+       /* loop processes list of send reqs */
+       while (send_wr) {
+               ret = post_one_send(my_qp, send_wr, 0);
+               if (unlikely(ret)) {
+                       goto post_send_exit0;
+               }
+               wqe_cnt++;
+               send_wr = send_wr->next;
+       }
+
+post_send_exit0:
+       iosync(); /* serialize GAL register access */
+       hipz_update_sqa(my_qp, wqe_cnt);
+       if (unlikely(ret || ehca_debug_level >= 2))
+               ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+                        my_qp, qp->qp_num, wqe_cnt, ret);
+       my_qp->message_count += wqe_cnt;
+       spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+
+out:
+       if (ret)
+               *bad_send_wr = send_wr;
+       return ret;
+}
+
+static int internal_post_recv(struct ehca_qp *my_qp,
+                             struct ib_device *dev,
+                             struct ib_recv_wr *recv_wr,
+                             struct ib_recv_wr **bad_recv_wr)
+{
+       struct ehca_wqe *wqe_p;
+       int wqe_cnt = 0;
+       int ret = 0;
+       u32 rq_map_idx;
+       unsigned long flags;
+       struct ehca_qmap_entry *qmap_entry;
+
+       if (unlikely(!HAS_RQ(my_qp))) {
+               ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
+                        my_qp, my_qp->real_qp_num, my_qp->ext_type);
+               ret = -ENODEV;
+               goto out;
+       }
+
+       /* LOCK the QUEUE */
+       spin_lock_irqsave(&my_qp->spinlock_r, flags);
+
+       /* loop processes list of recv reqs */
+       while (recv_wr) {
+               u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
+               /* get pointer next to free WQE */
+               wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
+               if (unlikely(!wqe_p)) {
+                       /* too many posted work requests: queue overflow */
+                       ret = -ENOMEM;
+                       ehca_err(dev, "Too many posted WQEs "
+                               "qp_num=%x", my_qp->real_qp_num);
+                       goto post_recv_exit0;
+               }
+               /*
+                * Get the index of the WQE in the recv queue. The same index
+                * is used for writing into the rq_map.
+                */
+               rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
+
+               /* write a RECV WQE into the QUEUE */
+               ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
+                               rq_map_idx);
+               /*
+                * if something failed,
+                * reset the free entry pointer to the start value
+                */
+               if (unlikely(ret)) {
+                       my_qp->ipz_rqueue.current_q_offset = start_offset;
+                       ret = -EINVAL;
+                       ehca_err(dev, "Could not write WQE "
+                               "qp_num=%x", my_qp->real_qp_num);
+                       goto post_recv_exit0;
+               }
+
+               qmap_entry = &my_qp->rq_map.map[rq_map_idx];
+               qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
+               qmap_entry->reported = 0;
+               qmap_entry->cqe_req = 1;
+
+               wqe_cnt++;
+               recv_wr = recv_wr->next;
+       } /* eof for recv_wr */
+
+post_recv_exit0:
+       iosync(); /* serialize GAL register access */
+       hipz_update_rqa(my_qp, wqe_cnt);
+       if (unlikely(ret || ehca_debug_level >= 2))
+           ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+                    my_qp, my_qp->real_qp_num, wqe_cnt, ret);
+       spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
+
+out:
+       if (ret)
+               *bad_recv_wr = recv_wr;
+
+       return ret;
+}
+
+int ehca_post_recv(struct ib_qp *qp,
+                  struct ib_recv_wr *recv_wr,
+                  struct ib_recv_wr **bad_recv_wr)
+{
+       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+
+       /* Reject WR if QP is in RESET state */
+       if (unlikely(my_qp->state == IB_QPS_RESET)) {
+               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+                        my_qp->state, qp->qp_num);
+               *bad_recv_wr = recv_wr;
+               return -EINVAL;
+       }
+
+       return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
+}
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+                      struct ib_recv_wr *recv_wr,
+                      struct ib_recv_wr **bad_recv_wr)
+{
+       return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
+                                 srq->device, recv_wr, bad_recv_wr);
+}
+
+/*
+ * ib_wc_opcode table converts ehca wc opcode to ib
+ * Since we use zero to indicate invalid opcode, the actual ib opcode must
+ * be decremented!!!
+ */
+static const u8 ib_wc_opcode[255] = {
+       [0x01] = IB_WC_RECV+1,
+       [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
+       [0x04] = IB_WC_BIND_MW+1,
+       [0x08] = IB_WC_FETCH_ADD+1,
+       [0x10] = IB_WC_COMP_SWAP+1,
+       [0x20] = IB_WC_RDMA_WRITE+1,
+       [0x40] = IB_WC_RDMA_READ+1,
+       [0x80] = IB_WC_SEND+1
+};
+
+/* internal function to poll one entry of cq */
+static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
+{
+       int ret = 0, qmap_tail_idx;
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       struct ehca_cqe *cqe;
+       struct ehca_qp *my_qp;
+       struct ehca_qmap_entry *qmap_entry;
+       struct ehca_queue_map *qmap;
+       int cqe_count = 0, is_error;
+
+repoll:
+       cqe = (struct ehca_cqe *)
+               ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
+       if (!cqe) {
+               ret = -EAGAIN;
+               if (ehca_debug_level >= 3)
+                       ehca_dbg(cq->device, "Completion queue is empty  "
+                                "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
+               goto poll_cq_one_exit0;
+       }
+
+       /* prevents loads being reordered across this point */
+       rmb();
+
+       cqe_count++;
+       if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
+               struct ehca_qp *qp;
+               int purgeflag;
+               unsigned long flags;
+
+               qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
+               if (!qp) {
+                       ehca_err(cq->device, "cq_num=%x qp_num=%x "
+                                "could not find qp -> ignore cqe",
+                                my_cq->cq_number, cqe->local_qp_number);
+                       ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
+                                my_cq->cq_number, cqe->local_qp_number);
+                       /* ignore this purged cqe */
+                       goto repoll;
+               }
+               spin_lock_irqsave(&qp->spinlock_s, flags);
+               purgeflag = qp->sqerr_purgeflag;
+               spin_unlock_irqrestore(&qp->spinlock_s, flags);
+
+               if (purgeflag) {
+                       ehca_dbg(cq->device,
+                                "Got CQE with purged bit qp_num=%x src_qp=%x",
+                                cqe->local_qp_number, cqe->remote_qp_number);
+                       if (ehca_debug_level >= 2)
+                               ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
+                                        cqe->local_qp_number,
+                                        cqe->remote_qp_number);
+                       /*
+                        * ignore this to avoid double cqes of bad wqe
+                        * that caused sqe and turn off purge flag
+                        */
+                       qp->sqerr_purgeflag = 0;
+                       goto repoll;
+               }
+       }
+
+       is_error = cqe->status & WC_STATUS_ERROR_BIT;
+
+       /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
+       if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
+               ehca_dbg(cq->device,
+                        "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
+                        is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
+               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+                        my_cq, my_cq->cq_number);
+               ehca_dbg(cq->device,
+                        "ehca_cq=%p cq_num=%x -------------------------",
+                        my_cq, my_cq->cq_number);
+       }
+
+       read_lock(&ehca_qp_idr_lock);
+       my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
+       read_unlock(&ehca_qp_idr_lock);
+       if (!my_qp)
+               goto repoll;
+       wc->qp = &my_qp->ib_qp;
+
+       qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
+       if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
+               /* We got a send completion. */
+               qmap = &my_qp->sq_map;
+       else
+               /* We got a receive completion. */
+               qmap = &my_qp->rq_map;
+
+       /* advance the tail pointer */
+       qmap->tail = qmap_tail_idx;
+
+       if (is_error) {
+               /*
+                * set left_to_poll to 0 because in error state, we will not
+                * get any additional CQEs
+                */
+               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+                                                       my_qp->sq_map.entries);
+               my_qp->sq_map.left_to_poll = 0;
+               ehca_add_to_err_list(my_qp, 1);
+
+               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+                                                       my_qp->rq_map.entries);
+               my_qp->rq_map.left_to_poll = 0;
+               if (HAS_RQ(my_qp))
+                       ehca_add_to_err_list(my_qp, 0);
+       }
+
+       qmap_entry = &qmap->map[qmap_tail_idx];
+       if (qmap_entry->reported) {
+               ehca_warn(cq->device, "Double cqe on qp_num=%#x",
+                               my_qp->real_qp_num);
+               /* found a double cqe, discard it and read next one */
+               goto repoll;
+       }
+
+       wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
+       qmap_entry->reported = 1;
+
+       /* if left_to_poll is decremented to 0, add the QP to the error list */
+       if (qmap->left_to_poll > 0) {
+               qmap->left_to_poll--;
+               if ((my_qp->sq_map.left_to_poll == 0) &&
+                               (my_qp->rq_map.left_to_poll == 0)) {
+                       ehca_add_to_err_list(my_qp, 1);
+                       if (HAS_RQ(my_qp))
+                               ehca_add_to_err_list(my_qp, 0);
+               }
+       }
+
+       /* eval ib_wc_opcode */
+       wc->opcode = ib_wc_opcode[cqe->optype]-1;
+       if (unlikely(wc->opcode == -1)) {
+               ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
+                        "ehca_cq=%p cq_num=%x",
+                        cqe->optype, cqe->status, my_cq, my_cq->cq_number);
+               /* dump cqe for other infos */
+               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+                        my_cq, my_cq->cq_number);
+               /* update also queue adder to throw away this entry!!! */
+               goto repoll;
+       }
+
+       /* eval ib_wc_status */
+       if (unlikely(is_error)) {
+               /* complete with errors */
+               map_ib_wc_status(cqe->status, &wc->status);
+               wc->vendor_err = wc->status;
+       } else
+               wc->status = IB_WC_SUCCESS;
+
+       wc->byte_len = cqe->nr_bytes_transferred;
+       wc->pkey_index = cqe->pkey_index;
+       wc->slid = cqe->rlid;
+       wc->dlid_path_bits = cqe->dlid;
+       wc->src_qp = cqe->remote_qp_number;
+       /*
+        * HW has "Immed data present" and "GRH present" in bits 6 and 5.
+        * SW defines those in bits 1 and 0, so we can just shift and mask.
+        */
+       wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
+       wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
+       wc->sl = cqe->service_level;
+
+poll_cq_one_exit0:
+       if (cqe_count > 0)
+               hipz_update_feca(my_cq, cqe_count);
+
+       return ret;
+}
+
+static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
+                              struct ib_wc *wc, int num_entries,
+                              struct ipz_queue *ipz_queue, int on_sq)
+{
+       int nr = 0;
+       struct ehca_wqe *wqe;
+       u64 offset;
+       struct ehca_queue_map *qmap;
+       struct ehca_qmap_entry *qmap_entry;
+
+       if (on_sq)
+               qmap = &my_qp->sq_map;
+       else
+               qmap = &my_qp->rq_map;
+
+       qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+       while ((nr < num_entries) && (qmap_entry->reported == 0)) {
+               /* generate flush CQE */
+
+               memset(wc, 0, sizeof(*wc));
+
+               offset = qmap->next_wqe_idx * ipz_queue->qe_size;
+               wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
+               if (!wqe) {
+                       ehca_err(cq->device, "Invalid wqe offset=%#llx on "
+                                "qp_num=%#x", offset, my_qp->real_qp_num);
+                       return nr;
+               }
+
+               wc->wr_id = replace_wr_id(wqe->work_request_id,
+                                         qmap_entry->app_wr_id);
+
+               if (on_sq) {
+                       switch (wqe->optype) {
+                       case WQE_OPTYPE_SEND:
+                               wc->opcode = IB_WC_SEND;
+                               break;
+                       case WQE_OPTYPE_RDMAWRITE:
+                               wc->opcode = IB_WC_RDMA_WRITE;
+                               break;
+                       case WQE_OPTYPE_RDMAREAD:
+                               wc->opcode = IB_WC_RDMA_READ;
+                               break;
+                       default:
+                               ehca_err(cq->device, "Invalid optype=%x",
+                                               wqe->optype);
+                               return nr;
+                       }
+               } else
+                       wc->opcode = IB_WC_RECV;
+
+               if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
+                       wc->ex.imm_data = wqe->immediate_data;
+                       wc->wc_flags |= IB_WC_WITH_IMM;
+               }
+
+               wc->status = IB_WC_WR_FLUSH_ERR;
+
+               wc->qp = &my_qp->ib_qp;
+
+               /* mark as reported and advance next_wqe pointer */
+               qmap_entry->reported = 1;
+               qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
+                                               qmap->entries);
+               qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+               wc++; nr++;
+       }
+
+       return nr;
+
+}
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+{
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       int nr;
+       struct ehca_qp *err_qp;
+       struct ib_wc *current_wc = wc;
+       int ret = 0;
+       unsigned long flags;
+       int entries_left = num_entries;
+
+       if (num_entries < 1) {
+               ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
+                        "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
+               ret = -EINVAL;
+               goto poll_cq_exit0;
+       }
+
+       spin_lock_irqsave(&my_cq->spinlock, flags);
+
+       /* generate flush cqes for send queues */
+       list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
+               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+                               &err_qp->ipz_squeue, 1);
+               entries_left -= nr;
+               current_wc += nr;
+
+               if (entries_left == 0)
+                       break;
+       }
+
+       /* generate flush cqes for receive queues */
+       list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
+               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+                               &err_qp->ipz_rqueue, 0);
+               entries_left -= nr;
+               current_wc += nr;
+
+               if (entries_left == 0)
+                       break;
+       }
+
+       for (nr = 0; nr < entries_left; nr++) {
+               ret = ehca_poll_cq_one(cq, current_wc);
+               if (ret)
+                       break;
+               current_wc++;
+       } /* eof for nr */
+       entries_left -= nr;
+
+       spin_unlock_irqrestore(&my_cq->spinlock, flags);
+       if (ret == -EAGAIN  || !ret)
+               ret = num_entries - entries_left;
+
+poll_cq_exit0:
+       return ret;
+}
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
+{
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       int ret = 0;
+
+       switch (notify_flags & IB_CQ_SOLICITED_MASK) {
+       case IB_CQ_SOLICITED:
+               hipz_set_cqx_n0(my_cq, 1);
+               break;
+       case IB_CQ_NEXT_COMP:
+               hipz_set_cqx_n1(my_cq, 1);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+               unsigned long spl_flags;
+               spin_lock_irqsave(&my_cq->spinlock, spl_flags);
+               ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
+               spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
+       }
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_sqp.c b/drivers/staging/rdma/ehca/ehca_sqp.c
new file mode 100644 (file)
index 0000000..376b031
--- /dev/null
@@ -0,0 +1,245 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  SQP functions
+ *
+ *  Authors: Khadija Souissi <souissi@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define IB_MAD_STATUS_REDIRECT         cpu_to_be16(0x0002)
+#define IB_MAD_STATUS_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_MAD_STATUS_UNSUP_METHOD     cpu_to_be16(0x0008)
+
+#define IB_PMA_CLASS_PORT_INFO         cpu_to_be16(0x0001)
+
+/**
+ * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
+ * pair is created successfully, the corresponding port gets active.
+ *
+ * Define Special Queue pair 0 (SMI QP) is still not supported.
+ *
+ * @qp_init_attr: Queue pair init attributes with port and queue pair type
+ */
+
+u64 ehca_define_sqp(struct ehca_shca *shca,
+                   struct ehca_qp *ehca_qp,
+                   struct ib_qp_init_attr *qp_init_attr)
+{
+       u32 pma_qp_nr, bma_qp_nr;
+       u64 ret;
+       u8 port = qp_init_attr->port_num;
+       int counter;
+
+       shca->sport[port - 1].port_state = IB_PORT_DOWN;
+
+       switch (qp_init_attr->qp_type) {
+       case IB_QPT_SMI:
+               /* function not supported yet */
+               break;
+       case IB_QPT_GSI:
+               ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
+                                        ehca_qp->ipz_qp_handle,
+                                        ehca_qp->galpas.kernel,
+                                        (u32) qp_init_attr->port_num,
+                                        &pma_qp_nr, &bma_qp_nr);
+
+               if (ret != H_SUCCESS) {
+                       ehca_err(&shca->ib_device,
+                                "Can't define AQP1 for port %x. h_ret=%lli",
+                                port, ret);
+                       return ret;
+               }
+               shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
+               ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
+                        port, pma_qp_nr);
+               break;
+       default:
+               ehca_err(&shca->ib_device, "invalid qp_type=%x",
+                        qp_init_attr->qp_type);
+               return H_PARAMETER;
+       }
+
+       if (ehca_nr_ports < 0) /* autodetect mode */
+               return H_SUCCESS;
+
+       for (counter = 0;
+            shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
+                    counter < ehca_port_act_time;
+            counter++) {
+               ehca_dbg(&shca->ib_device, "... wait until port %x is active",
+                        port);
+               msleep_interruptible(1000);
+       }
+
+       if (counter == ehca_port_act_time) {
+               ehca_err(&shca->ib_device, "Port %x is not active.", port);
+               return H_HARDWARE;
+       }
+
+       return H_SUCCESS;
+}
+
+struct ib_perf {
+       struct ib_mad_hdr mad_hdr;
+       u8 reserved[40];
+       u8 data[192];
+} __attribute__ ((packed));
+
+/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
+struct tcslfl {
+       u32 tc:8;
+       u32 sl:4;
+       u32 fl:20;
+} __attribute__ ((packed));
+
+/* IP Version/TC/FL packed into 32 bits, as in GRH */
+struct vertcfl {
+       u32 ver:4;
+       u32 tc:8;
+       u32 fl:20;
+} __attribute__ ((packed));
+
+static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
+                            const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                            const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+       const struct ib_perf *in_perf = (const struct ib_perf *)in_mad;
+       struct ib_perf *out_perf = (struct ib_perf *)out_mad;
+       struct ib_class_port_info *poi =
+               (struct ib_class_port_info *)out_perf->data;
+       struct tcslfl *tcslfl =
+               (struct tcslfl *)&poi->redirect_tcslfl;
+       struct ehca_shca *shca =
+               container_of(ibdev, struct ehca_shca, ib_device);
+       struct ehca_sport *sport = &shca->sport[port_num - 1];
+
+       ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
+
+       *out_mad = *in_mad;
+
+       if (in_perf->mad_hdr.class_version != 1) {
+               ehca_warn(ibdev, "Unsupported class_version=%x",
+                         in_perf->mad_hdr.class_version);
+               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
+               goto perf_reply;
+       }
+
+       switch (in_perf->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+       case IB_MGMT_METHOD_SET:
+               /* set class port info for redirection */
+               out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
+               out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
+               memset(poi, 0, sizeof(*poi));
+               poi->base_version = 1;
+               poi->class_version = 1;
+               poi->resp_time_value = 18;
+
+               /* copy local routing information from WC where applicable */
+               tcslfl->sl         = in_wc->sl;
+               poi->redirect_lid  =
+                       sport->saved_attr.lid | in_wc->dlid_path_bits;
+               poi->redirect_qp   = sport->pma_qp_nr;
+               poi->redirect_qkey = IB_QP1_QKEY;
+
+               ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
+                               &poi->redirect_pkey);
+
+               /* if request was globally routed, copy route info */
+               if (in_grh) {
+                       const struct vertcfl *vertcfl =
+                               (const struct vertcfl *)&in_grh->version_tclass_flow;
+                       memcpy(poi->redirect_gid, in_grh->dgid.raw,
+                              sizeof(poi->redirect_gid));
+                       tcslfl->tc        = vertcfl->tc;
+                       tcslfl->fl        = vertcfl->fl;
+               } else
+                       /* else only fill in default GID */
+                       ehca_query_gid(ibdev, port_num, 0,
+                                      (union ib_gid *)&poi->redirect_gid);
+
+               ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
+                        sport->saved_attr.lid, sport->pma_qp_nr);
+               break;
+
+       case IB_MGMT_METHOD_GET_RESP:
+               return IB_MAD_RESULT_FAILURE;
+
+       default:
+               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
+               break;
+       }
+
+perf_reply:
+       out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in, size_t in_mad_size,
+                    struct ib_mad_hdr *out, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index)
+{
+       int ret;
+       const struct ib_mad *in_mad = (const struct ib_mad *)in;
+       struct ib_mad *out_mad = (struct ib_mad *)out;
+
+       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+                        *out_mad_size != sizeof(*out_mad)))
+               return IB_MAD_RESULT_FAILURE;
+
+       if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
+               return IB_MAD_RESULT_FAILURE;
+
+       /* accept only pma request */
+       if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+               return IB_MAD_RESULT_SUCCESS;
+
+       ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
+       ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
+                               in_mad, out_mad);
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_tools.h b/drivers/staging/rdma/ehca/ehca_tools.h
new file mode 100644 (file)
index 0000000..d280b12
--- /dev/null
@@ -0,0 +1,155 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  auxiliary functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef EHCA_TOOLS_H
+#define EHCA_TOOLS_H
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/vmalloc.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+
+#include <linux/atomic.h>
+#include <asm/ibmebus.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hvcall.h>
+
+extern int ehca_debug_level;
+
+#define ehca_dbg(ib_dev, format, arg...) \
+       do { \
+               if (unlikely(ehca_debug_level)) \
+                       dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
+                                  "PU%04x EHCA_DBG:%s " format "\n", \
+                                  raw_smp_processor_id(), __func__, \
+                                  ## arg); \
+       } while (0)
+
+#define ehca_info(ib_dev, format, arg...) \
+       dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
+                raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_warn(ib_dev, format, arg...) \
+       dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
+                raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_err(ib_dev, format, arg...) \
+       dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
+               raw_smp_processor_id(), __func__, ## arg)
+
+/* use this one only if no ib_dev available */
+#define ehca_gen_dbg(format, arg...) \
+       do { \
+               if (unlikely(ehca_debug_level)) \
+                       printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
+                              raw_smp_processor_id(), __func__, ## arg); \
+       } while (0)
+
+#define ehca_gen_warn(format, arg...) \
+       printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
+              raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_gen_err(format, arg...) \
+       printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
+              raw_smp_processor_id(), __func__, ## arg)
+
+/**
+ * ehca_dmp - printk a memory block, whose length is n*8 bytes.
+ * Each line has the following layout:
+ * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
+ */
+#define ehca_dmp(adr, len, format, args...) \
+       do { \
+               unsigned int x; \
+               unsigned int l = (unsigned int)(len); \
+               unsigned char *deb = (unsigned char *)(adr); \
+               for (x = 0; x < l; x += 16) { \
+                       printk(KERN_INFO "EHCA_DMP:%s " format \
+                              " adr=%p ofs=%04x %016llx %016llx\n", \
+                              __func__, ##args, deb, x, \
+                              *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
+                       deb += 16; \
+               } \
+       } while (0)
+
+/* define a bitmask, little endian version */
+#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
+
+/* define a bitmask, the ibm way... */
+#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
+
+/* internal function, don't use */
+#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
+
+/* internal function, don't use */
+#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
+
+/**
+ * EHCA_BMASK_SET - return value shifted and masked by mask
+ * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
+ * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
+ * in variable
+ */
+#define EHCA_BMASK_SET(mask, value) \
+       ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
+
+/**
+ * EHCA_BMASK_GET - extract a parameter from value by mask
+ */
+#define EHCA_BMASK_GET(mask, value) \
+       (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
+
+/* Converts ehca to ib return code */
+int ehca2ib_return_code(u64 ehca_rc);
+
+#endif /* EHCA_TOOLS_H */
diff --git a/drivers/staging/rdma/ehca/ehca_uverbs.c b/drivers/staging/rdma/ehca/ehca_uverbs.c
new file mode 100644 (file)
index 0000000..1a1d5d9
--- /dev/null
@@ -0,0 +1,309 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  userspace support verbs
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+                                       struct ib_udata *udata)
+{
+       struct ehca_ucontext *my_context;
+
+       my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
+       if (!my_context) {
+               ehca_err(device, "Out of memory device=%p", device);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       return &my_context->ib_ucontext;
+}
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context)
+{
+       kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
+       return 0;
+}
+
+static void ehca_mm_open(struct vm_area_struct *vma)
+{
+       u32 *count = (u32 *)vma->vm_private_data;
+       if (!count) {
+               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+                            vma->vm_start, vma->vm_end);
+               return;
+       }
+       (*count)++;
+       if (!(*count))
+               ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
+                            vma->vm_start, vma->vm_end);
+       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+                    vma->vm_start, vma->vm_end, *count);
+}
+
+static void ehca_mm_close(struct vm_area_struct *vma)
+{
+       u32 *count = (u32 *)vma->vm_private_data;
+       if (!count) {
+               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+                            vma->vm_start, vma->vm_end);
+               return;
+       }
+       (*count)--;
+       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+                    vma->vm_start, vma->vm_end, *count);
+}
+
+static const struct vm_operations_struct vm_ops = {
+       .open = ehca_mm_open,
+       .close = ehca_mm_close,
+};
+
+static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
+                       u32 *mm_count)
+{
+       int ret;
+       u64 vsize, physical;
+
+       vsize = vma->vm_end - vma->vm_start;
+       if (vsize < EHCA_PAGESIZE) {
+               ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
+               return -EINVAL;
+       }
+
+       physical = galpas->user.fw_handle;
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+       ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
+       ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
+                          vma->vm_page_prot);
+       if (unlikely(ret)) {
+               ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
+               return -ENOMEM;
+       }
+
+       vma->vm_private_data = mm_count;
+       (*mm_count)++;
+       vma->vm_ops = &vm_ops;
+
+       return 0;
+}
+
+static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
+                          u32 *mm_count)
+{
+       int ret;
+       u64 start, ofs;
+       struct page *page;
+
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+       start = vma->vm_start;
+       for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
+               u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
+               page = virt_to_page(virt_addr);
+               ret = vm_insert_page(vma, start, page);
+               if (unlikely(ret)) {
+                       ehca_gen_err("vm_insert_page() failed rc=%i", ret);
+                       return ret;
+               }
+               start += PAGE_SIZE;
+       }
+       vma->vm_private_data = mm_count;
+       (*mm_count)++;
+       vma->vm_ops = &vm_ops;
+
+       return 0;
+}
+
+static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
+                       u32 rsrc_type)
+{
+       int ret;
+
+       switch (rsrc_type) {
+       case 0: /* galpa fw handle */
+               ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
+               ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
+               if (unlikely(ret)) {
+                       ehca_err(cq->ib_cq.device,
+                                "ehca_mmap_fw() failed rc=%i cq_num=%x",
+                                ret, cq->cq_number);
+                       return ret;
+               }
+               break;
+
+       case 1: /* cq queue_addr */
+               ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
+               ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
+               if (unlikely(ret)) {
+                       ehca_err(cq->ib_cq.device,
+                                "ehca_mmap_queue() failed rc=%i cq_num=%x",
+                                ret, cq->cq_number);
+                       return ret;
+               }
+               break;
+
+       default:
+               ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
+                        rsrc_type, cq->cq_number);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
+                       u32 rsrc_type)
+{
+       int ret;
+
+       switch (rsrc_type) {
+       case 0: /* galpa fw handle */
+               ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
+               ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "remap_pfn_range() failed ret=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return -ENOMEM;
+               }
+               break;
+
+       case 1: /* qp rqueue_addr */
+               ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
+               ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
+                                     &qp->mm_count_rqueue);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return ret;
+               }
+               break;
+
+       case 2: /* qp squeue_addr */
+               ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
+               ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
+                                     &qp->mm_count_squeue);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return ret;
+               }
+               break;
+
+       default:
+               ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
+                        rsrc_type, qp->ib_qp.qp_num);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       u64 fileoffset = vma->vm_pgoff;
+       u32 idr_handle = fileoffset & 0x1FFFFFF;
+       u32 q_type = (fileoffset >> 27) & 0x1;    /* CQ, QP,...        */
+       u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
+       u32 ret;
+       struct ehca_cq *cq;
+       struct ehca_qp *qp;
+       struct ib_uobject *uobject;
+
+       switch (q_type) {
+       case  0: /* CQ */
+               read_lock(&ehca_cq_idr_lock);
+               cq = idr_find(&ehca_cq_idr, idr_handle);
+               read_unlock(&ehca_cq_idr_lock);
+
+               /* make sure this mmap really belongs to the authorized user */
+               if (!cq)
+                       return -EINVAL;
+
+               if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
+                       return -EINVAL;
+
+               ret = ehca_mmap_cq(vma, cq, rsrc_type);
+               if (unlikely(ret)) {
+                       ehca_err(cq->ib_cq.device,
+                                "ehca_mmap_cq() failed rc=%i cq_num=%x",
+                                ret, cq->cq_number);
+                       return ret;
+               }
+               break;
+
+       case 1: /* QP */
+               read_lock(&ehca_qp_idr_lock);
+               qp = idr_find(&ehca_qp_idr, idr_handle);
+               read_unlock(&ehca_qp_idr_lock);
+
+               /* make sure this mmap really belongs to the authorized user */
+               if (!qp)
+                       return -EINVAL;
+
+               uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
+               if (!uobject || uobject->context != context)
+                       return -EINVAL;
+
+               ret = ehca_mmap_qp(vma, qp, rsrc_type);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "ehca_mmap_qp() failed rc=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return ret;
+               }
+               break;
+
+       default:
+               ehca_gen_err("bad queue type %x", q_type);
+               return -EINVAL;
+       }
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/ehca/hcp_if.c b/drivers/staging/rdma/ehca/hcp_if.c
new file mode 100644 (file)
index 0000000..89517ff
--- /dev/null
@@ -0,0 +1,949 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/hvcall.h>
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hcp_phyp.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define H_ALL_RES_QP_ENHANCED_OPS       EHCA_BMASK_IBM(9, 11)
+#define H_ALL_RES_QP_PTE_PIN            EHCA_BMASK_IBM(12, 12)
+#define H_ALL_RES_QP_SERVICE_TYPE       EHCA_BMASK_IBM(13, 15)
+#define H_ALL_RES_QP_STORAGE            EHCA_BMASK_IBM(16, 17)
+#define H_ALL_RES_QP_LL_RQ_CQE_POSTING  EHCA_BMASK_IBM(18, 18)
+#define H_ALL_RES_QP_LL_SQ_CQE_POSTING  EHCA_BMASK_IBM(19, 21)
+#define H_ALL_RES_QP_SIGNALING_TYPE     EHCA_BMASK_IBM(22, 23)
+#define H_ALL_RES_QP_UD_AV_LKEY_CTRL    EHCA_BMASK_IBM(31, 31)
+#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
+#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
+#define H_ALL_RES_QP_RESOURCE_TYPE      EHCA_BMASK_IBM(56, 63)
+
+#define H_ALL_RES_QP_MAX_OUTST_SEND_WR  EHCA_BMASK_IBM(0, 15)
+#define H_ALL_RES_QP_MAX_OUTST_RECV_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_MAX_SEND_SGE       EHCA_BMASK_IBM(32, 39)
+#define H_ALL_RES_QP_MAX_RECV_SGE       EHCA_BMASK_IBM(40, 47)
+
+#define H_ALL_RES_QP_UD_AV_LKEY         EHCA_BMASK_IBM(32, 63)
+#define H_ALL_RES_QP_SRQ_QP_TOKEN       EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_SRQ_QP_HANDLE      EHCA_BMASK_IBM(0, 64)
+#define H_ALL_RES_QP_SRQ_LIMIT          EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_SRQ_QPN            EHCA_BMASK_IBM(40, 63)
+
+#define H_ALL_RES_QP_ACT_OUTST_SEND_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_ACT_OUTST_RECV_WR  EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_ACT_SEND_SGE       EHCA_BMASK_IBM(8, 15)
+#define H_ALL_RES_QP_ACT_RECV_SGE       EHCA_BMASK_IBM(24, 31)
+
+#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(32, 63)
+
+#define H_MP_INIT_TYPE                  EHCA_BMASK_IBM(44, 47)
+#define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
+#define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
+
+#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
+#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
+#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
+
+static DEFINE_SPINLOCK(hcall_lock);
+
+static long ehca_plpar_hcall_norets(unsigned long opcode,
+                                   unsigned long arg1,
+                                   unsigned long arg2,
+                                   unsigned long arg3,
+                                   unsigned long arg4,
+                                   unsigned long arg5,
+                                   unsigned long arg6,
+                                   unsigned long arg7)
+{
+       long ret;
+       int i, sleep_msecs;
+       unsigned long flags = 0;
+
+       if (unlikely(ehca_debug_level >= 2))
+               ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
+                            opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
+
+       for (i = 0; i < 5; i++) {
+               /* serialize hCalls to work around firmware issue */
+               if (ehca_lock_hcalls)
+                       spin_lock_irqsave(&hcall_lock, flags);
+
+               ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
+                                        arg5, arg6, arg7);
+
+               if (ehca_lock_hcalls)
+                       spin_unlock_irqrestore(&hcall_lock, flags);
+
+               if (H_IS_LONG_BUSY(ret)) {
+                       sleep_msecs = get_longbusy_msecs(ret);
+                       msleep_interruptible(sleep_msecs);
+                       continue;
+               }
+
+               if (ret < H_SUCCESS)
+                       ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
+                                    opcode, ret, arg1, arg2, arg3,
+                                    arg4, arg5, arg6, arg7);
+               else
+                       if (unlikely(ehca_debug_level >= 2))
+                               ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
+
+               return ret;
+       }
+
+       return H_BUSY;
+}
+
+static long ehca_plpar_hcall9(unsigned long opcode,
+                             unsigned long *outs, /* array of 9 outputs */
+                             unsigned long arg1,
+                             unsigned long arg2,
+                             unsigned long arg3,
+                             unsigned long arg4,
+                             unsigned long arg5,
+                             unsigned long arg6,
+                             unsigned long arg7,
+                             unsigned long arg8,
+                             unsigned long arg9)
+{
+       long ret;
+       int i, sleep_msecs;
+       unsigned long flags = 0;
+
+       if (unlikely(ehca_debug_level >= 2))
+               ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
+                            arg1, arg2, arg3, arg4, arg5,
+                            arg6, arg7, arg8, arg9);
+
+       for (i = 0; i < 5; i++) {
+               /* serialize hCalls to work around firmware issue */
+               if (ehca_lock_hcalls)
+                       spin_lock_irqsave(&hcall_lock, flags);
+
+               ret = plpar_hcall9(opcode, outs,
+                                  arg1, arg2, arg3, arg4, arg5,
+                                  arg6, arg7, arg8, arg9);
+
+               if (ehca_lock_hcalls)
+                       spin_unlock_irqrestore(&hcall_lock, flags);
+
+               if (H_IS_LONG_BUSY(ret)) {
+                       sleep_msecs = get_longbusy_msecs(ret);
+                       msleep_interruptible(sleep_msecs);
+                       continue;
+               }
+
+               if (ret < H_SUCCESS) {
+                       ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
+                                    opcode, arg1, arg2, arg3, arg4, arg5,
+                                    arg6, arg7, arg8, arg9);
+                       ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+                                    ret, outs[0], outs[1], outs[2], outs[3],
+                                    outs[4], outs[5], outs[6], outs[7],
+                                    outs[8]);
+               } else if (unlikely(ehca_debug_level >= 2))
+                       ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+                                    ret, outs[0], outs[1], outs[2], outs[3],
+                                    outs[4], outs[5], outs[6], outs[7],
+                                    outs[8]);
+               return ret;
+       }
+
+       return H_BUSY;
+}
+
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u32 neq_control,
+                            const u32 number_of_entries,
+                            struct ipz_eq_handle *eq_handle,
+                            u32 *act_nr_of_entries,
+                            u32 *act_pages,
+                            u32 *eq_ist)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+       u64 allocate_controls;
+
+       /* resource type */
+       allocate_controls = 3ULL;
+
+       /* ISN is associated */
+       if (neq_control != 1)
+               allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
+       else /* notification event queue */
+               allocate_controls = (1ULL << 63) | allocate_controls;
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,  /* r4 */
+                               allocate_controls,      /* r5 */
+                               number_of_entries,      /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       eq_handle->handle = outs[0];
+       *act_nr_of_entries = (u32)outs[3];
+       *act_pages = (u32)outs[4];
+       *eq_ist = (u32)outs[5];
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resource - ret=%lli ", ret);
+
+       return ret;
+}
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+                      struct ipz_eq_handle eq_handle,
+                      const u64 event_mask)
+{
+       return ehca_plpar_hcall_norets(H_RESET_EVENTS,
+                                      adapter_handle.handle, /* r4 */
+                                      eq_handle.handle,      /* r5 */
+                                      event_mask,            /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_cq *cq,
+                            struct ehca_alloc_cq_parms *param)
+{
+       int rc;
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,   /* r4  */
+                               2,                       /* r5  */
+                               param->eq_handle.handle, /* r6  */
+                               cq->token,               /* r7  */
+                               param->nr_cqe,           /* r8  */
+                               0, 0, 0, 0);
+       cq->ipz_cq_handle.handle = outs[0];
+       param->act_nr_of_entries = (u32)outs[3];
+       param->act_pages = (u32)outs[4];
+
+       if (ret == H_SUCCESS) {
+               rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
+               if (rc) {
+                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+                                    rc, outs[5]);
+
+                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                               adapter_handle.handle,     /* r4 */
+                                               cq->ipz_cq_handle.handle,  /* r5 */
+                                               0, 0, 0, 0, 0);
+                       ret = H_NO_MEM;
+               }
+       }
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_alloc_qp_parms *parms, int is_user)
+{
+       int rc;
+       u64 ret;
+       u64 allocate_controls, max_r10_reg, r11, r12;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       allocate_controls =
+               EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
+                                parms->squeue.page_size)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
+                                parms->rqueue.page_size)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
+                                !!(parms->ll_comp_flags & LLQP_RECV_COMP))
+               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
+                                !!(parms->ll_comp_flags & LLQP_SEND_COMP))
+               | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
+                                parms->ud_av_l_key_ctl)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
+
+       max_r10_reg =
+               EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
+                              parms->squeue.max_wr + 1)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
+                                parms->rqueue.max_wr + 1)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
+                                parms->squeue.max_sge)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
+                                parms->rqueue.max_sge);
+
+       r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
+
+       if (parms->ext_type == EQPT_SRQ)
+               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
+       else
+               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,             /* r4  */
+                               allocate_controls,                 /* r5  */
+                               parms->send_cq_handle.handle,
+                               parms->recv_cq_handle.handle,
+                               parms->eq_handle.handle,
+                               ((u64)parms->token << 32) | parms->pd.value,
+                               max_r10_reg, r11, r12);
+
+       parms->qp_handle.handle = outs[0];
+       parms->real_qp_num = (u32)outs[1];
+       parms->squeue.act_nr_wqes =
+               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
+       parms->rqueue.act_nr_wqes =
+               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
+       parms->squeue.act_nr_sges =
+               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
+       parms->rqueue.act_nr_sges =
+               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
+       parms->squeue.queue_size =
+               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
+       parms->rqueue.queue_size =
+               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
+
+       if (ret == H_SUCCESS) {
+               rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
+               if (rc) {
+                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+                                    rc, outs[6]);
+
+                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                               adapter_handle.handle,     /* r4 */
+                                               parms->qp_handle.handle,  /* r5 */
+                                               0, 0, 0, 0, 0);
+                       ret = H_NO_MEM;
+               }
+       }
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+                     const u8 port_id,
+                     struct hipz_query_port *query_port_response_block)
+{
+       u64 ret;
+       u64 r_cb = __pa(query_port_response_block);
+
+       if (r_cb & (EHCA_PAGESIZE-1)) {
+               ehca_gen_err("response block not page aligned");
+               return H_PARAMETER;
+       }
+
+       ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
+                                     adapter_handle.handle, /* r4 */
+                                     port_id,               /* r5 */
+                                     r_cb,                  /* r6 */
+                                     0, 0, 0, 0);
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(query_port_response_block, 64, "response_block");
+
+       return ret;
+}
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+                      const u8 port_id, const u32 port_cap,
+                      const u8 init_type, const int modify_mask)
+{
+       u64 port_attributes = port_cap;
+
+       if (modify_mask & IB_PORT_SHUTDOWN)
+               port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
+       if (modify_mask & IB_PORT_INIT_TYPE)
+               port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
+       if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
+               port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
+
+       return ehca_plpar_hcall_norets(H_MODIFY_PORT,
+                                      adapter_handle.handle, /* r4 */
+                                      port_id,               /* r5 */
+                                      port_attributes,       /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+                    struct hipz_query_hca *query_hca_rblock)
+{
+       u64 r_cb = __pa(query_hca_rblock);
+
+       if (r_cb & (EHCA_PAGESIZE-1)) {
+               ehca_gen_err("response_block=%p not page aligned",
+                            query_hca_rblock);
+               return H_PARAMETER;
+       }
+
+       return ehca_plpar_hcall_norets(H_QUERY_HCA,
+                                      adapter_handle.handle, /* r4 */
+                                      r_cb,                  /* r5 */
+                                      0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+                         const u8 pagesize,
+                         const u8 queue_type,
+                         const u64 resource_handle,
+                         const u64 logical_address_of_page,
+                         u64 count)
+{
+       return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
+                                      adapter_handle.handle,      /* r4  */
+                                      (u64)queue_type | ((u64)pagesize) << 8,
+                                      /* r5  */
+                                      resource_handle,            /* r6  */
+                                      logical_address_of_page,    /* r7  */
+                                      count,                      /* r8  */
+                                      0, 0);
+}
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_eq_handle eq_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count)
+{
+       if (count != 1) {
+               ehca_gen_err("Ppage counter=%llx", count);
+               return H_PARAMETER;
+       }
+       return hipz_h_register_rpage(adapter_handle,
+                                    pagesize,
+                                    queue_type,
+                                    eq_handle.handle,
+                                    logical_address_of_page, count);
+}
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
+                          u32 ist)
+{
+       u64 ret;
+       ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
+                                     adapter_handle.handle, /* r4 */
+                                     ist,                   /* r5 */
+                                     0, 0, 0, 0, 0);
+
+       if (ret != H_SUCCESS && ret != H_BUSY)
+               ehca_gen_err("Could not query interrupt state.");
+
+       return ret;
+}
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_cq_handle cq_handle,
+                            struct ehca_pfcq *pfcq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa gal)
+{
+       if (count != 1) {
+               ehca_gen_err("Page counter=%llx", count);
+               return H_PARAMETER;
+       }
+
+       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+                                    cq_handle.handle, logical_address_of_page,
+                                    count);
+}
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_qp_handle qp_handle,
+                            struct ehca_pfqp *pfqp,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa galpa)
+{
+       if (count > 1) {
+               ehca_gen_err("Page counter=%llx", count);
+               return H_PARAMETER;
+       }
+
+       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+                                    qp_handle.handle, logical_address_of_page,
+                                    count);
+}
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+                              const struct ipz_qp_handle qp_handle,
+                              struct ehca_pfqp *pfqp,
+                              void **log_addr_next_sq_wqe2processed,
+                              void **log_addr_next_rq_wqe2processed,
+                              int dis_and_get_function_code)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+                               adapter_handle.handle,     /* r4 */
+                               dis_and_get_function_code, /* r5 */
+                               qp_handle.handle,          /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       if (log_addr_next_sq_wqe2processed)
+               *log_addr_next_sq_wqe2processed = (void *)outs[0];
+       if (log_addr_next_rq_wqe2processed)
+               *log_addr_next_rq_wqe2processed = (void *)outs[1];
+
+       return ret;
+}
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+                    const struct ipz_qp_handle qp_handle,
+                    struct ehca_pfqp *pfqp,
+                    const u64 update_mask,
+                    struct hcp_modify_qp_control_block *mqpcb,
+                    struct h_galpa gal)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+       ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
+                               adapter_handle.handle, /* r4 */
+                               qp_handle.handle,      /* r5 */
+                               update_mask,           /* r6 */
+                               __pa(mqpcb),           /* r7 */
+                               0, 0, 0, 0, 0);
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Insufficient resources ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+                   const struct ipz_qp_handle qp_handle,
+                   struct ehca_pfqp *pfqp,
+                   struct hcp_modify_qp_control_block *qqpcb,
+                   struct h_galpa gal)
+{
+       return ehca_plpar_hcall_norets(H_QUERY_QP,
+                                      adapter_handle.handle, /* r4 */
+                                      qp_handle.handle,      /* r5 */
+                                      __pa(qqpcb),           /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_qp *qp)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = hcp_galpas_dtor(&qp->galpas);
+       if (ret) {
+               ehca_gen_err("Could not destruct qp->galpas");
+               return H_RESOURCE;
+       }
+       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+                               adapter_handle.handle,     /* r4 */
+                               /* function code */
+                               1,                         /* r5 */
+                               qp->ipz_qp_handle.handle,  /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       if (ret == H_HARDWARE)
+               ehca_gen_err("HCA not operational. ret=%lli", ret);
+
+       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                     adapter_handle.handle,     /* r4 */
+                                     qp->ipz_qp_handle.handle,  /* r5 */
+                                     0, 0, 0, 0, 0);
+
+       if (ret == H_RESOURCE)
+               ehca_gen_err("Resource still in use. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port)
+{
+       return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
+                                      adapter_handle.handle, /* r4 */
+                                      qp_handle.handle,      /* r5 */
+                                      port,                  /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port, u32 * pma_qp_nr,
+                      u32 * bma_qp_nr)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
+                               adapter_handle.handle, /* r4 */
+                               qp_handle.handle,      /* r5 */
+                               port,                  /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       *pma_qp_nr = (u32)outs[0];
+       *bma_qp_nr = (u32)outs[1];
+
+       if (ret == H_ALIAS_EXIST)
+               ehca_gen_err("AQP1 already exists. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id)
+{
+       u64 ret;
+
+       ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
+                                     adapter_handle.handle,  /* r4 */
+                                     qp_handle.handle,       /* r5 */
+                                     mcg_dlid,               /* r6 */
+                                     interface_id,           /* r7 */
+                                     subnet_prefix,          /* r8 */
+                                     0, 0);
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id)
+{
+       return ehca_plpar_hcall_norets(H_DETACH_MCQP,
+                                      adapter_handle.handle, /* r4 */
+                                      qp_handle.handle,      /* r5 */
+                                      mcg_dlid,              /* r6 */
+                                      interface_id,          /* r7 */
+                                      subnet_prefix,         /* r8 */
+                                      0, 0);
+}
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_cq *cq,
+                     u8 force_flag)
+{
+       u64 ret;
+
+       ret = hcp_galpas_dtor(&cq->galpas);
+       if (ret) {
+               ehca_gen_err("Could not destruct cp->galpas");
+               return H_RESOURCE;
+       }
+
+       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                     adapter_handle.handle,     /* r4 */
+                                     cq->ipz_cq_handle.handle,  /* r5 */
+                                     force_flag != 0 ? 1L : 0L, /* r6 */
+                                     0, 0, 0, 0);
+
+       if (ret == H_RESOURCE)
+               ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
+
+       return ret;
+}
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_eq *eq)
+{
+       u64 ret;
+
+       ret = hcp_galpas_dtor(&eq->galpas);
+       if (ret) {
+               ehca_gen_err("Could not destruct eq->galpas");
+               return H_RESOURCE;
+       }
+
+       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                     adapter_handle.handle,     /* r4 */
+                                     eq->ipz_eq_handle.handle,  /* r5 */
+                                     0, 0, 0, 0, 0);
+
+       if (ret == H_RESOURCE)
+               ehca_gen_err("Resource in use. ret=%lli ", ret);
+
+       return ret;
+}
+
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u64 vaddr,
+                            const u64 length,
+                            const u32 access_ctrl,
+                            const struct ipz_pd pd,
+                            struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,            /* r4 */
+                               5,                                /* r5 */
+                               vaddr,                            /* r6 */
+                               length,                           /* r7 */
+                               (((u64)access_ctrl) << 32ULL),    /* r8 */
+                               pd.value,                         /* r9 */
+                               0, 0, 0);
+       outparms->handle.handle = outs[0];
+       outparms->lkey = (u32)outs[2];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count)
+{
+       u64 ret;
+
+       if (unlikely(ehca_debug_level >= 3)) {
+               if (count > 1) {
+                       u64 *kpage;
+                       int i;
+                       kpage = __va(logical_address_of_page);
+                       for (i = 0; i < count; i++)
+                               ehca_gen_dbg("kpage[%d]=%p",
+                                            i, (void *)kpage[i]);
+               } else
+                       ehca_gen_dbg("kpage=%p",
+                                    (void *)logical_address_of_page);
+       }
+
+       if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
+               ehca_gen_err("logical_address_of_page not on a 4k boundary "
+                            "adapter_handle=%llx mr=%p mr_handle=%llx "
+                            "pagesize=%x queue_type=%x "
+                            "logical_address_of_page=%llx count=%llx",
+                            adapter_handle.handle, mr,
+                            mr->ipz_mr_handle.handle, pagesize, queue_type,
+                            logical_address_of_page, count);
+               ret = H_PARAMETER;
+       } else
+               ret = hipz_h_register_rpage(adapter_handle, pagesize,
+                                           queue_type,
+                                           mr->ipz_mr_handle.handle,
+                                           logical_address_of_page, count);
+       return ret;
+}
+
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mr *mr,
+                   struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
+                               adapter_handle.handle,     /* r4 */
+                               mr->ipz_mr_handle.handle,  /* r5 */
+                               0, 0, 0, 0, 0, 0, 0);
+       outparms->len = outs[0];
+       outparms->vaddr = outs[1];
+       outparms->acl  = outs[4] >> 32;
+       outparms->lkey = (u32)(outs[5] >> 32);
+       outparms->rkey = (u32)(outs[5] & (0xffffffff));
+
+       return ret;
+}
+
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mr *mr)
+{
+       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                      adapter_handle.handle,    /* r4 */
+                                      mr->ipz_mr_handle.handle, /* r5 */
+                                      0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+                         const struct ehca_mr *mr,
+                         const u64 vaddr_in,
+                         const u64 length,
+                         const u32 access_ctrl,
+                         const struct ipz_pd pd,
+                         const u64 mr_addr_cb,
+                         struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
+                               adapter_handle.handle,    /* r4 */
+                               mr->ipz_mr_handle.handle, /* r5 */
+                               vaddr_in,                 /* r6 */
+                               length,                   /* r7 */
+                               /* r8 */
+                               ((((u64)access_ctrl) << 32ULL) | pd.value),
+                               mr_addr_cb,               /* r9 */
+                               0, 0, 0);
+       outparms->vaddr = outs[1];
+       outparms->lkey = (u32)outs[2];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+                       const struct ehca_mr *mr,
+                       const struct ehca_mr *orig_mr,
+                       const u64 vaddr_in,
+                       const u32 access_ctrl,
+                       const struct ipz_pd pd,
+                       struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
+                               adapter_handle.handle,            /* r4 */
+                               orig_mr->ipz_mr_handle.handle,    /* r5 */
+                               vaddr_in,                         /* r6 */
+                               (((u64)access_ctrl) << 32ULL),    /* r7 */
+                               pd.value,                         /* r8 */
+                               0, 0, 0, 0);
+       outparms->handle.handle = outs[0];
+       outparms->lkey = (u32)outs[2];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mw *mw,
+                            const struct ipz_pd pd,
+                            struct ehca_mw_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,      /* r4 */
+                               6,                          /* r5 */
+                               pd.value,                   /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       outparms->handle.handle = outs[0];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mw *mw,
+                   struct ehca_mw_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
+                               adapter_handle.handle,    /* r4 */
+                               mw->ipz_mw_handle.handle, /* r5 */
+                               0, 0, 0, 0, 0, 0, 0);
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mw *mw)
+{
+       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                      adapter_handle.handle,    /* r4 */
+                                      mw->ipz_mw_handle.handle, /* r5 */
+                                      0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+                     const u64 ressource_handle,
+                     void *rblock,
+                     unsigned long *byte_count)
+{
+       u64 r_cb = __pa(rblock);
+
+       if (r_cb & (EHCA_PAGESIZE-1)) {
+               ehca_gen_err("rblock not page aligned.");
+               return H_PARAMETER;
+       }
+
+       return ehca_plpar_hcall_norets(H_ERROR_DATA,
+                                      adapter_handle.handle,
+                                      ressource_handle,
+                                      r_cb,
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_eoi(int irq)
+{
+       unsigned long xirr;
+
+       iosync();
+       xirr = (0xffULL << 24) | irq;
+
+       return plpar_hcall_norets(H_EOI, xirr);
+}
diff --git a/drivers/staging/rdma/ehca/hcp_if.h b/drivers/staging/rdma/ehca/hcp_if.h
new file mode 100644 (file)
index 0000000..a46e514
--- /dev/null
@@ -0,0 +1,265 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_IF_H__
+#define __HCP_IF_H__
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "hipz_hw.h"
+
+/*
+ * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
+ * resources, create the empty EQPT (ring).
+ */
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u32 neq_control,
+                            const u32 number_of_entries,
+                            struct ipz_eq_handle *eq_handle,
+                            u32 * act_nr_of_entries,
+                            u32 * act_pages,
+                            u32 * eq_ist);
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+                      struct ipz_eq_handle eq_handle,
+                      const u64 event_mask);
+/*
+ * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
+ * resources, create the empty CQPT (ring).
+ */
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_cq *cq,
+                            struct ehca_alloc_cq_parms *param);
+
+
+/*
+ * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
+ * initialize resources, create empty QPPTs (2 rings).
+ */
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_alloc_qp_parms *parms, int is_user);
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+                     const u8 port_id,
+                     struct hipz_query_port *query_port_response_block);
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+                      const u8 port_id, const u32 port_cap,
+                      const u8 init_type, const int modify_mask);
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+                    struct hipz_query_hca *query_hca_rblock);
+
+/*
+ * hipz_h_register_rpage internal function in hcp_if.h for all
+ * hcp_H_REGISTER_RPAGE calls.
+ */
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+                         const u8 pagesize,
+                         const u8 queue_type,
+                         const u64 resource_handle,
+                         const u64 logical_address_of_page,
+                         u64 count);
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_eq_handle eq_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count);
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle
+                          hcp_adapter_handle,
+                          u32 ist);
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_cq_handle cq_handle,
+                            struct ehca_pfcq *pfcq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa gal);
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_qp_handle qp_handle,
+                            struct ehca_pfqp *pfqp,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa galpa);
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+                              const struct ipz_qp_handle qp_handle,
+                              struct ehca_pfqp *pfqp,
+                              void **log_addr_next_sq_wqe_tb_processed,
+                              void **log_addr_next_rq_wqe_tb_processed,
+                              int dis_and_get_function_code);
+enum hcall_sigt {
+       HCALL_SIGT_NO_CQE = 0,
+       HCALL_SIGT_BY_WQE = 1,
+       HCALL_SIGT_EVERY = 2
+};
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+                    const struct ipz_qp_handle qp_handle,
+                    struct ehca_pfqp *pfqp,
+                    const u64 update_mask,
+                    struct hcp_modify_qp_control_block *mqpcb,
+                    struct h_galpa gal);
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+                   const struct ipz_qp_handle qp_handle,
+                   struct ehca_pfqp *pfqp,
+                   struct hcp_modify_qp_control_block *qqpcb,
+                   struct h_galpa gal);
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_qp *qp);
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port);
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port, u32 * pma_qp_nr,
+                      u32 * bma_qp_nr);
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_cq *cq,
+                     u8 force_flag);
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_eq *eq);
+
+/*
+ * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u64 vaddr,
+                            const u64 length,
+                            const u32 access_ctrl,
+                            const struct ipz_pd pd,
+                            struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count);
+
+/* hipz_h_query_mr queries MR in HW and FW */
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mr *mr,
+                   struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mr frees MR resources in HW and FW */
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mr *mr);
+
+/* hipz_h_reregister_pmr reregisters MR in HW and FW */
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+                         const struct ehca_mr *mr,
+                         const u64 vaddr_in,
+                         const u64 length,
+                         const u32 access_ctrl,
+                         const struct ipz_pd pd,
+                         const u64 mr_addr_cb,
+                         struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_smr register shared MR in HW and FW */
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+                       const struct ehca_mr *mr,
+                       const struct ehca_mr *orig_mr,
+                       const u64 vaddr_in,
+                       const u32 access_ctrl,
+                       const struct ipz_pd pd,
+                       struct ehca_mr_hipzout_parms *outparms);
+
+/*
+ * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mw *mw,
+                            const struct ipz_pd pd,
+                            struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_query_mw queries MW in HW and FW */
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mw *mw,
+                   struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mw frees MW resources in HW and FW */
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mw *mw);
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+                     const u64 ressource_handle,
+                     void *rblock,
+                     unsigned long *byte_count);
+u64 hipz_h_eoi(int irq);
+
+#endif /* __HCP_IF_H__ */
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.c b/drivers/staging/rdma/ehca/hcp_phyp.c
new file mode 100644 (file)
index 0000000..077376f
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *   load store abstraction for ehca register access with tracing
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+u64 hcall_map_page(u64 physaddr)
+{
+       return (u64)ioremap(physaddr, EHCA_PAGESIZE);
+}
+
+int hcall_unmap_page(u64 mapaddr)
+{
+       iounmap((volatile void __iomem *) mapaddr);
+       return 0;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+                   u64 paddr_kernel, u64 paddr_user)
+{
+       if (!is_user) {
+               galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
+               if (!galpas->kernel.fw_handle)
+                       return -ENOMEM;
+       } else
+               galpas->kernel.fw_handle = 0;
+
+       galpas->user.fw_handle = paddr_user;
+
+       return 0;
+}
+
+int hcp_galpas_dtor(struct h_galpas *galpas)
+{
+       if (galpas->kernel.fw_handle) {
+               int ret = hcall_unmap_page(galpas->kernel.fw_handle);
+               if (ret)
+                       return ret;
+       }
+
+       galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.h b/drivers/staging/rdma/ehca/hcp_phyp.h
new file mode 100644 (file)
index 0000000..d1b0299
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware calls
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_PHYP_H__
+#define __HCP_PHYP_H__
+
+
+/*
+ * eHCA page (mapped into memory)
+ * resource to access eHCA register pages in CPU address space
+*/
+struct h_galpa {
+       u64 fw_handle;
+       /* for pSeries this is a 64bit memory address where
+          I/O memory is mapped into CPU address space (kv) */
+};
+
+/*
+ * resource to access eHCA address space registers, all types
+ */
+struct h_galpas {
+       u32 pid;                /*PID of userspace galpa checking */
+       struct h_galpa user;    /* user space accessible resource,
+                                  set to 0 if unused */
+       struct h_galpa kernel;  /* kernel space accessible resource,
+                                  set to 0 if unused */
+};
+
+static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
+{
+       u64 addr = galpa.fw_handle + offset;
+       return *(volatile u64 __force *)addr;
+}
+
+static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
+{
+       u64 addr = galpa.fw_handle + offset;
+       *(volatile u64 __force *)addr = value;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+                   u64 paddr_kernel, u64 paddr_user);
+
+int hcp_galpas_dtor(struct h_galpas *galpas);
+
+u64 hcall_map_page(u64 physaddr);
+
+int hcall_unmap_page(u64 mapaddr);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns.h b/drivers/staging/rdma/ehca/hipz_fns.h
new file mode 100644 (file)
index 0000000..9dac93d
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_H__
+#define __HIPZ_FNS_H__
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+#include "hipz_fns_core.h"
+
+#define hipz_galpa_store_eq(gal, offset, value) \
+       hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_eq(gal, offset) \
+       hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qped(gal, offset, value) \
+       hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_qped(gal, offset) \
+       hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
+
+#define hipz_galpa_store_mrmw(gal, offset, value) \
+       hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_mrmw(gal, offset) \
+       hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
+
+#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns_core.h b/drivers/staging/rdma/ehca/hipz_fns_core.h
new file mode 100644 (file)
index 0000000..868735f
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_CORE_H__
+#define __HIPZ_FNS_CORE_H__
+
+#include "hcp_phyp.h"
+#include "hipz_hw.h"
+
+#define hipz_galpa_store_cq(gal, offset, value) \
+       hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_cq(gal, offset) \
+       hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qp(gal, offset, value) \
+       hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
+#define hipz_galpa_load_qp(gal, offset) \
+       hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
+
+static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+       /*  ringing doorbell :-) */
+       hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
+                           EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
+}
+
+static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+       /*  ringing doorbell :-) */
+       hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
+                           EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
+}
+
+static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
+{
+       hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
+                           EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
+}
+
+static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
+{
+       u64 cqx_n0_reg;
+
+       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
+                           EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
+                                          value));
+       cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
+}
+
+static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
+{
+       u64 cqx_n1_reg;
+
+       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
+                           EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
+       cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
+}
+
+#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/staging/rdma/ehca/hipz_hw.h b/drivers/staging/rdma/ehca/hipz_hw.h
new file mode 100644 (file)
index 0000000..bf996c7
--- /dev/null
@@ -0,0 +1,414 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  eHCA register definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_HW_H__
+#define __HIPZ_HW_H__
+
+#include "ehca_tools.h"
+
+#define EHCA_MAX_MTU 4
+
+/* QP Table Entry Memory Map */
+struct hipz_qptemm {
+       u64 qpx_hcr;
+       u64 qpx_c;
+       u64 qpx_herr;
+       u64 qpx_aer;
+/* 0x20*/
+       u64 qpx_sqa;
+       u64 qpx_sqc;
+       u64 qpx_rqa;
+       u64 qpx_rqc;
+/* 0x40*/
+       u64 qpx_st;
+       u64 qpx_pmstate;
+       u64 qpx_pmfa;
+       u64 qpx_pkey;
+/* 0x60*/
+       u64 qpx_pkeya;
+       u64 qpx_pkeyb;
+       u64 qpx_pkeyc;
+       u64 qpx_pkeyd;
+/* 0x80*/
+       u64 qpx_qkey;
+       u64 qpx_dqp;
+       u64 qpx_dlidp;
+       u64 qpx_portp;
+/* 0xa0*/
+       u64 qpx_slidp;
+       u64 qpx_slidpp;
+       u64 qpx_dlida;
+       u64 qpx_porta;
+/* 0xc0*/
+       u64 qpx_slida;
+       u64 qpx_slidpa;
+       u64 qpx_slvl;
+       u64 qpx_ipd;
+/* 0xe0*/
+       u64 qpx_mtu;
+       u64 qpx_lato;
+       u64 qpx_rlimit;
+       u64 qpx_rnrlimit;
+/* 0x100*/
+       u64 qpx_t;
+       u64 qpx_sqhp;
+       u64 qpx_sqptp;
+       u64 qpx_nspsn;
+/* 0x120*/
+       u64 qpx_nspsnhwm;
+       u64 reserved1;
+       u64 qpx_sdsi;
+       u64 qpx_sdsbc;
+/* 0x140*/
+       u64 qpx_sqwsize;
+       u64 qpx_sqwts;
+       u64 qpx_lsn;
+       u64 qpx_nssn;
+/* 0x160 */
+       u64 qpx_mor;
+       u64 qpx_cor;
+       u64 qpx_sqsize;
+       u64 qpx_erc;
+/* 0x180*/
+       u64 qpx_rnrrc;
+       u64 qpx_ernrwt;
+       u64 qpx_rnrresp;
+       u64 qpx_lmsna;
+/* 0x1a0 */
+       u64 qpx_sqhpc;
+       u64 qpx_sqcptp;
+       u64 qpx_sigt;
+       u64 qpx_wqecnt;
+/* 0x1c0*/
+       u64 qpx_rqhp;
+       u64 qpx_rqptp;
+       u64 qpx_rqsize;
+       u64 qpx_nrr;
+/* 0x1e0*/
+       u64 qpx_rdmac;
+       u64 qpx_nrpsn;
+       u64 qpx_lapsn;
+       u64 qpx_lcr;
+/* 0x200*/
+       u64 qpx_rwc;
+       u64 qpx_rwva;
+       u64 qpx_rdsi;
+       u64 qpx_rdsbc;
+/* 0x220*/
+       u64 qpx_rqwsize;
+       u64 qpx_crmsn;
+       u64 qpx_rdd;
+       u64 qpx_larpsn;
+/* 0x240*/
+       u64 qpx_pd;
+       u64 qpx_scqn;
+       u64 qpx_rcqn;
+       u64 qpx_aeqn;
+/* 0x260*/
+       u64 qpx_aaelog;
+       u64 qpx_ram;
+       u64 qpx_rdmaqe0;
+       u64 qpx_rdmaqe1;
+/* 0x280*/
+       u64 qpx_rdmaqe2;
+       u64 qpx_rdmaqe3;
+       u64 qpx_nrpsnhwm;
+/* 0x298*/
+       u64 reserved[(0x400 - 0x298) / 8];
+/* 0x400 extended data */
+       u64 reserved_ext[(0x500 - 0x400) / 8];
+/* 0x500 */
+       u64 reserved2[(0x1000 - 0x500) / 8];
+/* 0x1000      */
+};
+
+#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
+
+#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
+
+/* MRMWPT Entry Memory Map */
+struct hipz_mrmwmm {
+       /* 0x00 */
+       u64 mrx_hcr;
+
+       u64 mrx_c;
+       u64 mrx_herr;
+       u64 mrx_aer;
+       /* 0x20 */
+       u64 mrx_pp;
+       u64 reserved1;
+       u64 reserved2;
+       u64 reserved3;
+       /* 0x40 */
+       u64 reserved4[(0x200 - 0x40) / 8];
+       /* 0x200 */
+       u64 mrx_ctl[64];
+
+};
+
+#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
+
+struct hipz_qpedmm {
+       /* 0x00 */
+       u64 reserved0[(0x400) / 8];
+       /* 0x400 */
+       u64 qpedx_phh;
+       u64 qpedx_ppsgp;
+       /* 0x410 */
+       u64 qpedx_ppsgu;
+       u64 qpedx_ppdgp;
+       /* 0x420 */
+       u64 qpedx_ppdgu;
+       u64 qpedx_aph;
+       /* 0x430 */
+       u64 qpedx_apsgp;
+       u64 qpedx_apsgu;
+       /* 0x440 */
+       u64 qpedx_apdgp;
+       u64 qpedx_apdgu;
+       /* 0x450 */
+       u64 qpedx_apav;
+       u64 qpedx_apsav;
+       /* 0x460  */
+       u64 qpedx_hcr;
+       u64 reserved1[4];
+       /* 0x488 */
+       u64 qpedx_rrl0;
+       /* 0x490 */
+       u64 qpedx_rrrkey0;
+       u64 qpedx_rrva0;
+       /* 0x4a0 */
+       u64 reserved2;
+       u64 qpedx_rrl1;
+       /* 0x4b0 */
+       u64 qpedx_rrrkey1;
+       u64 qpedx_rrva1;
+       /* 0x4c0 */
+       u64 reserved3;
+       u64 qpedx_rrl2;
+       /* 0x4d0 */
+       u64 qpedx_rrrkey2;
+       u64 qpedx_rrva2;
+       /* 0x4e0 */
+       u64 reserved4;
+       u64 qpedx_rrl3;
+       /* 0x4f0 */
+       u64 qpedx_rrrkey3;
+       u64 qpedx_rrva3;
+};
+
+#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
+
+/* CQ Table Entry Memory Map */
+struct hipz_cqtemm {
+       u64 cqx_hcr;
+       u64 cqx_c;
+       u64 cqx_herr;
+       u64 cqx_aer;
+/* 0x20  */
+       u64 cqx_ptp;
+       u64 cqx_tp;
+       u64 cqx_fec;
+       u64 cqx_feca;
+/* 0x40  */
+       u64 cqx_ep;
+       u64 cqx_eq;
+/* 0x50  */
+       u64 reserved1;
+       u64 cqx_n0;
+/* 0x60  */
+       u64 cqx_n1;
+       u64 reserved2[(0x1000 - 0x60) / 8];
+/* 0x1000 */
+};
+
+#define CQX_FEC_CQE_CNT           EHCA_BMASK_IBM(32, 63)
+#define CQX_FECADDER              EHCA_BMASK_IBM(32, 63)
+#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+
+#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
+
+/* EQ Table Entry Memory Map */
+struct hipz_eqtemm {
+       u64 eqx_hcr;
+       u64 eqx_c;
+
+       u64 eqx_herr;
+       u64 eqx_aer;
+/* 0x20 */
+       u64 eqx_ptp;
+       u64 eqx_tp;
+       u64 eqx_ssba;
+       u64 eqx_psba;
+
+/* 0x40 */
+       u64 eqx_cec;
+       u64 eqx_meql;
+       u64 eqx_xisbi;
+       u64 eqx_xisc;
+/* 0x60 */
+       u64 eqx_it;
+
+};
+
+#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
+
+/* access control defines for MR/MW */
+#define HIPZ_ACCESSCTRL_L_WRITE  0x00800000
+#define HIPZ_ACCESSCTRL_R_WRITE  0x00400000
+#define HIPZ_ACCESSCTRL_R_READ   0x00200000
+#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
+#define HIPZ_ACCESSCTRL_MW_BIND  0x00080000
+
+/* query hca response block */
+struct hipz_query_hca {
+       u32 cur_reliable_dg;
+       u32 cur_qp;
+       u32 cur_cq;
+       u32 cur_eq;
+       u32 cur_mr;
+       u32 cur_mw;
+       u32 cur_ee_context;
+       u32 cur_mcast_grp;
+       u32 cur_qp_attached_mcast_grp;
+       u32 reserved1;
+       u32 cur_ipv6_qp;
+       u32 cur_eth_qp;
+       u32 cur_hp_mr;
+       u32 reserved2[3];
+       u32 max_rd_domain;
+       u32 max_qp;
+       u32 max_cq;
+       u32 max_eq;
+       u32 max_mr;
+       u32 max_hp_mr;
+       u32 max_mw;
+       u32 max_mrwpte;
+       u32 max_special_mrwpte;
+       u32 max_rd_ee_context;
+       u32 max_mcast_grp;
+       u32 max_total_mcast_qp_attach;
+       u32 max_mcast_qp_attach;
+       u32 max_raw_ipv6_qp;
+       u32 max_raw_ethy_qp;
+       u32 internal_clock_frequency;
+       u32 max_pd;
+       u32 max_ah;
+       u32 max_cqe;
+       u32 max_wqes_wq;
+       u32 max_partitions;
+       u32 max_rr_ee_context;
+       u32 max_rr_qp;
+       u32 max_rr_hca;
+       u32 max_act_wqs_ee_context;
+       u32 max_act_wqs_qp;
+       u32 max_sge;
+       u32 max_sge_rd;
+       u32 memory_page_size_supported;
+       u64 max_mr_size;
+       u32 local_ca_ack_delay;
+       u32 num_ports;
+       u32 vendor_id;
+       u32 vendor_part_id;
+       u32 hw_ver;
+       u64 node_guid;
+       u64 hca_cap_indicators;
+       u32 data_counter_register_size;
+       u32 max_shared_rq;
+       u32 max_isns_eq;
+       u32 max_neq;
+} __attribute__ ((packed));
+
+#define HCA_CAP_AH_PORT_NR_CHECK      EHCA_BMASK_IBM( 0,  0)
+#define HCA_CAP_ATOMIC                EHCA_BMASK_IBM( 1,  1)
+#define HCA_CAP_AUTO_PATH_MIG         EHCA_BMASK_IBM( 2,  2)
+#define HCA_CAP_BAD_P_KEY_CTR         EHCA_BMASK_IBM( 3,  3)
+#define HCA_CAP_SQD_RTS_PORT_CHANGE   EHCA_BMASK_IBM( 4,  4)
+#define HCA_CAP_CUR_QP_STATE_MOD      EHCA_BMASK_IBM( 5,  5)
+#define HCA_CAP_INIT_TYPE             EHCA_BMASK_IBM( 6,  6)
+#define HCA_CAP_PORT_ACTIVE_EVENT     EHCA_BMASK_IBM( 7,  7)
+#define HCA_CAP_Q_KEY_VIOL_CTR        EHCA_BMASK_IBM( 8,  8)
+#define HCA_CAP_WQE_RESIZE            EHCA_BMASK_IBM( 9,  9)
+#define HCA_CAP_RAW_PACKET_MCAST      EHCA_BMASK_IBM(10, 10)
+#define HCA_CAP_SHUTDOWN_PORT         EHCA_BMASK_IBM(11, 11)
+#define HCA_CAP_RC_LL_QP              EHCA_BMASK_IBM(12, 12)
+#define HCA_CAP_SRQ                   EHCA_BMASK_IBM(13, 13)
+#define HCA_CAP_UD_LL_QP              EHCA_BMASK_IBM(16, 16)
+#define HCA_CAP_RESIZE_MR             EHCA_BMASK_IBM(17, 17)
+#define HCA_CAP_MINI_QP               EHCA_BMASK_IBM(18, 18)
+#define HCA_CAP_H_ALLOC_RES_SYNC      EHCA_BMASK_IBM(19, 19)
+
+/* query port response block */
+struct hipz_query_port {
+       u32 state;
+       u32 bad_pkey_cntr;
+       u32 lmc;
+       u32 lid;
+       u32 subnet_timeout;
+       u32 qkey_viol_cntr;
+       u32 sm_sl;
+       u32 sm_lid;
+       u32 capability_mask;
+       u32 init_type_reply;
+       u32 pkey_tbl_len;
+       u32 gid_tbl_len;
+       u64 gid_prefix;
+       u32 port_nr;
+       u16 pkey_entries[16];
+       u8  reserved1[32];
+       u32 trent_size;
+       u32 trbuf_size;
+       u64 max_msg_sz;
+       u32 max_mtu;
+       u32 vl_cap;
+       u32 phys_pstate;
+       u32 phys_state;
+       u32 phys_speed;
+       u32 phys_width;
+       u8  reserved2[1884];
+       u64 guid_entries[255];
+} __attribute__ ((packed));
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.c b/drivers/staging/rdma/ehca/ipz_pt_fn.c
new file mode 100644 (file)
index 0000000..7ffc748
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ipz_pt_fn.h"
+#include "ehca_classes.h"
+
+#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
+
+struct kmem_cache *small_qp_cache;
+
+void *ipz_qpageit_get_inc(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       queue->current_q_offset += queue->pagesize;
+       if (queue->current_q_offset > queue->queue_length) {
+               queue->current_q_offset -= queue->pagesize;
+               ret = NULL;
+       }
+       if (((u64)ret) % queue->pagesize) {
+               ehca_gen_err("ERROR!! not at PAGE-Boundary");
+               return NULL;
+       }
+       return ret;
+}
+
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       u64 last_entry_in_q = queue->queue_length - queue->qe_size;
+
+       queue->current_q_offset += queue->qe_size;
+       if (queue->current_q_offset > last_entry_in_q) {
+               queue->current_q_offset = 0;
+               queue->toggle_state = (~queue->toggle_state) & 1;
+       }
+
+       return ret;
+}
+
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
+{
+       int i;
+       for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
+               u64 page = __pa(queue->queue_pages[i]);
+               if (addr >= page && addr < page + queue->pagesize) {
+                       *q_offset = addr - page + i * queue->pagesize;
+                       return 0;
+               }
+       }
+       return -EINVAL;
+}
+
+#if PAGE_SHIFT < EHCA_PAGESHIFT
+#error Kernel pages must be at least as large than eHCA pages (4K) !
+#endif
+
+/*
+ * allocate pages for queue:
+ * outer loop allocates whole kernel pages (page aligned) and
+ * inner loop divides a kernel page into smaller hca queue pages
+ */
+static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
+{
+       int k, f = 0;
+       u8 *kpage;
+
+       while (f < nr_of_pages) {
+               kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
+               if (!kpage)
+                       goto out;
+
+               for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
+                       queue->queue_pages[f] = (struct ipz_page *)kpage;
+                       kpage += EHCA_PAGESIZE;
+                       f++;
+               }
+       }
+       return 1;
+
+out:
+       for (f = 0; f < nr_of_pages && queue->queue_pages[f];
+            f += PAGES_PER_KPAGE)
+               free_page((unsigned long)(queue->queue_pages)[f]);
+       return 0;
+}
+
+static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+       int order = ilog2(queue->pagesize) - 9;
+       struct ipz_small_queue_page *page;
+       unsigned long bit;
+
+       mutex_lock(&pd->lock);
+
+       if (!list_empty(&pd->free[order]))
+               page = list_entry(pd->free[order].next,
+                                 struct ipz_small_queue_page, list);
+       else {
+               page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
+               if (!page)
+                       goto out;
+
+               page->page = get_zeroed_page(GFP_KERNEL);
+               if (!page->page) {
+                       kmem_cache_free(small_qp_cache, page);
+                       goto out;
+               }
+
+               list_add(&page->list, &pd->free[order]);
+       }
+
+       bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
+       __set_bit(bit, page->bitmap);
+       page->fill++;
+
+       if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
+               list_move(&page->list, &pd->full[order]);
+
+       mutex_unlock(&pd->lock);
+
+       queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
+       queue->small_page = page;
+       queue->offset = bit << (order + 9);
+       return 1;
+
+out:
+       ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
+       mutex_unlock(&pd->lock);
+       return 0;
+}
+
+static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+       int order = ilog2(queue->pagesize) - 9;
+       struct ipz_small_queue_page *page = queue->small_page;
+       unsigned long bit;
+       int free_page = 0;
+
+       bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
+               >> (order + 9);
+
+       mutex_lock(&pd->lock);
+
+       __clear_bit(bit, page->bitmap);
+       page->fill--;
+
+       if (page->fill == 0) {
+               list_del(&page->list);
+               free_page = 1;
+       }
+
+       if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
+               /* the page was full until we freed the chunk */
+               list_move_tail(&page->list, &pd->free[order]);
+
+       mutex_unlock(&pd->lock);
+
+       if (free_page) {
+               free_page(page->page);
+               kmem_cache_free(small_qp_cache, page);
+       }
+}
+
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+                  const u32 nr_of_pages, const u32 pagesize,
+                  const u32 qe_size, const u32 nr_of_sg,
+                  int is_small)
+{
+       if (pagesize > PAGE_SIZE) {
+               ehca_gen_err("FATAL ERROR: pagesize=%x "
+                            "is greater than kernel page size", pagesize);
+               return 0;
+       }
+
+       /* init queue fields */
+       queue->queue_length = nr_of_pages * pagesize;
+       queue->pagesize = pagesize;
+       queue->qe_size = qe_size;
+       queue->act_nr_of_sg = nr_of_sg;
+       queue->current_q_offset = 0;
+       queue->toggle_state = 1;
+       queue->small_page = NULL;
+
+       /* allocate queue page pointers */
+       queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
+                                    GFP_KERNEL | __GFP_NOWARN);
+       if (!queue->queue_pages) {
+               queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
+               if (!queue->queue_pages) {
+                       ehca_gen_err("Couldn't allocate queue page list");
+                       return 0;
+               }
+       }
+
+       /* allocate actual queue pages */
+       if (is_small) {
+               if (!alloc_small_queue_page(queue, pd))
+                       goto ipz_queue_ctor_exit0;
+       } else
+               if (!alloc_queue_pages(queue, nr_of_pages))
+                       goto ipz_queue_ctor_exit0;
+
+       return 1;
+
+ipz_queue_ctor_exit0:
+       ehca_gen_err("Couldn't alloc pages queue=%p "
+                "nr_of_pages=%x",  queue, nr_of_pages);
+       kvfree(queue->queue_pages);
+
+       return 0;
+}
+
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+{
+       int i, nr_pages;
+
+       if (!queue || !queue->queue_pages) {
+               ehca_gen_dbg("queue or queue_pages is NULL");
+               return 0;
+       }
+
+       if (queue->small_page)
+               free_small_queue_page(queue, pd);
+       else {
+               nr_pages = queue->queue_length / queue->pagesize;
+               for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
+                       free_page((unsigned long)queue->queue_pages[i]);
+       }
+
+       kvfree(queue->queue_pages);
+
+       return 1;
+}
+
+int ehca_init_small_qp_cache(void)
+{
+       small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
+                                          sizeof(struct ipz_small_queue_page),
+                                          0, SLAB_HWCACHE_ALIGN, NULL);
+       if (!small_qp_cache)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void ehca_cleanup_small_qp_cache(void)
+{
+       kmem_cache_destroy(small_qp_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.h b/drivers/staging/rdma/ehca/ipz_pt_fn.h
new file mode 100644 (file)
index 0000000..a801274
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IPZ_PT_FN_H__
+#define __IPZ_PT_FN_H__
+
+#define EHCA_PAGESHIFT   12
+#define EHCA_PAGESIZE   4096UL
+#define EHCA_PAGEMASK   (~(EHCA_PAGESIZE-1))
+#define EHCA_PT_ENTRIES 512UL
+
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+
+struct ehca_pd;
+struct ipz_small_queue_page;
+
+extern struct kmem_cache *small_qp_cache;
+
+/* struct generic ehca page */
+struct ipz_page {
+       u8 entries[EHCA_PAGESIZE];
+};
+
+#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
+
+struct ipz_small_queue_page {
+       unsigned long page;
+       unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
+       int fill;
+       void *mapped_addr;
+       u32 mmap_count;
+       struct list_head list;
+};
+
+/* struct generic queue in linux kernel virtual memory (kv) */
+struct ipz_queue {
+       u64 current_q_offset;   /* current queue entry */
+
+       struct ipz_page **queue_pages;  /* array of pages belonging to queue */
+       u32 qe_size;            /* queue entry size */
+       u32 act_nr_of_sg;
+       u32 queue_length;       /* queue length allocated in bytes */
+       u32 pagesize;
+       u32 toggle_state;       /* toggle flag - per page */
+       u32 offset; /* save offset within page for small_qp */
+       struct ipz_small_queue_page *small_page;
+};
+
+/*
+ * return current Queue Entry for a certain q_offset
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
+{
+       struct ipz_page *current_page;
+       if (q_offset >= queue->queue_length)
+               return NULL;
+       current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
+       return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
+}
+
+/*
+ * return current Queue Entry
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_get(struct ipz_queue *queue)
+{
+       return ipz_qeit_calc(queue, queue->current_q_offset);
+}
+
+/*
+ * return current Queue Page , increment Queue Page iterator from
+ * page to page in struct ipz_queue, last increment will return 0! and
+ * NOT wrap
+ * returns address (kv) of Queue Page
+ * warning don't use in parallel with ipz_QE_get_inc()
+ */
+void *ipz_qpageit_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       queue->current_q_offset += queue->qe_size;
+       if (queue->current_q_offset >= queue->queue_length) {
+               queue->current_q_offset = 0;
+               /* toggle the valid flag */
+               queue->toggle_state = (~queue->toggle_state) & 1;
+       }
+
+       return ret;
+}
+
+/*
+ * return a bool indicating whether current Queue Entry is valid
+ */
+static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
+{
+       struct ehca_cqe *cqe = ipz_qeit_get(queue);
+       return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
+}
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
+{
+       return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
+}
+
+/*
+ * returns and resets Queue Entry iterator
+ * returns address (kv) of first Queue Entry
+ */
+static inline void *ipz_qeit_reset(struct ipz_queue *queue)
+{
+       queue->current_q_offset = 0;
+       return ipz_qeit_get(queue);
+}
+
+/*
+ * return the q_offset corresponding to an absolute address
+ */
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
+
+/*
+ * return the next queue offset. don't modify the queue.
+ */
+static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
+{
+       offset += queue->qe_size;
+       if (offset >= queue->queue_length) offset = 0;
+       return offset;
+}
+
+/* struct generic page table */
+struct ipz_pt {
+       u64 entries[EHCA_PT_ENTRIES];
+};
+
+/* struct page table for a queue, only to be used in pf */
+struct ipz_qpt {
+       /* queue page tables (kv), use u64 because we know the element length */
+       u64 *qpts;
+       u32 n_qpts;
+       u32 n_ptes;       /*  number of page table entries */
+       u64 *current_pte_addr;
+};
+
+/*
+ * constructor for a ipz_queue_t, placement new for ipz_queue_t,
+ * new for all dependent datastructors
+ * all QP Tables are the same
+ * flow:
+ *    allocate+pin queue
+ * see ipz_qpt_ctor()
+ * returns true if ok, false if out of memory
+ */
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+                  const u32 nr_of_pages, const u32 pagesize,
+                  const u32 qe_size, const u32 nr_of_sg,
+                  int is_small);
+
+/*
+ * destructor for a ipz_queue_t
+ *  -# free queue
+ *  see ipz_queue_ctor()
+ *  returns true if ok, false if queue was NULL-ptr of free failed
+ */
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
+
+/*
+ * constructor for a ipz_qpt_t,
+ * placement new for struct ipz_queue, new for all dependent datastructors
+ * all QP Tables are the same,
+ * flow:
+ * -# allocate+pin queue
+ * -# initialise ptcb
+ * -# allocate+pin PTs
+ * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
+ * -# the ring must have room for exactly nr_of_PTEs
+ * see ipz_qpt_ctor()
+ */
+void ipz_qpt_ctor(struct ipz_qpt *qpt,
+                 const u32 nr_of_qes,
+                 const u32 pagesize,
+                 const u32 qe_size,
+                 const u8 lowbyte, const u8 toggle,
+                 u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ * fix EQ page problems
+ */
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Event Queue Entry, increment Queue Entry iterator
+ * by one step in struct ipz_queue if valid, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_queue_QPageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ */
+static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       u32 qe = *(u8 *)ret;
+       if ((qe >> 7) != (queue->toggle_state & 1))
+               return NULL;
+       ipz_qeit_eq_get_inc(queue); /* this is a good one */
+       return ret;
+}
+
+static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       u32 qe = *(u8 *)ret;
+       if ((qe >> 7) != (queue->toggle_state & 1))
+               return NULL;
+       return ret;
+}
+
+/* returns address (GX) of first queue entry */
+static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
+{
+       return be64_to_cpu(qpt->qpts[0]);
+}
+
+/* returns address (kv) of first page of queue page table */
+static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
+{
+       return qpt->qpts;
+}
+
+#endif                         /* __IPZ_PT_FN_H__ */
index fd09290..342a07c 100644 (file)
@@ -269,14 +269,14 @@ int iscsit_deaccess_np(struct iscsi_np *np, struct iscsi_portal_group *tpg,
 }
 
 bool iscsit_check_np_match(
-       struct __kernel_sockaddr_storage *sockaddr,
+       struct sockaddr_storage *sockaddr,
        struct iscsi_np *np,
        int network_transport)
 {
        struct sockaddr_in *sock_in, *sock_in_e;
        struct sockaddr_in6 *sock_in6, *sock_in6_e;
        bool ip_match = false;
-       u16 port;
+       u16 port, port_e;
 
        if (sockaddr->ss_family == AF_INET6) {
                sock_in6 = (struct sockaddr_in6 *)sockaddr;
@@ -288,6 +288,7 @@ bool iscsit_check_np_match(
                        ip_match = true;
 
                port = ntohs(sock_in6->sin6_port);
+               port_e = ntohs(sock_in6_e->sin6_port);
        } else {
                sock_in = (struct sockaddr_in *)sockaddr;
                sock_in_e = (struct sockaddr_in *)&np->np_sockaddr;
@@ -296,9 +297,10 @@ bool iscsit_check_np_match(
                        ip_match = true;
 
                port = ntohs(sock_in->sin_port);
+               port_e = ntohs(sock_in_e->sin_port);
        }
 
-       if (ip_match && (np->np_port == port) &&
+       if (ip_match && (port_e == port) &&
            (np->np_network_transport == network_transport))
                return true;
 
@@ -309,7 +311,7 @@ bool iscsit_check_np_match(
  * Called with mutex np_lock held
  */
 static struct iscsi_np *iscsit_get_np(
-       struct __kernel_sockaddr_storage *sockaddr,
+       struct sockaddr_storage *sockaddr,
        int network_transport)
 {
        struct iscsi_np *np;
@@ -340,12 +342,9 @@ static struct iscsi_np *iscsit_get_np(
 }
 
 struct iscsi_np *iscsit_add_np(
-       struct __kernel_sockaddr_storage *sockaddr,
-       char *ip_str,
+       struct sockaddr_storage *sockaddr,
        int network_transport)
 {
-       struct sockaddr_in *sock_in;
-       struct sockaddr_in6 *sock_in6;
        struct iscsi_np *np;
        int ret;
 
@@ -368,16 +367,6 @@ struct iscsi_np *iscsit_add_np(
        }
 
        np->np_flags |= NPF_IP_NETWORK;
-       if (sockaddr->ss_family == AF_INET6) {
-               sock_in6 = (struct sockaddr_in6 *)sockaddr;
-               snprintf(np->np_ip, IPV6_ADDRESS_SPACE, "%s", ip_str);
-               np->np_port = ntohs(sock_in6->sin6_port);
-       } else {
-               sock_in = (struct sockaddr_in *)sockaddr;
-               sprintf(np->np_ip, "%s", ip_str);
-               np->np_port = ntohs(sock_in->sin_port);
-       }
-
        np->np_network_transport = network_transport;
        spin_lock_init(&np->np_thread_lock);
        init_completion(&np->np_restart_comp);
@@ -411,8 +400,8 @@ struct iscsi_np *iscsit_add_np(
        list_add_tail(&np->np_list, &g_np_list);
        mutex_unlock(&np_lock);
 
-       pr_debug("CORE[0] - Added Network Portal: %s:%hu on %s\n",
-               np->np_ip, np->np_port, np->np_transport->name);
+       pr_debug("CORE[0] - Added Network Portal: %pISpc on %s\n",
+               &np->np_sockaddr, np->np_transport->name);
 
        return np;
 }
@@ -481,8 +470,8 @@ int iscsit_del_np(struct iscsi_np *np)
        list_del(&np->np_list);
        mutex_unlock(&np_lock);
 
-       pr_debug("CORE[0] - Removed Network Portal: %s:%hu on %s\n",
-               np->np_ip, np->np_port, np->np_transport->name);
+       pr_debug("CORE[0] - Removed Network Portal: %pISpc on %s\n",
+               &np->np_sockaddr, np->np_transport->name);
 
        iscsit_put_transport(np->np_transport);
        kfree(np);
@@ -1209,7 +1198,6 @@ static u32 iscsit_do_crypto_hash_sg(
        u8 *pad_bytes)
 {
        u32 data_crc;
-       u32 i;
        struct scatterlist *sg;
        unsigned int page_off;
 
@@ -1218,15 +1206,15 @@ static u32 iscsit_do_crypto_hash_sg(
        sg = cmd->first_data_sg;
        page_off = cmd->first_data_sg_off;
 
-       i = 0;
        while (data_length) {
-               u32 cur_len = min_t(u32, data_length, (sg[i].length - page_off));
+               u32 cur_len = min_t(u32, data_length, (sg->length - page_off));
 
-               crypto_hash_update(hash, &sg[i], cur_len);
+               crypto_hash_update(hash, sg, cur_len);
 
                data_length -= cur_len;
                page_off = 0;
-               i++;
+               /* iscsit_map_iovec has already checked for invalid sg pointers */
+               sg = sg_next(sg);
        }
 
        if (padding) {
@@ -2556,7 +2544,7 @@ static int iscsit_send_conn_drop_async_message(
        cmd->stat_sn            = conn->stat_sn++;
        hdr->statsn             = cpu_to_be32(cmd->stat_sn);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
        hdr->async_event        = ISCSI_ASYNC_MSG_DROPPING_CONNECTION;
        hdr->param1             = cpu_to_be16(cmd->logout_cid);
        hdr->param2             = cpu_to_be16(conn->sess->sess_ops->DefaultTime2Wait);
@@ -2628,7 +2616,7 @@ iscsit_build_datain_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
                hdr->statsn             = cpu_to_be32(0xFFFFFFFF);
 
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
        hdr->datasn             = cpu_to_be32(datain->data_sn);
        hdr->offset             = cpu_to_be32(datain->offset);
 
@@ -2839,7 +2827,7 @@ iscsit_build_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
 
        iscsit_increment_maxcmdsn(cmd, conn->sess);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built Logout Response ITT: 0x%08x StatSN:"
                " 0x%08x Response: 0x%02x CID: %hu on CID: %hu\n",
@@ -2902,7 +2890,7 @@ iscsit_build_nopin_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
                iscsit_increment_maxcmdsn(cmd, conn->sess);
 
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built NOPIN %s Response ITT: 0x%08x, TTT: 0x%08x,"
                " StatSN: 0x%08x, Length %u\n", (nopout_response) ?
@@ -3049,7 +3037,7 @@ static int iscsit_send_r2t(
        hdr->ttt                = cpu_to_be32(r2t->targ_xfer_tag);
        hdr->statsn             = cpu_to_be32(conn->stat_sn);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
        hdr->r2tsn              = cpu_to_be32(r2t->r2t_sn);
        hdr->data_offset        = cpu_to_be32(r2t->offset);
        hdr->data_length        = cpu_to_be32(r2t->xfer_len);
@@ -3202,7 +3190,7 @@ void iscsit_build_rsp_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
 
        iscsit_increment_maxcmdsn(cmd, conn->sess);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built SCSI Response, ITT: 0x%08x, StatSN: 0x%08x,"
                " Response: 0x%02x, SAM Status: 0x%02x, CID: %hu\n",
@@ -3321,7 +3309,7 @@ iscsit_build_task_mgt_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
 
        iscsit_increment_maxcmdsn(cmd, conn->sess);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built Task Management Response ITT: 0x%08x,"
                " StatSN: 0x%08x, Response: 0x%02x, CID: %hu\n",
@@ -3399,6 +3387,7 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
        int target_name_printed;
        unsigned char buf[ISCSI_IQN_LEN+12]; /* iqn + "TargetName=" + \0 */
        unsigned char *text_in = cmd->text_in_ptr, *text_ptr = NULL;
+       bool active;
 
        buffer_len = min(conn->conn_ops->MaxRecvDataSegmentLength,
                         SENDTARGETS_BUF_LIMIT);
@@ -3452,19 +3441,18 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
                        }
 
                        spin_lock(&tpg->tpg_state_lock);
-                       if ((tpg->tpg_state == TPG_STATE_FREE) ||
-                           (tpg->tpg_state == TPG_STATE_INACTIVE)) {
-                               spin_unlock(&tpg->tpg_state_lock);
-                               continue;
-                       }
+                       active = (tpg->tpg_state == TPG_STATE_ACTIVE);
                        spin_unlock(&tpg->tpg_state_lock);
 
+                       if (!active && tpg->tpg_attrib.tpg_enabled_sendtargets)
+                               continue;
+
                        spin_lock(&tpg->tpg_np_lock);
                        list_for_each_entry(tpg_np, &tpg->tpg_gnp_list,
                                                tpg_np_list) {
                                struct iscsi_np *np = tpg_np->tpg_np;
                                bool inaddr_any = iscsit_check_inaddr_any(np);
-                               char *fmt_str;
+                               struct sockaddr_storage *sockaddr;
 
                                if (np->np_network_transport != network_transport)
                                        continue;
@@ -3492,15 +3480,15 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
                                        }
                                }
 
-                               if (np->np_sockaddr.ss_family == AF_INET6)
-                                       fmt_str = "TargetAddress=[%s]:%hu,%hu";
+                               if (inaddr_any)
+                                       sockaddr = &conn->local_sockaddr;
                                else
-                                       fmt_str = "TargetAddress=%s:%hu,%hu";
+                                       sockaddr = &np->np_sockaddr;
 
-                               len = sprintf(buf, fmt_str,
-                                       inaddr_any ? conn->local_ip : np->np_ip,
-                                       np->np_port,
-                                       tpg->tpgt);
+                               len = sprintf(buf, "TargetAddress="
+                                             "%pISpc,%hu",
+                                             sockaddr,
+                                             tpg->tpgt);
                                len += 1;
 
                                if ((len + payload_len) > buffer_len) {
@@ -3576,7 +3564,7 @@ iscsit_build_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
         */
        cmd->maxcmdsn_inc = 0;
        hdr->exp_cmdsn = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built Text Response: ITT: 0x%08x, TTT: 0x%08x, StatSN: 0x%08x,"
                " Length: %u, CID: %hu F: %d C: %d\n", cmd->init_task_tag,
@@ -3654,7 +3642,7 @@ iscsit_build_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
        cmd->stat_sn            = conn->stat_sn++;
        hdr->statsn             = cpu_to_be32(cmd->stat_sn);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
 }
 EXPORT_SYMBOL(iscsit_build_reject);
index 7d0f9c0..4cf2c0f 100644 (file)
@@ -10,10 +10,10 @@ extern int iscsit_access_np(struct iscsi_np *, struct iscsi_portal_group *);
 extern void iscsit_login_kref_put(struct kref *);
 extern int iscsit_deaccess_np(struct iscsi_np *, struct iscsi_portal_group *,
                                struct iscsi_tpg_np *);
-extern bool iscsit_check_np_match(struct __kernel_sockaddr_storage *,
+extern bool iscsit_check_np_match(struct sockaddr_storage *,
                                struct iscsi_np *, int);
-extern struct iscsi_np *iscsit_add_np(struct __kernel_sockaddr_storage *,
-                               char *, int);
+extern struct iscsi_np *iscsit_add_np(struct sockaddr_storage *,
+                               int);
 extern int iscsit_reset_np_thread(struct iscsi_np *, struct iscsi_tpg_np *,
                                struct iscsi_portal_group *, bool);
 extern int iscsit_del_np(struct iscsi_np *);
index c1898c8..c7461d7 100644 (file)
@@ -99,7 +99,7 @@ static ssize_t lio_target_np_store_sctp(
                 * Use existing np->np_sockaddr for SCTP network portal reference
                 */
                tpg_np_sctp = iscsit_tpg_add_network_portal(tpg, &np->np_sockaddr,
-                                       np->np_ip, tpg_np, ISCSI_SCTP_TCP);
+                                       tpg_np, ISCSI_SCTP_TCP);
                if (!tpg_np_sctp || IS_ERR(tpg_np_sctp))
                        goto out;
        } else {
@@ -177,7 +177,7 @@ static ssize_t lio_target_np_store_iser(
                }
 
                tpg_np_iser = iscsit_tpg_add_network_portal(tpg, &np->np_sockaddr,
-                               np->np_ip, tpg_np, ISCSI_INFINIBAND);
+                               tpg_np, ISCSI_INFINIBAND);
                if (IS_ERR(tpg_np_iser)) {
                        rc = PTR_ERR(tpg_np_iser);
                        goto out;
@@ -220,7 +220,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
        struct iscsi_portal_group *tpg;
        struct iscsi_tpg_np *tpg_np;
        char *str, *str2, *ip_str, *port_str;
-       struct __kernel_sockaddr_storage sockaddr;
+       struct sockaddr_storage sockaddr;
        struct sockaddr_in *sock_in;
        struct sockaddr_in6 *sock_in6;
        unsigned long port;
@@ -235,7 +235,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
        memset(buf, 0, MAX_PORTAL_LEN + 1);
        snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
 
-       memset(&sockaddr, 0, sizeof(struct __kernel_sockaddr_storage));
+       memset(&sockaddr, 0, sizeof(struct sockaddr_storage));
 
        str = strstr(buf, "[");
        if (str) {
@@ -248,8 +248,8 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
                        return ERR_PTR(-EINVAL);
                }
                str++; /* Skip over leading "[" */
-               *str2 = '\0'; /* Terminate the IPv6 address */
-               str2++; /* Skip over the "]" */
+               *str2 = '\0'; /* Terminate the unbracketed IPv6 address */
+               str2++; /* Skip over the \0 */
                port_str = strstr(str2, ":");
                if (!port_str) {
                        pr_err("Unable to locate \":port\""
@@ -267,7 +267,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
                sock_in6 = (struct sockaddr_in6 *)&sockaddr;
                sock_in6->sin6_family = AF_INET6;
                sock_in6->sin6_port = htons((unsigned short)port);
-               ret = in6_pton(str, IPV6_ADDRESS_SPACE,
+               ret = in6_pton(str, -1,
                                (void *)&sock_in6->sin6_addr.in6_u, -1, &end);
                if (ret <= 0) {
                        pr_err("in6_pton returned: %d\n", ret);
@@ -316,7 +316,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
         * sys/kernel/config/iscsi/$IQN/$TPG/np/$IP:$PORT/
         *
         */
-       tpg_np = iscsit_tpg_add_network_portal(tpg, &sockaddr, str, NULL,
+       tpg_np = iscsit_tpg_add_network_portal(tpg, &sockaddr, NULL,
                                ISCSI_TCP);
        if (IS_ERR(tpg_np)) {
                iscsit_put_tpg(tpg);
@@ -344,8 +344,8 @@ static void lio_target_call_delnpfromtpg(
 
        se_tpg = &tpg->tpg_se_tpg;
        pr_debug("LIO_Target_ConfigFS: DEREGISTER -> %s TPGT: %hu"
-               " PORTAL: %s:%hu\n", config_item_name(&se_tpg->se_tpg_wwn->wwn_group.cg_item),
-               tpg->tpgt, tpg_np->tpg_np->np_ip, tpg_np->tpg_np->np_port);
+               " PORTAL: %pISpc\n", config_item_name(&se_tpg->se_tpg_wwn->wwn_group.cg_item),
+               tpg->tpgt, &tpg_np->tpg_np->np_sockaddr);
 
        ret = iscsit_tpg_del_network_portal(tpg, tpg_np);
        if (ret < 0)
@@ -656,6 +656,7 @@ static ssize_t lio_target_nacl_show_info(
        struct iscsi_conn *conn;
        struct se_session *se_sess;
        ssize_t rb = 0;
+       u32 max_cmd_sn;
 
        spin_lock_bh(&se_nacl->nacl_sess_lock);
        se_sess = se_nacl->nacl_sess;
@@ -703,11 +704,12 @@ static ssize_t lio_target_nacl_show_info(
                                " Values]-----------------------\n");
                rb += sprintf(page+rb, "  CmdSN/WR  :  CmdSN/WC  :  ExpCmdSN"
                                "  :  MaxCmdSN  :     ITT    :     TTT\n");
+               max_cmd_sn = (u32) atomic_read(&sess->max_cmd_sn);
                rb += sprintf(page+rb, " 0x%08x   0x%08x   0x%08x   0x%08x"
                                "   0x%08x   0x%08x\n",
                        sess->cmdsn_window,
-                       (sess->max_cmd_sn - sess->exp_cmd_sn) + 1,
-                       sess->exp_cmd_sn, sess->max_cmd_sn,
+                       (max_cmd_sn - sess->exp_cmd_sn) + 1,
+                       sess->exp_cmd_sn, max_cmd_sn,
                        sess->init_task_tag, sess->targ_xfer_tag);
                rb += sprintf(page+rb, "----------------------[iSCSI"
                                " Connections]-------------------------\n");
@@ -751,7 +753,7 @@ static ssize_t lio_target_nacl_show_info(
                                break;
                        }
 
-                       rb += sprintf(page+rb, "   Address %s %s", conn->login_ip,
+                       rb += sprintf(page+rb, "   Address %pISc %s", &conn->login_sockaddr,
                                (conn->network_transport == ISCSI_TCP) ?
                                "TCP" : "SCTP");
                        rb += sprintf(page+rb, "  StatSN: 0x%08x\n",
@@ -1010,6 +1012,11 @@ TPG_ATTR(t10_pi, S_IRUGO | S_IWUSR);
  */
 DEF_TPG_ATTRIB(fabric_prot_type);
 TPG_ATTR(fabric_prot_type, S_IRUGO | S_IWUSR);
+/*
+ * Define iscsi_tpg_attrib_s_tpg_enabled_sendtargets
+ */
+DEF_TPG_ATTRIB(tpg_enabled_sendtargets);
+TPG_ATTR(tpg_enabled_sendtargets, S_IRUGO | S_IWUSR);
 
 static struct configfs_attribute *lio_target_tpg_attrib_attrs[] = {
        &iscsi_tpg_attrib_authentication.attr,
@@ -1024,6 +1031,7 @@ static struct configfs_attribute *lio_target_tpg_attrib_attrs[] = {
        &iscsi_tpg_attrib_default_erl.attr,
        &iscsi_tpg_attrib_t10_pi.attr,
        &iscsi_tpg_attrib_fabric_prot_type.attr,
+       &iscsi_tpg_attrib_tpg_enabled_sendtargets.attr,
        NULL,
 };
 
index 5fabcd3..0382fa2 100644 (file)
@@ -47,19 +47,19 @@ void iscsit_determine_maxcmdsn(struct iscsi_session *sess)
         * core_set_queue_depth_for_node().
         */
        sess->cmdsn_window = se_nacl->queue_depth;
-       sess->max_cmd_sn = (sess->max_cmd_sn + se_nacl->queue_depth) - 1;
+       atomic_add(se_nacl->queue_depth - 1, &sess->max_cmd_sn);
 }
 
 void iscsit_increment_maxcmdsn(struct iscsi_cmd *cmd, struct iscsi_session *sess)
 {
+       u32 max_cmd_sn;
+
        if (cmd->immediate_cmd || cmd->maxcmdsn_inc)
                return;
 
        cmd->maxcmdsn_inc = 1;
 
-       mutex_lock(&sess->cmdsn_mutex);
-       sess->max_cmd_sn += 1;
-       pr_debug("Updated MaxCmdSN to 0x%08x\n", sess->max_cmd_sn);
-       mutex_unlock(&sess->cmdsn_mutex);
+       max_cmd_sn = atomic_inc_return(&sess->max_cmd_sn);
+       pr_debug("Updated MaxCmdSN to 0x%08x\n", max_cmd_sn);
 }
 EXPORT_SYMBOL(iscsit_increment_maxcmdsn);
index 7e8f65e..96e78c8 100644 (file)
@@ -331,7 +331,7 @@ static int iscsi_login_zero_tsih_s1(
         * The FFP CmdSN window values will be allocated from the TPG's
         * Initiator Node's ACL once the login has been successfully completed.
         */
-       sess->max_cmd_sn        = be32_to_cpu(pdu->cmdsn);
+       atomic_set(&sess->max_cmd_sn, be32_to_cpu(pdu->cmdsn));
 
        sess->sess_ops = kzalloc(sizeof(struct iscsi_sess_ops), GFP_KERNEL);
        if (!sess->sess_ops) {
@@ -729,9 +729,9 @@ void iscsi_post_login_handler(
                        stop_timer = 1;
                }
 
-               pr_debug("iSCSI Login successful on CID: %hu from %s to"
-                       " %s:%hu,%hu\n", conn->cid, conn->login_ip,
-                       conn->local_ip, conn->local_port, tpg->tpgt);
+               pr_debug("iSCSI Login successful on CID: %hu from %pISpc to"
+                       " %pISpc,%hu\n", conn->cid, &conn->login_sockaddr,
+                       &conn->local_sockaddr, tpg->tpgt);
 
                list_add_tail(&conn->conn_list, &sess->sess_conn_list);
                atomic_inc(&sess->nconn);
@@ -776,8 +776,8 @@ void iscsi_post_login_handler(
        pr_debug("Moving to TARG_SESS_STATE_LOGGED_IN.\n");
        sess->session_state = TARG_SESS_STATE_LOGGED_IN;
 
-       pr_debug("iSCSI Login successful on CID: %hu from %s to %s:%hu,%hu\n",
-               conn->cid, conn->login_ip, conn->local_ip, conn->local_port,
+       pr_debug("iSCSI Login successful on CID: %hu from %pISpc to %pISpc,%hu\n",
+               conn->cid, &conn->login_sockaddr, &conn->local_sockaddr,
                tpg->tpgt);
 
        spin_lock_bh(&sess->conn_lock);
@@ -823,8 +823,8 @@ static void iscsi_handle_login_thread_timeout(unsigned long data)
        struct iscsi_np *np = (struct iscsi_np *) data;
 
        spin_lock_bh(&np->np_thread_lock);
-       pr_err("iSCSI Login timeout on Network Portal %s:%hu\n",
-                       np->np_ip, np->np_port);
+       pr_err("iSCSI Login timeout on Network Portal %pISpc\n",
+                       &np->np_sockaddr);
 
        if (np->np_login_timer_flags & ISCSI_TF_STOP) {
                spin_unlock_bh(&np->np_thread_lock);
@@ -877,7 +877,7 @@ static void iscsi_stop_login_thread_timer(struct iscsi_np *np)
 
 int iscsit_setup_np(
        struct iscsi_np *np,
-       struct __kernel_sockaddr_storage *sockaddr)
+       struct sockaddr_storage *sockaddr)
 {
        struct socket *sock = NULL;
        int backlog = ISCSIT_TCP_BACKLOG, ret, opt = 0, len;
@@ -916,7 +916,7 @@ int iscsit_setup_np(
         * in iscsi_target_configfs.c code..
         */
        memcpy(&np->np_sockaddr, sockaddr,
-                       sizeof(struct __kernel_sockaddr_storage));
+                       sizeof(struct sockaddr_storage));
 
        if (sockaddr->ss_family == AF_INET6)
                len = sizeof(struct sockaddr_in6);
@@ -975,7 +975,7 @@ fail:
 
 int iscsi_target_setup_login_socket(
        struct iscsi_np *np,
-       struct __kernel_sockaddr_storage *sockaddr)
+       struct sockaddr_storage *sockaddr)
 {
        struct iscsit_transport *t;
        int rc;
@@ -1015,44 +1015,42 @@ int iscsit_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in6, &err, 1);
                if (!rc) {
-                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr))
-                               snprintf(conn->login_ip, sizeof(conn->login_ip), "[%pI6c]",
-                                       &sock_in6.sin6_addr.in6_u);
-                       else
-                               snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI4",
-                                       &sock_in6.sin6_addr.s6_addr32[3]);
-                       conn->login_port = ntohs(sock_in6.sin6_port);
+                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr)) {
+                               memcpy(&conn->login_sockaddr, &sock_in6, sizeof(sock_in6));
+                       } else {
+                               /* Pretend to be an ipv4 socket */
+                               sock_in.sin_family = AF_INET;
+                               sock_in.sin_port = sock_in6.sin6_port;
+                               memcpy(&sock_in.sin_addr, &sock_in6.sin6_addr.s6_addr32[3], 4);
+                               memcpy(&conn->login_sockaddr, &sock_in, sizeof(sock_in));
+                       }
                }
 
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in6, &err, 0);
                if (!rc) {
-                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr))
-                               snprintf(conn->local_ip, sizeof(conn->local_ip), "[%pI6c]",
-                                       &sock_in6.sin6_addr.in6_u);
-                       else
-                               snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI4",
-                                       &sock_in6.sin6_addr.s6_addr32[3]);
-                       conn->local_port = ntohs(sock_in6.sin6_port);
+                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr)) {
+                               memcpy(&conn->local_sockaddr, &sock_in6, sizeof(sock_in6));
+                       } else {
+                               /* Pretend to be an ipv4 socket */
+                               sock_in.sin_family = AF_INET;
+                               sock_in.sin_port = sock_in6.sin6_port;
+                               memcpy(&sock_in.sin_addr, &sock_in6.sin6_addr.s6_addr32[3], 4);
+                               memcpy(&conn->local_sockaddr, &sock_in, sizeof(sock_in));
+                       }
                }
        } else {
                memset(&sock_in, 0, sizeof(struct sockaddr_in));
 
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in, &err, 1);
-               if (!rc) {
-                       sprintf(conn->login_ip, "%pI4",
-                                       &sock_in.sin_addr.s_addr);
-                       conn->login_port = ntohs(sock_in.sin_port);
-               }
+               if (!rc)
+                       memcpy(&conn->login_sockaddr, &sock_in, sizeof(sock_in));
 
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in, &err, 0);
-               if (!rc) {
-                       sprintf(conn->local_ip, "%pI4",
-                                       &sock_in.sin_addr.s_addr);
-                       conn->local_port = ntohs(sock_in.sin_port);
-               }
+               if (!rc)
+                       memcpy(&conn->local_sockaddr, &sock_in, sizeof(sock_in));
        }
 
        return 0;
@@ -1302,8 +1300,8 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
        spin_lock_bh(&np->np_thread_lock);
        if (np->np_thread_state != ISCSI_NP_THREAD_ACTIVE) {
                spin_unlock_bh(&np->np_thread_lock);
-               pr_err("iSCSI Network Portal on %s:%hu currently not"
-                       " active.\n", np->np_ip, np->np_port);
+               pr_err("iSCSI Network Portal on %pISpc currently not"
+                       " active.\n", &np->np_sockaddr);
                iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
                                ISCSI_LOGIN_STATUS_SVC_UNAVAILABLE);
                goto new_sess_out;
@@ -1312,9 +1310,9 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
 
        conn->network_transport = np->np_network_transport;
 
-       pr_debug("Received iSCSI login request from %s on %s Network"
-               " Portal %s:%hu\n", conn->login_ip, np->np_transport->name,
-               conn->local_ip, conn->local_port);
+       pr_debug("Received iSCSI login request from %pISpc on %s Network"
+               " Portal %pISpc\n", &conn->login_sockaddr, np->np_transport->name,
+               &conn->local_sockaddr);
 
        pr_debug("Moving to TARG_CONN_STATE_IN_LOGIN.\n");
        conn->conn_state        = TARG_CONN_STATE_IN_LOGIN;
index 57aa0d0..b597aa2 100644 (file)
@@ -5,9 +5,9 @@ extern int iscsi_login_setup_crypto(struct iscsi_conn *);
 extern int iscsi_check_for_session_reinstatement(struct iscsi_conn *);
 extern int iscsi_login_post_auth_non_zero_tsih(struct iscsi_conn *, u16, u32);
 extern int iscsit_setup_np(struct iscsi_np *,
-                               struct __kernel_sockaddr_storage *);
+                               struct sockaddr_storage *);
 extern int iscsi_target_setup_login_socket(struct iscsi_np *,
-                               struct __kernel_sockaddr_storage *);
+                               struct sockaddr_storage *);
 extern int iscsit_accept_np(struct iscsi_np *, struct iscsi_conn *);
 extern int iscsit_get_login_rx(struct iscsi_conn *, struct iscsi_login *);
 extern int iscsit_put_login_tx(struct iscsi_conn *, struct iscsi_login *, u32);
index f9cde91..5c964c0 100644 (file)
@@ -341,7 +341,6 @@ static int iscsi_target_check_first_request(
 static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_login *login)
 {
        u32 padding = 0;
-       struct iscsi_session *sess = conn->sess;
        struct iscsi_login_rsp *login_rsp;
 
        login_rsp = (struct iscsi_login_rsp *) login->rsp;
@@ -353,7 +352,7 @@ static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_log
        login_rsp->itt                  = login->init_task_tag;
        login_rsp->statsn               = cpu_to_be32(conn->stat_sn++);
        login_rsp->exp_cmdsn            = cpu_to_be32(conn->sess->exp_cmd_sn);
-       login_rsp->max_cmdsn            = cpu_to_be32(conn->sess->max_cmd_sn);
+       login_rsp->max_cmdsn            = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Sending Login Response, Flags: 0x%02x, ITT: 0x%08x,"
                " ExpCmdSN; 0x%08x, MaxCmdSN: 0x%08x, StatSN: 0x%08x, Length:"
@@ -382,10 +381,6 @@ static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_log
                goto err;
 
        login->rsp_length               = 0;
-       mutex_lock(&sess->cmdsn_mutex);
-       login_rsp->exp_cmdsn            = cpu_to_be32(sess->exp_cmd_sn);
-       login_rsp->max_cmdsn            = cpu_to_be32(sess->max_cmd_sn);
-       mutex_unlock(&sess->cmdsn_mutex);
 
        return 0;
 
index 5e1349a..9dd94ff 100644 (file)
@@ -430,7 +430,7 @@ static ssize_t iscsi_stat_tgt_attr_show_attr_fail_intr_addr(
        int ret;
 
        spin_lock(&lstat->lock);
-       ret = snprintf(page, PAGE_SIZE, "%s\n", lstat->last_intr_fail_ip_addr);
+       ret = snprintf(page, PAGE_SIZE, "%pISc\n", &lstat->last_intr_fail_sockaddr);
        spin_unlock(&lstat->lock);
 
        return ret;
index cf59c39..11320df 100644 (file)
@@ -50,7 +50,7 @@ u8 iscsit_tmr_abort_task(
                pr_err("Unable to locate RefTaskTag: 0x%08x on CID:"
                        " %hu.\n", hdr->rtt, conn->cid);
                return (iscsi_sna_gte(be32_to_cpu(hdr->refcmdsn), conn->sess->exp_cmd_sn) &&
-                       iscsi_sna_lte(be32_to_cpu(hdr->refcmdsn), conn->sess->max_cmd_sn)) ?
+                       iscsi_sna_lte(be32_to_cpu(hdr->refcmdsn), (u32) atomic_read(&conn->sess->max_cmd_sn))) ?
                        ISCSI_TMF_RSP_COMPLETE : ISCSI_TMF_RSP_NO_TASK;
        }
        if (ref_cmd->cmd_sn != be32_to_cpu(hdr->refcmdsn)) {
index 968068f..23c95cd 100644 (file)
@@ -226,6 +226,7 @@ static void iscsit_set_default_tpg_attribs(struct iscsi_portal_group *tpg)
        a->default_erl = TA_DEFAULT_ERL;
        a->t10_pi = TA_DEFAULT_T10_PI;
        a->fabric_prot_type = TA_DEFAULT_FABRIC_PROT_TYPE;
+       a->tpg_enabled_sendtargets = TA_DEFAULT_TPG_ENABLED_SENDTARGETS;
 }
 
 int iscsit_tpg_add_portal_group(struct iscsi_tiqn *tiqn, struct iscsi_portal_group *tpg)
@@ -430,7 +431,7 @@ struct iscsi_tpg_np *iscsit_tpg_locate_child_np(
 
 static bool iscsit_tpg_check_network_portal(
        struct iscsi_tiqn *tiqn,
-       struct __kernel_sockaddr_storage *sockaddr,
+       struct sockaddr_storage *sockaddr,
        int network_transport)
 {
        struct iscsi_portal_group *tpg;
@@ -459,8 +460,7 @@ static bool iscsit_tpg_check_network_portal(
 
 struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
        struct iscsi_portal_group *tpg,
-       struct __kernel_sockaddr_storage *sockaddr,
-       char *ip_str,
+       struct sockaddr_storage *sockaddr,
        struct iscsi_tpg_np *tpg_np_parent,
        int network_transport)
 {
@@ -470,8 +470,8 @@ struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
        if (!tpg_np_parent) {
                if (iscsit_tpg_check_network_portal(tpg->tpg_tiqn, sockaddr,
                                network_transport)) {
-                       pr_err("Network Portal: %s already exists on a"
-                               " different TPG on %s\n", ip_str,
+                       pr_err("Network Portal: %pISc already exists on a"
+                               " different TPG on %s\n", sockaddr,
                                tpg->tpg_tiqn->tiqn);
                        return ERR_PTR(-EEXIST);
                }
@@ -484,7 +484,7 @@ struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
                return ERR_PTR(-ENOMEM);
        }
 
-       np = iscsit_add_np(sockaddr, ip_str, network_transport);
+       np = iscsit_add_np(sockaddr, network_transport);
        if (IS_ERR(np)) {
                kfree(tpg_np);
                return ERR_CAST(np);
@@ -514,8 +514,8 @@ struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
                spin_unlock(&tpg_np_parent->tpg_np_parent_lock);
        }
 
-       pr_debug("CORE[%s] - Added Network Portal: %s:%hu,%hu on %s\n",
-               tpg->tpg_tiqn->tiqn, np->np_ip, np->np_port, tpg->tpgt,
+       pr_debug("CORE[%s] - Added Network Portal: %pISpc,%hu on %s\n",
+               tpg->tpg_tiqn->tiqn, &np->np_sockaddr, tpg->tpgt,
                np->np_transport->name);
 
        return tpg_np;
@@ -528,8 +528,8 @@ static int iscsit_tpg_release_np(
 {
        iscsit_clear_tpg_np_login_thread(tpg_np, tpg, true);
 
-       pr_debug("CORE[%s] - Removed Network Portal: %s:%hu,%hu on %s\n",
-               tpg->tpg_tiqn->tiqn, np->np_ip, np->np_port, tpg->tpgt,
+       pr_debug("CORE[%s] - Removed Network Portal: %pISpc,%hu on %s\n",
+               tpg->tpg_tiqn->tiqn, &np->np_sockaddr, tpg->tpgt,
                np->np_transport->name);
 
        tpg_np->tpg_np = NULL;
@@ -892,3 +892,21 @@ int iscsit_ta_fabric_prot_type(
 
        return 0;
 }
+
+int iscsit_ta_tpg_enabled_sendtargets(
+       struct iscsi_portal_group *tpg,
+       u32 flag)
+{
+       struct iscsi_tpg_attrib *a = &tpg->tpg_attrib;
+
+       if ((flag != 0) && (flag != 1)) {
+               pr_err("Illegal value %d\n", flag);
+               return -EINVAL;
+       }
+
+       a->tpg_enabled_sendtargets = flag;
+       pr_debug("iSCSI_TPG[%hu] - TPG enabled bit required for SendTargets:"
+               " %s\n", tpg->tpgt, (a->tpg_enabled_sendtargets) ? "ON" : "OFF");
+
+       return 0;
+}
index 95ff5bd..9db32bd 100644 (file)
@@ -22,7 +22,7 @@ extern struct iscsi_node_attrib *iscsit_tpg_get_node_attrib(struct iscsi_session
 extern void iscsit_tpg_del_external_nps(struct iscsi_tpg_np *);
 extern struct iscsi_tpg_np *iscsit_tpg_locate_child_np(struct iscsi_tpg_np *, int);
 extern struct iscsi_tpg_np *iscsit_tpg_add_network_portal(struct iscsi_portal_group *,
-                       struct __kernel_sockaddr_storage *, char *, struct iscsi_tpg_np *,
+                       struct sockaddr_storage *, struct iscsi_tpg_np *,
                        int);
 extern int iscsit_tpg_del_network_portal(struct iscsi_portal_group *,
                        struct iscsi_tpg_np *);
@@ -40,5 +40,6 @@ extern int iscsit_ta_demo_mode_discovery(struct iscsi_portal_group *, u32);
 extern int iscsit_ta_default_erl(struct iscsi_portal_group *, u32);
 extern int iscsit_ta_t10_pi(struct iscsi_portal_group *, u32);
 extern int iscsit_ta_fabric_prot_type(struct iscsi_portal_group *, u32);
+extern int iscsit_ta_tpg_enabled_sendtargets(struct iscsi_portal_group *, u32);
 
 #endif /* ISCSI_TARGET_TPG_H */
index a2bff07..428b0d9 100644 (file)
@@ -233,6 +233,7 @@ struct iscsi_r2t *iscsit_get_holder_for_r2tsn(
 
 static inline int iscsit_check_received_cmdsn(struct iscsi_session *sess, u32 cmdsn)
 {
+       u32 max_cmdsn;
        int ret;
 
        /*
@@ -241,10 +242,10 @@ static inline int iscsit_check_received_cmdsn(struct iscsi_session *sess, u32 cm
         * or order CmdSNs due to multiple connection sessions and/or
         * CRC failures.
         */
-       if (iscsi_sna_gt(cmdsn, sess->max_cmd_sn)) {
+       max_cmdsn = atomic_read(&sess->max_cmd_sn);
+       if (iscsi_sna_gt(cmdsn, max_cmdsn)) {
                pr_err("Received CmdSN: 0x%08x is greater than"
-                      " MaxCmdSN: 0x%08x, ignoring.\n", cmdsn,
-                      sess->max_cmd_sn);
+                      " MaxCmdSN: 0x%08x, ignoring.\n", cmdsn, max_cmdsn);
                ret = CMDSN_MAXCMDSN_OVERRUN;
 
        } else if (cmdsn == sess->exp_cmd_sn) {
@@ -1371,6 +1372,33 @@ int tx_data(
        return iscsit_do_tx_data(conn, &c);
 }
 
+static bool sockaddr_equal(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+       switch (x->ss_family) {
+       case AF_INET: {
+               struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+               struct sockaddr_in *siny = (struct sockaddr_in *)y;
+               if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+                       return false;
+               if (sinx->sin_port != siny->sin_port)
+                       return false;
+               break;
+       }
+       case AF_INET6: {
+               struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+               struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+               if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+                       return false;
+               if (sinx->sin6_port != siny->sin6_port)
+                       return false;
+               break;
+       }
+       default:
+               return false;
+       }
+       return true;
+}
+
 void iscsit_collect_login_stats(
        struct iscsi_conn *conn,
        u8 status_class,
@@ -1387,7 +1415,7 @@ void iscsit_collect_login_stats(
        ls = &tiqn->login_stats;
 
        spin_lock(&ls->lock);
-       if (!strcmp(conn->login_ip, ls->last_intr_fail_ip_addr) &&
+       if (sockaddr_equal(&conn->login_sockaddr, &ls->last_intr_fail_sockaddr) &&
            ((get_jiffies_64() - ls->last_fail_time) < 10)) {
                /* We already have the failure info for this login */
                spin_unlock(&ls->lock);
@@ -1427,8 +1455,7 @@ void iscsit_collect_login_stats(
 
                ls->last_intr_fail_ip_family = conn->login_family;
 
-               snprintf(ls->last_intr_fail_ip_addr, IPV6_ADDRESS_SPACE,
-                               "%s", conn->login_ip);
+               ls->last_intr_fail_sockaddr = conn->login_sockaddr;
                ls->last_fail_time = get_jiffies_64();
        }
 
index a556bde..5bc85ff 100644 (file)
@@ -526,7 +526,7 @@ static inline struct tcm_loop_tpg *tl_tpg(struct se_portal_group *se_tpg)
 static char *tcm_loop_get_endpoint_wwn(struct se_portal_group *se_tpg)
 {
        /*
-        * Return the passed NAA identifier for the SAS Target Port
+        * Return the passed NAA identifier for the Target Port
         */
        return &tl_tpg(se_tpg)->tl_hba->tl_wwn_address[0];
 }
@@ -845,7 +845,7 @@ static int tcm_loop_make_nexus(
                transport_free_session(tl_nexus->se_sess);
                goto out;
        }
-       /* Now, register the SAS I_T Nexus as active. */
+       /* Now, register the I_T Nexus as active. */
        transport_register_session(se_tpg, tl_nexus->se_sess->se_node_acl,
                        tl_nexus->se_sess, tl_nexus);
        tl_tpg->tl_nexus = tl_nexus;
@@ -884,7 +884,7 @@ static int tcm_loop_drop_nexus(
                " %s Initiator Port: %s\n", tcm_loop_dump_proto_id(tpg->tl_hba),
                tl_nexus->se_sess->se_node_acl->initiatorname);
        /*
-        * Release the SCSI I_T Nexus to the emulated SAS Target Port
+        * Release the SCSI I_T Nexus to the emulated Target Port
         */
        transport_deregister_session(tl_nexus->se_sess);
        tpg->tl_nexus = NULL;
@@ -1034,6 +1034,11 @@ static ssize_t tcm_loop_tpg_store_transport_status(
        }
        if (!strncmp(page, "offline", 7)) {
                tl_tpg->tl_transport_status = TCM_TRANSPORT_OFFLINE;
+               if (tl_tpg->tl_nexus) {
+                       struct se_session *tl_sess = tl_tpg->tl_nexus->se_sess;
+
+                       core_allocate_nexus_loss_ua(tl_sess->se_node_acl);
+               }
                return count;
        }
        return -EINVAL;
@@ -1077,7 +1082,7 @@ static struct se_portal_group *tcm_loop_make_naa_tpg(
        tl_tpg->tl_hba = tl_hba;
        tl_tpg->tl_tpgt = tpgt;
        /*
-        * Register the tl_tpg as a emulated SAS TCM Target Endpoint
+        * Register the tl_tpg as a emulated TCM Target Endpoint
         */
        ret = core_tpg_register(wwn, &tl_tpg->tl_se_tpg, tl_hba->tl_proto_id);
        if (ret < 0)
@@ -1102,11 +1107,11 @@ static void tcm_loop_drop_naa_tpg(
        tl_hba = tl_tpg->tl_hba;
        tpgt = tl_tpg->tl_tpgt;
        /*
-        * Release the I_T Nexus for the Virtual SAS link if present
+        * Release the I_T Nexus for the Virtual target link if present
         */
        tcm_loop_drop_nexus(tl_tpg);
        /*
-        * Deregister the tl_tpg as a emulated SAS TCM Target Endpoint
+        * Deregister the tl_tpg as a emulated TCM Target Endpoint
         */
        core_tpg_deregister(se_tpg);
 
@@ -1199,8 +1204,9 @@ static void tcm_loop_drop_scsi_hba(
                                struct tcm_loop_hba, tl_hba_wwn);
 
        pr_debug("TCM_Loop_ConfigFS: Deallocating emulated Target"
-               " SAS Address: %s at Linux/SCSI Host ID: %d\n",
-               tl_hba->tl_wwn_address, tl_hba->sh->host_no);
+               " %s Address: %s at Linux/SCSI Host ID: %d\n",
+               tcm_loop_dump_proto_id(tl_hba), tl_hba->tl_wwn_address,
+               tl_hba->sh->host_no);
        /*
         * Call device_unregister() on the original tl_hba->dev.
         * tcm_loop_fabric_scsi.c:tcm_loop_release_adapter() will
index 09e682b..dcc424a 100644 (file)
@@ -620,8 +620,6 @@ struct se_lun_acl *core_dev_init_initiator_node_lun_acl(
 
        lacl->mapped_lun = mapped_lun;
        lacl->se_lun_nacl = nacl;
-       snprintf(lacl->initiatorname, TRANSPORT_IQN_LEN, "%s",
-                nacl->initiatorname);
 
        return lacl;
 }
@@ -656,7 +654,7 @@ int core_dev_add_initiator_node_lun_acl(
                " InitiatorNode: %s\n", tpg->se_tpg_tfo->get_fabric_name(),
                tpg->se_tpg_tfo->tpg_get_tag(tpg), lun->unpacked_lun, lacl->mapped_lun,
                (lun_access & TRANSPORT_LUNFLAGS_READ_WRITE) ? "RW" : "RO",
-               lacl->initiatorname);
+               nacl->initiatorname);
        /*
         * Check to see if there are any existing persistent reservation APTPL
         * pre-registrations that need to be enabled for this LUN ACL..
@@ -688,7 +686,7 @@ int core_dev_del_initiator_node_lun_acl(
                " InitiatorNode: %s Mapped LUN: %llu\n",
                tpg->se_tpg_tfo->get_fabric_name(),
                tpg->se_tpg_tfo->tpg_get_tag(tpg), lun->unpacked_lun,
-               lacl->initiatorname, lacl->mapped_lun);
+               nacl->initiatorname, lacl->mapped_lun);
 
        return 0;
 }
@@ -701,7 +699,7 @@ void core_dev_free_initiator_node_lun_acl(
                " Mapped LUN: %llu\n", tpg->se_tpg_tfo->get_fabric_name(),
                tpg->se_tpg_tfo->tpg_get_tag(tpg),
                tpg->se_tpg_tfo->get_fabric_name(),
-               lacl->initiatorname, lacl->mapped_lun);
+               lacl->se_lun_nacl->initiatorname, lacl->mapped_lun);
 
        kfree(lacl);
 }
@@ -754,7 +752,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
        dev->dev_link_magic = SE_DEV_LINK_MAGIC;
        dev->se_hba = hba;
        dev->transport = hba->backend->ops;
-       dev->prot_length = sizeof(struct se_dif_v1_tuple);
+       dev->prot_length = sizeof(struct t10_pi_tuple);
        dev->hba_index = hba->hba_index;
 
        INIT_LIST_HEAD(&dev->dev_list);
@@ -771,7 +769,6 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
        spin_lock_init(&dev->se_tmr_lock);
        spin_lock_init(&dev->qf_cmd_lock);
        sema_init(&dev->caw_sem, 1);
-       atomic_set(&dev->dev_ordered_id, 0);
        INIT_LIST_HEAD(&dev->t10_wwn.t10_vpd_list);
        spin_lock_init(&dev->t10_wwn.t10_vpd_lock);
        INIT_LIST_HEAD(&dev->t10_pr.registration_list);
index 48a3698..be42429 100644 (file)
@@ -203,7 +203,7 @@ static ssize_t target_fabric_mappedlun_store_write_protect(
        pr_debug("%s_ConfigFS: Changed Initiator ACL: %s"
                " Mapped LUN: %llu Write Protect bit to %s\n",
                se_tpg->se_tpg_tfo->get_fabric_name(),
-               lacl->initiatorname, lacl->mapped_lun, (op) ? "ON" : "OFF");
+               se_nacl->initiatorname, lacl->mapped_lun, (op) ? "ON" : "OFF");
 
        return count;
 
index be9cefc..9522960 100644 (file)
@@ -184,3 +184,8 @@ core_delete_hba(struct se_hba *hba)
        kfree(hba);
        return 0;
 }
+
+bool target_sense_desc_format(struct se_device *dev)
+{
+       return dev->transport->get_blocks(dev) > U32_MAX;
+}
index e318ddb..0b4b2a6 100644 (file)
@@ -154,6 +154,38 @@ sbc_emulate_readcapacity_16(struct se_cmd *cmd)
        return 0;
 }
 
+static sense_reason_t
+sbc_emulate_startstop(struct se_cmd *cmd)
+{
+       unsigned char *cdb = cmd->t_task_cdb;
+
+       /*
+        * See sbc3r36 section 5.25
+        * Immediate bit should be set since there is nothing to complete
+        * POWER CONDITION MODIFIER 0h
+        */
+       if (!(cdb[1] & 1) || cdb[2] || cdb[3])
+               return TCM_INVALID_CDB_FIELD;
+
+       /*
+        * See sbc3r36 section 5.25
+        * POWER CONDITION 0h START_VALID - process START and LOEJ
+        */
+       if (cdb[4] >> 4 & 0xf)
+               return TCM_INVALID_CDB_FIELD;
+
+       /*
+        * See sbc3r36 section 5.25
+        * LOEJ 0h - nothing to load or unload
+        * START 1h - we are ready
+        */
+       if (!(cdb[4] & 1) || (cdb[4] & 2) || (cdb[4] & 4))
+               return TCM_INVALID_CDB_FIELD;
+
+       target_complete_cmd(cmd, SAM_STAT_GOOD);
+       return 0;
+}
+
 sector_t sbc_get_write_same_sectors(struct se_cmd *cmd)
 {
        u32 num_blocks;
@@ -960,6 +992,9 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops)
                               " than 1\n", sectors);
                        return TCM_INVALID_CDB_FIELD;
                }
+               if (sbc_check_dpofua(dev, cmd, cdb))
+                       return TCM_INVALID_CDB_FIELD;
+
                /*
                 * Double size because we have two buffers, note that
                 * zero is not an error..
@@ -1069,6 +1104,10 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops)
                size = 0;
                cmd->execute_cmd = sbc_emulate_noop;
                break;
+       case START_STOP:
+               size = 0;
+               cmd->execute_cmd = sbc_emulate_startstop;
+               break;
        default:
                ret = spc_parse_cdb(cmd, &size);
                if (ret)
@@ -1191,7 +1230,7 @@ void
 sbc_dif_generate(struct se_cmd *cmd)
 {
        struct se_device *dev = cmd->se_dev;
-       struct se_dif_v1_tuple *sdt;
+       struct t10_pi_tuple *sdt;
        struct scatterlist *dsg = cmd->t_data_sg, *psg;
        sector_t sector = cmd->t_task_lba;
        void *daddr, *paddr;
@@ -1203,7 +1242,7 @@ sbc_dif_generate(struct se_cmd *cmd)
                daddr = kmap_atomic(sg_page(dsg)) + dsg->offset;
 
                for (j = 0; j < psg->length;
-                               j += sizeof(struct se_dif_v1_tuple)) {
+                               j += sizeof(*sdt)) {
                        __u16 crc;
                        unsigned int avail;
 
@@ -1256,7 +1295,7 @@ sbc_dif_generate(struct se_cmd *cmd)
 }
 
 static sense_reason_t
-sbc_dif_v1_verify(struct se_cmd *cmd, struct se_dif_v1_tuple *sdt,
+sbc_dif_v1_verify(struct se_cmd *cmd, struct t10_pi_tuple *sdt,
                  __u16 crc, sector_t sector, unsigned int ei_lba)
 {
        __be16 csum;
@@ -1346,7 +1385,7 @@ sbc_dif_verify(struct se_cmd *cmd, sector_t start, unsigned int sectors,
               unsigned int ei_lba, struct scatterlist *psg, int psg_off)
 {
        struct se_device *dev = cmd->se_dev;
-       struct se_dif_v1_tuple *sdt;
+       struct t10_pi_tuple *sdt;
        struct scatterlist *dsg = cmd->t_data_sg;
        sector_t sector = start;
        void *daddr, *paddr;
@@ -1361,7 +1400,7 @@ sbc_dif_verify(struct se_cmd *cmd, sector_t start, unsigned int sectors,
 
                for (i = psg_off; i < psg->length &&
                                sector < start + sectors;
-                               i += sizeof(struct se_dif_v1_tuple)) {
+                               i += sizeof(*sdt)) {
                        __u16 crc;
                        unsigned int avail;
 
index f87d4ce..9413e1a 100644 (file)
@@ -484,8 +484,8 @@ static sense_reason_t
 spc_emulate_evpd_b0(struct se_cmd *cmd, unsigned char *buf)
 {
        struct se_device *dev = cmd->se_dev;
-       int have_tp = 0;
-       int opt, min;
+       u32 mtl = 0;
+       int have_tp = 0, opt, min;
 
        /*
         * Following spc3r22 section 6.5.3 Block Limits VPD page, when
@@ -516,8 +516,15 @@ spc_emulate_evpd_b0(struct se_cmd *cmd, unsigned char *buf)
 
        /*
         * Set MAXIMUM TRANSFER LENGTH
+        *
+        * XXX: Currently assumes single PAGE_SIZE per scatterlist for fabrics
+        * enforcing maximum HW scatter-gather-list entry limit
         */
-       put_unaligned_be32(dev->dev_attrib.hw_max_sectors, &buf[8]);
+       if (cmd->se_tfo->max_data_sg_nents) {
+               mtl = (cmd->se_tfo->max_data_sg_nents * PAGE_SIZE) /
+                      dev->dev_attrib.block_size;
+       }
+       put_unaligned_be32(min_not_zero(mtl, dev->dev_attrib.hw_max_sectors), &buf[8]);
 
        /*
         * Set OPTIMAL TRANSFER LENGTH
@@ -768,7 +775,12 @@ static int spc_modesense_control(struct se_cmd *cmd, u8 pc, u8 *p)
        if (pc == 1)
                goto out;
 
-       p[2] = 2;
+       /* GLTSD: No implicit save of log parameters */
+       p[2] = (1 << 1);
+       if (target_sense_desc_format(dev))
+               /* D_SENSE: Descriptor format sense data for 64bit sectors */
+               p[2] |= (1 << 2);
+
        /*
         * From spc4r23, 7.4.7 Control mode page
         *
@@ -1151,6 +1163,7 @@ static sense_reason_t spc_emulate_request_sense(struct se_cmd *cmd)
        unsigned char *rbuf;
        u8 ua_asc = 0, ua_ascq = 0;
        unsigned char buf[SE_SENSE_BUF];
+       bool desc_format = target_sense_desc_format(cmd->se_dev);
 
        memset(buf, 0, SE_SENSE_BUF);
 
@@ -1164,32 +1177,11 @@ static sense_reason_t spc_emulate_request_sense(struct se_cmd *cmd)
        if (!rbuf)
                return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
-       if (!core_scsi3_ua_clear_for_request_sense(cmd, &ua_asc, &ua_ascq)) {
-               /*
-                * CURRENT ERROR, UNIT ATTENTION
-                */
-               buf[0] = 0x70;
-               buf[SPC_SENSE_KEY_OFFSET] = UNIT_ATTENTION;
-
-               /*
-                * The Additional Sense Code (ASC) from the UNIT ATTENTION
-                */
-               buf[SPC_ASC_KEY_OFFSET] = ua_asc;
-               buf[SPC_ASCQ_KEY_OFFSET] = ua_ascq;
-               buf[7] = 0x0A;
-       } else {
-               /*
-                * CURRENT ERROR, NO SENSE
-                */
-               buf[0] = 0x70;
-               buf[SPC_SENSE_KEY_OFFSET] = NO_SENSE;
-
-               /*
-                * NO ADDITIONAL SENSE INFORMATION
-                */
-               buf[SPC_ASC_KEY_OFFSET] = 0x00;
-               buf[7] = 0x0A;
-       }
+       if (!core_scsi3_ua_clear_for_request_sense(cmd, &ua_asc, &ua_ascq))
+               scsi_build_sense_buffer(desc_format, buf, UNIT_ATTENTION,
+                                       ua_asc, ua_ascq);
+       else
+               scsi_build_sense_buffer(desc_format, buf, NO_SENSE, 0x0, 0x0);
 
        memcpy(rbuf, buf, min_t(u32, sizeof(buf), cmd->data_length));
        transport_kunmap_data_sg(cmd);
@@ -1418,9 +1410,6 @@ spc_parse_cdb(struct se_cmd *cmd, unsigned int *size)
                }
                break;
        default:
-               pr_warn("TARGET_CORE[%s]: Unsupported SCSI Opcode"
-                       " 0x%02x, sending CHECK_CONDITION.\n",
-                       cmd->se_tfo->get_fabric_name(), cdb[0]);
                return TCM_UNSUPPORTED_SCSI_OPCODE;
        }
 
index babde4a..2d0381d 100644 (file)
@@ -41,6 +41,7 @@
 #include "target_core_internal.h"
 #include "target_core_alua.h"
 #include "target_core_pr.h"
+#include "target_core_ua.h"
 
 extern struct se_device *g_lun0_dev;
 
@@ -83,6 +84,22 @@ struct se_node_acl *core_tpg_get_initiator_node_acl(
 }
 EXPORT_SYMBOL(core_tpg_get_initiator_node_acl);
 
+void core_allocate_nexus_loss_ua(
+       struct se_node_acl *nacl)
+{
+       struct se_dev_entry *deve;
+
+       if (!nacl)
+               return;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link)
+               core_scsi3_ua_allocate(deve, 0x29,
+                       ASCQ_29H_NEXUS_LOSS_OCCURRED);
+       rcu_read_unlock();
+}
+EXPORT_SYMBOL(core_allocate_nexus_loss_ua);
+
 /*     core_tpg_add_node_to_devs():
  *
  *
index ce8574b..5bacc7b 100644 (file)
@@ -39,6 +39,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
 
 #include <target/target_core_base.h>
 #include <target/target_core_backend.h>
@@ -1074,6 +1075,55 @@ transport_set_vpd_ident(struct t10_vpd *vpd, unsigned char *page_83)
 }
 EXPORT_SYMBOL(transport_set_vpd_ident);
 
+static sense_reason_t
+target_check_max_data_sg_nents(struct se_cmd *cmd, struct se_device *dev,
+                              unsigned int size)
+{
+       u32 mtl;
+
+       if (!cmd->se_tfo->max_data_sg_nents)
+               return TCM_NO_SENSE;
+       /*
+        * Check if fabric enforced maximum SGL entries per I/O descriptor
+        * exceeds se_cmd->data_length.  If true, set SCF_UNDERFLOW_BIT +
+        * residual_count and reduce original cmd->data_length to maximum
+        * length based on single PAGE_SIZE entry scatter-lists.
+        */
+       mtl = (cmd->se_tfo->max_data_sg_nents * PAGE_SIZE);
+       if (cmd->data_length > mtl) {
+               /*
+                * If an existing CDB overflow is present, calculate new residual
+                * based on CDB size minus fabric maximum transfer length.
+                *
+                * If an existing CDB underflow is present, calculate new residual
+                * based on original cmd->data_length minus fabric maximum transfer
+                * length.
+                *
+                * Otherwise, set the underflow residual based on cmd->data_length
+                * minus fabric maximum transfer length.
+                */
+               if (cmd->se_cmd_flags & SCF_OVERFLOW_BIT) {
+                       cmd->residual_count = (size - mtl);
+               } else if (cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) {
+                       u32 orig_dl = size + cmd->residual_count;
+                       cmd->residual_count = (orig_dl - mtl);
+               } else {
+                       cmd->se_cmd_flags |= SCF_UNDERFLOW_BIT;
+                       cmd->residual_count = (cmd->data_length - mtl);
+               }
+               cmd->data_length = mtl;
+               /*
+                * Reset sbc_check_prot() calculated protection payload
+                * length based upon the new smaller MTL.
+                */
+               if (cmd->prot_length) {
+                       u32 sectors = (mtl / dev->dev_attrib.block_size);
+                       cmd->prot_length = dev->prot_length * sectors;
+               }
+       }
+       return TCM_NO_SENSE;
+}
+
 sense_reason_t
 target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
 {
@@ -1087,9 +1137,9 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
                        " 0x%02x\n", cmd->se_tfo->get_fabric_name(),
                                cmd->data_length, size, cmd->t_task_cdb[0]);
 
-               if (cmd->data_direction == DMA_TO_DEVICE) {
-                       pr_err("Rejecting underflow/overflow"
-                                       " WRITE data\n");
+               if (cmd->data_direction == DMA_TO_DEVICE &&
+                   cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) {
+                       pr_err("Rejecting underflow/overflow WRITE data\n");
                        return TCM_INVALID_CDB_FIELD;
                }
                /*
@@ -1119,7 +1169,7 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
                }
        }
 
-       return 0;
+       return target_check_max_data_sg_nents(cmd, dev, size);
 
 }
 
@@ -1177,14 +1227,7 @@ transport_check_alloc_task_attr(struct se_cmd *cmd)
                        " emulation is not supported\n");
                return TCM_INVALID_CDB_FIELD;
        }
-       /*
-        * Used to determine when ORDERED commands should go from
-        * Dormant to Active status.
-        */
-       cmd->se_ordered_id = atomic_inc_return(&dev->dev_ordered_id);
-       pr_debug("Allocated se_ordered_id: %u for Task Attr: 0x%02x on %s\n",
-                       cmd->se_ordered_id, cmd->sam_task_attr,
-                       dev->transport->name);
+
        return 0;
 }
 
@@ -1246,6 +1289,11 @@ target_setup_cmd_from_cdb(struct se_cmd *cmd, unsigned char *cdb)
        }
 
        ret = dev->transport->parse_cdb(cmd);
+       if (ret == TCM_UNSUPPORTED_SCSI_OPCODE)
+               pr_warn_ratelimited("%s/%s: Unsupported SCSI Opcode 0x%02x, sending CHECK_CONDITION.\n",
+                                   cmd->se_tfo->get_fabric_name(),
+                                   cmd->se_sess->se_node_acl->initiatorname,
+                                   cmd->t_task_cdb[0]);
        if (ret)
                return ret;
 
@@ -1693,8 +1741,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
 
 check_stop:
        transport_lun_remove_cmd(cmd);
-       if (!transport_cmd_check_stop_to_fabric(cmd))
-               ;
+       transport_cmd_check_stop_to_fabric(cmd);
        return;
 
 queue_full:
@@ -1767,16 +1814,14 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
         */
        switch (cmd->sam_task_attr) {
        case TCM_HEAD_TAG:
-               pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x, "
-                        "se_ordered_id: %u\n",
-                        cmd->t_task_cdb[0], cmd->se_ordered_id);
+               pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n",
+                        cmd->t_task_cdb[0]);
                return false;
        case TCM_ORDERED_TAG:
                atomic_inc_mb(&dev->dev_ordered_sync);
 
-               pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, "
-                        " se_ordered_id: %u\n",
-                        cmd->t_task_cdb[0], cmd->se_ordered_id);
+               pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n",
+                        cmd->t_task_cdb[0]);
 
                /*
                 * Execute an ORDERED command if no other older commands
@@ -1800,10 +1845,8 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
        list_add_tail(&cmd->se_delayed_node, &dev->delayed_cmd_list);
        spin_unlock(&dev->delayed_cmd_lock);
 
-       pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to"
-               " delayed CMD list, se_ordered_id: %u\n",
-               cmd->t_task_cdb[0], cmd->sam_task_attr,
-               cmd->se_ordered_id);
+       pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn",
+               cmd->t_task_cdb[0], cmd->sam_task_attr);
        return true;
 }
 
@@ -1888,20 +1931,18 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
        if (cmd->sam_task_attr == TCM_SIMPLE_TAG) {
                atomic_dec_mb(&dev->simple_cmds);
                dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev->dev_cur_ordered_id: %u for"
-                       " SIMPLE: %u\n", dev->dev_cur_ordered_id,
-                       cmd->se_ordered_id);
+               pr_debug("Incremented dev->dev_cur_ordered_id: %u for SIMPLE\n",
+                        dev->dev_cur_ordered_id);
        } else if (cmd->sam_task_attr == TCM_HEAD_TAG) {
                dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev_cur_ordered_id: %u for"
-                       " HEAD_OF_QUEUE: %u\n", dev->dev_cur_ordered_id,
-                       cmd->se_ordered_id);
+               pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n",
+                        dev->dev_cur_ordered_id);
        } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) {
                atomic_dec_mb(&dev->dev_ordered_sync);
 
                dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:"
-                       " %u\n", dev->dev_cur_ordered_id, cmd->se_ordered_id);
+               pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n",
+                        dev->dev_cur_ordered_id);
        }
 
        target_restart_delayed_cmds(dev);
@@ -2615,37 +2656,159 @@ bool transport_wait_for_tasks(struct se_cmd *cmd)
 }
 EXPORT_SYMBOL(transport_wait_for_tasks);
 
-static int transport_get_sense_codes(
-       struct se_cmd *cmd,
-       u8 *asc,
-       u8 *ascq)
+struct sense_info {
+       u8 key;
+       u8 asc;
+       u8 ascq;
+       bool add_sector_info;
+};
+
+static const struct sense_info sense_info_table[] = {
+       [TCM_NO_SENSE] = {
+               .key = NOT_READY
+       },
+       [TCM_NON_EXISTENT_LUN] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x25 /* LOGICAL UNIT NOT SUPPORTED */
+       },
+       [TCM_UNSUPPORTED_SCSI_OPCODE] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x20, /* INVALID COMMAND OPERATION CODE */
+       },
+       [TCM_SECTOR_COUNT_TOO_MANY] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x20, /* INVALID COMMAND OPERATION CODE */
+       },
+       [TCM_UNKNOWN_MODE_PAGE] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x24, /* INVALID FIELD IN CDB */
+       },
+       [TCM_CHECK_CONDITION_ABORT_CMD] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x29, /* BUS DEVICE RESET FUNCTION OCCURRED */
+               .ascq = 0x03,
+       },
+       [TCM_INCORRECT_AMOUNT_OF_DATA] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x0c, /* WRITE ERROR */
+               .ascq = 0x0d, /* NOT ENOUGH UNSOLICITED DATA */
+       },
+       [TCM_INVALID_CDB_FIELD] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x24, /* INVALID FIELD IN CDB */
+       },
+       [TCM_INVALID_PARAMETER_LIST] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x26, /* INVALID FIELD IN PARAMETER LIST */
+       },
+       [TCM_PARAMETER_LIST_LENGTH_ERROR] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x1a, /* PARAMETER LIST LENGTH ERROR */
+       },
+       [TCM_UNEXPECTED_UNSOLICITED_DATA] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x0c, /* WRITE ERROR */
+               .ascq = 0x0c, /* UNEXPECTED_UNSOLICITED_DATA */
+       },
+       [TCM_SERVICE_CRC_ERROR] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x47, /* PROTOCOL SERVICE CRC ERROR */
+               .ascq = 0x05, /* N/A */
+       },
+       [TCM_SNACK_REJECTED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x11, /* READ ERROR */
+               .ascq = 0x13, /* FAILED RETRANSMISSION REQUEST */
+       },
+       [TCM_WRITE_PROTECTED] = {
+               .key = DATA_PROTECT,
+               .asc = 0x27, /* WRITE PROTECTED */
+       },
+       [TCM_ADDRESS_OUT_OF_RANGE] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x21, /* LOGICAL BLOCK ADDRESS OUT OF RANGE */
+       },
+       [TCM_CHECK_CONDITION_UNIT_ATTENTION] = {
+               .key = UNIT_ATTENTION,
+       },
+       [TCM_CHECK_CONDITION_NOT_READY] = {
+               .key = NOT_READY,
+       },
+       [TCM_MISCOMPARE_VERIFY] = {
+               .key = MISCOMPARE,
+               .asc = 0x1d, /* MISCOMPARE DURING VERIFY OPERATION */
+               .ascq = 0x00,
+       },
+       [TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x10,
+               .ascq = 0x01, /* LOGICAL BLOCK GUARD CHECK FAILED */
+               .add_sector_info = true,
+       },
+       [TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x10,
+               .ascq = 0x02, /* LOGICAL BLOCK APPLICATION TAG CHECK FAILED */
+               .add_sector_info = true,
+       },
+       [TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x10,
+               .ascq = 0x03, /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */
+               .add_sector_info = true,
+       },
+       [TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE] = {
+               /*
+                * Returning ILLEGAL REQUEST would cause immediate IO errors on
+                * Solaris initiators.  Returning NOT READY instead means the
+                * operations will be retried a finite number of times and we
+                * can survive intermittent errors.
+                */
+               .key = NOT_READY,
+               .asc = 0x08, /* LOGICAL UNIT COMMUNICATION FAILURE */
+       },
+};
+
+static int translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason)
 {
-       *asc = cmd->scsi_asc;
-       *ascq = cmd->scsi_ascq;
+       const struct sense_info *si;
+       u8 *buffer = cmd->sense_buffer;
+       int r = (__force int)reason;
+       u8 asc, ascq;
+       bool desc_format = target_sense_desc_format(cmd->se_dev);
 
-       return 0;
-}
+       if (r < ARRAY_SIZE(sense_info_table) && sense_info_table[r].key)
+               si = &sense_info_table[r];
+       else
+               si = &sense_info_table[(__force int)
+                                      TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE];
 
-static
-void transport_err_sector_info(unsigned char *buffer, sector_t bad_sector)
-{
-       /* Place failed LBA in sense data information descriptor 0. */
-       buffer[SPC_ADD_SENSE_LEN_OFFSET] = 0xc;
-       buffer[SPC_DESC_TYPE_OFFSET] = 0; /* Information */
-       buffer[SPC_ADDITIONAL_DESC_LEN_OFFSET] = 0xa;
-       buffer[SPC_VALIDITY_OFFSET] = 0x80;
+       if (reason == TCM_CHECK_CONDITION_UNIT_ATTENTION) {
+               core_scsi3_ua_for_check_condition(cmd, &asc, &ascq);
+               WARN_ON_ONCE(asc == 0);
+       } else if (si->asc == 0) {
+               WARN_ON_ONCE(cmd->scsi_asc == 0);
+               asc = cmd->scsi_asc;
+               ascq = cmd->scsi_ascq;
+       } else {
+               asc = si->asc;
+               ascq = si->ascq;
+       }
+
+       scsi_build_sense_buffer(desc_format, buffer, si->key, asc, ascq);
+       if (si->add_sector_info)
+               return scsi_set_sense_information(buffer,
+                                                 cmd->scsi_sense_length,
+                                                 cmd->bad_sector);
 
-       /* Descriptor Information: failing sector */
-       put_unaligned_be64(bad_sector, &buffer[12]);
+       return 0;
 }
 
 int
 transport_send_check_condition_and_sense(struct se_cmd *cmd,
                sense_reason_t reason, int from_transport)
 {
-       unsigned char *buffer = cmd->sense_buffer;
        unsigned long flags;
-       u8 asc = 0, ascq = 0;
 
        spin_lock_irqsave(&cmd->t_state_lock, flags);
        if (cmd->se_cmd_flags & SCF_SENT_CHECK_CONDITION) {
@@ -2655,243 +2818,17 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd,
        cmd->se_cmd_flags |= SCF_SENT_CHECK_CONDITION;
        spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
-       if (!reason && from_transport)
-               goto after_reason;
+       if (!from_transport) {
+               int rc;
 
-       if (!from_transport)
                cmd->se_cmd_flags |= SCF_EMULATED_TASK_SENSE;
-
-       /*
-        * Actual SENSE DATA, see SPC-3 7.23.2  SPC_SENSE_KEY_OFFSET uses
-        * SENSE KEY values from include/scsi/scsi.h
-        */
-       switch (reason) {
-       case TCM_NO_SENSE:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* Not Ready */
-               buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY;
-               /* NO ADDITIONAL SENSE INFORMATION */
-               buffer[SPC_ASC_KEY_OFFSET] = 0;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0;
-               break;
-       case TCM_NON_EXISTENT_LUN:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL UNIT NOT SUPPORTED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x25;
-               break;
-       case TCM_UNSUPPORTED_SCSI_OPCODE:
-       case TCM_SECTOR_COUNT_TOO_MANY:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID COMMAND OPERATION CODE */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x20;
-               break;
-       case TCM_UNKNOWN_MODE_PAGE:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID FIELD IN CDB */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x24;
-               break;
-       case TCM_CHECK_CONDITION_ABORT_CMD:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* BUS DEVICE RESET FUNCTION OCCURRED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x29;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x03;
-               break;
-       case TCM_INCORRECT_AMOUNT_OF_DATA:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* WRITE ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x0c;
-               /* NOT ENOUGH UNSOLICITED DATA */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x0d;
-               break;
-       case TCM_INVALID_CDB_FIELD:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID FIELD IN CDB */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x24;
-               break;
-       case TCM_INVALID_PARAMETER_LIST:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID FIELD IN PARAMETER LIST */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x26;
-               break;
-       case TCM_PARAMETER_LIST_LENGTH_ERROR:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* PARAMETER LIST LENGTH ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x1a;
-               break;
-       case TCM_UNEXPECTED_UNSOLICITED_DATA:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* WRITE ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x0c;
-               /* UNEXPECTED_UNSOLICITED_DATA */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x0c;
-               break;
-       case TCM_SERVICE_CRC_ERROR:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* PROTOCOL SERVICE CRC ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x47;
-               /* N/A */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x05;
-               break;
-       case TCM_SNACK_REJECTED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* READ ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x11;
-               /* FAILED RETRANSMISSION REQUEST */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x13;
-               break;
-       case TCM_WRITE_PROTECTED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* DATA PROTECT */
-               buffer[SPC_SENSE_KEY_OFFSET] = DATA_PROTECT;
-               /* WRITE PROTECTED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x27;
-               break;
-       case TCM_ADDRESS_OUT_OF_RANGE:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK ADDRESS OUT OF RANGE */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x21;
-               break;
-       case TCM_CHECK_CONDITION_UNIT_ATTENTION:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* UNIT ATTENTION */
-               buffer[SPC_SENSE_KEY_OFFSET] = UNIT_ATTENTION;
-               core_scsi3_ua_for_check_condition(cmd, &asc, &ascq);
-               buffer[SPC_ASC_KEY_OFFSET] = asc;
-               buffer[SPC_ASCQ_KEY_OFFSET] = ascq;
-               break;
-       case TCM_CHECK_CONDITION_NOT_READY:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* Not Ready */
-               buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY;
-               transport_get_sense_codes(cmd, &asc, &ascq);
-               buffer[SPC_ASC_KEY_OFFSET] = asc;
-               buffer[SPC_ASCQ_KEY_OFFSET] = ascq;
-               break;
-       case TCM_MISCOMPARE_VERIFY:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               buffer[SPC_SENSE_KEY_OFFSET] = MISCOMPARE;
-               /* MISCOMPARE DURING VERIFY OPERATION */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x1d;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x00;
-               break;
-       case TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK GUARD CHECK FAILED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x10;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x01;
-               transport_err_sector_info(buffer, cmd->bad_sector);
-               break;
-       case TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK APPLICATION TAG CHECK FAILED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x10;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x02;
-               transport_err_sector_info(buffer, cmd->bad_sector);
-               break;
-       case TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x10;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x03;
-               transport_err_sector_info(buffer, cmd->bad_sector);
-               break;
-       case TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE:
-       default:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /*
-                * Returning ILLEGAL REQUEST would cause immediate IO errors on
-                * Solaris initiators.  Returning NOT READY instead means the
-                * operations will be retried a finite number of times and we
-                * can survive intermittent errors.
-                */
-               buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY;
-               /* LOGICAL UNIT COMMUNICATION FAILURE */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x08;
-               break;
+               cmd->scsi_status = SAM_STAT_CHECK_CONDITION;
+               cmd->scsi_sense_length  = TRANSPORT_SENSE_BUFFER;
+               rc = translate_sense_reason(cmd, reason);
+               if (rc)
+                       return rc;
        }
-       /*
-        * This code uses linux/include/scsi/scsi.h SAM status codes!
-        */
-       cmd->scsi_status = SAM_STAT_CHECK_CONDITION;
-       /*
-        * Automatically padded, this value is encoded in the fabric's
-        * data_length response PDU containing the SCSI defined sense data.
-        */
-       cmd->scsi_sense_length  = TRANSPORT_SENSE_BUFFER;
 
-after_reason:
        trace_target_cmd_complete(cmd);
        return cmd->se_tfo->queue_status(cmd);
 }
index c448ef4..937cebf 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/parser.h>
 #include <linux/vmalloc.h>
 #include <linux/uio_driver.h>
+#include <linux/stringify.h>
 #include <net/genetlink.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
@@ -538,14 +539,8 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
                UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size);
                pr_warn("TCMU: Userspace set UNKNOWN_OP flag on se_cmd %p\n",
                        cmd->se_cmd);
-               transport_generic_request_failure(cmd->se_cmd,
-                       TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE);
-               cmd->se_cmd = NULL;
-               kmem_cache_free(tcmu_cmd_cache, cmd);
-               return;
-       }
-
-       if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) {
+               entry->rsp.scsi_status = SAM_STAT_CHECK_CONDITION;
+       } else if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) {
                memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer,
                               se_cmd->scsi_sense_length);
 
@@ -577,7 +572,6 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
 static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 {
        struct tcmu_mailbox *mb;
-       LIST_HEAD(cpl_cmds);
        unsigned long flags;
        int handled = 0;
 
@@ -905,7 +899,7 @@ static int tcmu_configure_device(struct se_device *dev)
        WARN_ON(!PAGE_ALIGNED(udev->data_off));
        WARN_ON(udev->data_size % PAGE_SIZE);
 
-       info->version = xstr(TCMU_MAILBOX_VERSION);
+       info->version = __stringify(TCMU_MAILBOX_VERSION);
 
        info->mem[0].name = "tcm-user command & data buffer";
        info->mem[0].addr = (phys_addr_t) udev->mb_addr;
index 4515f52..47fe94e 100644 (file)
@@ -450,6 +450,8 @@ int target_xcopy_setup_pt(void)
        memset(&xcopy_pt_sess, 0, sizeof(struct se_session));
        INIT_LIST_HEAD(&xcopy_pt_sess.sess_list);
        INIT_LIST_HEAD(&xcopy_pt_sess.sess_acl_list);
+       INIT_LIST_HEAD(&xcopy_pt_sess.sess_cmd_list);
+       spin_lock_init(&xcopy_pt_sess.sess_cmd_lock);
 
        xcopy_pt_nacl.se_tpg = &xcopy_pt_tpg;
        xcopy_pt_nacl.nacl_sess = &xcopy_pt_sess;
@@ -644,7 +646,7 @@ static int target_xcopy_read_source(
        pr_debug("XCOPY: Built READ_16: LBA: %llu Sectors: %u Length: %u\n",
                (unsigned long long)src_lba, src_sectors, length);
 
-       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, NULL, length,
+       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, length,
                              DMA_FROM_DEVICE, 0, &xpt_cmd->sense_buffer[0]);
        xop->src_pt_cmd = xpt_cmd;
 
@@ -704,7 +706,7 @@ static int target_xcopy_write_destination(
        pr_debug("XCOPY: Built WRITE_16: LBA: %llu Sectors: %u Length: %u\n",
                (unsigned long long)dst_lba, dst_sectors, length);
 
-       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, NULL, length,
+       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, length,
                              DMA_TO_DEVICE, 0, &xpt_cmd->sense_buffer[0]);
        xop->dst_pt_cmd = xpt_cmd;
 
index 6803172..aa3caca 100644 (file)
@@ -255,7 +255,7 @@ static void ft_recv_seq(struct fc_seq *sp, struct fc_frame *fp, void *arg)
        struct ft_cmd *cmd = arg;
        struct fc_frame_header *fh;
 
-       if (unlikely(IS_ERR(fp))) {
+       if (IS_ERR(fp)) {
                /* XXX need to find cmd if queued */
                cmd->seq = NULL;
                cmd->aborted = true;
index 118938e..0390044 100644 (file)
@@ -340,6 +340,14 @@ config ACPI_THERMAL_REL
        tristate
        depends on ACPI
 
+config INTEL_PCH_THERMAL
+       tristate "Intel PCH Thermal Reporting Driver"
+       depends on X86 && PCI
+       help
+         Enable this to support thermal reporting on certain intel PCHs.
+         Thermal reporting device will provide temperature reading,
+         programmable trip points and other information.
+
 menu "Texas Instruments thermal drivers"
 source "drivers/thermal/ti-soc-thermal/Kconfig"
 endmenu
index 535dfee..26f1608 100644 (file)
@@ -41,6 +41,7 @@ obj-$(CONFIG_INTEL_SOC_DTS_THERMAL)   += intel_soc_dts_thermal.o
 obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL)  += intel_quark_dts_thermal.o
 obj-$(CONFIG_TI_SOC_THERMAL)   += ti-soc-thermal/
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
+obj-$(CONFIG_INTEL_PCH_THERMAL)        += intel_pch_thermal.o
 obj-$(CONFIG_ST_THERMAL)       += st/
 obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra_soctherm.o
 obj-$(CONFIG_HISI_THERMAL)     += hisi_thermal.o
index 01255fd..26b8d32 100644 (file)
@@ -155,7 +155,7 @@ static bool armada_is_valid(struct armada_thermal_priv *priv)
 }
 
 static int armada_get_temp(struct thermal_zone_device *thermal,
-                         unsigned long *temp)
+                         int *temp)
 {
        struct armada_thermal_priv *priv = thermal->devdata;
        unsigned long reg;
index 2fb273c..652acd8 100644 (file)
@@ -107,8 +107,7 @@ static int db8500_cdev_unbind(struct thermal_zone_device *thermal,
 }
 
 /* Callback to get current temperature */
-static int db8500_sys_get_temp(struct thermal_zone_device *thermal,
-               unsigned long *temp)
+static int db8500_sys_get_temp(struct thermal_zone_device *thermal, int *temp)
 {
        struct db8500_thermal_zone *pzone = thermal->devdata;
 
@@ -180,7 +179,7 @@ static int db8500_sys_get_trip_type(struct thermal_zone_device *thermal,
 
 /* Callback to get trip point temperature */
 static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal,
-               int trip, unsigned long *temp)
+               int trip, int *temp)
 {
        struct db8500_thermal_zone *pzone = thermal->devdata;
        struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
@@ -195,7 +194,7 @@ static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal,
 
 /* Callback to get critical trip point temperature */
 static int db8500_sys_get_crit_temp(struct thermal_zone_device *thermal,
-               unsigned long *temp)
+               int *temp)
 {
        struct db8500_thermal_zone *pzone = thermal->devdata;
        struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
index 09f6e30..a0bc9de 100644 (file)
@@ -93,7 +93,7 @@ static int dove_init_sensor(const struct dove_thermal_priv *priv)
 }
 
 static int dove_get_temp(struct thermal_zone_device *thermal,
-                         unsigned long *temp)
+                         int *temp)
 {
        unsigned long reg;
        struct dove_thermal_priv *priv = thermal->devdata;
index c2c10bb..34fe365 100644 (file)
@@ -34,7 +34,7 @@
 static int get_trip_level(struct thermal_zone_device *tz)
 {
        int count = 0;
-       unsigned long trip_temp;
+       int trip_temp;
        enum thermal_trip_type trip_type;
 
        if (tz->trips == 0 || !tz->ops->get_trip_temp)
index c5dd76b..70836c5 100644 (file)
 
 static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 {
-       long trip_temp;
-       unsigned long trip_hyst;
+       int trip_temp, trip_hyst;
        struct thermal_instance *instance;
 
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
        tz->ops->get_trip_hyst(tz, trip, &trip_hyst);
 
-       dev_dbg(&tz->device, "Trip%d[temp=%ld]:temp=%d:hyst=%ld\n",
+       dev_dbg(&tz->device, "Trip%d[temp=%d]:temp=%d:hyst=%d\n",
                                trip, trip_temp, tz->temperature,
                                trip_hyst);
 
index b49f97c..36d0729 100644 (file)
@@ -155,7 +155,7 @@ static void hisi_thermal_disable_sensor(struct hisi_thermal_data *data)
        mutex_unlock(&data->thermal_lock);
 }
 
-static int hisi_thermal_get_temp(void *_sensor, long *temp)
+static int hisi_thermal_get_temp(void *_sensor, int *temp)
 {
        struct hisi_thermal_sensor *sensor = _sensor;
        struct hisi_thermal_data *data = sensor->thermal;
@@ -178,7 +178,7 @@ static int hisi_thermal_get_temp(void *_sensor, long *temp)
        data->irq_bind_sensor = sensor_id;
        mutex_unlock(&data->thermal_lock);
 
-       dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%ld, thres=%d\n",
+       dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%d, thres=%d\n",
                sensor->id, data->irq_enabled, *temp, sensor->thres_temp);
        /*
         * Bind irq to sensor for two cases:
index fde4c28..4bec1d3 100644 (file)
@@ -98,10 +98,10 @@ struct imx_thermal_data {
        enum thermal_device_mode mode;
        struct regmap *tempmon;
        u32 c1, c2; /* See formula in imx_get_sensor_data() */
-       unsigned long temp_passive;
-       unsigned long temp_critical;
-       unsigned long alarm_temp;
-       unsigned long last_temp;
+       int temp_passive;
+       int temp_critical;
+       int alarm_temp;
+       int last_temp;
        bool irq_enabled;
        int irq;
        struct clk *thermal_clk;
@@ -109,7 +109,7 @@ struct imx_thermal_data {
 };
 
 static void imx_set_panic_temp(struct imx_thermal_data *data,
-                              signed long panic_temp)
+                              int panic_temp)
 {
        struct regmap *map = data->tempmon;
        int critical_value;
@@ -121,7 +121,7 @@ static void imx_set_panic_temp(struct imx_thermal_data *data,
 }
 
 static void imx_set_alarm_temp(struct imx_thermal_data *data,
-                              signed long alarm_temp)
+                              int alarm_temp)
 {
        struct regmap *map = data->tempmon;
        int alarm_value;
@@ -133,7 +133,7 @@ static void imx_set_alarm_temp(struct imx_thermal_data *data,
                        TEMPSENSE0_ALARM_VALUE_SHIFT);
 }
 
-static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
+static int imx_get_temp(struct thermal_zone_device *tz, int *temp)
 {
        struct imx_thermal_data *data = tz->devdata;
        struct regmap *map = data->tempmon;
@@ -189,13 +189,13 @@ static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
                if (data->alarm_temp == data->temp_critical &&
                        *temp < data->temp_passive) {
                        imx_set_alarm_temp(data, data->temp_passive);
-                       dev_dbg(&tz->device, "thermal alarm off: T < %lu\n",
+                       dev_dbg(&tz->device, "thermal alarm off: T < %d\n",
                                data->alarm_temp / 1000);
                }
        }
 
        if (*temp != data->last_temp) {
-               dev_dbg(&tz->device, "millicelsius: %ld\n", *temp);
+               dev_dbg(&tz->device, "millicelsius: %d\n", *temp);
                data->last_temp = *temp;
        }
 
@@ -262,8 +262,7 @@ static int imx_get_trip_type(struct thermal_zone_device *tz, int trip,
        return 0;
 }
 
-static int imx_get_crit_temp(struct thermal_zone_device *tz,
-                            unsigned long *temp)
+static int imx_get_crit_temp(struct thermal_zone_device *tz, int *temp)
 {
        struct imx_thermal_data *data = tz->devdata;
 
@@ -272,7 +271,7 @@ static int imx_get_crit_temp(struct thermal_zone_device *tz,
 }
 
 static int imx_get_trip_temp(struct thermal_zone_device *tz, int trip,
-                            unsigned long *temp)
+                            int *temp)
 {
        struct imx_thermal_data *data = tz->devdata;
 
@@ -282,7 +281,7 @@ static int imx_get_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int imx_set_trip_temp(struct thermal_zone_device *tz, int trip,
-                            unsigned long temp)
+                            int temp)
 {
        struct imx_thermal_data *data = tz->devdata;
 
@@ -434,7 +433,7 @@ static irqreturn_t imx_thermal_alarm_irq_thread(int irq, void *dev)
 {
        struct imx_thermal_data *data = dev;
 
-       dev_dbg(&data->tz->device, "THERMAL ALARM: T > %lu\n",
+       dev_dbg(&data->tz->device, "THERMAL ALARM: T > %d\n",
                data->alarm_temp / 1000);
 
        thermal_zone_device_update(data->tz);
index 031018e..5836e55 100644 (file)
@@ -186,7 +186,7 @@ static int int3400_thermal_run_osc(acpi_handle handle,
 }
 
 static int int3400_thermal_get_temp(struct thermal_zone_device *thermal,
-                       unsigned long *temp)
+                       int *temp)
 {
        *temp = 20 * 1000; /* faked temp sensor with 20C */
        return 0;
index 1e25133..b9b2666 100644 (file)
@@ -20,7 +20,7 @@
 #include "int340x_thermal_zone.h"
 
 static int int340x_thermal_get_zone_temp(struct thermal_zone_device *zone,
-                                        unsigned long *temp)
+                                        int *temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        unsigned long long tmp;
@@ -49,7 +49,7 @@ static int int340x_thermal_get_zone_temp(struct thermal_zone_device *zone,
 }
 
 static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone,
-                                        int trip, unsigned long *temp)
+                                        int trip, int *temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        int i;
@@ -114,7 +114,7 @@ static int int340x_thermal_get_trip_type(struct thermal_zone_device *zone,
 }
 
 static int int340x_thermal_set_trip_temp(struct thermal_zone_device *zone,
-                                     int trip, unsigned long temp)
+                                     int trip, int temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        acpi_status status;
@@ -136,7 +136,7 @@ static int int340x_thermal_set_trip_temp(struct thermal_zone_device *zone,
 
 
 static int int340x_thermal_get_trip_hyst(struct thermal_zone_device *zone,
-               int trip, unsigned long *temp)
+               int trip, int *temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        acpi_status status;
@@ -163,7 +163,7 @@ static struct thermal_zone_device_ops int340x_thermal_zone_ops = {
 };
 
 static int int340x_thermal_get_trip_config(acpi_handle handle, char *name,
-                                     unsigned long *temp)
+                                     int *temp)
 {
        unsigned long long r;
        acpi_status status;
index 9f38ab7..aaadf72 100644 (file)
@@ -21,7 +21,7 @@
 #define INT340X_THERMAL_MAX_ACT_TRIP_COUNT     10
 
 struct active_trip {
-       unsigned long temp;
+       int temp;
        int id;
        bool valid;
 };
@@ -31,11 +31,11 @@ struct int34x_thermal_zone {
        struct active_trip act_trips[INT340X_THERMAL_MAX_ACT_TRIP_COUNT];
        unsigned long *aux_trips;
        int aux_trip_nr;
-       unsigned long psv_temp;
+       int psv_temp;
        int psv_trip_id;
-       unsigned long crt_temp;
+       int crt_temp;
        int crt_trip_id;
-       unsigned long hot_temp;
+       int hot_temp;
        int hot_trip_id;
        struct thermal_zone_device *zone;
        struct thermal_zone_device_ops *override_ops;
index 3df3dc3..ccc0ad0 100644 (file)
@@ -145,7 +145,7 @@ static int get_tjmax(void)
        return -EINVAL;
 }
 
-static int read_temp_msr(unsigned long *temp)
+static int read_temp_msr(int *temp)
 {
        int cpu;
        u32 eax, edx;
@@ -177,7 +177,7 @@ err_ret:
 }
 
 static int proc_thermal_get_zone_temp(struct thermal_zone_device *zone,
-                                        unsigned long *temp)
+                                        int *temp)
 {
        int ret;
 
diff --git a/drivers/thermal/intel_pch_thermal.c b/drivers/thermal/intel_pch_thermal.c
new file mode 100644 (file)
index 0000000..50c7da7
--- /dev/null
@@ -0,0 +1,283 @@
+/* intel_pch_thermal.c - Intel PCH Thermal driver
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * Authors:
+ *     Tushar Dave <tushar.n.dave@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/thermal.h>
+
+/* Intel PCH thermal Device IDs */
+#define PCH_THERMAL_DID_WPT    0x9CA4 /* Wildcat Point */
+
+/* Wildcat Point-LP  PCH Thermal registers */
+#define WPT_TEMP       0x0000  /* Temperature */
+#define WPT_TSC        0x04    /* Thermal Sensor Control */
+#define WPT_TSS        0x06    /* Thermal Sensor Status */
+#define WPT_TSEL       0x08    /* Thermal Sensor Enable and Lock */
+#define WPT_TSREL      0x0A    /* Thermal Sensor Report Enable and Lock */
+#define WPT_TSMIC      0x0C    /* Thermal Sensor SMI Control */
+#define WPT_CTT        0x0010  /* Catastrophic Trip Point */
+#define WPT_TAHV       0x0014  /* Thermal Alert High Value */
+#define WPT_TALV       0x0018  /* Thermal Alert Low Value */
+#define WPT_TL         0x00000040      /* Throttle Value */
+#define WPT_PHL        0x0060  /* PCH Hot Level */
+#define WPT_PHLC       0x62    /* PHL Control */
+#define WPT_TAS        0x80    /* Thermal Alert Status */
+#define WPT_TSPIEN     0x82    /* PCI Interrupt Event Enables */
+#define WPT_TSGPEN     0x84    /* General Purpose Event Enables */
+
+/*  Wildcat Point-LP  PCH Thermal Register bit definitions */
+#define WPT_TEMP_TSR   0x00ff  /* Temp TS Reading */
+#define WPT_TSC_CPDE   0x01    /* Catastrophic Power-Down Enable */
+#define WPT_TSS_TSDSS  0x10    /* Thermal Sensor Dynamic Shutdown Status */
+#define WPT_TSS_GPES   0x08    /* GPE status */
+#define WPT_TSEL_ETS   0x01    /* Enable TS */
+#define WPT_TSEL_PLDB  0x80    /* TSEL Policy Lock-Down Bit */
+#define WPT_TL_TOL     0x000001FF      /* T0 Level */
+#define WPT_TL_T1L     0x1ff00000      /* T1 Level */
+#define WPT_TL_TTEN    0x20000000      /* TT Enable */
+
+static char driver_name[] = "Intel PCH thermal driver";
+
+struct pch_thermal_device {
+       void __iomem *hw_base;
+       const struct pch_dev_ops *ops;
+       struct pci_dev *pdev;
+       struct thermal_zone_device *tzd;
+       int crt_trip_id;
+       unsigned long crt_temp;
+       int hot_trip_id;
+       unsigned long hot_temp;
+};
+
+static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips)
+{
+       u8 tsel;
+       u16 trip_temp;
+
+       *nr_trips = 0;
+
+       /* Check if BIOS has already enabled thermal sensor */
+       if (WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS))
+               goto read_trips;
+
+       tsel = readb(ptd->hw_base + WPT_TSEL);
+       /*
+        * When TSEL's Policy Lock-Down bit is 1, TSEL become RO.
+        * If so, thermal sensor cannot enable. Bail out.
+        */
+       if (tsel & WPT_TSEL_PLDB) {
+               dev_err(&ptd->pdev->dev, "Sensor can't be enabled\n");
+               return -ENODEV;
+       }
+
+       writeb(tsel|WPT_TSEL_ETS, ptd->hw_base + WPT_TSEL);
+       if (!(WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS))) {
+               dev_err(&ptd->pdev->dev, "Sensor can't be enabled\n");
+               return -ENODEV;
+       }
+
+read_trips:
+       ptd->crt_trip_id = -1;
+       trip_temp = readw(ptd->hw_base + WPT_CTT);
+       trip_temp &= 0x1FF;
+       if (trip_temp) {
+               /* Resolution of 1/2 degree C and an offset of -50C */
+               ptd->crt_temp = trip_temp * 1000 / 2 - 50000;
+               ptd->crt_trip_id = 0;
+               ++(*nr_trips);
+       }
+
+       ptd->hot_trip_id = -1;
+       trip_temp = readw(ptd->hw_base + WPT_PHL);
+       trip_temp &= 0x1FF;
+       if (trip_temp) {
+               /* Resolution of 1/2 degree C and an offset of -50C */
+               ptd->hot_temp = trip_temp * 1000 / 2 - 50000;
+               ptd->hot_trip_id = *nr_trips;
+               ++(*nr_trips);
+       }
+
+       return 0;
+}
+
+static int pch_wpt_get_temp(struct pch_thermal_device *ptd, int *temp)
+{
+       u8 wpt_temp;
+
+       wpt_temp = WPT_TEMP_TSR & readl(ptd->hw_base + WPT_TEMP);
+
+       /* Resolution of 1/2 degree C and an offset of -50C */
+       *temp = (wpt_temp * 1000 / 2 - 50000);
+
+       return 0;
+}
+
+struct pch_dev_ops {
+       int (*hw_init)(struct pch_thermal_device *ptd, int *nr_trips);
+       int (*get_temp)(struct pch_thermal_device *ptd, int *temp);
+};
+
+
+/* dev ops for Wildcat Point */
+static struct pch_dev_ops pch_dev_ops_wpt = {
+       .hw_init = pch_wpt_init,
+       .get_temp = pch_wpt_get_temp,
+};
+
+static int pch_thermal_get_temp(struct thermal_zone_device *tzd, int *temp)
+{
+       struct pch_thermal_device *ptd = tzd->devdata;
+
+       return  ptd->ops->get_temp(ptd, temp);
+}
+
+static int pch_get_trip_type(struct thermal_zone_device *tzd, int trip,
+                            enum thermal_trip_type *type)
+{
+       struct pch_thermal_device *ptd = tzd->devdata;
+
+       if (ptd->crt_trip_id == trip)
+               *type = THERMAL_TRIP_CRITICAL;
+       else if (ptd->hot_trip_id == trip)
+               *type = THERMAL_TRIP_HOT;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+static int pch_get_trip_temp(struct thermal_zone_device *tzd, int trip, int *temp)
+{
+       struct pch_thermal_device *ptd = tzd->devdata;
+
+       if (ptd->crt_trip_id == trip)
+               *temp = ptd->crt_temp;
+       else if (ptd->hot_trip_id == trip)
+               *temp = ptd->hot_temp;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+static struct thermal_zone_device_ops tzd_ops = {
+       .get_temp = pch_thermal_get_temp,
+       .get_trip_type = pch_get_trip_type,
+       .get_trip_temp = pch_get_trip_temp,
+};
+
+
+static int intel_pch_thermal_probe(struct pci_dev *pdev,
+                                  const struct pci_device_id *id)
+{
+       struct pch_thermal_device *ptd;
+       int err;
+       int nr_trips;
+       char *dev_name;
+
+       ptd = devm_kzalloc(&pdev->dev, sizeof(*ptd), GFP_KERNEL);
+       if (!ptd)
+               return -ENOMEM;
+
+       switch (pdev->device) {
+       case PCH_THERMAL_DID_WPT:
+               ptd->ops = &pch_dev_ops_wpt;
+               dev_name = "pch_wildcat_point";
+               break;
+       default:
+               dev_err(&pdev->dev, "unknown pch thermal device\n");
+               return -ENODEV;
+       }
+
+       pci_set_drvdata(pdev, ptd);
+       ptd->pdev = pdev;
+
+       err = pci_enable_device(pdev);
+       if (err) {
+               dev_err(&pdev->dev, "failed to enable pci device\n");
+               return err;
+       }
+
+       err = pci_request_regions(pdev, driver_name);
+       if (err) {
+               dev_err(&pdev->dev, "failed to request pci region\n");
+               goto error_disable;
+       }
+
+       ptd->hw_base = pci_ioremap_bar(pdev, 0);
+       if (!ptd->hw_base) {
+               err = -ENOMEM;
+               dev_err(&pdev->dev, "failed to map mem base\n");
+               goto error_release;
+       }
+
+       err = ptd->ops->hw_init(ptd, &nr_trips);
+       if (err)
+               goto error_cleanup;
+
+       ptd->tzd = thermal_zone_device_register(dev_name, nr_trips, 0, ptd,
+                                               &tzd_ops, NULL, 0, 0);
+       if (IS_ERR(ptd->tzd)) {
+               dev_err(&pdev->dev, "Failed to register thermal zone %s\n",
+                       dev_name);
+               err = PTR_ERR(ptd->tzd);
+               goto error_cleanup;
+       }
+
+       return 0;
+
+error_cleanup:
+       iounmap(ptd->hw_base);
+error_release:
+       pci_release_regions(pdev);
+error_disable:
+       pci_disable_device(pdev);
+       dev_err(&pdev->dev, "pci device failed to probe\n");
+       return err;
+}
+
+static void intel_pch_thermal_remove(struct pci_dev *pdev)
+{
+       struct pch_thermal_device *ptd = pci_get_drvdata(pdev);
+
+       thermal_zone_device_unregister(ptd->tzd);
+       iounmap(ptd->hw_base);
+       pci_set_drvdata(pdev, NULL);
+       pci_release_region(pdev, 0);
+       pci_disable_device(pdev);
+}
+
+static struct pci_device_id intel_pch_thermal_id[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_WPT) },
+       { 0, },
+};
+MODULE_DEVICE_TABLE(pci, intel_pch_thermal_id);
+
+static struct pci_driver intel_pch_thermal_driver = {
+       .name           = "intel_pch_thermal",
+       .id_table       = intel_pch_thermal_id,
+       .probe          = intel_pch_thermal_probe,
+       .remove         = intel_pch_thermal_remove,
+};
+
+module_pci_driver(intel_pch_thermal_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel PCH Thermal driver");
index 2ac0c70..6c79588 100644 (file)
@@ -693,11 +693,14 @@ static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
        { X86_VENDOR_INTEL, 6, 0x3f},
        { X86_VENDOR_INTEL, 6, 0x45},
        { X86_VENDOR_INTEL, 6, 0x46},
+       { X86_VENDOR_INTEL, 6, 0x47},
        { X86_VENDOR_INTEL, 6, 0x4c},
        { X86_VENDOR_INTEL, 6, 0x4d},
+       { X86_VENDOR_INTEL, 6, 0x4e},
        { X86_VENDOR_INTEL, 6, 0x4f},
        { X86_VENDOR_INTEL, 6, 0x56},
        { X86_VENDOR_INTEL, 6, 0x57},
+       { X86_VENDOR_INTEL, 6, 0x5e},
        {}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
index 4434ec8..5ed90e6 100644 (file)
@@ -186,7 +186,7 @@ static int soc_dts_disable(struct thermal_zone_device *tzd)
        return ret;
 }
 
-static int _get_trip_temp(int trip, unsigned long *temp)
+static int _get_trip_temp(int trip, int *temp)
 {
        int status;
        u32 out;
@@ -212,19 +212,18 @@ static int _get_trip_temp(int trip, unsigned long *temp)
 }
 
 static inline int sys_get_trip_temp(struct thermal_zone_device *tzd,
-                               int trip, unsigned long *temp)
+                               int trip, int *temp)
 {
        return _get_trip_temp(trip, temp);
 }
 
-static inline int sys_get_crit_temp(struct thermal_zone_device *tzd,
-                               unsigned long *temp)
+static inline int sys_get_crit_temp(struct thermal_zone_device *tzd, int *temp)
 {
        return _get_trip_temp(QRK_DTS_ID_TP_CRITICAL, temp);
 }
 
 static int update_trip_temp(struct soc_sensor_entry *aux_entry,
-                               int trip, unsigned long temp)
+                               int trip, int temp)
 {
        u32 out;
        u32 temp_out;
@@ -272,7 +271,7 @@ failed:
 }
 
 static inline int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-                               unsigned long temp)
+                               int temp)
 {
        return update_trip_temp(tzd->devdata, trip, temp);
 }
@@ -289,7 +288,7 @@ static int sys_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzd,
-                               unsigned long *temp)
+                               int *temp)
 {
        u32 out;
        int ret;
index 42e4b6a..5841d1d 100644 (file)
@@ -80,7 +80,7 @@ err_ret:
 }
 
 static int sys_get_trip_temp(struct thermal_zone_device *tzd, int trip,
-                            unsigned long *temp)
+                            int *temp)
 {
        int status;
        u32 out;
@@ -106,7 +106,7 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzd, int trip,
 }
 
 static int update_trip_temp(struct intel_soc_dts_sensor_entry *dts,
-                           int thres_index, unsigned long temp,
+                           int thres_index, int temp,
                            enum thermal_trip_type trip_type)
 {
        int status;
@@ -196,7 +196,7 @@ err_restore_ptps:
 }
 
 static int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-                            unsigned long temp)
+                            int temp)
 {
        struct intel_soc_dts_sensor_entry *dts = tzd->devdata;
        struct intel_soc_dts_sensors *sensors = dts->sensors;
@@ -226,7 +226,7 @@ static int sys_get_trip_type(struct thermal_zone_device *tzd,
 }
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzd,
-                            unsigned long *temp)
+                            int *temp)
 {
        int status;
        u32 out;
index 11041fe..8922366 100644 (file)
@@ -33,7 +33,7 @@ struct kirkwood_thermal_priv {
 };
 
 static int kirkwood_get_temp(struct thermal_zone_device *thermal,
-                         unsigned long *temp)
+                         int *temp)
 {
        unsigned long reg;
        struct kirkwood_thermal_priv *priv = thermal->devdata;
index b295b2b..42b7d42 100644 (file)
@@ -91,7 +91,7 @@ struct __thermal_zone {
 /***   DT thermal zone device callbacks   ***/
 
 static int of_thermal_get_temp(struct thermal_zone_device *tz,
-                              unsigned long *temp)
+                              int *temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -177,7 +177,7 @@ EXPORT_SYMBOL_GPL(of_thermal_get_trip_points);
  * Return: zero on success, error code otherwise
  */
 static int of_thermal_set_emul_temp(struct thermal_zone_device *tz,
-                                   unsigned long temp)
+                                   int temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -311,7 +311,7 @@ static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip,
-                                   unsigned long *temp)
+                                   int *temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -324,7 +324,7 @@ static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
-                                   unsigned long temp)
+                                   int temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -338,7 +338,7 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip,
-                                   unsigned long *hyst)
+                                   int *hyst)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -351,7 +351,7 @@ static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip,
-                                   unsigned long hyst)
+                                   int hyst)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -365,7 +365,7 @@ static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_crit_temp(struct thermal_zone_device *tz,
-                                   unsigned long *temp)
+                                   int *temp)
 {
        struct __thermal_zone *data = tz->devdata;
        int i;
index 2516769..9c8a7aa 100644 (file)
@@ -92,8 +92,8 @@ struct power_allocator_params {
  * Return: The power budget for the next period.
  */
 static u32 pid_controller(struct thermal_zone_device *tz,
-                         unsigned long current_temp,
-                         unsigned long control_temp,
+                         int current_temp,
+                         int control_temp,
                          u32 max_allocatable_power)
 {
        s64 p, i, d, power_range;
@@ -102,7 +102,7 @@ static u32 pid_controller(struct thermal_zone_device *tz,
 
        max_power_frac = int_to_frac(max_allocatable_power);
 
-       err = ((s32)control_temp - (s32)current_temp);
+       err = control_temp - current_temp;
        err = int_to_frac(err);
 
        /* Calculate the proportional term */
@@ -223,8 +223,8 @@ static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
 }
 
 static int allocate_power(struct thermal_zone_device *tz,
-                         unsigned long current_temp,
-                         unsigned long control_temp)
+                         int current_temp,
+                         int control_temp)
 {
        struct thermal_instance *instance;
        struct power_allocator_params *params = tz->governor_data;
@@ -331,7 +331,7 @@ static int allocate_power(struct thermal_zone_device *tz,
                                      granted_power, total_granted_power,
                                      num_actors, power_range,
                                      max_allocatable_power, current_temp,
-                                     (s32)control_temp - (s32)current_temp);
+                                     control_temp - current_temp);
 
        kfree(req_power);
 unlock:
@@ -416,7 +416,7 @@ static int power_allocator_bind(struct thermal_zone_device *tz)
 {
        int ret;
        struct power_allocator_params *params;
-       unsigned long switch_on_temp, control_temp;
+       int switch_on_temp, control_temp;
        u32 temperature_threshold;
 
        if (!tz->tzp || !tz->tzp->sustainable_power) {
@@ -481,7 +481,7 @@ static void power_allocator_unbind(struct thermal_zone_device *tz)
 static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 {
        int ret;
-       unsigned long switch_on_temp, control_temp, current_temp;
+       int switch_on_temp, control_temp, current_temp;
        struct power_allocator_params *params = tz->governor_data;
 
        /*
index c8d27b8..b677aad 100644 (file)
@@ -117,7 +117,7 @@ static int qpnp_tm_update_temp_no_adc(struct qpnp_tm_chip *chip)
        return 0;
 }
 
-static int qpnp_tm_get_temp(void *data, long *temp)
+static int qpnp_tm_get_temp(void *data, int *temp)
 {
        struct qpnp_tm_chip *chip = data;
        int ret, mili_celsius;
index fe4e767..5d4ae7d 100644 (file)
@@ -200,8 +200,7 @@ err_out_unlock:
        return ret;
 }
 
-static int rcar_thermal_get_temp(struct thermal_zone_device *zone,
-                                unsigned long *temp)
+static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
 {
        struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
 
@@ -235,7 +234,7 @@ static int rcar_thermal_get_trip_type(struct thermal_zone_device *zone,
 }
 
 static int rcar_thermal_get_trip_temp(struct thermal_zone_device *zone,
-                                     int trip, unsigned long *temp)
+                                     int trip, int *temp)
 {
        struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
        struct device *dev = rcar_priv_to_dev(priv);
@@ -299,7 +298,7 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable)
 static void rcar_thermal_work(struct work_struct *work)
 {
        struct rcar_thermal_priv *priv;
-       unsigned long cctemp, nctemp;
+       int cctemp, nctemp;
 
        priv = container_of(work, struct rcar_thermal_priv, work.work);
 
index cd8f5f9..c89ffb2 100644 (file)
@@ -64,7 +64,7 @@ struct rockchip_tsadc_chip {
        void (*control)(void __iomem *reg, bool on);
 
        /* Per-sensor methods */
-       int (*get_temp)(int chn, void __iomem *reg, long *temp);
+       int (*get_temp)(int chn, void __iomem *reg, int *temp);
        void (*set_tshut_temp)(int chn, void __iomem *reg, long temp);
        void (*set_tshut_mode)(int chn, void __iomem *reg, enum tshut_mode m);
 };
@@ -191,7 +191,7 @@ static u32 rk_tsadcv2_temp_to_code(long temp)
        return 0;
 }
 
-static long rk_tsadcv2_code_to_temp(u32 code)
+static int rk_tsadcv2_code_to_temp(u32 code)
 {
        unsigned int low = 0;
        unsigned int high = ARRAY_SIZE(v2_code_table) - 1;
@@ -277,7 +277,7 @@ static void rk_tsadcv2_control(void __iomem *regs, bool enable)
        writel_relaxed(val, regs + TSADCV2_AUTO_CON);
 }
 
-static int rk_tsadcv2_get_temp(int chn, void __iomem *regs, long *temp)
+static int rk_tsadcv2_get_temp(int chn, void __iomem *regs, int *temp)
 {
        u32 val;
 
@@ -366,7 +366,7 @@ static irqreturn_t rockchip_thermal_alarm_irq_thread(int irq, void *dev)
        return IRQ_HANDLED;
 }
 
-static int rockchip_thermal_get_temp(void *_sensor, long *out_temp)
+static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
 {
        struct rockchip_thermal_sensor *sensor = _sensor;
        struct rockchip_thermal_data *thermal = sensor->thermal;
@@ -374,7 +374,7 @@ static int rockchip_thermal_get_temp(void *_sensor, long *out_temp)
        int retval;
 
        retval = tsadc->get_temp(sensor->id, thermal->regs, out_temp);
-       dev_dbg(&thermal->pdev->dev, "sensor %d - temp: %ld, retval: %d\n",
+       dev_dbg(&thermal->pdev->dev, "sensor %d - temp: %d, retval: %d\n",
                sensor->id, *out_temp, retval);
 
        return retval;
index c96ff10..0bae8cc 100644 (file)
@@ -207,8 +207,7 @@ struct exynos_tmu_data {
        int (*tmu_initialize)(struct platform_device *pdev);
        void (*tmu_control)(struct platform_device *pdev, bool on);
        int (*tmu_read)(struct exynos_tmu_data *data);
-       void (*tmu_set_emulation)(struct exynos_tmu_data *data,
-                                 unsigned long temp);
+       void (*tmu_set_emulation)(struct exynos_tmu_data *data, int temp);
        void (*tmu_clear_irqs)(struct exynos_tmu_data *data);
 };
 
@@ -216,7 +215,7 @@ static void exynos_report_trigger(struct exynos_tmu_data *p)
 {
        char data[10], *envp[] = { data, NULL };
        struct thermal_zone_device *tz = p->tzd;
-       unsigned long temp;
+       int temp;
        unsigned int i;
 
        if (!tz) {
@@ -517,7 +516,7 @@ static int exynos5433_tmu_initialize(struct platform_device *pdev)
        struct thermal_zone_device *tz = data->tzd;
        unsigned int status, trim_info;
        unsigned int rising_threshold = 0, falling_threshold = 0;
-       unsigned long temp, temp_hist;
+       int temp, temp_hist;
        int ret = 0, threshold_code, i, sensor_id, cal_type;
 
        status = readb(data->base + EXYNOS_TMU_REG_STATUS);
@@ -610,7 +609,7 @@ static int exynos5440_tmu_initialize(struct platform_device *pdev)
        struct exynos_tmu_data *data = platform_get_drvdata(pdev);
        unsigned int trim_info = 0, con, rising_threshold;
        int ret = 0, threshold_code;
-       unsigned long crit_temp = 0;
+       int crit_temp = 0;
 
        /*
         * For exynos5440 soc triminfo value is swapped between TMU0 and
@@ -663,7 +662,7 @@ static int exynos7_tmu_initialize(struct platform_device *pdev)
        unsigned int status, trim_info;
        unsigned int rising_threshold = 0, falling_threshold = 0;
        int ret = 0, threshold_code, i;
-       unsigned long temp, temp_hist;
+       int temp, temp_hist;
        unsigned int reg_off, bit_off;
 
        status = readb(data->base + EXYNOS_TMU_REG_STATUS);
@@ -876,7 +875,7 @@ static void exynos7_tmu_control(struct platform_device *pdev, bool on)
        writel(con, data->base + EXYNOS_TMU_REG_CONTROL);
 }
 
-static int exynos_get_temp(void *p, long *temp)
+static int exynos_get_temp(void *p, int *temp)
 {
        struct exynos_tmu_data *data = p;
 
@@ -896,7 +895,7 @@ static int exynos_get_temp(void *p, long *temp)
 
 #ifdef CONFIG_THERMAL_EMULATION
 static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val,
-                           unsigned long temp)
+                           int temp)
 {
        if (temp) {
                temp /= MCELSIUS;
@@ -926,7 +925,7 @@ static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val,
 }
 
 static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
-                                        unsigned long temp)
+                                        int temp)
 {
        unsigned int val;
        u32 emul_con;
@@ -946,7 +945,7 @@ static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
 }
 
 static void exynos5440_tmu_set_emulation(struct exynos_tmu_data *data,
-                                        unsigned long temp)
+                                        int temp)
 {
        unsigned int val;
 
@@ -955,7 +954,7 @@ static void exynos5440_tmu_set_emulation(struct exynos_tmu_data *data,
        writel(val, data->base + EXYNOS5440_TMU_S0_7_DEBUG);
 }
 
-static int exynos_tmu_set_emulation(void *drv_data, unsigned long temp)
+static int exynos_tmu_set_emulation(void *drv_data, int temp)
 {
        struct exynos_tmu_data *data = drv_data;
        int ret = -EINVAL;
@@ -978,7 +977,7 @@ out:
 #else
 #define exynos4412_tmu_set_emulation NULL
 #define exynos5440_tmu_set_emulation NULL
-static int exynos_tmu_set_emulation(void *drv_data,    unsigned long temp)
+static int exynos_tmu_set_emulation(void *drv_data, int temp)
        { return -EINVAL; }
 #endif /* CONFIG_THERMAL_EMULATION */
 
index bddb717..534dd91 100644 (file)
@@ -38,7 +38,7 @@ struct spear_thermal_dev {
 };
 
 static inline int thermal_get_temp(struct thermal_zone_device *thermal,
-                               unsigned long *temp)
+                               int *temp)
 {
        struct spear_thermal_dev *stdev = thermal->devdata;
 
index 88c759d..be637e6 100644 (file)
@@ -111,8 +111,7 @@ static int st_thermal_calibration(struct st_thermal_sensor *sensor)
 }
 
 /* Callback to get temperature from HW*/
-static int st_thermal_get_temp(struct thermal_zone_device *th,
-               unsigned long *temperature)
+static int st_thermal_get_temp(struct thermal_zone_device *th, int *temperature)
 {
        struct st_thermal_sensor *sensor = th->devdata;
        struct device *dev = sensor->dev;
@@ -159,7 +158,7 @@ static int st_thermal_get_trip_type(struct thermal_zone_device *th,
 }
 
 static int st_thermal_get_trip_temp(struct thermal_zone_device *th,
-                                   int trip, unsigned long *temp)
+                                   int trip, int *temp)
 {
        struct st_thermal_sensor *sensor = th->devdata;
        struct device *dev = sensor->dev;
index 5a0f12d..2f9f708 100644 (file)
@@ -113,7 +113,7 @@ static void update_passive_instance(struct thermal_zone_device *tz,
 
 static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 {
-       long trip_temp;
+       int trip_temp;
        enum thermal_trip_type trip_type;
        enum thermal_trend trend;
        struct thermal_instance *instance;
@@ -135,7 +135,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
                trace_thermal_zone_trip(tz, trip, trip_type);
        }
 
-       dev_dbg(&tz->device, "Trip%d[type=%d,temp=%ld]:trend=%d,throttle=%d\n",
+       dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",
                                trip, trip_type, trip_temp, trend, throttle);
 
        mutex_lock(&tz->lock);
index 9197fc0..74ea576 100644 (file)
@@ -293,7 +293,7 @@ static int enable_tsensor(struct tegra_soctherm *tegra,
  * H denotes an addition of 0.5 Celsius and N denotes negation
  * of the final value.
  */
-static long translate_temp(u16 val)
+static int translate_temp(u16 val)
 {
        long t;
 
@@ -306,7 +306,7 @@ static long translate_temp(u16 val)
        return t;
 }
 
-static int tegra_thermctl_get_temp(void *data, long *out_temp)
+static int tegra_thermctl_get_temp(void *data, int *out_temp)
 {
        struct tegra_thermctl_zone *zone = data;
        u32 val;
index 4ca211b..5e5fc70 100644 (file)
@@ -426,7 +426,7 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
 static void handle_critical_trips(struct thermal_zone_device *tz,
                                int trip, enum thermal_trip_type trip_type)
 {
-       long trip_temp;
+       int trip_temp;
 
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
 
@@ -465,7 +465,7 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
 }
 
 /**
- * thermal_zone_get_temp() - returns its the temperature of thermal zone
+ * thermal_zone_get_temp() - returns the temperature of a thermal zone
  * @tz: a valid pointer to a struct thermal_zone_device
  * @temp: a valid pointer to where to store the resulting temperature.
  *
@@ -474,14 +474,12 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
  *
  * Return: On success returns 0, an error code otherwise
  */
-int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 {
        int ret = -EINVAL;
-#ifdef CONFIG_THERMAL_EMULATION
        int count;
-       unsigned long crit_temp = -1UL;
+       int crit_temp = INT_MAX;
        enum thermal_trip_type type;
-#endif
 
        if (!tz || IS_ERR(tz) || !tz->ops->get_temp)
                goto exit;
@@ -489,25 +487,26 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
        mutex_lock(&tz->lock);
 
        ret = tz->ops->get_temp(tz, temp);
-#ifdef CONFIG_THERMAL_EMULATION
-       if (!tz->emul_temperature)
-               goto skip_emul;
-
-       for (count = 0; count < tz->trips; count++) {
-               ret = tz->ops->get_trip_type(tz, count, &type);
-               if (!ret && type == THERMAL_TRIP_CRITICAL) {
-                       ret = tz->ops->get_trip_temp(tz, count, &crit_temp);
-                       break;
-               }
-       }
 
-       if (ret)
-               goto skip_emul;
+       if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) {
+               for (count = 0; count < tz->trips; count++) {
+                       ret = tz->ops->get_trip_type(tz, count, &type);
+                       if (!ret && type == THERMAL_TRIP_CRITICAL) {
+                               ret = tz->ops->get_trip_temp(tz, count,
+                                               &crit_temp);
+                               break;
+                       }
+               }
 
-       if (*temp < crit_temp)
-               *temp = tz->emul_temperature;
-skip_emul:
-#endif
+               /*
+                * Only allow emulating a temperature when the real temperature
+                * is below the critical temperature so that the emulation code
+                * cannot hide critical conditions.
+                */
+               if (!ret && *temp < crit_temp)
+                       *temp = tz->emul_temperature;
+       }
        mutex_unlock(&tz->lock);
 exit:
        return ret;
@@ -516,8 +515,7 @@ EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
 static void update_temperature(struct thermal_zone_device *tz)
 {
-       long temp;
-       int ret;
+       int temp, ret;
 
        ret = thermal_zone_get_temp(tz, &temp);
        if (ret) {
@@ -577,15 +575,14 @@ static ssize_t
 temp_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
-       long temperature;
-       int ret;
+       int temperature, ret;
 
        ret = thermal_zone_get_temp(tz, &temperature);
 
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -689,7 +686,7 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
        int trip, ret;
-       long temperature;
+       int temperature;
 
        if (!tz->ops->get_trip_temp)
                return -EPERM;
@@ -702,7 +699,7 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -711,7 +708,7 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
        int trip, ret;
-       unsigned long temperature;
+       int temperature;
 
        if (!tz->ops->set_trip_hyst)
                return -EPERM;
@@ -719,7 +716,7 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
        if (!sscanf(attr->attr.name, "trip_point_%d_hyst", &trip))
                return -EINVAL;
 
-       if (kstrtoul(buf, 10, &temperature))
+       if (kstrtoint(buf, 10, &temperature))
                return -EINVAL;
 
        /*
@@ -738,7 +735,7 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
        int trip, ret;
-       unsigned long temperature;
+       int temperature;
 
        if (!tz->ops->get_trip_hyst)
                return -EPERM;
@@ -748,7 +745,7 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 
        ret = tz->ops->get_trip_hyst(tz, trip, &temperature);
 
-       return ret ? ret : sprintf(buf, "%ld\n", temperature);
+       return ret ? ret : sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -847,7 +844,27 @@ policy_show(struct device *dev, struct device_attribute *devattr, char *buf)
        return sprintf(buf, "%s\n", tz->governor->name);
 }
 
-#ifdef CONFIG_THERMAL_EMULATION
+static ssize_t
+available_policies_show(struct device *dev, struct device_attribute *devattr,
+                       char *buf)
+{
+       struct thermal_governor *pos;
+       ssize_t count = 0;
+       ssize_t size = PAGE_SIZE;
+
+       mutex_lock(&thermal_governor_lock);
+
+       list_for_each_entry(pos, &thermal_governor_list, governor_list) {
+               size = PAGE_SIZE - count;
+               count += scnprintf(buf + count, size, "%s ", pos->name);
+       }
+       count += scnprintf(buf + count, size, "\n");
+
+       mutex_unlock(&thermal_governor_lock);
+
+       return count;
+}
+
 static ssize_t
 emul_temp_store(struct device *dev, struct device_attribute *attr,
                     const char *buf, size_t count)
@@ -873,7 +890,6 @@ emul_temp_store(struct device *dev, struct device_attribute *attr,
        return ret ? ret : count;
 }
 static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store);
-#endif/*CONFIG_THERMAL_EMULATION*/
 
 static ssize_t
 sustainable_power_show(struct device *dev, struct device_attribute *devattr,
@@ -1032,6 +1048,7 @@ static DEVICE_ATTR(temp, 0444, temp_show, NULL);
 static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
 static DEVICE_ATTR(passive, S_IRUGO | S_IWUSR, passive_show, passive_store);
 static DEVICE_ATTR(policy, S_IRUGO | S_IWUSR, policy_show, policy_store);
+static DEVICE_ATTR(available_policies, S_IRUGO, available_policies_show, NULL);
 
 /* sys I/F for cooling device */
 #define to_cooling_device(_dev)        \
@@ -1803,11 +1820,12 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
                        goto unregister;
        }
 
-#ifdef CONFIG_THERMAL_EMULATION
-       result = device_create_file(&tz->device, &dev_attr_emul_temp);
-       if (result)
-               goto unregister;
-#endif
+       if (IS_ENABLED(CONFIG_THERMAL_EMULATION)) {
+               result = device_create_file(&tz->device, &dev_attr_emul_temp);
+               if (result)
+                       goto unregister;
+       }
+
        /* Create policy attribute */
        result = device_create_file(&tz->device, &dev_attr_policy);
        if (result)
@@ -1818,6 +1836,11 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
        if (result)
                goto unregister;
 
+       /* Create available_policies attribute */
+       result = device_create_file(&tz->device, &dev_attr_available_policies);
+       if (result)
+               goto unregister;
+
        /* Update 'this' zone's governor information */
        mutex_lock(&thermal_governor_lock);
 
@@ -1849,9 +1872,6 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 
        INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
 
-       if (!tz->ops->get_temp)
-               thermal_zone_device_set_polling(tz, 0);
-
        thermal_zone_device_update(tz);
 
        return tz;
@@ -1918,6 +1938,7 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
        if (tz->ops->get_mode)
                device_remove_file(&tz->device, &dev_attr_mode);
        device_remove_file(&tz->device, &dev_attr_policy);
+       device_remove_file(&tz->device, &dev_attr_available_policies);
        remove_trip_attrs(tz);
        thermal_set_governor(tz, NULL);
 
index 1967bee..06fd2ed 100644 (file)
@@ -69,7 +69,7 @@ static DEVICE_ATTR(name, 0444, name_show, NULL);
 static ssize_t
 temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-       long temperature;
+       int temperature;
        int ret;
        struct thermal_hwmon_attr *hwmon_attr
                        = container_of(attr, struct thermal_hwmon_attr, attr);
@@ -83,7 +83,7 @@ temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -95,14 +95,14 @@ temp_crit_show(struct device *dev, struct device_attribute *attr, char *buf)
                        = container_of(hwmon_attr, struct thermal_hwmon_temp,
                                       temp_crit);
        struct thermal_zone_device *tz = temp->tz;
-       long temperature;
+       int temperature;
        int ret;
 
        ret = tz->ops->get_trip_temp(tz, 0, &temperature);
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 
@@ -142,7 +142,7 @@ thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
 
 static bool thermal_zone_crit_temp_valid(struct thermal_zone_device *tz)
 {
-       unsigned long temp;
+       int temp;
        return tz->ops->get_crit_temp && !tz->ops->get_crit_temp(tz, &temp);
 }
 
index c7c5b37..b213a12 100644 (file)
@@ -76,14 +76,14 @@ static inline int ti_thermal_hotspot_temperature(int t, int s, int c)
 
 /* thermal zone ops */
 /* Get temperature callback function for thermal zone */
-static inline int __ti_thermal_get_temp(void *devdata, long *temp)
+static inline int __ti_thermal_get_temp(void *devdata, int *temp)
 {
        struct thermal_zone_device *pcb_tz = NULL;
        struct ti_thermal_data *data = devdata;
        struct ti_bandgap *bgp;
        const struct ti_temp_sensor *s;
        int ret, tmp, slope, constant;
-       unsigned long pcb_temp;
+       int pcb_temp;
 
        if (!data)
                return 0;
@@ -119,7 +119,7 @@ static inline int __ti_thermal_get_temp(void *devdata, long *temp)
 }
 
 static inline int ti_thermal_get_temp(struct thermal_zone_device *thermal,
-                                     unsigned long *temp)
+                                     int *temp)
 {
        struct ti_thermal_data *data = thermal->devdata;
 
@@ -229,7 +229,7 @@ static int ti_thermal_get_trip_type(struct thermal_zone_device *thermal,
 
 /* Get trip temperature callback functions for thermal zone */
 static int ti_thermal_get_trip_temp(struct thermal_zone_device *thermal,
-                                   int trip, unsigned long *temp)
+                                   int trip, int *temp)
 {
        if (!ti_thermal_is_valid_trip(trip))
                return -EINVAL;
@@ -280,7 +280,7 @@ static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
 
 /* Get critical temperature callback functions for thermal zone */
 static int ti_thermal_get_crit_temp(struct thermal_zone_device *thermal,
-                                   unsigned long *temp)
+                                   int *temp)
 {
        /* shutdown zone */
        return ti_thermal_get_trip_temp(thermal, OMAP_TRIP_NUMBER - 1, temp);
index 50d1d2c..7fc919f 100644 (file)
@@ -164,7 +164,7 @@ err_ret:
        return err;
 }
 
-static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
 {
        u32 eax, edx;
        struct phy_dev_entry *phy_dev_entry;
@@ -175,7 +175,7 @@ static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *tem
        if (eax & 0x80000000) {
                *temp = phy_dev_entry->tj_max -
                                ((eax >> 16) & 0x7f) * 1000;
-               pr_debug("sys_get_curr_temp %ld\n", *temp);
+               pr_debug("sys_get_curr_temp %d\n", *temp);
                return 0;
        }
 
@@ -183,7 +183,7 @@ static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *tem
 }
 
 static int sys_get_trip_temp(struct thermal_zone_device *tzd,
-               int trip, unsigned long *temp)
+               int trip, int *temp)
 {
        u32 eax, edx;
        struct phy_dev_entry *phy_dev_entry;
@@ -214,13 +214,13 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzd,
                *temp = phy_dev_entry->tj_max - thres_reg_value * 1000;
        else
                *temp = 0;
-       pr_debug("sys_get_trip_temp %ld\n", *temp);
+       pr_debug("sys_get_trip_temp %d\n", *temp);
 
        return 0;
 }
 
 static int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-                                                       unsigned long temp)
+                                                       int temp)
 {
        u32 l, h;
        struct phy_dev_entry *phy_dev_entry;
index a9d837f..10beb15 100644 (file)
@@ -200,7 +200,7 @@ static int xen_hvm_console_init(void)
 {
        int r;
        uint64_t v = 0;
-       unsigned long mfn;
+       unsigned long gfn;
        struct xencons_info *info;
 
        if (!xen_hvm_domain())
@@ -217,7 +217,7 @@ static int xen_hvm_console_init(void)
        }
        /*
         * If the toolstack (or the hypervisor) hasn't set these values, the
-        * default value is 0. Even though mfn = 0 and evtchn = 0 are
+        * default value is 0. Even though gfn = 0 and evtchn = 0 are
         * theoretically correct values, in practice they never are and they
         * mean that a legacy toolstack hasn't initialized the pv console correctly.
         */
@@ -229,8 +229,8 @@ static int xen_hvm_console_init(void)
        r = hvm_get_parameter(HVM_PARAM_CONSOLE_PFN, &v);
        if (r < 0 || v == 0)
                goto err;
-       mfn = v;
-       info->intf = xen_remap(mfn << PAGE_SHIFT, PAGE_SIZE);
+       gfn = v;
+       info->intf = xen_remap(gfn << PAGE_SHIFT, PAGE_SIZE);
        if (info->intf == NULL)
                goto err;
        info->vtermno = HVC_COOKIE;
@@ -265,7 +265,8 @@ static int xen_pv_console_init(void)
                return 0;
        }
        info->evtchn = xen_start_info->console.domU.evtchn;
-       info->intf = mfn_to_virt(xen_start_info->console.domU.mfn);
+       /* GFN == MFN for PV guest */
+       info->intf = gfn_to_virt(xen_start_info->console.domU.mfn);
        info->vtermno = HVC_COOKIE;
 
        spin_lock(&xencons_lock);
@@ -374,7 +375,6 @@ static int xencons_connect_backend(struct xenbus_device *dev,
        int ret, evtchn, devid, ref, irq;
        struct xenbus_transaction xbt;
        grant_ref_t gref_head;
-       unsigned long mfn;
 
        ret = xenbus_alloc_evtchn(dev, &evtchn);
        if (ret)
@@ -389,10 +389,6 @@ static int xencons_connect_backend(struct xenbus_device *dev,
                        irq, &domU_hvc_ops, 256);
        if (IS_ERR(info->hvc))
                return PTR_ERR(info->hvc);
-       if (xen_pv_domain())
-               mfn = virt_to_mfn(info->intf);
-       else
-               mfn = __pa(info->intf) >> PAGE_SHIFT;
        ret = gnttab_alloc_grant_references(1, &gref_head);
        if (ret < 0)
                return ret;
@@ -401,7 +397,7 @@ static int xencons_connect_backend(struct xenbus_device *dev,
        if (ref < 0)
                return ref;
        gnttab_grant_foreign_access_ref(ref, info->xbdev->otherend_id,
-                       mfn, 0);
+                                       virt_to_gfn(info->intf), 0);
 
  again:
        ret = xenbus_transaction_start(&xbt);
index 4f0cbb5..d3af01c 100644 (file)
@@ -1091,7 +1091,7 @@ static void mmap_user_close(struct vm_area_struct *vma)
        omapfb_put_mem_region(rg);
 }
 
-static struct vm_operations_struct mmap_user_ops = {
+static const struct vm_operations_struct mmap_user_ops = {
        .open = mmap_user_open,
        .close = mmap_user_close,
 };
index 09dc447..0567d51 100644 (file)
@@ -46,7 +46,7 @@ struct xenfb_info {
        int                     nr_pages;
        int                     irq;
        struct xenfb_page       *page;
-       unsigned long           *mfns;
+       unsigned long           *gfns;
        int                     update_wanted; /* XENFB_TYPE_UPDATE wanted */
        int                     feature_resize; /* XENFB_TYPE_RESIZE ok */
        struct xenfb_resize     resize;         /* protected by resize_lock */
@@ -402,8 +402,8 @@ static int xenfb_probe(struct xenbus_device *dev,
 
        info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-       info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
-       if (!info->mfns)
+       info->gfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
+       if (!info->gfns)
                goto error_nomem;
 
        /* set up shared page */
@@ -530,29 +530,29 @@ static int xenfb_remove(struct xenbus_device *dev)
                framebuffer_release(info->fb_info);
        }
        free_page((unsigned long)info->page);
-       vfree(info->mfns);
+       vfree(info->gfns);
        vfree(info->fb);
        kfree(info);
 
        return 0;
 }
 
-static unsigned long vmalloc_to_mfn(void *address)
+static unsigned long vmalloc_to_gfn(void *address)
 {
-       return pfn_to_mfn(vmalloc_to_pfn(address));
+       return xen_page_to_gfn(vmalloc_to_page(address));
 }
 
 static void xenfb_init_shared_page(struct xenfb_info *info,
                                   struct fb_info *fb_info)
 {
        int i;
-       int epd = PAGE_SIZE / sizeof(info->mfns[0]);
+       int epd = PAGE_SIZE / sizeof(info->gfns[0]);
 
        for (i = 0; i < info->nr_pages; i++)
-               info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
+               info->gfns[i] = vmalloc_to_gfn(info->fb + i * PAGE_SIZE);
 
        for (i = 0; i * epd < info->nr_pages; i++)
-               info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]);
+               info->page->pd[i] = vmalloc_to_gfn(&info->gfns[i * epd]);
 
        info->page->width = fb_info->var.xres;
        info->page->height = fb_info->var.yres;
@@ -586,7 +586,7 @@ static int xenfb_connect_backend(struct xenbus_device *dev,
                goto unbind_irq;
        }
        ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
-                           virt_to_mfn(info->page));
+                           virt_to_gfn(info->page));
        if (ret)
                goto error_xenbus;
        ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
index 55c4b5b..c68edc1 100644 (file)
@@ -188,6 +188,15 @@ config AT91SAM9X_WATCHDOG
          Watchdog timer embedded into AT91SAM9X and AT91CAP9 chips. This will
          reboot your system when the timeout is reached.
 
+config SAMA5D4_WATCHDOG
+       tristate "Atmel SAMA5D4 Watchdog Timer"
+       depends on ARCH_AT91
+       select WATCHDOG_CORE
+       help
+         Atmel SAMA5D4 watchdog timer is embedded into SAMA5D4 chips.
+         Its Watchdog Timer Mode Register can be written more than once.
+         This will reboot your system when the timeout is reached.
+
 config CADENCE_WATCHDOG
        tristate "Cadence Watchdog Timer"
        depends on HAS_IOMEM
@@ -558,6 +567,17 @@ config DIGICOLOR_WATCHDOG
          To compile this driver as a module, choose M here: the
          module will be called digicolor_wdt.
 
+config LPC18XX_WATCHDOG
+       tristate "LPC18xx/43xx Watchdog"
+       depends on ARCH_LPC18XX || COMPILE_TEST
+       select WATCHDOG_CORE
+       help
+         Say Y here if to include support for the watchdog timer
+         in NXP LPC SoCs family, which includes LPC18xx/LPC43xx
+         processors.
+         To compile this driver as a module, choose M here: the
+         module will be called lpc18xx_wdt.
+
 # AVR32 Architecture
 
 config AT32AP700X_WDT
@@ -1334,7 +1354,7 @@ config MPC5200_WDT
 
 config 8xxx_WDT
        tristate "MPC8xxx Platform Watchdog Timer"
-       depends on PPC_8xx || PPC_83xx || PPC_86xx
+       depends on PPC_8xx || PPC_83xx || PPC_86xx || PPC_MPC512x
        select WATCHDOG_CORE
        help
          This driver is for a SoC level watchdog that exists on some
index 59ea9a1..0c616e3 100644 (file)
@@ -41,6 +41,7 @@ obj-$(CONFIG_IXP4XX_WATCHDOG) += ixp4xx_wdt.o
 obj-$(CONFIG_KS8695_WATCHDOG) += ks8695_wdt.o
 obj-$(CONFIG_S3C2410_WATCHDOG) += s3c2410_wdt.o
 obj-$(CONFIG_SA1100_WATCHDOG) += sa1100_wdt.o
+obj-$(CONFIG_SAMA5D4_WATCHDOG) += sama5d4_wdt.o
 obj-$(CONFIG_DW_WATCHDOG) += dw_wdt.o
 obj-$(CONFIG_EP93XX_WATCHDOG) += ep93xx_wdt.o
 obj-$(CONFIG_PNX4008_WATCHDOG) += pnx4008_wdt.o
@@ -66,6 +67,7 @@ obj-$(CONFIG_TEGRA_WATCHDOG) += tegra_wdt.o
 obj-$(CONFIG_MESON_WATCHDOG) += meson_wdt.o
 obj-$(CONFIG_MEDIATEK_WATCHDOG) += mtk_wdt.o
 obj-$(CONFIG_DIGICOLOR_WATCHDOG) += digicolor_wdt.o
+obj-$(CONFIG_LPC18XX_WATCHDOG) += lpc18xx_wdt.o
 
 # AVR32 Architecture
 obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o
index 9ba1153..e12a797 100644 (file)
@@ -244,7 +244,7 @@ static int at91wdt_probe(struct platform_device *pdev)
        }
 
        regmap_st = syscon_node_to_regmap(parent->of_node);
-       if (!regmap_st)
+       if (IS_ERR(regmap_st))
                return -ENODEV;
 
        res = misc_register(&at91wdt_miscdev);
index e4698f7..7e6acaf 100644 (file)
@@ -17,6 +17,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/clk.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
@@ -90,6 +91,7 @@ struct at91wdt {
        unsigned long heartbeat;        /* WDT heartbeat in jiffies */
        bool nowayout;
        unsigned int irq;
+       struct clk *sclk;
 };
 
 /* ......................................................................... */
@@ -352,15 +354,25 @@ static int __init at91wdt_probe(struct platform_device *pdev)
        if (IS_ERR(wdt->base))
                return PTR_ERR(wdt->base);
 
+       wdt->sclk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(wdt->sclk))
+               return PTR_ERR(wdt->sclk);
+
+       err = clk_prepare_enable(wdt->sclk);
+       if (err) {
+               dev_err(&pdev->dev, "Could not enable slow clock\n");
+               return err;
+       }
+
        if (pdev->dev.of_node) {
                err = of_at91wdt_init(pdev->dev.of_node, wdt);
                if (err)
-                       return err;
+                       goto err_clk;
        }
 
        err = at91_wdt_init(pdev, wdt);
        if (err)
-               return err;
+               goto err_clk;
 
        platform_set_drvdata(pdev, wdt);
 
@@ -368,6 +380,11 @@ static int __init at91wdt_probe(struct platform_device *pdev)
                wdt->wdd.timeout, wdt->nowayout);
 
        return 0;
+
+err_clk:
+       clk_disable_unprepare(wdt->sclk);
+
+       return err;
 }
 
 static int __exit at91wdt_remove(struct platform_device *pdev)
@@ -377,6 +394,7 @@ static int __exit at91wdt_remove(struct platform_device *pdev)
 
        pr_warn("I quit now, hardware will probably reboot!\n");
        del_timer(&wdt->timer);
+       clk_disable_unprepare(wdt->sclk);
 
        return 0;
 }
index c6fbb2e..b79a83b 100644 (file)
 
 #define AT91_WDT_MR            0x04                    /* Watchdog Mode Register */
 #define                AT91_WDT_WDV            (0xfff << 0)            /* Counter Value */
+#define                        AT91_WDT_SET_WDV(x)     ((x) & AT91_WDT_WDV)
 #define                AT91_WDT_WDFIEN         (1     << 12)           /* Fault Interrupt Enable */
 #define                AT91_WDT_WDRSTEN        (1     << 13)           /* Reset Processor */
 #define                AT91_WDT_WDRPROC        (1     << 14)           /* Timer Restart */
 #define                AT91_WDT_WDDIS          (1     << 15)           /* Watchdog Disable */
 #define                AT91_WDT_WDD            (0xfff << 16)           /* Delta Value */
+#define                        AT91_WDT_SET_WDD(x)     (((x) << 16) & AT91_WDT_WDD)
 #define                AT91_WDT_WDDBGHLT       (1     << 28)           /* Debug Halt */
 #define                AT91_WDT_WDIDLEHLT      (1     << 29)           /* Idle Halt */
 
index 7116968..66c3e65 100644 (file)
@@ -182,6 +182,7 @@ static int bcm2835_wdt_probe(struct platform_device *pdev)
        watchdog_set_drvdata(&bcm2835_wdt_wdd, wdt);
        watchdog_init_timeout(&bcm2835_wdt_wdd, heartbeat, dev);
        watchdog_set_nowayout(&bcm2835_wdt_wdd, nowayout);
+       bcm2835_wdt_wdd.parent = &pdev->dev;
        err = watchdog_register_device(&bcm2835_wdt_wdd);
        if (err) {
                dev_err(dev, "Failed to register watchdog device");
index b28a072..4064a43 100644 (file)
@@ -209,6 +209,7 @@ static int bcm47xx_wdt_probe(struct platform_device *pdev)
 
        wdt->wdd.info = &bcm47xx_wdt_info;
        wdt->wdd.timeout = WDT_DEFAULT_TIME;
+       wdt->wdd.parent = &pdev->dev;
        ret = wdt->wdd.ops->set_timeout(&wdt->wdd, timeout);
        if (ret)
                goto err_timer;
index 22d8ae6..e0c9842 100644 (file)
@@ -319,6 +319,7 @@ static int bcm_kona_wdt_probe(struct platform_device *pdev)
        spin_lock_init(&wdt->lock);
        platform_set_drvdata(pdev, wdt);
        watchdog_set_drvdata(&bcm_kona_wdt_wdd, wdt);
+       bcm_kona_wdt_wdd.parent = &pdev->dev;
 
        ret = bcm_kona_wdt_set_timeout_reg(&bcm_kona_wdt_wdd, 0);
        if (ret) {
index e96b09b..04da4b6 100644 (file)
@@ -186,8 +186,6 @@ static int booke_wdt_stop(struct watchdog_device *wdog)
 static int booke_wdt_set_timeout(struct watchdog_device *wdt_dev,
                                 unsigned int timeout)
 {
-       if (timeout > MAX_WDT_TIMEOUT)
-               return -EINVAL;
        wdt_dev->timeout = timeout;
        booke_wdt_set(wdt_dev);
 
@@ -211,7 +209,6 @@ static struct watchdog_device booke_wdt_dev = {
        .info = &booke_wdt_info,
        .ops = &booke_wdt_ops,
        .min_timeout = 1,
-       .max_timeout = 0xFFFF
 };
 
 static void __exit booke_wdt_exit(void)
@@ -229,6 +226,7 @@ static int __init booke_wdt_init(void)
        booke_wdt_set_timeout(&booke_wdt_dev,
                              period_to_sec(booke_wdt_period));
        watchdog_set_nowayout(&booke_wdt_dev, nowayout);
+       booke_wdt_dev.max_timeout = MAX_WDT_TIMEOUT;
        if (booke_wdt_enabled)
                booke_wdt_start(&booke_wdt_dev);
 
index ce12f43..a099b77 100644 (file)
@@ -358,6 +358,7 @@ static int __init coh901327_probe(struct platform_device *pdev)
        if (ret < 0)
                coh901327_wdt.timeout = 60;
 
+       coh901327_wdt.parent = &pdev->dev;
        ret = watchdog_register_device(&coh901327_wdt);
        if (ret == 0)
                dev_info(&pdev->dev,
index 2e95896..67e6797 100644 (file)
@@ -195,6 +195,7 @@ static int da9052_wdt_probe(struct platform_device *pdev)
        da9052_wdt->timeout = DA9052_DEF_TIMEOUT;
        da9052_wdt->info = &da9052_wdt_info;
        da9052_wdt->ops = &da9052_wdt_ops;
+       da9052_wdt->parent = &pdev->dev;
        watchdog_set_drvdata(da9052_wdt, driver_data);
 
        kref_init(&driver_data->kref);
index 495089d..04d1430 100644 (file)
@@ -161,6 +161,7 @@ static int da9055_wdt_probe(struct platform_device *pdev)
        da9055_wdt->timeout = DA9055_DEF_TIMEOUT;
        da9055_wdt->info = &da9055_wdt_info;
        da9055_wdt->ops = &da9055_wdt_ops;
+       da9055_wdt->parent = &pdev->dev;
        watchdog_set_nowayout(da9055_wdt, nowayout);
        watchdog_set_drvdata(da9055_wdt, driver_data);
 
index b3a870c..7386111 100644 (file)
@@ -210,6 +210,7 @@ static int da9062_wdt_probe(struct platform_device *pdev)
        wdt->wdtdev.max_timeout = DA9062_WDT_MAX_TIMEOUT;
        wdt->wdtdev.timeout = DA9062_WDG_DEFAULT_TIMEOUT;
        wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS;
+       wdt->wdtdev.parent = &pdev->dev;
 
        watchdog_set_drvdata(&wdt->wdtdev, wdt);
        dev_set_drvdata(&pdev->dev, wdt);
index e2fe2eb..6bf130b 100644 (file)
@@ -175,6 +175,7 @@ static int da9063_wdt_probe(struct platform_device *pdev)
        wdt->wdtdev.min_timeout = DA9063_WDT_MIN_TIMEOUT;
        wdt->wdtdev.max_timeout = DA9063_WDT_MAX_TIMEOUT;
        wdt->wdtdev.timeout = DA9063_WDG_TIMEOUT;
+       wdt->wdtdev.parent = &pdev->dev;
 
        wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS;
 
index cfdf8a4..17454ca 100644 (file)
@@ -179,6 +179,7 @@ static int davinci_wdt_probe(struct platform_device *pdev)
        wdd->min_timeout        = 1;
        wdd->max_timeout        = MAX_HEARTBEAT;
        wdd->timeout            = DEFAULT_HEARTBEAT;
+       wdd->parent             = &pdev->dev;
 
        watchdog_init_timeout(wdd, heartbeat, dev);
 
index 31d8e49..50abe1b 100644 (file)
@@ -143,6 +143,7 @@ static int dc_wdt_probe(struct platform_device *pdev)
        }
        dc_wdt_wdd.max_timeout = U32_MAX / clk_get_rate(wdt->clk);
        dc_wdt_wdd.timeout = dc_wdt_wdd.max_timeout;
+       dc_wdt_wdd.parent = &pdev->dev;
 
        spin_lock_init(&wdt->lock);
 
index 7a2cc71..0a4d7cc 100644 (file)
@@ -132,6 +132,7 @@ static int ep93xx_wdt_probe(struct platform_device *pdev)
        val = readl(mmio_base + EP93XX_WATCHDOG);
        ep93xx_wdt_wdd.bootstatus = (val & 0x01) ? WDIOF_CARDRESET : 0;
        ep93xx_wdt_wdd.timeout = timeout;
+       ep93xx_wdt_wdd.parent = &pdev->dev;
 
        watchdog_set_nowayout(&ep93xx_wdt_wdd, nowayout);
 
index 1687cc2..90d59d3 100644 (file)
@@ -50,12 +50,41 @@ static void gpio_wdt_disable(struct gpio_wdt_priv *priv)
                gpio_direction_input(priv->gpio);
 }
 
+static void gpio_wdt_hwping(unsigned long data)
+{
+       struct watchdog_device *wdd = (struct watchdog_device *)data;
+       struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
+
+       if (priv->armed && time_after(jiffies, priv->last_jiffies +
+                                     msecs_to_jiffies(wdd->timeout * 1000))) {
+               dev_crit(wdd->dev, "Timer expired. System will reboot soon!\n");
+               return;
+       }
+
+       /* Restart timer */
+       mod_timer(&priv->timer, jiffies + priv->hw_margin);
+
+       switch (priv->hw_algo) {
+       case HW_ALGO_TOGGLE:
+               /* Toggle output pin */
+               priv->state = !priv->state;
+               gpio_set_value_cansleep(priv->gpio, priv->state);
+               break;
+       case HW_ALGO_LEVEL:
+               /* Pulse */
+               gpio_set_value_cansleep(priv->gpio, !priv->active_low);
+               udelay(1);
+               gpio_set_value_cansleep(priv->gpio, priv->active_low);
+               break;
+       }
+}
+
 static void gpio_wdt_start_impl(struct gpio_wdt_priv *priv)
 {
        priv->state = priv->active_low;
        gpio_direction_output(priv->gpio, priv->state);
        priv->last_jiffies = jiffies;
-       mod_timer(&priv->timer, priv->last_jiffies + priv->hw_margin);
+       gpio_wdt_hwping((unsigned long)&priv->wdd);
 }
 
 static int gpio_wdt_start(struct watchdog_device *wdd)
@@ -97,35 +126,6 @@ static int gpio_wdt_set_timeout(struct watchdog_device *wdd, unsigned int t)
        return gpio_wdt_ping(wdd);
 }
 
-static void gpio_wdt_hwping(unsigned long data)
-{
-       struct watchdog_device *wdd = (struct watchdog_device *)data;
-       struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
-
-       if (priv->armed && time_after(jiffies, priv->last_jiffies +
-                                     msecs_to_jiffies(wdd->timeout * 1000))) {
-               dev_crit(wdd->dev, "Timer expired. System will reboot soon!\n");
-               return;
-       }
-
-       /* Restart timer */
-       mod_timer(&priv->timer, jiffies + priv->hw_margin);
-
-       switch (priv->hw_algo) {
-       case HW_ALGO_TOGGLE:
-               /* Toggle output pin */
-               priv->state = !priv->state;
-               gpio_set_value_cansleep(priv->gpio, priv->state);
-               break;
-       case HW_ALGO_LEVEL:
-               /* Pulse */
-               gpio_set_value_cansleep(priv->gpio, !priv->active_low);
-               udelay(1);
-               gpio_set_value_cansleep(priv->gpio, priv->active_low);
-               break;
-       }
-}
-
 static int gpio_wdt_notify_sys(struct notifier_block *nb, unsigned long code,
                               void *unused)
 {
@@ -182,10 +182,10 @@ static int gpio_wdt_probe(struct platform_device *pdev)
        ret = of_property_read_string(pdev->dev.of_node, "hw_algo", &algo);
        if (ret)
                return ret;
-       if (!strncmp(algo, "toggle", 6)) {
+       if (!strcmp(algo, "toggle")) {
                priv->hw_algo = HW_ALGO_TOGGLE;
                f = GPIOF_IN;
-       } else if (!strncmp(algo, "level", 5)) {
+       } else if (!strcmp(algo, "level")) {
                priv->hw_algo = HW_ALGO_LEVEL;
                f = priv->active_low ? GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW;
        } else {
@@ -217,6 +217,7 @@ static int gpio_wdt_probe(struct platform_device *pdev)
        priv->wdd.ops           = &gpio_wdt_ops;
        priv->wdd.min_timeout   = SOFT_TIMEOUT_MIN;
        priv->wdd.max_timeout   = SOFT_TIMEOUT_MAX;
+       priv->wdd.parent        = &pdev->dev;
 
        if (watchdog_init_timeout(&priv->wdd, 0, &pdev->dev) < 0)
                priv->wdd.timeout = SOFT_TIMEOUT_DEF;
index 9bc39ae..78c2541 100644 (file)
@@ -267,6 +267,7 @@ static int ie6xx_wdt_probe(struct platform_device *pdev)
 
        ie6xx_wdt_dev.timeout = timeout;
        watchdog_set_nowayout(&ie6xx_wdt_dev, nowayout);
+       ie6xx_wdt_dev.parent = &pdev->dev;
 
        spin_lock_init(&ie6xx_wdt_data.unlock_sequence);
 
index 0f73621..15ab072 100644 (file)
@@ -316,6 +316,7 @@ static int pdc_wdt_remove(struct platform_device *pdev)
 {
        struct pdc_wdt_dev *pdc_wdt = platform_get_drvdata(pdev);
 
+       unregister_restart_handler(&pdc_wdt->restart_handler);
        pdc_wdt_stop(&pdc_wdt->wdt_dev);
        watchdog_unregister_device(&pdc_wdt->wdt_dev);
        clk_disable_unprepare(pdc_wdt->wdt_clk);
index 84f6701..0a436b5 100644 (file)
@@ -137,6 +137,7 @@ static int mid_wdt_probe(struct platform_device *pdev)
        wdt_dev->min_timeout = MID_WDT_TIMEOUT_MIN;
        wdt_dev->max_timeout = MID_WDT_TIMEOUT_MAX;
        wdt_dev->timeout = MID_WDT_DEFAULT_TIMEOUT;
+       wdt_dev->parent = &pdev->dev;
 
        watchdog_set_drvdata(wdt_dev, &pdev->dev);
        platform_set_drvdata(pdev, wdt_dev);
index 4c2cc09..6a7d5c3 100644 (file)
@@ -174,6 +174,7 @@ static int jz4740_wdt_probe(struct platform_device *pdev)
        jz4740_wdt->timeout = heartbeat;
        jz4740_wdt->min_timeout = 1;
        jz4740_wdt->max_timeout = MAX_HEARTBEAT;
+       jz4740_wdt->parent = &pdev->dev;
        watchdog_set_nowayout(jz4740_wdt, nowayout);
        watchdog_set_drvdata(jz4740_wdt, drvdata);
 
diff --git a/drivers/watchdog/lpc18xx_wdt.c b/drivers/watchdog/lpc18xx_wdt.c
new file mode 100644 (file)
index 0000000..ab7b8b1
--- /dev/null
@@ -0,0 +1,340 @@
+/*
+ * NXP LPC18xx Watchdog Timer (WDT)
+ *
+ * Copyright (c) 2015 Ariel D'Alessandro <ariel@vanguardiasur.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * Notes
+ * -----
+ * The Watchdog consists of a fixed divide-by-4 clock pre-scaler and a 24-bit
+ * counter which decrements on every clock cycle.
+ */
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/watchdog.h>
+
+/* Registers */
+#define LPC18XX_WDT_MOD                        0x00
+#define LPC18XX_WDT_MOD_WDEN           BIT(0)
+#define LPC18XX_WDT_MOD_WDRESET                BIT(1)
+
+#define LPC18XX_WDT_TC                 0x04
+#define LPC18XX_WDT_TC_MIN             0xff
+#define LPC18XX_WDT_TC_MAX             0xffffff
+
+#define LPC18XX_WDT_FEED               0x08
+#define LPC18XX_WDT_FEED_MAGIC1                0xaa
+#define LPC18XX_WDT_FEED_MAGIC2                0x55
+
+#define LPC18XX_WDT_TV                 0x0c
+
+/* Clock pre-scaler */
+#define LPC18XX_WDT_CLK_DIV            4
+
+/* Timeout values in seconds */
+#define LPC18XX_WDT_DEF_TIMEOUT                30U
+
+static int heartbeat;
+module_param(heartbeat, int, 0);
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeats in seconds (default="
+                __MODULE_STRING(LPC18XX_WDT_DEF_TIMEOUT) ")");
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+                __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+struct lpc18xx_wdt_dev {
+       struct watchdog_device  wdt_dev;
+       struct clk              *reg_clk;
+       struct clk              *wdt_clk;
+       unsigned long           clk_rate;
+       void __iomem            *base;
+       struct timer_list       timer;
+       struct notifier_block   restart_handler;
+       spinlock_t              lock;
+};
+
+static int lpc18xx_wdt_feed(struct watchdog_device *wdt_dev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+       unsigned long flags;
+
+       /*
+        * An abort condition will occur if an interrupt happens during the feed
+        * sequence.
+        */
+       spin_lock_irqsave(&lpc18xx_wdt->lock, flags);
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       writel(LPC18XX_WDT_FEED_MAGIC2, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       spin_unlock_irqrestore(&lpc18xx_wdt->lock, flags);
+
+       return 0;
+}
+
+static void lpc18xx_wdt_timer_feed(unsigned long data)
+{
+       struct watchdog_device *wdt_dev = (struct watchdog_device *)data;
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+
+       lpc18xx_wdt_feed(wdt_dev);
+
+       /* Use safe value (1/2 of real timeout) */
+       mod_timer(&lpc18xx_wdt->timer, jiffies +
+                 msecs_to_jiffies((wdt_dev->timeout * MSEC_PER_SEC) / 2));
+}
+
+/*
+ * Since LPC18xx Watchdog cannot be disabled in hardware, we must keep feeding
+ * it with a timer until userspace watchdog software takes over.
+ */
+static int lpc18xx_wdt_stop(struct watchdog_device *wdt_dev)
+{
+       lpc18xx_wdt_timer_feed((unsigned long)wdt_dev);
+
+       return 0;
+}
+
+static void __lpc18xx_wdt_set_timeout(struct lpc18xx_wdt_dev *lpc18xx_wdt)
+{
+       unsigned int val;
+
+       val = DIV_ROUND_UP(lpc18xx_wdt->wdt_dev.timeout * lpc18xx_wdt->clk_rate,
+                          LPC18XX_WDT_CLK_DIV);
+       writel(val, lpc18xx_wdt->base + LPC18XX_WDT_TC);
+}
+
+static int lpc18xx_wdt_set_timeout(struct watchdog_device *wdt_dev,
+                                  unsigned int new_timeout)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+
+       lpc18xx_wdt->wdt_dev.timeout = new_timeout;
+       __lpc18xx_wdt_set_timeout(lpc18xx_wdt);
+
+       return 0;
+}
+
+static unsigned int lpc18xx_wdt_get_timeleft(struct watchdog_device *wdt_dev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+       unsigned int val;
+
+       val = readl(lpc18xx_wdt->base + LPC18XX_WDT_TV);
+       return (val * LPC18XX_WDT_CLK_DIV) / lpc18xx_wdt->clk_rate;
+}
+
+static int lpc18xx_wdt_start(struct watchdog_device *wdt_dev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+       unsigned int val;
+
+       if (timer_pending(&lpc18xx_wdt->timer))
+               del_timer(&lpc18xx_wdt->timer);
+
+       val = readl(lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+       val |= LPC18XX_WDT_MOD_WDEN;
+       val |= LPC18XX_WDT_MOD_WDRESET;
+       writel(val, lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+
+       /*
+        * Setting the WDEN bit in the WDMOD register is not sufficient to
+        * enable the Watchdog. A valid feed sequence must be completed after
+        * setting WDEN before the Watchdog is capable of generating a reset.
+        */
+       lpc18xx_wdt_feed(wdt_dev);
+
+       return 0;
+}
+
+static struct watchdog_info lpc18xx_wdt_info = {
+       .identity       = "NXP LPC18xx Watchdog",
+       .options        = WDIOF_SETTIMEOUT |
+                         WDIOF_KEEPALIVEPING |
+                         WDIOF_MAGICCLOSE,
+};
+
+static const struct watchdog_ops lpc18xx_wdt_ops = {
+       .owner          = THIS_MODULE,
+       .start          = lpc18xx_wdt_start,
+       .stop           = lpc18xx_wdt_stop,
+       .ping           = lpc18xx_wdt_feed,
+       .set_timeout    = lpc18xx_wdt_set_timeout,
+       .get_timeleft   = lpc18xx_wdt_get_timeleft,
+};
+
+static int lpc18xx_wdt_restart(struct notifier_block *this, unsigned long mode,
+                              void *cmd)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = container_of(this,
+                               struct lpc18xx_wdt_dev, restart_handler);
+       unsigned long flags;
+       int val;
+
+       /*
+        * Incorrect feed sequence causes immediate watchdog reset if enabled.
+        */
+       spin_lock_irqsave(&lpc18xx_wdt->lock, flags);
+
+       val = readl(lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+       val |= LPC18XX_WDT_MOD_WDEN;
+       val |= LPC18XX_WDT_MOD_WDRESET;
+       writel(val, lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       writel(LPC18XX_WDT_FEED_MAGIC2, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+
+       spin_unlock_irqrestore(&lpc18xx_wdt->lock, flags);
+
+       return NOTIFY_OK;
+}
+
+static int lpc18xx_wdt_probe(struct platform_device *pdev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt;
+       struct device *dev = &pdev->dev;
+       struct resource *res;
+       int ret;
+
+       lpc18xx_wdt = devm_kzalloc(dev, sizeof(*lpc18xx_wdt), GFP_KERNEL);
+       if (!lpc18xx_wdt)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       lpc18xx_wdt->base = devm_ioremap_resource(dev, res);
+       if (IS_ERR(lpc18xx_wdt->base))
+               return PTR_ERR(lpc18xx_wdt->base);
+
+       lpc18xx_wdt->reg_clk = devm_clk_get(dev, "reg");
+       if (IS_ERR(lpc18xx_wdt->reg_clk)) {
+               dev_err(dev, "failed to get the reg clock\n");
+               return PTR_ERR(lpc18xx_wdt->reg_clk);
+       }
+
+       lpc18xx_wdt->wdt_clk = devm_clk_get(dev, "wdtclk");
+       if (IS_ERR(lpc18xx_wdt->wdt_clk)) {
+               dev_err(dev, "failed to get the wdt clock\n");
+               return PTR_ERR(lpc18xx_wdt->wdt_clk);
+       }
+
+       ret = clk_prepare_enable(lpc18xx_wdt->reg_clk);
+       if (ret) {
+               dev_err(dev, "could not prepare or enable sys clock\n");
+               return ret;
+       }
+
+       ret = clk_prepare_enable(lpc18xx_wdt->wdt_clk);
+       if (ret) {
+               dev_err(dev, "could not prepare or enable wdt clock\n");
+               goto disable_reg_clk;
+       }
+
+       /* We use the clock rate to calculate timeouts */
+       lpc18xx_wdt->clk_rate = clk_get_rate(lpc18xx_wdt->wdt_clk);
+       if (lpc18xx_wdt->clk_rate == 0) {
+               dev_err(dev, "failed to get clock rate\n");
+               ret = -EINVAL;
+               goto disable_wdt_clk;
+       }
+
+       lpc18xx_wdt->wdt_dev.info = &lpc18xx_wdt_info;
+       lpc18xx_wdt->wdt_dev.ops = &lpc18xx_wdt_ops;
+
+       lpc18xx_wdt->wdt_dev.min_timeout = DIV_ROUND_UP(LPC18XX_WDT_TC_MIN *
+                               LPC18XX_WDT_CLK_DIV, lpc18xx_wdt->clk_rate);
+
+       lpc18xx_wdt->wdt_dev.max_timeout = (LPC18XX_WDT_TC_MAX *
+                               LPC18XX_WDT_CLK_DIV) / lpc18xx_wdt->clk_rate;
+
+       lpc18xx_wdt->wdt_dev.timeout = min(lpc18xx_wdt->wdt_dev.max_timeout,
+                                          LPC18XX_WDT_DEF_TIMEOUT);
+
+       spin_lock_init(&lpc18xx_wdt->lock);
+
+       lpc18xx_wdt->wdt_dev.parent = dev;
+       watchdog_set_drvdata(&lpc18xx_wdt->wdt_dev, lpc18xx_wdt);
+
+       ret = watchdog_init_timeout(&lpc18xx_wdt->wdt_dev, heartbeat, dev);
+
+       __lpc18xx_wdt_set_timeout(lpc18xx_wdt);
+
+       setup_timer(&lpc18xx_wdt->timer, lpc18xx_wdt_timer_feed,
+                   (unsigned long)&lpc18xx_wdt->wdt_dev);
+
+       watchdog_set_nowayout(&lpc18xx_wdt->wdt_dev, nowayout);
+
+       platform_set_drvdata(pdev, lpc18xx_wdt);
+
+       ret = watchdog_register_device(&lpc18xx_wdt->wdt_dev);
+       if (ret)
+               goto disable_wdt_clk;
+
+       lpc18xx_wdt->restart_handler.notifier_call = lpc18xx_wdt_restart;
+       lpc18xx_wdt->restart_handler.priority = 128;
+       ret = register_restart_handler(&lpc18xx_wdt->restart_handler);
+       if (ret)
+               dev_warn(dev, "failed to register restart handler: %d\n", ret);
+
+       return 0;
+
+disable_wdt_clk:
+       clk_disable_unprepare(lpc18xx_wdt->wdt_clk);
+disable_reg_clk:
+       clk_disable_unprepare(lpc18xx_wdt->reg_clk);
+       return ret;
+}
+
+static void lpc18xx_wdt_shutdown(struct platform_device *pdev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = platform_get_drvdata(pdev);
+
+       lpc18xx_wdt_stop(&lpc18xx_wdt->wdt_dev);
+}
+
+static int lpc18xx_wdt_remove(struct platform_device *pdev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = platform_get_drvdata(pdev);
+
+       unregister_restart_handler(&lpc18xx_wdt->restart_handler);
+
+       dev_warn(&pdev->dev, "I quit now, hardware will probably reboot!\n");
+       del_timer(&lpc18xx_wdt->timer);
+
+       watchdog_unregister_device(&lpc18xx_wdt->wdt_dev);
+       clk_disable_unprepare(lpc18xx_wdt->wdt_clk);
+       clk_disable_unprepare(lpc18xx_wdt->reg_clk);
+
+       return 0;
+}
+
+static const struct of_device_id lpc18xx_wdt_match[] = {
+       { .compatible = "nxp,lpc1850-wwdt" },
+       {}
+};
+MODULE_DEVICE_TABLE(of, lpc18xx_wdt_match);
+
+static struct platform_driver lpc18xx_wdt_driver = {
+       .driver = {
+               .name = "lpc18xx-wdt",
+               .of_match_table = lpc18xx_wdt_match,
+       },
+       .probe = lpc18xx_wdt_probe,
+       .remove = lpc18xx_wdt_remove,
+       .shutdown = lpc18xx_wdt_shutdown,
+};
+module_platform_driver(lpc18xx_wdt_driver);
+
+MODULE_AUTHOR("Ariel D'Alessandro <ariel@vanguardiasur.com.ar>");
+MODULE_DESCRIPTION("NXP LPC18xx Watchdog Timer Driver");
+MODULE_LICENSE("GPL v2");
index d193a5e..6901300 100644 (file)
@@ -197,6 +197,7 @@ static int a21_wdt_probe(struct platform_device *pdev)
        watchdog_init_timeout(&a21_wdt, 30, &pdev->dev);
        watchdog_set_nowayout(&a21_wdt, nowayout);
        watchdog_set_drvdata(&a21_wdt, drv);
+       a21_wdt.parent = &pdev->dev;
 
        reset = a21_wdt_get_bootstatus(drv);
        if (reset == 2)
index 59f0913..3aefdde 100644 (file)
@@ -130,6 +130,7 @@ static int menf21bmc_wdt_probe(struct platform_device *pdev)
        drv_data->wdt.info = &menf21bmc_wdt_info;
        drv_data->wdt.min_timeout = BMC_WD_TIMEOUT_MIN;
        drv_data->wdt.max_timeout = BMC_WD_TIMEOUT_MAX;
+       drv_data->wdt.parent = &pdev->dev;
        drv_data->i2c_client = i2c_client;
 
        /*
index 689381a..5f2273a 100644 (file)
@@ -50,8 +50,12 @@ struct mpc8xxx_wdt_type {
        bool hw_enabled;
 };
 
-static struct mpc8xxx_wdt __iomem *wd_base;
-static int mpc8xxx_wdt_init_late(void);
+struct mpc8xxx_wdt_ddata {
+       struct mpc8xxx_wdt __iomem *base;
+       struct watchdog_device wdd;
+       struct timer_list timer;
+       spinlock_t lock;
+};
 
 static u16 timeout = 0xffff;
 module_param(timeout, ushort, 0);
@@ -68,65 +72,59 @@ module_param(nowayout, bool, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
                 "(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
-/*
- * We always prescale, but if someone really doesn't want to they can set this
- * to 0
- */
-static int prescale = 1;
-
-static DEFINE_SPINLOCK(wdt_spinlock);
-
-static void mpc8xxx_wdt_keepalive(void)
+static void mpc8xxx_wdt_keepalive(struct mpc8xxx_wdt_ddata *ddata)
 {
        /* Ping the WDT */
-       spin_lock(&wdt_spinlock);
-       out_be16(&wd_base->swsrr, 0x556c);
-       out_be16(&wd_base->swsrr, 0xaa39);
-       spin_unlock(&wdt_spinlock);
+       spin_lock(&ddata->lock);
+       out_be16(&ddata->base->swsrr, 0x556c);
+       out_be16(&ddata->base->swsrr, 0xaa39);
+       spin_unlock(&ddata->lock);
 }
 
-static struct watchdog_device mpc8xxx_wdt_dev;
-static void mpc8xxx_wdt_timer_ping(unsigned long arg);
-static DEFINE_TIMER(wdt_timer, mpc8xxx_wdt_timer_ping, 0,
-               (unsigned long)&mpc8xxx_wdt_dev);
-
 static void mpc8xxx_wdt_timer_ping(unsigned long arg)
 {
-       struct watchdog_device *w = (struct watchdog_device *)arg;
+       struct mpc8xxx_wdt_ddata *ddata = (void *)arg;
 
-       mpc8xxx_wdt_keepalive();
+       mpc8xxx_wdt_keepalive(ddata);
        /* We're pinging it twice faster than needed, just to be sure. */
-       mod_timer(&wdt_timer, jiffies + HZ * w->timeout / 2);
+       mod_timer(&ddata->timer, jiffies + HZ * ddata->wdd.timeout / 2);
 }
 
 static int mpc8xxx_wdt_start(struct watchdog_device *w)
 {
-       u32 tmp = SWCRR_SWEN;
+       struct mpc8xxx_wdt_ddata *ddata =
+               container_of(w, struct mpc8xxx_wdt_ddata, wdd);
+
+       u32 tmp = SWCRR_SWEN | SWCRR_SWPR;
 
        /* Good, fire up the show */
-       if (prescale)
-               tmp |= SWCRR_SWPR;
        if (reset)
                tmp |= SWCRR_SWRI;
 
        tmp |= timeout << 16;
 
-       out_be32(&wd_base->swcrr, tmp);
+       out_be32(&ddata->base->swcrr, tmp);
 
-       del_timer_sync(&wdt_timer);
+       del_timer_sync(&ddata->timer);
 
        return 0;
 }
 
 static int mpc8xxx_wdt_ping(struct watchdog_device *w)
 {
-       mpc8xxx_wdt_keepalive();
+       struct mpc8xxx_wdt_ddata *ddata =
+               container_of(w, struct mpc8xxx_wdt_ddata, wdd);
+
+       mpc8xxx_wdt_keepalive(ddata);
        return 0;
 }
 
 static int mpc8xxx_wdt_stop(struct watchdog_device *w)
 {
-       mod_timer(&wdt_timer, jiffies);
+       struct mpc8xxx_wdt_ddata *ddata =
+               container_of(w, struct mpc8xxx_wdt_ddata, wdd);
+
+       mod_timer(&ddata->timer, jiffies);
        return 0;
 }
 
@@ -143,53 +141,57 @@ static struct watchdog_ops mpc8xxx_wdt_ops = {
        .stop = mpc8xxx_wdt_stop,
 };
 
-static struct watchdog_device mpc8xxx_wdt_dev = {
-       .info = &mpc8xxx_wdt_info,
-       .ops = &mpc8xxx_wdt_ops,
-};
-
-static const struct of_device_id mpc8xxx_wdt_match[];
 static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
 {
        int ret;
-       const struct of_device_id *match;
-       struct device_node *np = ofdev->dev.of_node;
+       struct resource *res;
        const struct mpc8xxx_wdt_type *wdt_type;
+       struct mpc8xxx_wdt_ddata *ddata;
        u32 freq = fsl_get_sys_freq();
        bool enabled;
        unsigned int timeout_sec;
 
-       match = of_match_device(mpc8xxx_wdt_match, &ofdev->dev);
-       if (!match)
+       wdt_type = of_device_get_match_data(&ofdev->dev);
+       if (!wdt_type)
                return -EINVAL;
-       wdt_type = match->data;
 
        if (!freq || freq == -1)
                return -EINVAL;
 
-       wd_base = of_iomap(np, 0);
-       if (!wd_base)
+       ddata = devm_kzalloc(&ofdev->dev, sizeof(*ddata), GFP_KERNEL);
+       if (!ddata)
                return -ENOMEM;
 
-       enabled = in_be32(&wd_base->swcrr) & SWCRR_SWEN;
+       res = platform_get_resource(ofdev, IORESOURCE_MEM, 0);
+       ddata->base = devm_ioremap_resource(&ofdev->dev, res);
+       if (IS_ERR(ddata->base))
+               return PTR_ERR(ddata->base);
+
+       enabled = in_be32(&ddata->base->swcrr) & SWCRR_SWEN;
        if (!enabled && wdt_type->hw_enabled) {
                pr_info("could not be enabled in software\n");
-               ret = -ENOSYS;
-               goto err_unmap;
+               return -ENODEV;
        }
 
+       spin_lock_init(&ddata->lock);
+       setup_timer(&ddata->timer, mpc8xxx_wdt_timer_ping,
+                   (unsigned long)ddata);
+
+       ddata->wdd.info = &mpc8xxx_wdt_info,
+       ddata->wdd.ops = &mpc8xxx_wdt_ops,
+
        /* Calculate the timeout in seconds */
-       if (prescale)
-               timeout_sec = (timeout * wdt_type->prescaler) / freq;
-       else
-               timeout_sec = timeout / freq;
-
-       mpc8xxx_wdt_dev.timeout = timeout_sec;
-#ifdef MODULE
-       ret = mpc8xxx_wdt_init_late();
-       if (ret)
-               goto err_unmap;
-#endif
+       timeout_sec = (timeout * wdt_type->prescaler) / freq;
+
+       ddata->wdd.timeout = timeout_sec;
+
+       watchdog_set_nowayout(&ddata->wdd, nowayout);
+
+       ret = watchdog_register_device(&ddata->wdd);
+       if (ret) {
+               pr_err("cannot register watchdog device (err=%d)\n", ret);
+               return ret;
+       }
 
        pr_info("WDT driver for MPC8xxx initialized. mode:%s timeout=%d (%d seconds)\n",
                reset ? "reset" : "interrupt", timeout, timeout_sec);
@@ -200,21 +202,20 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
         * userspace handles it.
         */
        if (enabled)
-               mod_timer(&wdt_timer, jiffies);
+               mod_timer(&ddata->timer, jiffies);
+
+       platform_set_drvdata(ofdev, ddata);
        return 0;
-err_unmap:
-       iounmap(wd_base);
-       wd_base = NULL;
-       return ret;
 }
 
 static int mpc8xxx_wdt_remove(struct platform_device *ofdev)
 {
+       struct mpc8xxx_wdt_ddata *ddata = platform_get_drvdata(ofdev);
+
        pr_crit("Watchdog removed, expect the %s soon!\n",
                reset ? "reset" : "machine check exception");
-       del_timer_sync(&wdt_timer);
-       watchdog_unregister_device(&mpc8xxx_wdt_dev);
-       iounmap(wd_base);
+       del_timer_sync(&ddata->timer);
+       watchdog_unregister_device(&ddata->wdd);
 
        return 0;
 }
@@ -253,31 +254,6 @@ static struct platform_driver mpc8xxx_wdt_driver = {
        },
 };
 
-/*
- * We do wdt initialization in two steps: arch_initcall probes the wdt
- * very early to start pinging the watchdog (misc devices are not yet
- * available), and later module_init() just registers the misc device.
- */
-static int mpc8xxx_wdt_init_late(void)
-{
-       int ret;
-
-       if (!wd_base)
-               return -ENODEV;
-
-       watchdog_set_nowayout(&mpc8xxx_wdt_dev, nowayout);
-
-       ret = watchdog_register_device(&mpc8xxx_wdt_dev);
-       if (ret) {
-               pr_err("cannot register watchdog device (err=%d)\n", ret);
-               return ret;
-       }
-       return 0;
-}
-#ifndef MODULE
-module_init(mpc8xxx_wdt_init_late);
-#endif
-
 static int __init mpc8xxx_wdt_init(void)
 {
        return platform_driver_register(&mpc8xxx_wdt_driver);
index 938b987..6ad9df9 100644 (file)
@@ -210,6 +210,14 @@ static int mtk_wdt_probe(struct platform_device *pdev)
        return 0;
 }
 
+static void mtk_wdt_shutdown(struct platform_device *pdev)
+{
+       struct mtk_wdt_dev *mtk_wdt = platform_get_drvdata(pdev);
+
+       if (watchdog_active(&mtk_wdt->wdt_dev))
+               mtk_wdt_stop(&mtk_wdt->wdt_dev);
+}
+
 static int mtk_wdt_remove(struct platform_device *pdev)
 {
        struct mtk_wdt_dev *mtk_wdt = platform_get_drvdata(pdev);
@@ -221,17 +229,48 @@ static int mtk_wdt_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int mtk_wdt_suspend(struct device *dev)
+{
+       struct mtk_wdt_dev *mtk_wdt = dev_get_drvdata(dev);
+
+       if (watchdog_active(&mtk_wdt->wdt_dev))
+               mtk_wdt_stop(&mtk_wdt->wdt_dev);
+
+       return 0;
+}
+
+static int mtk_wdt_resume(struct device *dev)
+{
+       struct mtk_wdt_dev *mtk_wdt = dev_get_drvdata(dev);
+
+       if (watchdog_active(&mtk_wdt->wdt_dev)) {
+               mtk_wdt_start(&mtk_wdt->wdt_dev);
+               mtk_wdt_ping(&mtk_wdt->wdt_dev);
+       }
+
+       return 0;
+}
+#endif
+
 static const struct of_device_id mtk_wdt_dt_ids[] = {
        { .compatible = "mediatek,mt6589-wdt" },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, mtk_wdt_dt_ids);
 
+static const struct dev_pm_ops mtk_wdt_pm_ops = {
+       SET_SYSTEM_SLEEP_PM_OPS(mtk_wdt_suspend,
+                               mtk_wdt_resume)
+};
+
 static struct platform_driver mtk_wdt_driver = {
        .probe          = mtk_wdt_probe,
        .remove         = mtk_wdt_remove,
+       .shutdown       = mtk_wdt_shutdown,
        .driver         = {
                .name           = DRV_NAME,
+               .pm             = &mtk_wdt_pm_ops,
                .of_match_table = mtk_wdt_dt_ids,
        },
 };
index c028454..bd917bb 100644 (file)
@@ -294,6 +294,8 @@ static const struct pci_device_id tco_pci_tbl[] = {
          PCI_ANY_ID, PCI_ANY_ID, },
        { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SMBUS,
          PCI_ANY_ID, PCI_ANY_ID, },
+       { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS,
+         PCI_ANY_ID, PCI_ANY_ID, },
        { 0, },                 /* End of list */
 };
 MODULE_DEVICE_TABLE(pci, tco_pci_tbl);
index de911c7..d96bee0 100644 (file)
@@ -253,6 +253,7 @@ static int omap_wdt_probe(struct platform_device *pdev)
        wdev->wdog.ops = &omap_wdt_ops;
        wdev->wdog.min_timeout = TIMER_MARGIN_MIN;
        wdev->wdog.max_timeout = TIMER_MARGIN_MAX;
+       wdev->wdog.parent = &pdev->dev;
 
        if (watchdog_init_timeout(&wdev->wdog, timer_margin, &pdev->dev) < 0)
                wdev->wdog.timeout = TIMER_MARGIN_DEFAULT;
index ef0c628..c6b8f4a 100644 (file)
@@ -567,6 +567,7 @@ static int orion_wdt_probe(struct platform_device *pdev)
 
        dev->wdt.timeout = wdt_max_duration;
        dev->wdt.max_timeout = wdt_max_duration;
+       dev->wdt.parent = &pdev->dev;
        watchdog_init_timeout(&dev->wdt, heartbeat, &pdev->dev);
 
        platform_set_drvdata(pdev, &dev->wdt);
index b9c6049..4224b3e 100644 (file)
@@ -167,6 +167,7 @@ static int pnx4008_wdt_probe(struct platform_device *pdev)
 
        pnx4008_wdd.bootstatus = (readl(WDTIM_RES(wdt_base)) & WDOG_RESET) ?
                        WDIOF_CARDRESET : 0;
+       pnx4008_wdd.parent = &pdev->dev;
        watchdog_set_nowayout(&pnx4008_wdd, nowayout);
 
        pnx4008_wdt_stop(&pnx4008_wdd); /* disable for now */
index aa03ca8..773dcfa 100644 (file)
@@ -171,6 +171,7 @@ static int qcom_wdt_probe(struct platform_device *pdev)
        wdt->wdd.ops = &qcom_wdt_ops;
        wdt->wdd.min_timeout = 1;
        wdt->wdd.max_timeout = 0x10000000U / wdt->rate;
+       wdt->wdd.parent = &pdev->dev;
 
        /*
         * If 'timeout-sec' unspecified in devicetree, assume a 30 second
index b7c68e2..39cd51d 100644 (file)
@@ -127,6 +127,7 @@ static int retu_wdt_probe(struct platform_device *pdev)
        retu_wdt->timeout       = RETU_WDT_MAX_TIMER;
        retu_wdt->min_timeout   = 0;
        retu_wdt->max_timeout   = RETU_WDT_MAX_TIMER;
+       retu_wdt->parent        = &pdev->dev;
 
        watchdog_set_drvdata(retu_wdt, wdev);
        watchdog_set_nowayout(retu_wdt, nowayout);
index a6f7e2e..1967919 100644 (file)
@@ -161,6 +161,7 @@ static int rt288x_wdt_probe(struct platform_device *pdev)
        rt288x_wdt_dev.dev = &pdev->dev;
        rt288x_wdt_dev.bootstatus = rt288x_wdt_bootcause();
        rt288x_wdt_dev.max_timeout = (0xfffful / rt288x_wdt_freq);
+       rt288x_wdt_dev.parent = &pdev->dev;
 
        watchdog_init_timeout(&rt288x_wdt_dev, rt288x_wdt_dev.max_timeout,
                              &pdev->dev);
index e89ae02..d781000 100644 (file)
@@ -607,6 +607,7 @@ static int s3c2410wdt_probe(struct platform_device *pdev)
        watchdog_set_nowayout(&wdt->wdt_device, nowayout);
 
        wdt->wdt_device.bootstatus = s3c2410wdt_get_bootstatus(wdt);
+       wdt->wdt_device.parent = &pdev->dev;
 
        ret = watchdog_register_device(&wdt->wdt_device);
        if (ret) {
diff --git a/drivers/watchdog/sama5d4_wdt.c b/drivers/watchdog/sama5d4_wdt.c
new file mode 100644 (file)
index 0000000..a49634c
--- /dev/null
@@ -0,0 +1,280 @@
+/*
+ * Driver for Atmel SAMA5D4 Watchdog Timer
+ *
+ * Copyright (C) 2015 Atmel Corporation
+ *
+ * Licensed under GPLv2.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/watchdog.h>
+
+#include "at91sam9_wdt.h"
+
+/* minimum and maximum watchdog timeout, in seconds */
+#define MIN_WDT_TIMEOUT                1
+#define MAX_WDT_TIMEOUT                16
+#define WDT_DEFAULT_TIMEOUT    MAX_WDT_TIMEOUT
+
+#define WDT_SEC2TICKS(s)       ((s) ? (((s) << 8) - 1) : 0)
+
+struct sama5d4_wdt {
+       struct watchdog_device  wdd;
+       void __iomem            *reg_base;
+       u32     config;
+};
+
+static int wdt_timeout = WDT_DEFAULT_TIMEOUT;
+static bool nowayout = WATCHDOG_NOWAYOUT;
+
+module_param(wdt_timeout, int, 0);
+MODULE_PARM_DESC(wdt_timeout,
+       "Watchdog timeout in seconds. (default = "
+       __MODULE_STRING(WDT_DEFAULT_TIMEOUT) ")");
+
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout,
+       "Watchdog cannot be stopped once started (default="
+       __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+#define wdt_read(wdt, field) \
+       readl_relaxed((wdt)->reg_base + (field))
+
+#define wdt_write(wtd, field, val) \
+       writel_relaxed((val), (wdt)->reg_base + (field))
+
+static int sama5d4_wdt_start(struct watchdog_device *wdd)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+       u32 reg;
+
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg &= ~AT91_WDT_WDDIS;
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       return 0;
+}
+
+static int sama5d4_wdt_stop(struct watchdog_device *wdd)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+       u32 reg;
+
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg |= AT91_WDT_WDDIS;
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       return 0;
+}
+
+static int sama5d4_wdt_ping(struct watchdog_device *wdd)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+
+       wdt_write(wdt, AT91_WDT_CR, AT91_WDT_KEY | AT91_WDT_WDRSTT);
+
+       return 0;
+}
+
+static int sama5d4_wdt_set_timeout(struct watchdog_device *wdd,
+                                unsigned int timeout)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+       u32 value = WDT_SEC2TICKS(timeout);
+       u32 reg;
+
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg &= ~AT91_WDT_WDV;
+       reg &= ~AT91_WDT_WDD;
+       reg |= AT91_WDT_SET_WDV(value);
+       reg |= AT91_WDT_SET_WDD(value);
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       wdd->timeout = timeout;
+
+       return 0;
+}
+
+static const struct watchdog_info sama5d4_wdt_info = {
+       .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING,
+       .identity = "Atmel SAMA5D4 Watchdog",
+};
+
+static struct watchdog_ops sama5d4_wdt_ops = {
+       .owner = THIS_MODULE,
+       .start = sama5d4_wdt_start,
+       .stop = sama5d4_wdt_stop,
+       .ping = sama5d4_wdt_ping,
+       .set_timeout = sama5d4_wdt_set_timeout,
+};
+
+static irqreturn_t sama5d4_wdt_irq_handler(int irq, void *dev_id)
+{
+       struct sama5d4_wdt *wdt = platform_get_drvdata(dev_id);
+
+       if (wdt_read(wdt, AT91_WDT_SR)) {
+               pr_crit("Atmel Watchdog Software Reset\n");
+               emergency_restart();
+               pr_crit("Reboot didn't succeed\n");
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int of_sama5d4_wdt_init(struct device_node *np, struct sama5d4_wdt *wdt)
+{
+       const char *tmp;
+
+       wdt->config = AT91_WDT_WDDIS;
+
+       if (!of_property_read_string(np, "atmel,watchdog-type", &tmp) &&
+           !strcmp(tmp, "software"))
+               wdt->config |= AT91_WDT_WDFIEN;
+       else
+               wdt->config |= AT91_WDT_WDRSTEN;
+
+       if (of_property_read_bool(np, "atmel,idle-halt"))
+               wdt->config |= AT91_WDT_WDIDLEHLT;
+
+       if (of_property_read_bool(np, "atmel,dbg-halt"))
+               wdt->config |= AT91_WDT_WDDBGHLT;
+
+       return 0;
+}
+
+static int sama5d4_wdt_init(struct sama5d4_wdt *wdt)
+{
+       struct watchdog_device *wdd = &wdt->wdd;
+       u32 value = WDT_SEC2TICKS(wdd->timeout);
+       u32 reg;
+
+       /*
+        * Because the fields WDV and WDD must not be modified when the WDDIS
+        * bit is set, so clear the WDDIS bit before writing the WDT_MR.
+        */
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg &= ~AT91_WDT_WDDIS;
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       reg = wdt->config;
+       reg |= AT91_WDT_SET_WDD(value);
+       reg |= AT91_WDT_SET_WDV(value);
+
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       return 0;
+}
+
+static int sama5d4_wdt_probe(struct platform_device *pdev)
+{
+       struct watchdog_device *wdd;
+       struct sama5d4_wdt *wdt;
+       struct resource *res;
+       void __iomem *regs;
+       u32 irq = 0;
+       int ret;
+
+       wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL);
+       if (!wdt)
+               return -ENOMEM;
+
+       wdd = &wdt->wdd;
+       wdd->timeout = wdt_timeout;
+       wdd->info = &sama5d4_wdt_info;
+       wdd->ops = &sama5d4_wdt_ops;
+       wdd->min_timeout = MIN_WDT_TIMEOUT;
+       wdd->max_timeout = MAX_WDT_TIMEOUT;
+
+       watchdog_set_drvdata(wdd, wdt);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       regs = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(regs))
+               return PTR_ERR(regs);
+
+       wdt->reg_base = regs;
+
+       if (pdev->dev.of_node) {
+               irq = irq_of_parse_and_map(pdev->dev.of_node, 0);
+               if (!irq)
+                       dev_warn(&pdev->dev, "failed to get IRQ from DT\n");
+
+               ret = of_sama5d4_wdt_init(pdev->dev.of_node, wdt);
+               if (ret)
+                       return ret;
+       }
+
+       if ((wdt->config & AT91_WDT_WDFIEN) && irq) {
+               ret = devm_request_irq(&pdev->dev, irq, sama5d4_wdt_irq_handler,
+                                      IRQF_SHARED | IRQF_IRQPOLL |
+                                      IRQF_NO_SUSPEND, pdev->name, pdev);
+               if (ret) {
+                       dev_err(&pdev->dev,
+                               "cannot register interrupt handler\n");
+                       return ret;
+               }
+       }
+
+       ret = watchdog_init_timeout(wdd, wdt_timeout, &pdev->dev);
+       if (ret) {
+               dev_err(&pdev->dev, "unable to set timeout value\n");
+               return ret;
+       }
+
+       ret = sama5d4_wdt_init(wdt);
+       if (ret)
+               return ret;
+
+       watchdog_set_nowayout(wdd, nowayout);
+
+       ret = watchdog_register_device(wdd);
+       if (ret) {
+               dev_err(&pdev->dev, "failed to register watchdog device\n");
+               return ret;
+       }
+
+       platform_set_drvdata(pdev, wdt);
+
+       dev_info(&pdev->dev, "initialized (timeout = %d sec, nowayout = %d)\n",
+                wdt_timeout, nowayout);
+
+       return 0;
+}
+
+static int sama5d4_wdt_remove(struct platform_device *pdev)
+{
+       struct sama5d4_wdt *wdt = platform_get_drvdata(pdev);
+
+       sama5d4_wdt_stop(&wdt->wdd);
+
+       watchdog_unregister_device(&wdt->wdd);
+
+       return 0;
+}
+
+static const struct of_device_id sama5d4_wdt_of_match[] = {
+       { .compatible = "atmel,sama5d4-wdt", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, sama5d4_wdt_of_match);
+
+static struct platform_driver sama5d4_wdt_driver = {
+       .probe          = sama5d4_wdt_probe,
+       .remove         = sama5d4_wdt_remove,
+       .driver         = {
+               .name   = "sama5d4_wdt",
+               .of_match_table = sama5d4_wdt_of_match,
+       }
+};
+module_platform_driver(sama5d4_wdt_driver);
+
+MODULE_AUTHOR("Atmel Corporation");
+MODULE_DESCRIPTION("Atmel SAMA5D4 Watchdog Timer driver");
+MODULE_LICENSE("GPL v2");
index 567458b..f908121 100644 (file)
@@ -252,6 +252,7 @@ static int sh_wdt_probe(struct platform_device *pdev)
 
        watchdog_set_nowayout(&sh_wdt_dev, nowayout);
        watchdog_set_drvdata(&sh_wdt_dev, wdt);
+       sh_wdt_dev.parent = &pdev->dev;
 
        spin_lock_init(&wdt->lock);
 
index 42fa5c0..d0578ab 100644 (file)
@@ -154,6 +154,7 @@ static int sirfsoc_wdt_probe(struct platform_device *pdev)
 
        watchdog_init_timeout(&sirfsoc_wdd, timeout, &pdev->dev);
        watchdog_set_nowayout(&sirfsoc_wdd, nowayout);
+       sirfsoc_wdd.parent = &pdev->dev;
 
        ret = watchdog_register_device(&sirfsoc_wdd);
        if (ret)
index 4e7fec3..01d8162 100644 (file)
@@ -226,6 +226,7 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id)
        wdt->adev = adev;
        wdt->wdd.info = &wdt_info;
        wdt->wdd.ops = &wdt_ops;
+       wdt->wdd.parent = &adev->dev;
 
        spin_lock_init(&wdt->lock);
        watchdog_set_nowayout(&wdt->wdd, nowayout);
index 6785afd..14e9bad 100644 (file)
@@ -241,6 +241,7 @@ static int st_wdog_probe(struct platform_device *pdev)
                return -EINVAL;
        }
        st_wdog_dev.max_timeout = 0xFFFFFFFF / st_wdog->clkrate;
+       st_wdog_dev.parent = &pdev->dev;
 
        ret = clk_prepare_enable(clk);
        if (ret) {
index e7f0d5b..3ee6128 100644 (file)
@@ -76,6 +76,7 @@ static int stmp3xxx_wdt_probe(struct platform_device *pdev)
        watchdog_set_drvdata(&stmp3xxx_wdd, &pdev->dev);
 
        stmp3xxx_wdd.timeout = clamp_t(unsigned, heartbeat, 1, STMP3XXX_MAX_TIMEOUT);
+       stmp3xxx_wdd.parent = &pdev->dev;
 
        ret = watchdog_register_device(&stmp3xxx_wdd);
        if (ret < 0) {
index a29afb3..47bd8a1 100644 (file)
@@ -184,7 +184,7 @@ static int sunxi_wdt_start(struct watchdog_device *wdt_dev)
        /* Set system reset function */
        reg = readl(wdt_base + regs->wdt_cfg);
        reg &= ~(regs->wdt_reset_mask);
-       reg |= ~(regs->wdt_reset_val);
+       reg |= regs->wdt_reset_val;
        writel(reg, wdt_base + regs->wdt_cfg);
 
        /* Enable watchdog */
index 30451ea..7f97cdd 100644 (file)
@@ -218,6 +218,7 @@ static int tegra_wdt_probe(struct platform_device *pdev)
        wdd->ops = &tegra_wdt_ops;
        wdd->min_timeout = MIN_WDT_TIMEOUT;
        wdd->max_timeout = MAX_WDT_TIMEOUT;
+       wdd->parent = &pdev->dev;
 
        watchdog_set_drvdata(wdd, wdt);
 
index 2c1db6f..9bf3cc0 100644 (file)
@@ -83,6 +83,7 @@ static int twl4030_wdt_probe(struct platform_device *pdev)
        wdt->timeout            = 30;
        wdt->min_timeout        = 1;
        wdt->max_timeout        = 30;
+       wdt->parent = &pdev->dev;
 
        watchdog_set_nowayout(wdt, nowayout);
        platform_set_drvdata(pdev, wdt);
index 7f61593..c2da880 100644 (file)
@@ -131,6 +131,7 @@ static int __init txx9wdt_probe(struct platform_device *dev)
        txx9wdt.timeout = timeout;
        txx9wdt.min_timeout = 1;
        txx9wdt.max_timeout = WD_MAX_TIMEOUT;
+       txx9wdt.parent = &dev->dev;
        watchdog_set_nowayout(&txx9wdt, nowayout);
 
        ret = watchdog_register_device(&txx9wdt);
index 9de09ab..37c0843 100644 (file)
@@ -96,6 +96,7 @@ static int ux500_wdt_probe(struct platform_device *pdev)
                        ux500_wdt.max_timeout = WATCHDOG_MAX28;
        }
 
+       ux500_wdt.parent = &pdev->dev;
        watchdog_set_nowayout(&ux500_wdt, nowayout);
 
        /* disable auto off on sleep */
index 56369c4..5f9cbc3 100644 (file)
@@ -206,6 +206,7 @@ static int wdt_probe(struct pci_dev *pdev,
                timeout = WDT_TIMEOUT;
 
        wdt_dev.timeout = timeout;
+       wdt_dev.parent = &pdev->dev;
        watchdog_set_nowayout(&wdt_dev, nowayout);
        if (readl(wdt_mem) & VIA_WDT_FIRED)
                wdt_dev.bootstatus |= WDIOF_CARDRESET;
index 2fa17e7..8d1184a 100644 (file)
@@ -215,6 +215,7 @@ static int wm831x_wdt_probe(struct platform_device *pdev)
 
        wm831x_wdt->info = &wm831x_wdt_info;
        wm831x_wdt->ops = &wm831x_wdt_ops;
+       wm831x_wdt->parent = &pdev->dev;
        watchdog_set_nowayout(wm831x_wdt, nowayout);
        watchdog_set_drvdata(wm831x_wdt, driver_data);
 
index 34d272a..4ab4b83 100644 (file)
@@ -151,6 +151,7 @@ static int wm8350_wdt_probe(struct platform_device *pdev)
 
        watchdog_set_nowayout(&wm8350_wdt, nowayout);
        watchdog_set_drvdata(&wm8350_wdt, wm8350);
+       wm8350_wdt.parent = &pdev->dev;
 
        /* Default to 4s timeout */
        wm8350_wdt_set_timeout(&wm8350_wdt, 4);
index 1fa633b..c79329f 100644 (file)
@@ -441,7 +441,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
        /* Update direct mapping, invalidate P2M, and add to balloon. */
        for (i = 0; i < nr_pages; i++) {
                pfn = frame_list[i];
-               frame_list[i] = pfn_to_mfn(pfn);
+               frame_list[i] = pfn_to_gfn(pfn);
                page = pfn_to_page(pfn);
 
 #ifdef CONFIG_XEN_HAVE_PVMMU
index 0edb91c..8ae2fc9 100644 (file)
@@ -6,10 +6,10 @@
 bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
                               const struct bio_vec *vec2)
 {
-       unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
-       unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
+       unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page));
+       unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page));
 
        return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
-               ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
+               ((bfn1 == bfn2) || ((bfn1+1) == bfn2));
 }
 EXPORT_SYMBOL(xen_biovec_phys_mergeable);
index 68d1290..6cd5e65 100644 (file)
@@ -1688,7 +1688,7 @@ void __init xen_init_IRQ(void)
                struct physdev_pirq_eoi_gmfn eoi_gmfn;
 
                pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
-               eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map);
+               eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map);
                rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
                /* TODO: No PVH support for PIRQ EOI */
                if (rc != 0) {
index ed673e1..1d4baf5 100644 (file)
@@ -111,7 +111,7 @@ static int init_control_block(int cpu,
        for (i = 0; i < EVTCHN_FIFO_MAX_QUEUES; i++)
                q->head[i] = 0;
 
-       init_control.control_gfn = virt_to_mfn(control_block);
+       init_control.control_gfn = virt_to_gfn(control_block);
        init_control.offset      = 0;
        init_control.vcpu        = cpu;
 
@@ -167,7 +167,7 @@ static int evtchn_fifo_setup(struct irq_info *info)
                /* Mask all events in this page before adding it. */
                init_array_page(array_page);
 
-               expand_array.array_gfn = virt_to_mfn(array_page);
+               expand_array.array_gfn = virt_to_gfn(array_page);
 
                ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
                if (ret < 0)
index e53fe19..4547a91 100644 (file)
@@ -142,7 +142,8 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
 
                /* Grant foreign access to the page. */
                rc = gnttab_grant_foreign_access(op->domid,
-                       pfn_to_mfn(page_to_pfn(gref->page)), readonly);
+                                                xen_page_to_gfn(gref->page),
+                                                readonly);
                if (rc < 0)
                        goto undo;
                gref_ids[i] = gref->gref_id = rc;
@@ -493,7 +494,7 @@ static void gntalloc_vma_close(struct vm_area_struct *vma)
        mutex_unlock(&gref_mutex);
 }
 
-static struct vm_operations_struct gntalloc_vmops = {
+static const struct vm_operations_struct gntalloc_vmops = {
        .open = gntalloc_vma_open,
        .close = gntalloc_vma_close,
 };
index 0dbb222..2ea0b3b 100644 (file)
@@ -433,7 +433,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
        return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT];
 }
 
-static struct vm_operations_struct gntdev_vmops = {
+static const struct vm_operations_struct gntdev_vmops = {
        .open = gntdev_vma_open,
        .close = gntdev_vma_close,
        .find_special_page = gntdev_vma_find_special_page,
index d10effe..e12bd36 100644 (file)
@@ -80,7 +80,7 @@ static int xen_suspend(void *data)
         * is resuming in a new domain.
         */
        si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
-                                           ? virt_to_mfn(xen_start_info)
+                                           ? virt_to_gfn(xen_start_info)
                                            : 0);
 
        xen_arch_post_suspend(si->cancelled);
index 5a29616..5e9adac 100644 (file)
@@ -193,16 +193,16 @@ static int traverse_pages_block(unsigned nelem, size_t size,
        return ret;
 }
 
-struct mmap_mfn_state {
+struct mmap_gfn_state {
        unsigned long va;
        struct vm_area_struct *vma;
        domid_t domain;
 };
 
-static int mmap_mfn_range(void *data, void *state)
+static int mmap_gfn_range(void *data, void *state)
 {
        struct privcmd_mmap_entry *msg = data;
-       struct mmap_mfn_state *st = state;
+       struct mmap_gfn_state *st = state;
        struct vm_area_struct *vma = st->vma;
        int rc;
 
@@ -216,7 +216,7 @@ static int mmap_mfn_range(void *data, void *state)
            ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
                return -EINVAL;
 
-       rc = xen_remap_domain_mfn_range(vma,
+       rc = xen_remap_domain_gfn_range(vma,
                                        msg->va & PAGE_MASK,
                                        msg->mfn, msg->npages,
                                        vma->vm_page_prot,
@@ -236,7 +236,7 @@ static long privcmd_ioctl_mmap(void __user *udata)
        struct vm_area_struct *vma;
        int rc;
        LIST_HEAD(pagelist);
-       struct mmap_mfn_state state;
+       struct mmap_gfn_state state;
 
        /* We only support privcmd_ioctl_mmap_batch for auto translated. */
        if (xen_feature(XENFEAT_auto_translated_physmap))
@@ -273,7 +273,7 @@ static long privcmd_ioctl_mmap(void __user *udata)
 
        rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
                            &pagelist,
-                           mmap_mfn_range, &state);
+                           mmap_gfn_range, &state);
 
 
 out_up:
@@ -299,18 +299,18 @@ struct mmap_batch_state {
        int global_error;
        int version;
 
-       /* User-space mfn array to store errors in the second pass for V1. */
-       xen_pfn_t __user *user_mfn;
+       /* User-space gfn array to store errors in the second pass for V1. */
+       xen_pfn_t __user *user_gfn;
        /* User-space int array to store errors in the second pass for V2. */
        int __user *user_err;
 };
 
-/* auto translated dom0 note: if domU being created is PV, then mfn is
- * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
+/* auto translated dom0 note: if domU being created is PV, then gfn is
+ * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
  */
 static int mmap_batch_fn(void *data, int nr, void *state)
 {
-       xen_pfn_t *mfnp = data;
+       xen_pfn_t *gfnp = data;
        struct mmap_batch_state *st = state;
        struct vm_area_struct *vma = st->vma;
        struct page **pages = vma->vm_private_data;
@@ -321,8 +321,8 @@ static int mmap_batch_fn(void *data, int nr, void *state)
                cur_pages = &pages[st->index];
 
        BUG_ON(nr < 0);
-       ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr,
-                                        (int *)mfnp, st->vma->vm_page_prot,
+       ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr,
+                                        (int *)gfnp, st->vma->vm_page_prot,
                                         st->domain, cur_pages);
 
        /* Adjust the global_error? */
@@ -347,22 +347,22 @@ static int mmap_return_error(int err, struct mmap_batch_state *st)
 
        if (st->version == 1) {
                if (err) {
-                       xen_pfn_t mfn;
+                       xen_pfn_t gfn;
 
-                       ret = get_user(mfn, st->user_mfn);
+                       ret = get_user(gfn, st->user_gfn);
                        if (ret < 0)
                                return ret;
                        /*
                         * V1 encodes the error codes in the 32bit top
-                        * nibble of the mfn (with its known
+                        * nibble of the gfn (with its known
                         * limitations vis-a-vis 64 bit callers).
                         */
-                       mfn |= (err == -ENOENT) ?
+                       gfn |= (err == -ENOENT) ?
                                PRIVCMD_MMAPBATCH_PAGED_ERROR :
                                PRIVCMD_MMAPBATCH_MFN_ERROR;
-                       return __put_user(mfn, st->user_mfn++);
+                       return __put_user(gfn, st->user_gfn++);
                } else
-                       st->user_mfn++;
+                       st->user_gfn++;
        } else { /* st->version == 2 */
                if (err)
                        return __put_user(err, st->user_err++);
@@ -388,7 +388,7 @@ static int mmap_return_errors(void *data, int nr, void *state)
        return 0;
 }
 
-/* Allocate pfns that are then mapped with gmfns from foreign domid. Update
+/* Allocate pfns that are then mapped with gfns from foreign domid. Update
  * the vma with the page info to use later.
  * Returns: 0 if success, otherwise -errno
  */
@@ -414,7 +414,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
        return 0;
 }
 
-static struct vm_operations_struct privcmd_vm_ops;
+static const struct vm_operations_struct privcmd_vm_ops;
 
 static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 {
@@ -526,7 +526,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 
        if (state.global_error) {
                /* Write back errors in second pass. */
-               state.user_mfn = (xen_pfn_t *)m.arr;
+               state.user_gfn = (xen_pfn_t *)m.arr;
                state.user_err = m.err;
                ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
                                           &pagelist, mmap_return_errors, &state);
@@ -587,7 +587,7 @@ static void privcmd_close(struct vm_area_struct *vma)
        if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
                return;
 
-       rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
+       rc = xen_unmap_domain_gfn_range(vma, numpgs, pages);
        if (rc == 0)
                free_xenballooned_pages(numpgs, pages);
        else
@@ -605,7 +605,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct privcmd_vm_ops = {
+static const struct vm_operations_struct privcmd_vm_ops = {
        .close = privcmd_close,
        .fault = privcmd_fault
 };
index 4c54932..79bc493 100644 (file)
@@ -82,8 +82,8 @@ static u64 start_dma_addr;
  */
 static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
 {
-       unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr));
-       dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT;
+       unsigned long bfn = pfn_to_bfn(PFN_DOWN(paddr));
+       dma_addr_t dma = (dma_addr_t)bfn << PAGE_SHIFT;
 
        dma |= paddr & ~PAGE_MASK;
 
@@ -92,7 +92,7 @@ static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
 
 static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
 {
-       unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr));
+       unsigned long pfn = bfn_to_pfn(PFN_DOWN(baddr));
        dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT;
        phys_addr_t paddr = dma;
 
@@ -110,15 +110,15 @@ static int check_pages_physically_contiguous(unsigned long pfn,
                                             unsigned int offset,
                                             size_t length)
 {
-       unsigned long next_mfn;
+       unsigned long next_bfn;
        int i;
        int nr_pages;
 
-       next_mfn = pfn_to_mfn(pfn);
+       next_bfn = pfn_to_bfn(pfn);
        nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
 
        for (i = 1; i < nr_pages; i++) {
-               if (pfn_to_mfn(++pfn) != ++next_mfn)
+               if (pfn_to_bfn(++pfn) != ++next_bfn)
                        return 0;
        }
        return 1;
@@ -138,8 +138,8 @@ static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
 
 static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
 {
-       unsigned long mfn = PFN_DOWN(dma_addr);
-       unsigned long pfn = mfn_to_local_pfn(mfn);
+       unsigned long bfn = PFN_DOWN(dma_addr);
+       unsigned long pfn = bfn_to_local_pfn(bfn);
        phys_addr_t paddr;
 
        /* If the address is outside our domain, it CAN
@@ -311,9 +311,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        */
        flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
 
-       if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
-               return ret;
-
        /* On ARM this function returns an ioremap'ped virtual address for
         * which virt_to_phys doesn't return the corresponding physical
         * address. In fact on ARM virt_to_phys only works for kernel direct
@@ -356,9 +353,6 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
        phys_addr_t phys;
        u64 dma_mask = DMA_BIT_MASK(32);
 
-       if (dma_release_from_coherent(hwdev, order, vaddr))
-               return;
-
        if (hwdev && hwdev->coherent_dma_mask)
                dma_mask = hwdev->coherent_dma_mask;
 
index 239738f..945fc43 100644 (file)
@@ -129,21 +129,17 @@ static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
 /* xen generic tmem ops */
 
 static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
-                            u32 index, unsigned long pfn)
+                            u32 index, struct page *page)
 {
-       unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
-
        return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
-               gmfn, 0, 0, 0);
+                          xen_page_to_gfn(page), 0, 0, 0);
 }
 
 static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
-                            u32 index, unsigned long pfn)
+                            u32 index, struct page *page)
 {
-       unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
-
        return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
-               gmfn, 0, 0, 0);
+                          xen_page_to_gfn(page), 0, 0, 0);
 }
 
 static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
@@ -173,14 +169,13 @@ static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
 {
        u32 ind = (u32) index;
        struct tmem_oid oid = *(struct tmem_oid *)&key;
-       unsigned long pfn = page_to_pfn(page);
 
        if (pool < 0)
                return;
        if (ind != index)
                return;
        mb(); /* ensure page is quiescent; tmem may address it with an alias */
-       (void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
+       (void)xen_tmem_put_page((u32)pool, oid, ind, page);
 }
 
 static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
@@ -188,7 +183,6 @@ static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
 {
        u32 ind = (u32) index;
        struct tmem_oid oid = *(struct tmem_oid *)&key;
-       unsigned long pfn = page_to_pfn(page);
        int ret;
 
        /* translate return values to linux semantics */
@@ -196,7 +190,7 @@ static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
                return -1;
        if (ind != index)
                return -1;
-       ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
+       ret = xen_tmem_get_page((u32)pool, oid, ind, page);
        if (ret == 1)
                return 0;
        else
@@ -287,7 +281,6 @@ static int tmem_frontswap_store(unsigned type, pgoff_t offset,
 {
        u64 ind64 = (u64)offset;
        u32 ind = (u32)offset;
-       unsigned long pfn = page_to_pfn(page);
        int pool = tmem_frontswap_poolid;
        int ret;
 
@@ -296,7 +289,7 @@ static int tmem_frontswap_store(unsigned type, pgoff_t offset,
        if (ind64 != ind)
                return -1;
        mb(); /* ensure page is quiescent; tmem may address it with an alias */
-       ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+       ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), page);
        /* translate Xen tmem return values to linux semantics */
        if (ret == 1)
                return 0;
@@ -313,7 +306,6 @@ static int tmem_frontswap_load(unsigned type, pgoff_t offset,
 {
        u64 ind64 = (u64)offset;
        u32 ind = (u32)offset;
-       unsigned long pfn = page_to_pfn(page);
        int pool = tmem_frontswap_poolid;
        int ret;
 
@@ -321,7 +313,7 @@ static int tmem_frontswap_load(unsigned type, pgoff_t offset,
                return -1;
        if (ind64 != ind)
                return -1;
-       ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+       ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), page);
        /* translate Xen tmem return values to linux semantics */
        if (ret == 1)
                return 0;
index e303535..2ba09c1 100644 (file)
@@ -380,7 +380,7 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
 
        for (i = 0; i < nr_pages; i++) {
                err = gnttab_grant_foreign_access(dev->otherend_id,
-                                                 virt_to_mfn(vaddr), 0);
+                                                 virt_to_gfn(vaddr), 0);
                if (err < 0) {
                        xenbus_dev_fatal(dev, err,
                                         "granting access to ring page");
index b17707e..ee6d9ef 100644 (file)
@@ -49,7 +49,7 @@ static long xenbus_alloc(domid_t domid)
                goto out_err;
 
        gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid,
-                       virt_to_mfn(xen_store_interface), 0 /* writable */);
+                       virt_to_gfn(xen_store_interface), 0 /* writable */);
 
        arg.dom = DOMID_SELF;
        arg.remote_dom = domid;
index 4308fb3..3cbe055 100644 (file)
@@ -75,7 +75,7 @@ EXPORT_SYMBOL_GPL(xen_store_interface);
 enum xenstore_init xen_store_domain_type;
 EXPORT_SYMBOL_GPL(xen_store_domain_type);
 
-static unsigned long xen_store_mfn;
+static unsigned long xen_store_gfn;
 
 static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
 
@@ -711,9 +711,7 @@ static int __init xenstored_local_init(void)
        if (!page)
                goto out_err;
 
-       xen_store_mfn = xen_start_info->store_mfn =
-               pfn_to_mfn(virt_to_phys((void *)page) >>
-                          PAGE_SHIFT);
+       xen_store_gfn = xen_start_info->store_mfn = virt_to_gfn((void *)page);
 
        /* Next allocate a local port which xenstored can bind to */
        alloc_unbound.dom        = DOMID_SELF;
@@ -787,12 +785,12 @@ static int __init xenbus_init(void)
                err = xenstored_local_init();
                if (err)
                        goto out_error;
-               xen_store_interface = mfn_to_virt(xen_store_mfn);
+               xen_store_interface = gfn_to_virt(xen_store_gfn);
                break;
        case XS_PV:
                xen_store_evtchn = xen_start_info->store_evtchn;
-               xen_store_mfn = xen_start_info->store_mfn;
-               xen_store_interface = mfn_to_virt(xen_store_mfn);
+               xen_store_gfn = xen_start_info->store_mfn;
+               xen_store_interface = gfn_to_virt(xen_store_gfn);
                break;
        case XS_HVM:
                err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
@@ -802,9 +800,9 @@ static int __init xenbus_init(void)
                err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
                if (err)
                        goto out_error;
-               xen_store_mfn = (unsigned long)v;
+               xen_store_gfn = (unsigned long)v;
                xen_store_interface =
-                       xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+                       xen_remap(xen_store_gfn << PAGE_SHIFT, PAGE_SIZE);
                break;
        default:
                pr_warn("Xenstore state unknown\n");
index 58a5389..cff2387 100644 (file)
@@ -38,8 +38,8 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 
-/* map fgmfn of domid to lpfn in the current domain */
-static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
+/* map fgfn of domid to lpfn in the current domain */
+static int map_foreign_page(unsigned long lpfn, unsigned long fgfn,
                            unsigned int domid)
 {
        int rc;
@@ -49,7 +49,7 @@ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
                .size = 1,
                .space = XENMAPSPACE_gmfn_foreign,
        };
-       xen_ulong_t idx = fgmfn;
+       xen_ulong_t idx = fgfn;
        xen_pfn_t gpfn = lpfn;
        int err = 0;
 
@@ -62,13 +62,13 @@ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
 }
 
 struct remap_data {
-       xen_pfn_t *fgmfn; /* foreign domain's gmfn */
+       xen_pfn_t *fgfn; /* foreign domain's gfn */
        pgprot_t prot;
        domid_t  domid;
        struct vm_area_struct *vma;
        int index;
        struct page **pages;
-       struct xen_remap_mfn_info *info;
+       struct xen_remap_gfn_info *info;
        int *err_ptr;
        int mapped;
 };
@@ -82,20 +82,20 @@ static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
        pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
        int rc;
 
-       rc = map_foreign_page(pfn, *info->fgmfn, info->domid);
+       rc = map_foreign_page(pfn, *info->fgfn, info->domid);
        *info->err_ptr++ = rc;
        if (!rc) {
                set_pte_at(info->vma->vm_mm, addr, ptep, pte);
                info->mapped++;
        }
-       info->fgmfn++;
+       info->fgfn++;
 
        return 0;
 }
 
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
                              unsigned long addr,
-                             xen_pfn_t *mfn, int nr,
+                             xen_pfn_t *gfn, int nr,
                              int *err_ptr, pgprot_t prot,
                              unsigned domid,
                              struct page **pages)
@@ -108,7 +108,7 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
           x86 PVOPS */
        BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
-       data.fgmfn = mfn;
+       data.fgfn = gfn;
        data.prot  = prot;
        data.domid = domid;
        data.vma   = vma;
index 3f89c9e..5b50c4c 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 #include "affs.h"
 
 static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
         * blocks, we will have to change it.
         */
 
-       size = sb->s_bdev->bd_inode->i_size >> 9;
+       size = i_size_read(sb->s_bdev->bd_inode) >> 9;
        pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
 
        affs_set_blocksize(sb, PAGE_SIZE);
        /* Try to find root block. Its location depends on the block size. */
 
-       i = 512;
-       j = 4096;
+       i = bdev_logical_block_size(sb->s_bdev);
+       j = PAGE_SIZE;
        if (blocksize > 0) {
                i = j = blocksize;
                size = size / (blocksize / 512);
        }
+
        for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
                sbi->s_root_block = root_block;
                if (root_block < 0)
index 1ce06c8..3e36e4a 100644 (file)
@@ -42,8 +42,14 @@ struct __btrfs_workqueue {
 
        /* Thresholding related variants */
        atomic_t pending;
-       int max_active;
-       int current_max;
+
+       /* Up limit of concurrency workers */
+       int limit_active;
+
+       /* Current number of concurrency workers */
+       int current_active;
+
+       /* Threshold to change current_active */
        int thresh;
        unsigned int count;
        spinlock_t thres_lock;
@@ -88,7 +94,7 @@ BTRFS_WORK_HELPER(scrubnc_helper);
 BTRFS_WORK_HELPER(scrubparity_helper);
 
 static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
                         int thresh)
 {
        struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -96,26 +102,31 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
        if (!ret)
                return NULL;
 
-       ret->max_active = max_active;
+       ret->limit_active = limit_active;
        atomic_set(&ret->pending, 0);
        if (thresh == 0)
                thresh = DFT_THRESHOLD;
        /* For low threshold, disabling threshold is a better choice */
        if (thresh < DFT_THRESHOLD) {
-               ret->current_max = max_active;
+               ret->current_active = limit_active;
                ret->thresh = NO_THRESHOLD;
        } else {
-               ret->current_max = 1;
+               /*
+                * For threshold-able wq, let its concurrency grow on demand.
+                * Use minimal max_active at alloc time to reduce resource
+                * usage.
+                */
+               ret->current_active = 1;
                ret->thresh = thresh;
        }
 
        if (flags & WQ_HIGHPRI)
                ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
-                                                ret->max_active,
-                                                "btrfs", name);
+                                                ret->current_active, "btrfs",
+                                                name);
        else
                ret->normal_wq = alloc_workqueue("%s-%s", flags,
-                                                ret->max_active, "btrfs",
+                                                ret->current_active, "btrfs",
                                                 name);
        if (!ret->normal_wq) {
                kfree(ret);
@@ -134,7 +145,7 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                                              unsigned int flags,
-                                             int max_active,
+                                             int limit_active,
                                              int thresh)
 {
        struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -143,14 +154,14 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                return NULL;
 
        ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
-                                             max_active, thresh);
+                                             limit_active, thresh);
        if (!ret->normal) {
                kfree(ret);
                return NULL;
        }
 
        if (flags & WQ_HIGHPRI) {
-               ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+               ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
                                                    thresh);
                if (!ret->high) {
                        __btrfs_destroy_workqueue(ret->normal);
@@ -180,7 +191,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
  */
 static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
 {
-       int new_max_active;
+       int new_current_active;
        long pending;
        int need_change = 0;
 
@@ -197,7 +208,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
        wq->count %= (wq->thresh / 4);
        if (!wq->count)
                goto  out;
-       new_max_active = wq->current_max;
+       new_current_active = wq->current_active;
 
        /*
         * pending may be changed later, but it's OK since we really
@@ -205,19 +216,19 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
         */
        pending = atomic_read(&wq->pending);
        if (pending > wq->thresh)
-               new_max_active++;
+               new_current_active++;
        if (pending < wq->thresh / 2)
-               new_max_active--;
-       new_max_active = clamp_val(new_max_active, 1, wq->max_active);
-       if (new_max_active != wq->current_max)  {
+               new_current_active--;
+       new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
+       if (new_current_active != wq->current_active)  {
                need_change = 1;
-               wq->current_max = new_max_active;
+               wq->current_active = new_current_active;
        }
 out:
        spin_unlock(&wq->thres_lock);
 
        if (need_change) {
-               workqueue_set_max_active(wq->normal_wq, wq->current_max);
+               workqueue_set_max_active(wq->normal_wq, wq->current_active);
        }
 }
 
@@ -351,13 +362,13 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
        kfree(wq);
 }
 
-void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
 {
        if (!wq)
                return;
-       wq->normal->max_active = max;
+       wq->normal->limit_active = limit_active;
        if (wq->high)
-               wq->high->max_active = max;
+               wq->high->limit_active = limit_active;
 }
 
 void btrfs_set_work_high_priority(struct btrfs_work *work)
index b0b093b..ad4d064 100644 (file)
@@ -69,7 +69,7 @@ BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                                              unsigned int flags,
-                                             int max_active,
+                                             int limit_active,
                                              int thresh);
 void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
                     btrfs_func_t func,
index 564a7de..e54dd59 100644 (file)
@@ -183,8 +183,7 @@ no_valid_dev_replace_entry_found:
        }
 
 out:
-       if (path)
-               btrfs_free_path(path);
+       btrfs_free_path(path);
        return ret;
 }
 
index 9ebd34f..0d98aee 100644 (file)
@@ -3443,6 +3443,26 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
        return 0;
 }
 
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
+{
+       if ((flags & (BTRFS_BLOCK_GROUP_DUP |
+                     BTRFS_BLOCK_GROUP_RAID0 |
+                     BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
+           ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
+               return 0;
+
+       if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                    BTRFS_BLOCK_GROUP_RAID5 |
+                    BTRFS_BLOCK_GROUP_RAID10))
+               return 1;
+
+       if (flags & BTRFS_BLOCK_GROUP_RAID6)
+               return 2;
+
+       pr_warn("BTRFS: unknown raid type: %llu\n", flags);
+       return 0;
+}
+
 int btrfs_calc_num_tolerated_disk_barrier_failures(
        struct btrfs_fs_info *fs_info)
 {
@@ -3452,13 +3472,12 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
                       BTRFS_BLOCK_GROUP_SYSTEM,
                       BTRFS_BLOCK_GROUP_METADATA,
                       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
-       int num_types = 4;
        int i;
        int c;
        int num_tolerated_disk_barrier_failures =
                (int)fs_info->fs_devices->num_devices;
 
-       for (i = 0; i < num_types; i++) {
+       for (i = 0; i < ARRAY_SIZE(types); i++) {
                struct btrfs_space_info *tmp;
 
                sinfo = NULL;
@@ -3476,44 +3495,21 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
 
                down_read(&sinfo->groups_sem);
                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
-                       if (!list_empty(&sinfo->block_groups[c])) {
-                               u64 flags;
-
-                               btrfs_get_block_group_info(
-                                       &sinfo->block_groups[c], &space);
-                               if (space.total_bytes == 0 ||
-                                   space.used_bytes == 0)
-                                       continue;
-                               flags = space.flags;
-                               /*
-                                * return
-                                * 0: if dup, single or RAID0 is configured for
-                                *    any of metadata, system or data, else
-                                * 1: if RAID5 is configured, or if RAID1 or
-                                *    RAID10 is configured and only two mirrors
-                                *    are used, else
-                                * 2: if RAID6 is configured, else
-                                * num_mirrors - 1: if RAID1 or RAID10 is
-                                *                  configured and more than
-                                *                  2 mirrors are used.
-                                */
-                               if (num_tolerated_disk_barrier_failures > 0 &&
-                                   ((flags & (BTRFS_BLOCK_GROUP_DUP |
-                                              BTRFS_BLOCK_GROUP_RAID0)) ||
-                                    ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
-                                     == 0)))
-                                       num_tolerated_disk_barrier_failures = 0;
-                               else if (num_tolerated_disk_barrier_failures > 1) {
-                                       if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-                                           BTRFS_BLOCK_GROUP_RAID5 |
-                                           BTRFS_BLOCK_GROUP_RAID10)) {
-                                               num_tolerated_disk_barrier_failures = 1;
-                                       } else if (flags &
-                                                  BTRFS_BLOCK_GROUP_RAID6) {
-                                               num_tolerated_disk_barrier_failures = 2;
-                                       }
-                               }
-                       }
+                       u64 flags;
+
+                       if (list_empty(&sinfo->block_groups[c]))
+                               continue;
+
+                       btrfs_get_block_group_info(&sinfo->block_groups[c],
+                                                  &space);
+                       if (space.total_bytes == 0 || space.used_bytes == 0)
+                               continue;
+                       flags = space.flags;
+
+                       num_tolerated_disk_barrier_failures = min(
+                               num_tolerated_disk_barrier_failures,
+                               btrfs_get_num_tolerated_disk_barrier_failures(
+                                       flags));
                }
                up_read(&sinfo->groups_sem);
        }
index d4cbfee..bdfb479 100644 (file)
@@ -139,6 +139,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                                     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
                                void (*flush_fn)(void *));
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int btrfs_calc_num_tolerated_disk_barrier_failures(
        struct btrfs_fs_info *fs_info);
 int __init btrfs_end_io_wq_init(void);
index 237da01..a0fa725 100644 (file)
@@ -6909,8 +6909,7 @@ out:
 
        trace_btrfs_get_extent(root, em);
 
-       if (path)
-               btrfs_free_path(path);
+       btrfs_free_path(path);
        if (trans) {
                ret = btrfs_end_transaction(trans, root);
                if (!err)
index 9a11db0..a39f5d1 100644 (file)
@@ -3267,13 +3267,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                        scrub_blocked_if_needed(fs_info);
                }
 
-               /* for raid56, we skip parity stripe */
                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                        ret = get_raid56_logic_offset(physical, num, map,
                                                      &logical,
                                                      &stripe_logical);
                        logical += base;
                        if (ret) {
+                               /* it is parity strip */
                                stripe_logical += base;
                                stripe_end = stripe_logical + increment;
                                ret = scrub_raid56_parity(sctx, map, scrub_dev,
@@ -3480,7 +3480,6 @@ out:
 
 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
                                          struct btrfs_device *scrub_dev,
-                                         u64 chunk_tree, u64 chunk_objectid,
                                          u64 chunk_offset, u64 length,
                                          u64 dev_offset, int is_dev_replace)
 {
@@ -3531,8 +3530,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 length;
-       u64 chunk_tree;
-       u64 chunk_objectid;
        u64 chunk_offset;
        int ret = 0;
        int slot;
@@ -3596,8 +3593,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                if (found_key.offset + length <= start)
                        goto skip;
 
-               chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
-               chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
 
                /*
@@ -3630,9 +3625,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                dev_replace->cursor_right = found_key.offset + length;
                dev_replace->cursor_left = found_key.offset;
                dev_replace->item_needs_writeback = 1;
-               ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
-                                 chunk_offset, length, found_key.offset,
-                                 is_dev_replace);
+               ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
+                                 found_key.offset, is_dev_replace);
 
                /*
                 * flush, submit all pending read and write bios, afterwards
index a4b9c8b..f31db43 100644 (file)
@@ -115,8 +115,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                ret = -EAGAIN;
        }
 out:
-       if (path)
-               btrfs_free_path(path);
+       btrfs_free_path(path);
        if (ret == -EAGAIN) {
                if (root->defrag_max.objectid > root->defrag_progress.objectid)
                        goto done;
index 76201d6..6fc7358 100644 (file)
@@ -3585,23 +3585,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        } while (read_seqretry(&fs_info->profiles_lock, seq));
 
        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-               int num_tolerated_disk_barrier_failures;
-               u64 target = bctl->sys.target;
-
-               num_tolerated_disk_barrier_failures =
-                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
-               if (num_tolerated_disk_barrier_failures > 0 &&
-                   (target &
-                    (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-                     BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
-                       num_tolerated_disk_barrier_failures = 0;
-               else if (num_tolerated_disk_barrier_failures > 1 &&
-                        (target &
-                         (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
-                       num_tolerated_disk_barrier_failures = 1;
-
-               fs_info->num_tolerated_disk_barrier_failures =
-                       num_tolerated_disk_barrier_failures;
+               fs_info->num_tolerated_disk_barrier_failures = min(
+                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
+                       btrfs_get_num_tolerated_disk_barrier_failures(
+                               bctl->sys.target));
        }
 
        ret = insert_balance_item(fs_info->tree_root, bctl);
index 890c509..9d23e78 100644 (file)
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
        for (i = 0; i < num_pages; i++) {
                struct page *page = osd_data->pages[i];
 
-               if (rc < 0)
+               if (rc < 0 && rc != ENOENT)
                        goto unlock;
                if (bytes < (int)PAGE_CACHE_SIZE) {
                        /* zero (remainder of) page */
@@ -717,8 +717,10 @@ static int ceph_writepages_start(struct address_space *mapping,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-       if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                pr_warn("writepage_start %p on forced umount\n", inode);
+               truncate_pagecache(inode, 0);
+               mapping_set_error(mapping, -EIO);
                return -EIO; /* we're in a forced umount, don't write! */
        }
        if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -1593,7 +1595,7 @@ out:
        return err;
 }
 
-static struct vm_operations_struct ceph_vmops = {
+static const struct vm_operations_struct ceph_vmops = {
        .fault          = ceph_filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
 };
index ddd5e94..27b5668 100644 (file)
@@ -2413,6 +2413,14 @@ again:
                        goto out_unlock;
                }
 
+               if (!__ceph_is_any_caps(ci) &&
+                   ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                       dout("get_cap_refs %p forced umount\n", inode);
+                       *err = -EIO;
+                       ret = 1;
+                       goto out_unlock;
+               }
+
                dout("get_cap_refs %p have %s needed %s\n", inode,
                     ceph_cap_string(have), ceph_cap_string(need));
        }
index 8b79d87..0c62868 100644 (file)
@@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file)
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        struct ceph_file_info *cf = file->private_data;
-       struct inode *parent_inode = NULL;
        int err;
        int flags, fmode, wanted;
 
@@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file)
        ihold(inode);
 
        req->r_num_caps = 1;
-       if (flags & O_CREAT)
-               parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
-       err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-       iput(parent_inode);
+       err = ceph_mdsc_do_request(mdsc, NULL, req);
        if (!err)
                err = ceph_init_file(inode, file, req->r_fmode);
        ceph_mdsc_put_request(req);
@@ -279,7 +275,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        if (err)
                goto out_req;
 
-       if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+       if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
 
        if (d_unhashed(dentry)) {
@@ -956,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
 
+       if (iocb->ki_flags & IOCB_APPEND) {
+               err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+               if (err < 0)
+                       goto out;
+       }
+
        err = generic_write_checks(iocb, from);
        if (err <= 0)
                goto out;
index 6aa07af..51cb02d 100644 (file)
@@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        msg = create_request_message(mdsc, req, mds, drop_cap_releases);
        if (IS_ERR(msg)) {
                req->r_err = PTR_ERR(msg);
-               complete_request(mdsc, req);
                return PTR_ERR(msg);
        }
        req->r_request = msg;
@@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
 {
        struct ceph_mds_session *session = NULL;
        int mds = -1;
-       int err = -EAGAIN;
+       int err = 0;
 
        if (req->r_err || req->r_got_result) {
                if (req->r_aborted)
@@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
                err = -EIO;
                goto finish;
        }
+       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+               dout("do_request forced umount\n");
+               err = -EIO;
+               goto finish;
+       }
 
        put_request_session(req);
 
@@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
 
 out_session:
        ceph_put_mds_session(session);
+finish:
+       if (err) {
+               dout("__do_request early error %d\n", err);
+               req->r_err = err;
+               complete_request(mdsc, req);
+               __unregister_request(mdsc, req);
+       }
 out:
        return err;
-
-finish:
-       req->r_err = err;
-       complete_request(mdsc, req);
-       goto out;
 }
 
 /*
@@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 
        if (req->r_err) {
                err = req->r_err;
-               __unregister_request(mdsc, req);
-               dout("do_request early error %d\n", err);
                goto out;
        }
 
@@ -2411,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
-       if (req->r_got_safe && !head->safe) {
+       if (req->r_got_safe) {
                pr_warn("got unsafe after safe on %llu from mds%d\n",
                           tid, mds);
                mutex_unlock(&mdsc->mutex);
@@ -2520,8 +2524,7 @@ out_err:
                if (err) {
                        req->r_err = err;
                } else {
-                       req->r_reply = msg;
-                       ceph_msg_get(msg);
+                       req->r_reply =  ceph_msg_get(msg);
                        req->r_got_result = true;
                }
        } else {
@@ -3555,7 +3558,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush, want_snap;
 
-       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return;
 
        dout("sync\n");
@@ -3584,7 +3587,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
-       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return true;
        return atomic_read(&mdsc->num_sessions) == 0;
 }
@@ -3643,6 +3646,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
        dout("stopped\n");
 }
 
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+       struct ceph_mds_session *session;
+       int mds;
+
+       dout("force umount\n");
+
+       mutex_lock(&mdsc->mutex);
+       for (mds = 0; mds < mdsc->max_sessions; mds++) {
+               session = __ceph_lookup_mds_session(mdsc, mds);
+               if (!session)
+                       continue;
+               mutex_unlock(&mdsc->mutex);
+               mutex_lock(&session->s_mutex);
+               __close_session(mdsc, session);
+               if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+                       cleanup_session_requests(mdsc, session);
+                       remove_session_caps(session);
+               }
+               mutex_unlock(&session->s_mutex);
+               ceph_put_mds_session(session);
+               mutex_lock(&mdsc->mutex);
+               kick_requests(mdsc, mds);
+       }
+       __wake_requests(mdsc, &mdsc->waiting_for_map);
+       mutex_unlock(&mdsc->mutex);
+}
+
 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
        dout("stop\n");
index 762757e..f575eaf 100644 (file)
@@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 
 extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
index 233d906..4aa7122 100644 (file)
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
                return 0;
        }
 
-       if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
-               ceph_get_snap_context(ceph_empty_snapc);
-               snapc = ceph_empty_snapc;
-               goto done;
-       }
-
        /* alloc new snap context */
        err = -ENOMEM;
        if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
             realm->ino, realm, snapc, snapc->seq,
             (unsigned int) snapc->num_snaps);
 
-done:
        ceph_put_snap_context(realm->cached_context);
        realm->cached_context = snapc;
        return 0;
index 7b6bfcb..f446afa 100644 (file)
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
        if (!fsc)
                return;
        fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+       ceph_mdsc_force_umount(fsc->mdsc);
        return;
 }
 
index 6a1119e..e739950 100644 (file)
@@ -325,8 +325,11 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 static void
 cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
 {
-       if (ses->sectype == Unspecified)
+       if (ses->sectype == Unspecified) {
+               if (ses->user_name == NULL)
+                       seq_puts(s, ",sec=none");
                return;
+       }
 
        seq_puts(s, ",sec=");
 
index 3f50cee..e2a6af1 100644 (file)
@@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_LOCKED;
 }
 
-static struct vm_operations_struct cifs_file_vm_ops = {
+static const struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = cifs_page_mkwrite,
index c63f522..28a77bf 100644 (file)
@@ -67,6 +67,12 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
                goto out_drop_write;
        }
 
+       if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+               rc = -EBADF;
+               cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
+               goto out_fput;
+       }
+
        if ((!src_file.file->private_data) || (!dst_file->private_data)) {
                rc = -EBADF;
                cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
index 9b1ffaa..f6c6c8a 100644 (file)
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
         char *result;
         
        insize = max_t(unsigned int,
-                    INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+                    INSIZE(readlink), OUTSIZE(readlink)+ *length);
        UPARG(CODA_READLINK);
 
         inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
        error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
        if (!error) {
                retlen = outp->coda_readlink.count;
-               if ( retlen > *length )
-                       retlen = *length;
+               if (retlen >= *length)
+                       retlen = *length - 1;
                *length = retlen;
                result =  (char *)outp + (long)outp->coda_readlink.data;
                memcpy(buffer, result, retlen);
index c5ecde6..a8f7564 100644 (file)
@@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo)
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
-       int flag = 0;
        int ispipe;
        struct files_struct *displaced;
-       bool need_nonrelative = false;
+       /* require nonrelative corefile path and be extra careful */
+       bool need_suid_safe = false;
        bool core_dumped = false;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
@@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo)
         */
        if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
                /* Setuid core dump mode */
-               flag = O_EXCL;          /* Stop rewrite attacks */
                cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
-               need_nonrelative = true;
+               need_suid_safe = true;
        }
 
        retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo)
                if (cprm.limit < binfmt->min_coredump)
                        goto fail_unlock;
 
-               if (need_nonrelative && cn.corename[0] != '/') {
+               if (need_suid_safe && cn.corename[0] != '/') {
                        printk(KERN_WARNING "Pid %d(%s) can only dump core "\
                                "to fully qualified path!\n",
                                task_tgid_vnr(current), current->comm);
@@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo)
                        goto fail_unlock;
                }
 
+               /*
+                * Unlink the file if it exists unless this is a SUID
+                * binary - in that case, we're running around with root
+                * privs and don't want to unlink another user's coredump.
+                */
+               if (!need_suid_safe) {
+                       mm_segment_t old_fs;
+
+                       old_fs = get_fs();
+                       set_fs(KERNEL_DS);
+                       /*
+                        * If it doesn't exist, that's fine. If there's some
+                        * other problem, we'll catch it at the filp_open().
+                        */
+                       (void) sys_unlink((const char __user *)cn.corename);
+                       set_fs(old_fs);
+               }
+
+               /*
+                * There is a race between unlinking and creating the
+                * file, but if that causes an EEXIST here, that's
+                * fine - another process raced with us while creating
+                * the corefile, and the other process won. To userspace,
+                * what matters is that at least one of the two processes
+                * writes its coredump successfully, not which one.
+                */
                cprm.file = filp_open(cn.corename,
-                                O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+                                O_CREAT | 2 | O_NOFOLLOW |
+                                O_LARGEFILE | O_EXCL,
                                 0600);
                if (IS_ERR(cprm.file))
                        goto fail_unlock;
@@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo)
                if (!S_ISREG(inode->i_mode))
                        goto close_fail;
                /*
-                * Dont allow local users get cute and trick others to coredump
-                * into their pre-created files.
+                * Don't dump core if the filesystem changed owner or mode
+                * of the file during file creation. This is an issue when
+                * a process dumps core while its cwd is e.g. on a vfat
+                * filesystem.
                 */
                if (!uid_eq(inode->i_uid, current_fsuid()))
                        goto close_fail;
+               if ((inode->i_mode & 0677) != 0600)
+                       goto close_fail;
                if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
                        goto close_fail;
                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
index ae0f438..587ac08 100644 (file)
@@ -53,8 +53,6 @@ struct wb_writeback_work {
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        unsigned int auto_free:1;       /* free on completion */
-       unsigned int single_wait:1;
-       unsigned int single_done:1;
        enum wb_reason reason;          /* why was writeback initiated? */
 
        struct list_head list;          /* pending work list */
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
 static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
 {
-       trace_writeback_queue(wb->bdi, work);
+       trace_writeback_queue(wb, work);
 
        spin_lock_bh(&wb->work_lock);
-       if (!test_bit(WB_registered, &wb->state)) {
-               if (work->single_wait)
-                       work->single_done = 1;
+       if (!test_bit(WB_registered, &wb->state))
                goto out_unlock;
-       }
        if (work->done)
                atomic_inc(&work->done->cnt);
        list_add_tail(&work->list, &wb->work_list);
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
 
 /**
  * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
  * @cong_bits: mask of WB_[a]sync_congested bits to test
  *
  * Tests whether @inode is congested.  @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
  * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
  * associated with @inode is congested; otherwise, the root wb's congestion
  * state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
  */
 int inode_congested(struct inode *inode, int cong_bits)
 {
@@ -737,32 +735,6 @@ int inode_congested(struct inode *inode, int cong_bits)
 }
 EXPORT_SYMBOL_GPL(inode_congested);
 
-/**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's.  The caller must have set @work->single_wait before
- * issuing it.  This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
-static void wb_wait_for_single_work(struct backing_dev_info *bdi,
-                                   struct wb_writeback_work *work)
-{
-       if (WARN_ON_ONCE(!work->single_wait))
-               return;
-
-       wait_event(bdi->wb_waitq, work->single_done);
-
-       /*
-        * Paired with smp_wmb() in wb_do_writeback() and ensures that all
-        * modifications to @work prior to assertion of ->single_done is
-        * visible to the caller once this function returns.
-        */
-       smp_rmb();
-}
-
 /**
  * wb_split_bdi_pages - split nr_pages to write according to bandwidth
  * @wb: target bdi_writeback to split @nr_pages to
@@ -791,38 +763,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
 }
 
-/**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb.  If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned.  In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion.  @base_work never is.
- */
-static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
-                                   struct wb_writeback_work *base_work)
-{
-       struct wb_writeback_work *work;
-
-       work = kmalloc(sizeof(*work), GFP_ATOMIC);
-       if (work) {
-               *work = *base_work;
-               work->auto_free = 1;
-               work->single_wait = 0;
-       } else {
-               work = base_work;
-               work->auto_free = 0;
-               work->single_wait = 1;
-       }
-       work->single_done = 0;
-       wb_queue_work(wb, work);
-       return work != base_work;
-}
-
 /**
  * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
  * @bdi: target backing_dev_info
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
 {
-       long nr_pages = base_work->nr_pages;
-       int next_blkcg_id = 0;
+       int next_memcg_id = 0;
        struct bdi_writeback *wb;
        struct wb_iter iter;
 
        might_sleep();
 restart:
        rcu_read_lock();
-       bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+       bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+               DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+               struct wb_writeback_work fallback_work;
+               struct wb_writeback_work *work;
+               long nr_pages;
+
                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;
 
-               base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
-               if (!wb_clone_and_queue_work(wb, base_work)) {
-                       next_blkcg_id = wb->blkcg_css->id + 1;
-                       rcu_read_unlock();
-                       wb_wait_for_single_work(bdi, base_work);
-                       goto restart;
+               nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+               work = kmalloc(sizeof(*work), GFP_ATOMIC);
+               if (work) {
+                       *work = *base_work;
+                       work->nr_pages = nr_pages;
+                       work->auto_free = 1;
+                       wb_queue_work(wb, work);
+                       continue;
                }
+
+               /* alloc failed, execute synchronously using on-stack fallback */
+               work = &fallback_work;
+               *work = *base_work;
+               work->nr_pages = nr_pages;
+               work->auto_free = 0;
+               work->done = &fallback_work_done;
+
+               wb_queue_work(wb, work);
+
+               next_memcg_id = wb->memcg_css->id + 1;
+               rcu_read_unlock();
+               wb_wait_for_completion(bdi, &fallback_work_done);
+               goto restart;
        }
        rcu_read_unlock();
 }
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 
        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                base_work->auto_free = 0;
-               base_work->single_wait = 0;
-               base_work->single_done = 0;
                wb_queue_work(&bdi->wb, base_work);
        }
 }
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
         */
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
-               trace_writeback_nowork(wb->bdi);
+               trace_writeback_nowork(wb);
                wb_wakeup(wb);
                return;
        }
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
-       trace_writeback_wake_background(wb->bdi);
+       trace_writeback_wake_background(wb);
        wb_wakeup(wb);
 }
 
@@ -1421,6 +1380,10 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
  * Write a portion of b_io inodes which belong to @sb.
  *
  * Return the number of pages and/or inodes written.
+ *
+ * NOTE! This is called with wb->list_lock held, and will
+ * unlock and relock that for each inode it ends up doing
+ * IO for.
  */
 static long writeback_sb_inodes(struct super_block *sb,
                                struct bdi_writeback *wb,
@@ -1439,9 +1402,7 @@ static long writeback_sb_inodes(struct super_block *sb,
        unsigned long start_time = jiffies;
        long write_chunk;
        long wrote = 0;  /* count both pages and inodes */
-       struct blk_plug plug;
 
-       blk_start_plug(&plug);
        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
 
@@ -1539,7 +1500,6 @@ static long writeback_sb_inodes(struct super_block *sb,
                                break;
                }
        }
-       blk_finish_plug(&plug);
        return wrote;
 }
 
@@ -1586,12 +1546,15 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                .range_cyclic   = 1,
                .reason         = reason,
        };
+       struct blk_plug plug;
 
+       blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
                queue_io(wb, &work);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
+       blk_finish_plug(&plug);
 
        return nr_pages - work.nr_pages;
 }
@@ -1619,10 +1582,12 @@ static long wb_writeback(struct bdi_writeback *wb,
        unsigned long oldest_jif;
        struct inode *inode;
        long progress;
+       struct blk_plug plug;
 
        oldest_jif = jiffies;
        work->older_than_this = &oldest_jif;
 
+       blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        for (;;) {
                /*
@@ -1660,14 +1625,14 @@ static long wb_writeback(struct bdi_writeback *wb,
                } else if (work->for_background)
                        oldest_jif = jiffies;
 
-               trace_writeback_start(wb->bdi, work);
+               trace_writeback_start(wb, work);
                if (list_empty(&wb->b_io))
                        queue_io(wb, work);
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
-               trace_writeback_written(wb->bdi, work);
+               trace_writeback_written(wb, work);
 
                wb_update_bandwidth(wb, wb_start);
 
@@ -1692,7 +1657,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * we'll just busyloop.
                 */
                if (!list_empty(&wb->b_more_io))  {
-                       trace_writeback_wait(wb->bdi, work);
+                       trace_writeback_wait(wb, work);
                        inode = wb_inode(wb->b_more_io.prev);
                        spin_lock(&inode->i_lock);
                        spin_unlock(&wb->list_lock);
@@ -1702,6 +1667,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                }
        }
        spin_unlock(&wb->list_lock);
+       blk_finish_plug(&plug);
 
        return nr_pages - work->nr_pages;
 }
@@ -1797,26 +1763,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
                struct wb_completion *done = work->done;
-               bool need_wake_up = false;
 
-               trace_writeback_exec(wb->bdi, work);
+               trace_writeback_exec(wb, work);
 
                wrote += wb_writeback(wb, work);
 
-               if (work->single_wait) {
-                       WARN_ON_ONCE(work->auto_free);
-                       /* paired w/ rmb in wb_wait_for_single_work() */
-                       smp_wmb();
-                       work->single_done = 1;
-                       need_wake_up = true;
-               } else if (work->auto_free) {
+               if (work->auto_free)
                        kfree(work);
-               }
-
                if (done && atomic_dec_and_test(&done->cnt))
-                       need_wake_up = true;
-
-               if (need_wake_up)
                        wake_up_all(&wb->bdi->wb_waitq);
        }
 
index a38e38f..9bd1244 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/list_sort.h>
 #include <linux/lockref.h>
+#include <linux/rhashtable.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -50,9 +51,8 @@
 #include "trace_gfs2.h"
 
 struct gfs2_glock_iter {
-       int hash;                       /* hash bucket index           */
-       unsigned nhash;                 /* Index within current bucket */
        struct gfs2_sbd *sdp;           /* incore superblock           */
+       struct rhashtable_iter hti;     /* rhashtable iterator         */
        struct gfs2_glock *gl;          /* current glock struct        */
        loff_t last_pos;                /* last position               */
 };
@@ -70,44 +70,19 @@ static DEFINE_SPINLOCK(lru_lock);
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
-#define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 
-static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
-static struct dentry *gfs2_root;
-
-/**
- * gl_hash() - Turn glock number into hash bucket number
- * @lock: The glock number
- *
- * Returns: The number of the corresponding hash bucket
- */
-
-static unsigned int gl_hash(const struct gfs2_sbd *sdp,
-                           const struct lm_lockname *name)
-{
-       unsigned int h;
-
-       h = jhash(&name->ln_number, sizeof(u64), 0);
-       h = jhash(&name->ln_type, sizeof(unsigned int), h);
-       h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
-       h &= GFS2_GL_HASH_MASK;
-
-       return h;
-}
-
-static inline void spin_lock_bucket(unsigned int hash)
-{
-       hlist_bl_lock(&gl_hash_table[hash]);
-}
+static struct rhashtable_params ht_parms = {
+       .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
+       .key_len = sizeof(struct lm_lockname),
+       .key_offset = offsetof(struct gfs2_glock, gl_name),
+       .head_offset = offsetof(struct gfs2_glock, gl_node),
+};
 
-static inline void spin_unlock_bucket(unsigned int hash)
-{
-       hlist_bl_unlock(&gl_hash_table[hash]);
-}
+static struct rhashtable gl_hash_table;
 
-static void gfs2_glock_dealloc(struct rcu_head *rcu)
+void gfs2_glock_free(struct gfs2_glock *gl)
 {
-       struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
        if (gl->gl_ops->go_flags & GLOF_ASPACE) {
                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -115,13 +90,6 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
                kfree(gl->gl_lksb.sb_lvbptr);
                kmem_cache_free(gfs2_glock_cachep, gl);
        }
-}
-
-void gfs2_glock_free(struct gfs2_glock *gl)
-{
-       struct gfs2_sbd *sdp = gl->gl_sbd;
-
-       call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
                wake_up(&sdp->sd_glock_wait);
 }
@@ -192,7 +160,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 
 void gfs2_glock_put(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = gfs2_glock2aspace(gl);
 
        if (lockref_put_or_lock(&gl->gl_lockref))
@@ -202,42 +170,13 @@ void gfs2_glock_put(struct gfs2_glock *gl)
 
        gfs2_glock_remove_from_lru(gl);
        spin_unlock(&gl->gl_lockref.lock);
-       spin_lock_bucket(gl->gl_hash);
-       hlist_bl_del_rcu(&gl->gl_list);
-       spin_unlock_bucket(gl->gl_hash);
+       rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
        GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
        trace_gfs2_glock_put(gl);
        sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
 }
 
-/**
- * search_bucket() - Find struct gfs2_glock by lock number
- * @bucket: the bucket to search
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *search_bucket(unsigned int hash,
-                                       const struct gfs2_sbd *sdp,
-                                       const struct lm_lockname *name)
-{
-       struct gfs2_glock *gl;
-       struct hlist_bl_node *h;
-
-       hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
-               if (!lm_name_equal(&gl->gl_name, name))
-                       continue;
-               if (gl->gl_sbd != sdp)
-                       continue;
-               if (lockref_get_not_dead(&gl->gl_lockref))
-                       return gl;
-       }
-
-       return NULL;
-}
-
 /**
  * may_grant - check if its ok to grant a new lock
  * @gl: The glock
@@ -506,7 +445,7 @@ __releases(&gl->gl_spin)
 __acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        unsigned int lck_flags = gh ? gh->gh_flags : 0;
        int ret;
 
@@ -628,7 +567,7 @@ out_unlock:
 static void delete_work_func(struct work_struct *work)
 {
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_inode *ip;
        struct inode *inode;
        u64 no_addr = gl->gl_name.ln_number;
@@ -704,15 +643,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                   struct gfs2_glock **glp)
 {
        struct super_block *s = sdp->sd_vfs;
-       struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
-       struct gfs2_glock *gl, *tmp;
-       unsigned int hash = gl_hash(sdp, &name);
+       struct lm_lockname name = { .ln_number = number,
+                                   .ln_type = glops->go_type,
+                                   .ln_sbd = sdp };
+       struct gfs2_glock *gl, *tmp = NULL;
        struct address_space *mapping;
        struct kmem_cache *cachep;
+       int ret, tries = 0;
 
-       rcu_read_lock();
-       gl = search_bucket(hash, sdp, &name);
-       rcu_read_unlock();
+       gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+       if (gl && !lockref_get_not_dead(&gl->gl_lockref))
+               gl = NULL;
 
        *glp = gl;
        if (gl)
@@ -739,14 +680,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        }
 
        atomic_inc(&sdp->sd_glock_disposal);
-       gl->gl_sbd = sdp;
+       gl->gl_node.next = NULL;
        gl->gl_flags = 0;
        gl->gl_name = name;
        gl->gl_lockref.count = 1;
        gl->gl_state = LM_ST_UNLOCKED;
        gl->gl_target = LM_ST_UNLOCKED;
        gl->gl_demote_state = LM_ST_EXCLUSIVE;
-       gl->gl_hash = hash;
        gl->gl_ops = glops;
        gl->gl_dstamp = ktime_set(0, 0);
        preempt_disable();
@@ -771,22 +711,34 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->writeback_index = 0;
        }
 
-       spin_lock_bucket(hash);
-       tmp = search_bucket(hash, sdp, &name);
-       if (tmp) {
-               spin_unlock_bucket(hash);
-               kfree(gl->gl_lksb.sb_lvbptr);
-               kmem_cache_free(cachep, gl);
-               atomic_dec(&sdp->sd_glock_disposal);
-               gl = tmp;
-       } else {
-               hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
-               spin_unlock_bucket(hash);
+again:
+       ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
+                                           ht_parms);
+       if (ret == 0) {
+               *glp = gl;
+               return 0;
        }
 
-       *glp = gl;
+       if (ret == -EEXIST) {
+               ret = 0;
+               tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+               if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
+                       if (++tries < 100) {
+                               cond_resched();
+                               goto again;
+                       }
+                       tmp = NULL;
+                       ret = -ENOMEM;
+               }
+       } else {
+               WARN_ON_ONCE(ret);
+       }
+       kfree(gl->gl_lksb.sb_lvbptr);
+       kmem_cache_free(cachep, gl);
+       atomic_dec(&sdp->sd_glock_disposal);
+       *glp = tmp;
 
-       return 0;
+       return ret;
 }
 
 /**
@@ -928,7 +880,7 @@ __releases(&gl->gl_spin)
 __acquires(&gl->gl_spin)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct list_head *insert_pt = NULL;
        struct gfs2_holder *gh2;
        int try_futile = 0;
@@ -1006,7 +958,7 @@ trap_recursive:
 int gfs2_glock_nq(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        int error = 0;
 
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -1313,7 +1265,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
 
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
-       struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+       struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
 
        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
@@ -1462,31 +1414,26 @@ static struct shrinker glock_shrinker = {
  *
  */
 
-static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
-                         unsigned int hash)
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
 {
        struct gfs2_glock *gl;
-       struct hlist_bl_head *head = &gl_hash_table[hash];
-       struct hlist_bl_node *pos;
+       struct rhash_head *pos, *next;
+       const struct bucket_table *tbl;
+       int i;
 
        rcu_read_lock();
-       hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-               if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
-                       examiner(gl);
+       tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
+       for (i = 0; i < tbl->size; i++) {
+               rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) {
+                       if ((gl->gl_name.ln_sbd == sdp) &&
+                           lockref_get_not_dead(&gl->gl_lockref))
+                               examiner(gl);
+               }
        }
        rcu_read_unlock();
        cond_resched();
 }
 
-static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
-{
-       unsigned x;
-
-       for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-               examine_bucket(examiner, sdp, x);
-}
-
-
 /**
  * thaw_glock - thaw out a glock which has an unprocessed reply waiting
  * @gl: The glock to thaw
@@ -1569,7 +1516,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
        int ret;
 
        ret = gfs2_truncatei_resume(ip);
-       gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+       gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
 
        spin_lock(&gl->gl_spin);
        clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1733,17 +1680,17 @@ static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock *gl = iter_ptr;
 
-       seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n",
+       seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n",
                   gl->gl_name.ln_type,
                   (unsigned long long)gl->gl_name.ln_number,
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
        return 0;
 }
 
@@ -1776,11 +1723,10 @@ static const char *gfs2_stype[] = {
 
 static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-       struct gfs2_glock_iter *gi = seq->private;
-       struct gfs2_sbd *sdp = gi->sdp;
-       unsigned index = gi->hash >> 3;
-       unsigned subindex = gi->hash & 0x07;
-       s64 value;
+       struct gfs2_sbd *sdp = seq->private;
+       loff_t pos = *(loff_t *)iter_ptr;
+       unsigned index = pos >> 3;
+       unsigned subindex = pos & 0x07;
        int i;
 
        if (index == 0 && subindex != 0)
@@ -1791,12 +1737,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 
        for_each_possible_cpu(i) {
                 const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
-               if (index == 0) {
-                       value = i;
-               } else {
-                       value = lkstats->lkstats[index - 1].stats[subindex];
-               }
-               seq_printf(seq, " %15lld", (long long)value);
+
+               if (index == 0)
+                       seq_printf(seq, " %15u", i);
+               else
+                       seq_printf(seq, " %15llu", (unsigned long long)lkstats->
+                                  lkstats[index - 1].stats[subindex]);
        }
        seq_putc(seq, '\n');
        return 0;
@@ -1804,20 +1750,24 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 
 int __init gfs2_glock_init(void)
 {
-       unsigned i;
-       for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
-               INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
-       }
+       int ret;
+
+       ret = rhashtable_init(&gl_hash_table, &ht_parms);
+       if (ret < 0)
+               return ret;
 
        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZABLE, 0);
-       if (!glock_workqueue)
+       if (!glock_workqueue) {
+               rhashtable_destroy(&gl_hash_table);
                return -ENOMEM;
+       }
        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
                                                WQ_MEM_RECLAIM | WQ_FREEZABLE,
                                                0);
        if (!gfs2_delete_workqueue) {
                destroy_workqueue(glock_workqueue);
+               rhashtable_destroy(&gl_hash_table);
                return -ENOMEM;
        }
 
@@ -1829,72 +1779,41 @@ int __init gfs2_glock_init(void)
 void gfs2_glock_exit(void)
 {
        unregister_shrinker(&glock_shrinker);
+       rhashtable_destroy(&gl_hash_table);
        destroy_workqueue(glock_workqueue);
        destroy_workqueue(gfs2_delete_workqueue);
 }
 
-static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
-       return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
-                             struct gfs2_glock, gl_list);
-}
-
-static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
-{
-       return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
-                             struct gfs2_glock, gl_list);
-}
-
-static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
-{
-       struct gfs2_glock *gl;
-
        do {
-               gl = gi->gl;
-               if (gl) {
-                       gi->gl = glock_hash_next(gl);
-                       gi->nhash++;
-               } else {
-                       if (gi->hash >= GFS2_GL_HASH_SIZE) {
-                               rcu_read_unlock();
-                               return 1;
-                       }
-                       gi->gl = glock_hash_chain(gi->hash);
-                       gi->nhash = 0;
-               }
-               while (gi->gl == NULL) {
-                       gi->hash++;
-                       if (gi->hash >= GFS2_GL_HASH_SIZE) {
-                               rcu_read_unlock();
-                               return 1;
-                       }
-                       gi->gl = glock_hash_chain(gi->hash);
-                       gi->nhash = 0;
+               gi->gl = rhashtable_walk_next(&gi->hti);
+               if (IS_ERR(gi->gl)) {
+                       if (PTR_ERR(gi->gl) == -EAGAIN)
+                               continue;
+                       gi->gl = NULL;
                }
        /* Skip entries for other sb and dead entries */
-       } while (gi->sdp != gi->gl->gl_sbd ||
-                __lockref_is_dead(&gi->gl->gl_lockref));
-
-       return 0;
+       } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) ||
+                             __lockref_is_dead(&gi->gl->gl_lockref)));
 }
 
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
+       int ret;
 
        if (gi->last_pos <= *pos)
-               n = gi->nhash + (*pos - gi->last_pos);
-       else
-               gi->hash = 0;
+               n = (*pos - gi->last_pos);
 
-       gi->nhash = 0;
-       rcu_read_lock();
+       ret = rhashtable_walk_start(&gi->hti);
+       if (ret)
+               return NULL;
 
        do {
-               if (gfs2_glock_iter_next(gi))
-                       return NULL;
-       } while (n--);
+               gfs2_glock_iter_next(gi);
+       } while (gi->gl && n--);
 
        gi->last_pos = *pos;
        return gi->gl;
@@ -1907,9 +1826,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 
        (*pos)++;
        gi->last_pos = *pos;
-       if (gfs2_glock_iter_next(gi))
-               return NULL;
-
+       gfs2_glock_iter_next(gi);
        return gi->gl;
 }
 
@@ -1917,9 +1834,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock_iter *gi = seq->private;
 
-       if (gi->gl)
-               rcu_read_unlock();
        gi->gl = NULL;
+       rhashtable_walk_stop(&gi->hti);
 }
 
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1930,26 +1846,19 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 
 static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
 {
-       struct gfs2_glock_iter *gi = seq->private;
-
-       gi->hash = *pos;
+       preempt_disable();
        if (*pos >= GFS2_NR_SBSTATS)
                return NULL;
-       preempt_disable();
-       return SEQ_START_TOKEN;
+       return pos;
 }
 
 static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
                                   loff_t *pos)
 {
-       struct gfs2_glock_iter *gi = seq->private;
        (*pos)++;
-       gi->hash++;
-       if (gi->hash >= GFS2_NR_SBSTATS) {
-               preempt_enable();
+       if (*pos >= GFS2_NR_SBSTATS)
                return NULL;
-       }
-       return SEQ_START_TOKEN;
+       return pos;
 }
 
 static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
@@ -1987,14 +1896,28 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
        if (ret == 0) {
                struct seq_file *seq = file->private_data;
                struct gfs2_glock_iter *gi = seq->private;
+
                gi->sdp = inode->i_private;
+               gi->last_pos = 0;
                seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
                if (seq->buf)
                        seq->size = GFS2_SEQ_GOODSIZE;
+               gi->gl = NULL;
+               ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
        }
        return ret;
 }
 
+static int gfs2_glocks_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+       struct gfs2_glock_iter *gi = seq->private;
+
+       gi->gl = NULL;
+       rhashtable_walk_exit(&gi->hti);
+       return seq_release_private(inode, file);
+}
+
 static int gfs2_glstats_open(struct inode *inode, struct file *file)
 {
        int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
@@ -2003,21 +1926,22 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
                struct seq_file *seq = file->private_data;
                struct gfs2_glock_iter *gi = seq->private;
                gi->sdp = inode->i_private;
+               gi->last_pos = 0;
                seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
                if (seq->buf)
                        seq->size = GFS2_SEQ_GOODSIZE;
+               gi->gl = NULL;
+               ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
        }
        return ret;
 }
 
 static int gfs2_sbstats_open(struct inode *inode, struct file *file)
 {
-       int ret = seq_open_private(file, &gfs2_sbstats_seq_ops,
-                                  sizeof(struct gfs2_glock_iter));
+       int ret = seq_open(file, &gfs2_sbstats_seq_ops);
        if (ret == 0) {
                struct seq_file *seq = file->private_data;
-               struct gfs2_glock_iter *gi = seq->private;
-               gi->sdp = inode->i_private;
+               seq->private = inode->i_private;  /* sdp */
        }
        return ret;
 }
@@ -2027,7 +1951,7 @@ static const struct file_operations gfs2_glocks_fops = {
        .open    = gfs2_glocks_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-       .release = seq_release_private,
+       .release = gfs2_glocks_release,
 };
 
 static const struct file_operations gfs2_glstats_fops = {
@@ -2035,7 +1959,7 @@ static const struct file_operations gfs2_glstats_fops = {
        .open    = gfs2_glstats_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-       .release = seq_release_private,
+       .release = gfs2_glocks_release,
 };
 
 static const struct file_operations gfs2_sbstats_fops = {
@@ -2043,7 +1967,7 @@ static const struct file_operations gfs2_sbstats_fops = {
        .open    = gfs2_sbstats_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-       .release = seq_release_private,
+       .release = seq_release,
 };
 
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
index fa3fa5e..1f6c9c3 100644 (file)
@@ -32,13 +32,15 @@ struct workqueue_struct *gfs2_freeze_wq;
 
 static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
 {
-       fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+       fs_err(gl->gl_name.ln_sbd,
+              "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
+              "state 0x%lx\n",
               bh, (unsigned long long)bh->b_blocknr, bh->b_state,
               bh->b_page->mapping, bh->b_page->flags);
-       fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+       fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
               gl->gl_name.ln_type, gl->gl_name.ln_number,
               gfs2_glock2aspace(gl));
-       gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+       gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n");
 }
 
 /**
@@ -52,7 +54,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
 static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
                             unsigned int nr_revokes)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct list_head *head = &gl->gl_ail_list;
        struct gfs2_bufdata *bd, *tmp;
        struct buffer_head *bh;
@@ -80,7 +82,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
 
 static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_trans tr;
 
        memset(&tr, 0, sizeof(tr));
@@ -109,7 +111,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 
 void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        unsigned int revokes = atomic_read(&gl->gl_ail_count);
        unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
        int ret;
@@ -139,7 +141,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = &sdp->sd_aspace;
        struct gfs2_rgrpd *rgd;
        int error;
@@ -179,7 +181,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = &sdp->sd_aspace;
        struct gfs2_rgrpd *rgd = gl->gl_object;
 
@@ -218,7 +220,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-       gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
+       gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH);
        filemap_fdatawrite(metamapping);
        if (ip) {
                struct address_space *mapping = ip->i_inode.i_mapping;
@@ -252,7 +254,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 {
        struct gfs2_inode *ip = gl->gl_object;
 
-       gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+       gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
 
        if (flags & DIO_METADATA) {
                struct address_space *mapping = gfs2_glock2aspace(gl);
@@ -264,9 +266,9 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
                }
        }
 
-       if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
-               gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
-               gl->gl_sbd->sd_rindex_uptodate = 0;
+       if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) {
+               gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH);
+               gl->gl_name.ln_sbd->sd_rindex_uptodate = 0;
        }
        if (ip && S_ISREG(ip->i_inode.i_mode))
                truncate_inode_pages(ip->i_inode.i_mapping, 0);
@@ -281,7 +283,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 
 static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_holder *gh;
 
        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
@@ -416,7 +418,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 static int inode_go_lock(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = gl->gl_object;
        int error = 0;
 
@@ -477,7 +479,7 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 static void freeze_go_sync(struct gfs2_glock *gl)
 {
        int error = 0;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
        if (gl->gl_state == LM_ST_SHARED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
@@ -500,7 +502,7 @@ static void freeze_go_sync(struct gfs2_glock *gl)
 
 static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
        struct gfs2_glock *j_gl = ip->i_gl;
        struct gfs2_log_header_host head;
@@ -545,7 +547,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
 static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
 {
        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
        if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
                return;
index a1ec7c2..121ed08 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/ktime.h>
 #include <linux/percpu.h>
 #include <linux/lockref.h>
+#include <linux/rhashtable.h>
 
 #define DIO_WAIT       0x00000010
 #define DIO_METADATA   0x00000020
@@ -203,13 +204,15 @@ enum {
 };
 
 struct lm_lockname {
+       struct gfs2_sbd *ln_sbd;
        u64 ln_number;
        unsigned int ln_type;
 };
 
 #define lm_name_equal(name1, name2) \
-        (((name1)->ln_number == (name2)->ln_number) && \
-         ((name1)->ln_type == (name2)->ln_type))
+        (((name1)->ln_number == (name2)->ln_number) && \
+        ((name1)->ln_type == (name2)->ln_type) &&      \
+        ((name1)->ln_sbd == (name2)->ln_sbd))
 
 
 struct gfs2_glock_operations {
@@ -241,7 +244,7 @@ enum {
 };
 
 struct gfs2_lkstats {
-       s64 stats[GFS2_NR_LKSTATS];
+       u64 stats[GFS2_NR_LKSTATS];
 };
 
 enum {
@@ -327,7 +330,6 @@ enum {
 
 struct gfs2_glock {
        struct hlist_bl_node gl_list;
-       struct gfs2_sbd *gl_sbd;
        unsigned long gl_flags;         /* GLF_... */
        struct lm_lockname gl_name;
 
@@ -341,7 +343,6 @@ struct gfs2_glock {
                     gl_req:2,          /* State in last dlm request */
                     gl_reply:8;        /* Last reply from the dlm */
 
-       unsigned int gl_hash;
        unsigned long gl_demote_time; /* time of first demote request */
        long gl_hold_time;
        struct list_head gl_holders;
@@ -367,7 +368,7 @@ struct gfs2_glock {
                        loff_t end;
                } gl_vm;
        };
-       struct rcu_head gl_rcu;
+       struct rhash_head gl_node;
 };
 
 #define GFS2_MIN_LVB_SIZE 32   /* Min size of LVB that gfs2 supports */
@@ -835,7 +836,7 @@ static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
 
 static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
 {
-       const struct gfs2_sbd *sdp = gl->gl_sbd;
+       const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        preempt_disable();
        this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++;
        preempt_enable();
index 641383a..284c154 100644 (file)
@@ -31,7 +31,7 @@ extern struct workqueue_struct *gfs2_control_wq;
  *
  * @delta is the difference between the current rtt sample and the
  * running average srtt. We add 1/8 of that to the srtt in order to
- * update the current srtt estimate. The varience estimate is a bit
+ * update the current srtt estimate. The variance estimate is a bit
  * more complicated. We subtract the abs value of the @delta from
  * the current variance estimate and add 1/4 of that to the running
  * total.
@@ -80,7 +80,7 @@ static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
 
        preempt_disable();
        rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
-       lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+       lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
        gfs2_update_stats(&gl->gl_stats, index, rtt);           /* Local */
        gfs2_update_stats(&lks->lkstats[gltype], index, rtt);   /* Global */
        preempt_enable();
@@ -108,7 +108,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
        dstamp = gl->gl_dstamp;
        gl->gl_dstamp = ktime_get_real();
        irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
-       lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+       lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
        gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt);           /* Local */
        gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt);   /* Global */
        preempt_enable();
@@ -253,7 +253,7 @@ static void gfs2_reverse_hex(char *c, u64 value)
 static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
                     unsigned int flags)
 {
-       struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+       struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
        int req;
        u32 lkf;
        char strname[GDLM_STRNAME_BYTES] = "";
@@ -281,7 +281,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 
 static void gdlm_put_lock(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int lvb_needs_unlock = 0;
        int error;
@@ -319,7 +319,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
 
 static void gdlm_cancel(struct gfs2_glock *gl)
 {
-       struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+       struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
        dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
 }
 
index 92324ac..d5369a1 100644 (file)
@@ -70,7 +70,7 @@ static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
 static void maybe_release_space(struct gfs2_bufdata *bd)
 {
        struct gfs2_glock *gl = bd->bd_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_rgrpd *rgd = gl->gl_object;
        unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
        struct gfs2_bitmap *bi = rgd->rd_bits + index;
@@ -578,7 +578,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 static void gfs2_meta_sync(struct gfs2_glock *gl)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        int error;
 
        if (mapping == NULL)
@@ -588,7 +588,7 @@ static void gfs2_meta_sync(struct gfs2_glock *gl)
        error = filemap_fdatawait(mapping);
 
        if (error)
-               gfs2_io_error(gl->gl_sbd);
+               gfs2_io_error(gl->gl_name.ln_sbd);
 }
 
 static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
index b984a6e..0e1d4be 100644 (file)
@@ -114,7 +114,7 @@ const struct address_space_operations gfs2_rgrp_aops = {
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct page *page;
        struct buffer_head *bh;
        unsigned int shift;
@@ -200,7 +200,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct buffer_head *bh;
 
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
@@ -362,7 +362,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 
 struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct buffer_head *first_bh, *bh;
        u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
                          sdp->sd_sb.sb_bsize_shift;
index ac5d802..8ca1615 100644 (file)
@@ -44,7 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 {
        struct inode *inode = mapping->host;
        if (mapping->a_ops == &gfs2_meta_aops)
-               return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+               return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd;
        else if (mapping->a_ops == &gfs2_rgrp_aops)
                return container_of(mapping, struct gfs2_sbd, sd_aspace);
        else
index 9b61f92..3a31226 100644 (file)
@@ -119,7 +119,7 @@ static void gfs2_qd_dispose(struct list_head *list)
 
        while (!list_empty(list)) {
                qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
-               sdp = qd->qd_gl->gl_sbd;
+               sdp = qd->qd_gl->gl_name.ln_sbd;
 
                list_del(&qd->qd_lru);
 
@@ -302,7 +302,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 
 static void qd_hold(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
        lockref_get(&qd->qd_lockref);
 }
@@ -367,7 +367,7 @@ static void slot_put(struct gfs2_quota_data *qd)
 
 static int bh_get(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
        unsigned int block, offset;
        struct buffer_head *bh;
@@ -414,7 +414,7 @@ fail:
 
 static void bh_put(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
 
        mutex_lock(&sdp->sd_quota_mutex);
        gfs2_assert(sdp, qd->qd_bh_count);
@@ -486,7 +486,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 
 static void qd_unlock(struct gfs2_quota_data *qd)
 {
-       gfs2_assert_warn(qd->qd_gl->gl_sbd,
+       gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd,
                         test_bit(QDF_LOCKED, &qd->qd_flags));
        clear_bit(QDF_LOCKED, &qd->qd_flags);
        bh_put(qd);
@@ -614,7 +614,7 @@ static int sort_qd(const void *a, const void *b)
 
 static void do_qc(struct gfs2_quota_data *qd, s64 change)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
        struct gfs2_quota_change *qc = qd->qd_bh_qc;
        s64 x;
@@ -831,7 +831,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 
 static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 {
-       struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
        struct gfs2_alloc_parms ap = { .aflags = 0, };
        unsigned int data_blocks, ind_blocks;
@@ -922,7 +922,7 @@ out:
                gfs2_glock_dq_uninit(&ghs[qx]);
        mutex_unlock(&ip->i_inode.i_mutex);
        kfree(ghs);
-       gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
+       gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
        return error;
 }
 
@@ -954,7 +954,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
 static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
                    struct gfs2_holder *q_gh)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
        struct gfs2_holder i_gh;
        int error;
@@ -1037,7 +1037,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 
 static int need_sync(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_tune *gt = &sdp->sd_tune;
        s64 value;
        unsigned int num, den;
@@ -1125,7 +1125,7 @@ out:
 
 static int print_message(struct gfs2_quota_data *qd, char *type)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
 
        fs_info(sdp, "quota %s for %s %u\n",
                type,
index c6c6232..475985d 100644 (file)
@@ -1860,13 +1860,13 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 {
        const struct gfs2_glock *gl = rgd->rd_gl;
-       const struct gfs2_sbd *sdp = gl->gl_sbd;
+       const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_lkstats *st;
-       s64 r_dcount, l_dcount;
-       s64 l_srttb, a_srttb = 0;
+       u64 r_dcount, l_dcount;
+       u64 l_srttb, a_srttb = 0;
        s64 srttb_diff;
-       s64 sqr_diff;
-       s64 var;
+       u64 sqr_diff;
+       u64 var;
        int cpu, nonzero = 0;
 
        preempt_disable();
index 20c007d..49ac55d 100644 (file)
@@ -104,7 +104,7 @@ TRACE_EVENT(gfs2_glock_state_change,
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
@@ -140,7 +140,7 @@ TRACE_EVENT(gfs2_glock_put,
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
@@ -174,7 +174,7 @@ TRACE_EVENT(gfs2_demote_rq,
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
@@ -209,7 +209,7 @@ TRACE_EVENT(gfs2_promote,
        ),
 
        TP_fast_assign(
-               __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev    = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
                __entry->gltype = gh->gh_gl->gl_name.ln_type;
                __entry->first  = first;
@@ -239,7 +239,7 @@ TRACE_EVENT(gfs2_glock_queue,
        ),
 
        TP_fast_assign(
-               __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev    = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
                __entry->gltype = gh->gh_gl->gl_name.ln_type;
                __entry->queue  = queue;
@@ -267,18 +267,18 @@ TRACE_EVENT(gfs2_glock_lock_time,
                __field(        int,    status          )
                __field(        char,   flags           )
                __field(        s64,    tdiff           )
-               __field(        s64,    srtt            )
-               __field(        s64,    srttvar         )
-               __field(        s64,    srttb           )
-               __field(        s64,    srttvarb        )
-               __field(        s64,    sirt            )
-               __field(        s64,    sirtvar         )
-               __field(        s64,    dcount          )
-               __field(        s64,    qcount          )
+               __field(        u64,    srtt            )
+               __field(        u64,    srttvar         )
+               __field(        u64,    srttb           )
+               __field(        u64,    srttvarb        )
+               __field(        u64,    sirt            )
+               __field(        u64,    sirtvar         )
+               __field(        u64,    dcount          )
+               __field(        u64,    qcount          )
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->status         = gl->gl_lksb.sb_status;
@@ -333,7 +333,7 @@ TRACE_EVENT(gfs2_pin,
        ),
 
        TP_fast_assign(
-               __entry->dev            = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = bd->bd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->pin            = pin;
                __entry->len            = bd->bd_bh->b_size;
                __entry->block          = bd->bd_bh->b_blocknr;
@@ -449,7 +449,7 @@ TRACE_EVENT(gfs2_bmap,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->lblock         = lblock;
                __entry->pblock         = buffer_mapped(bh) ?  bh->b_blocknr : 0;
                __entry->inum           = ip->i_no_addr;
@@ -489,7 +489,7 @@ TRACE_EVENT(gfs2_block_alloc,
        ),
 
        TP_fast_assign(
-               __entry->dev            = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = rgd->rd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->start          = block;
                __entry->inum           = ip->i_no_addr;
                __entry->len            = len;
index 88bff24..b95d0d6 100644 (file)
@@ -158,7 +158,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
 void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
 {
        struct gfs2_trans *tr = current->journal_info;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = bh->b_page->mapping;
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_bufdata *bd;
@@ -224,7 +224,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 {
 
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_bufdata *bd;
 
        lock_buffer(bh);
index d3fa6bd..221719e 100644 (file)
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-               page_cache_release(page);
                node->page[i] = page;
        }
 
@@ -398,11 +397,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-       //int i;
+       int i;
 
-       //for (i = 0; i < node->tree->pages_per_bnode; i++)
-       //      if (node->page[i])
-       //              page_cache_release(node->page[i]);
+       for (i = 0; i < node->tree->pages_per_bnode; i++)
+               if (node->page[i])
+                       page_cache_release(node->page[i]);
        kfree(node);
 }
 
index 9f4ee7f..6fc766d 100644 (file)
@@ -131,13 +131,16 @@ skip:
        hfs_bnode_write(node, entry, data_off + key_len, entry_len);
        hfs_bnode_dump(node);
 
-       if (new_node) {
-               /* update parent key if we inserted a key
-                * at the start of the first node
-                */
-               if (!rec && new_node != node)
-                       hfs_brec_update_parent(fd);
+       /*
+        * update parent key if we inserted a key
+        * at the start of the node and it is not the new node
+        */
+       if (!rec && new_node != node) {
+               hfs_bnode_read_key(node, fd->search_key, data_off + size);
+               hfs_brec_update_parent(fd);
+       }
 
+       if (new_node) {
                hfs_bnode_put(fd->bnode);
                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
                goto again;
        }
 
-       if (!rec)
-               hfs_brec_update_parent(fd);
-
        return 0;
 }
 
@@ -366,6 +366,8 @@ again:
        if (IS_ERR(parent))
                return PTR_ERR(parent);
        __hfs_brec_find(parent, fd);
+       if (fd->record < 0)
+               return -ENOENT;
        hfs_bnode_dump(parent);
        rec = fd->record;
 
index 759708f..6392466 100644 (file)
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-               page_cache_release(page);
                node->page[i] = page;
        }
 
@@ -566,13 +565,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-#if 0
        int i;
 
        for (i = 0; i < node->tree->pages_per_bnode; i++)
                if (node->page[i])
                        page_cache_release(node->page[i]);
-#endif
        kfree(node);
 }
 
index 2d48d28..91e0045 100644 (file)
@@ -91,6 +91,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
        return ret;
 }
 
+/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+       size_t len = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+       do {
+               len += strlen(kn->name) + 1;
+               kn = kn->parent;
+       } while (kn && kn->parent);
+
+       spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+       return len;
+}
+
 /**
  * kernfs_path - build full path of a given node
  * @kn: kernfs_node of interest
index 29b9279..726d211 100644 (file)
@@ -2438,7 +2438,7 @@ done:
 
 /**
  * path_mountpoint - look up a path to be umounted
- * @nameidata: lookup context
+ * @nd:                lookup context
  * @flags:     lookup flags
  * @path:      pointer to container for result
  *
index e4905fb..8f20d60 100644 (file)
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -142,7 +142,8 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
        struct inode *inode = d_inode(dentry);
        const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
 
-       return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+       seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+       return 0;
 }
 
 static const struct super_operations nsfs_ops = {
index d0e436d..ce12e0b 100644 (file)
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                                     struct dlm_migratable_lockres *mres)
 {
        struct dlm_migratable_lock *ml;
-       struct list_head *queue;
+       struct list_head *queue, *iter;
        struct list_head *tmpq = NULL;
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
@@ -1821,7 +1821,9 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                               list_for_each_entry(lock, tmpq, list) {
+                               list_for_each(iter, tmpq) {
+                                       lock = list_entry(iter,
+                                                 struct dlm_lock, list);
                                        if (lock->ml.cookie == ml->cookie)
                                                break;
                                        lock = NULL;
index aa50d1a..b25eee4 100644 (file)
@@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
 {
        struct inode * inode = file_inode(file);
-       char *page, *tmp;
-       ssize_t length;
        uid_t loginuid;
        kuid_t kloginuid;
+       int rv;
 
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        }
        rcu_read_unlock();
 
-       if (count >= PAGE_SIZE)
-               count = PAGE_SIZE - 1;
-
        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }
-       page = (char*)__get_free_page(GFP_TEMPORARY);
-       if (!page)
-               return -ENOMEM;
-       length = -EFAULT;
-       if (copy_from_user(page, buf, count))
-               goto out_free_page;
-
-       page[count] = '\0';
-       loginuid = simple_strtoul(page, &tmp, 10);
-       if (tmp == page) {
-               length = -EINVAL;
-               goto out_free_page;
 
-       }
+       rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+       if (rv < 0)
+               return rv;
 
        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
-               if (!uid_valid(kloginuid)) {
-                       length = -EINVAL;
-                       goto out_free_page;
-               }
+               if (!uid_valid(kloginuid))
+                       return -EINVAL;
        }
 
-       length = audit_set_loginuid(kloginuid);
-       if (likely(length == 0))
-               length = count;
-
-out_free_page:
-       free_page((unsigned long) page);
-       return length;
+       rv = audit_set_loginuid(kloginuid);
+       if (rv < 0)
+               return rv;
+       return count;
 }
 
 static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
 {
        struct task_struct *task;
-       char buffer[PROC_NUMBUF], *end;
+       char buffer[PROC_NUMBUF];
        int make_it_fail;
+       int rv;
 
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
@@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-       make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
-       if (*end)
-               return -EINVAL;
+       rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+       if (rv < 0)
+               return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;
 
@@ -1836,8 +1818,6 @@ end_instantiate:
        return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
 /*
  * dname_to_vma_addr - maps a dentry name into two unsigned longs
  * which represent vma start and end addresses.
@@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        if (flags & LOOKUP_RCU)
                return -ECHILD;
 
-       if (!capable(CAP_SYS_ADMIN)) {
-               status = -EPERM;
-               goto out_notask;
-       }
-
        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
@@ -1957,6 +1932,29 @@ struct map_files_info {
        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+       .readlink       = proc_pid_readlink,
+       .follow_link    = proc_map_files_follow_link,
+       .setattr        = proc_setattr,
+};
+
 static int
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
@@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
        ei = PROC_I(inode);
        ei->op.proc_get_link = proc_map_files_get_link;
 
-       inode->i_op = &proc_pid_link_inode_operations;
+       inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;
        inode->i_mode = S_IFLNK;
 
@@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
        int result;
        struct mm_struct *mm;
 
-       result = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        result = -ENOENT;
        task = get_proc_task(dir);
        if (!task)
@@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
        struct map_files_info *p;
        int ret;
 
-       ret = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
@@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
-#endif /* CONFIG_CHECKPOINT_RESTORE */
 
 static int proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 {
        struct task_struct *task;
        struct mm_struct *mm;
-       char buffer[PROC_NUMBUF], *end;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;
 
-       ret = -EFAULT;
-       memset(buffer, 0, sizeof(buffer));
-       if (count > sizeof(buffer) - 1)
-               count = sizeof(buffer) - 1;
-       if (copy_from_user(buffer, buf, count))
-               goto out_no_task;
-
-       ret = -EINVAL;
-       val = (unsigned int)simple_strtoul(buffer, &end, 0);
-       if (*end == '\n')
-               end++;
-       if (end - buffer == 0)
-               goto out_no_task;
+       ret = kstrtouint_from_user(buf, count, 0, &val);
+       if (ret < 0)
+               return ret;
 
        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;
 
-       ret = end - buffer;
        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
@@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
  out_no_mm:
        put_task_struct(task);
  out_no_task:
-       return ret;
+       if (ret < 0)
+               return ret;
+       return count;
 }
 
 static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
index e5dee5c..ff3ffc7 100644 (file)
@@ -26,7 +26,7 @@
 
 #include "internal.h"
 
-static DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_RWLOCK(proc_subdir_lock);
 
 static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 {
        int rv;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        rv = __xlate_proc_name(name, ret, residual);
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return rv;
 }
 
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 {
        struct inode *inode;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
        if (de) {
                pde_get(de);
-               spin_unlock(&proc_subdir_lock);
+               read_unlock(&proc_subdir_lock);
                inode = proc_get_inode(dir->i_sb, de);
                if (!inode)
                        return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                d_add(dentry, inode);
                return NULL;
        }
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return ERR_PTR(-ENOENT);
 }
 
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
        if (!dir_emit_dots(file, ctx))
                return 0;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        de = pde_subdir_first(de);
        i = ctx->pos - 2;
        for (;;) {
                if (!de) {
-                       spin_unlock(&proc_subdir_lock);
+                       read_unlock(&proc_subdir_lock);
                        return 0;
                }
                if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
        do {
                struct proc_dir_entry *next;
                pde_get(de);
-               spin_unlock(&proc_subdir_lock);
+               read_unlock(&proc_subdir_lock);
                if (!dir_emit(ctx, de->name, de->namelen,
                            de->low_ino, de->mode >> 12)) {
                        pde_put(de);
                        return 0;
                }
-               spin_lock(&proc_subdir_lock);
+               read_lock(&proc_subdir_lock);
                ctx->pos++;
                next = pde_subdir_next(de);
                pde_put(de);
                de = next;
        } while (de);
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return 1;
 }
 
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        if (ret)
                return ret;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                proc_free_inum(dp->low_ino);
                return -EEXIST;
        }
-       spin_unlock(&proc_subdir_lock);
+       write_unlock(&proc_subdir_lock);
 
        return 0;
 }
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        const char *fn = name;
        unsigned int len;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return;
        }
        len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        de = pde_subdir_find(parent, fn, len);
        if (de)
                rb_erase(&de->subdir_node, &parent->subdir);
-       spin_unlock(&proc_subdir_lock);
+       write_unlock(&proc_subdir_lock);
        if (!de) {
                WARN(1, "name '%s'\n", name);
                return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
        const char *fn = name;
        unsigned int len;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        len = strlen(fn);
 
        root = pde_subdir_find(parent, fn, len);
        if (!root) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
                        de = next;
                        continue;
                }
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
 
                proc_entry_rundown(de);
                next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
                        break;
                pde_put(de);
 
-               spin_lock(&proc_subdir_lock);
+               write_lock(&proc_subdir_lock);
                de = next;
        }
        pde_put(root);
index 7eee2d8..9348403 100644 (file)
@@ -9,12 +9,16 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 #include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
 
 /* /proc/kpagecount - an array exposing page counts
  *
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                pfn++;
                out++;
                count -= KPMSIZE;
+
+               cond_resched();
        }
 
        *ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
        if (PageBalloon(page))
                u |= 1 << KPF_BALLOON;
 
+       if (page_is_idle(page))
+               u |= 1 << KPF_IDLE;
+
        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
 
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                pfn++;
                out++;
                count -= KPMSIZE;
+
+               cond_resched();
        }
 
        *ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
        .read = kpageflags_read,
 };
 
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       u64 __user *out = (u64 __user *)buf;
+       struct page *ppage;
+       unsigned long src = *ppos;
+       unsigned long pfn;
+       ssize_t ret = 0;
+       u64 ino;
+
+       pfn = src / KPMSIZE;
+       count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+       if (src & KPMMASK || count & KPMMASK)
+               return -EINVAL;
+
+       while (count > 0) {
+               if (pfn_valid(pfn))
+                       ppage = pfn_to_page(pfn);
+               else
+                       ppage = NULL;
+
+               if (ppage)
+                       ino = page_cgroup_ino(ppage);
+               else
+                       ino = 0;
+
+               if (put_user(ino, out)) {
+                       ret = -EFAULT;
+                       break;
+               }
+
+               pfn++;
+               out++;
+               count -= KPMSIZE;
+
+               cond_resched();
+       }
+
+       *ppos += (char __user *)out - buf;
+       if (!ret)
+               ret = (char __user *)out - buf;
+       return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+       .llseek = mem_lseek,
+       .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
 static int __init proc_page_init(void)
 {
        proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
        proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+       proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
        return 0;
 }
 fs_initcall(proc_page_init);
index 41f1a50..e2d46ad 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -459,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 
        mss->resident += size;
        /* Accumulate the size in pages that have been accessed. */
-       if (young || PageReferenced(page))
+       if (young || page_is_young(page) || PageReferenced(page))
                mss->referenced += size;
        mapcount = page_mapcount(page);
        if (mapcount >= 2) {
@@ -807,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 
                /* Clear accessed and referenced bits. */
                pmdp_test_and_clear_young(vma, addr, pmd);
+               test_and_clear_page_young(page);
                ClearPageReferenced(page);
 out:
                spin_unlock(ptl);
@@ -834,6 +836,7 @@ out:
 
                /* Clear accessed and referenced bits. */
                ptep_test_and_clear_young(vma, addr, pte);
+               test_and_clear_page_young(page);
                ClearPageReferenced(page);
        }
        pte_unmap_unlock(pte - 1, ptl);
index ce9e39f..225586e 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/cred.h>
 #include <linux/mm.h>
+#include <linux/printk.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -371,16 +372,16 @@ EXPORT_SYMBOL(seq_release);
  *     @esc:   set of characters that need escaping
  *
  *     Puts string into buffer, replacing each occurrence of character from
- *     @esc with usual octal escape.  Returns 0 in case of success, -1 - in
- *     case of overflow.
+ *     @esc with usual octal escape.
+ *     Use seq_has_overflowed() to check for errors.
  */
-int seq_escape(struct seq_file *m, const char *s, const char *esc)
+void seq_escape(struct seq_file *m, const char *s, const char *esc)
 {
        char *end = m->buf + m->size;
-        char *p;
+       char *p;
        char c;
 
-        for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
+       for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
                if (!strchr(esc, c)) {
                        *p++ = c;
                        continue;
@@ -393,14 +394,13 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
                        continue;
                }
                seq_set_overflow(m);
-               return -1;
-        }
+               return;
+       }
        m->count = p - m->buf;
-        return 0;
 }
 EXPORT_SYMBOL(seq_escape);
 
-int seq_vprintf(struct seq_file *m, const char *f, va_list args)
+void seq_vprintf(struct seq_file *m, const char *f, va_list args)
 {
        int len;
 
@@ -408,24 +408,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args)
                len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
                if (m->count + len < m->size) {
                        m->count += len;
-                       return 0;
+                       return;
                }
        }
        seq_set_overflow(m);
-       return -1;
 }
 EXPORT_SYMBOL(seq_vprintf);
 
-int seq_printf(struct seq_file *m, const char *f, ...)
+void seq_printf(struct seq_file *m, const char *f, ...)
 {
-       int ret;
        va_list args;
 
        va_start(args, f);
-       ret = seq_vprintf(m, f, args);
+       seq_vprintf(m, f, args);
        va_end(args);
-
-       return ret;
 }
 EXPORT_SYMBOL(seq_printf);
 
@@ -663,26 +659,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops,
 }
 EXPORT_SYMBOL(seq_open_private);
 
-int seq_putc(struct seq_file *m, char c)
+void seq_putc(struct seq_file *m, char c)
 {
-       if (m->count < m->size) {
-               m->buf[m->count++] = c;
-               return 0;
-       }
-       return -1;
+       if (m->count >= m->size)
+               return;
+
+       m->buf[m->count++] = c;
 }
 EXPORT_SYMBOL(seq_putc);
 
-int seq_puts(struct seq_file *m, const char *s)
+void seq_puts(struct seq_file *m, const char *s)
 {
        int len = strlen(s);
-       if (m->count + len < m->size) {
-               memcpy(m->buf + m->count, s, len);
-               m->count += len;
-               return 0;
+
+       if (m->count + len >= m->size) {
+               seq_set_overflow(m);
+               return;
        }
-       seq_set_overflow(m);
-       return -1;
+       memcpy(m->buf + m->count, s, len);
+       m->count += len;
 }
 EXPORT_SYMBOL(seq_puts);
 
@@ -693,8 +688,8 @@ EXPORT_SYMBOL(seq_puts);
  * This routine is very quick when you show lots of numbers.
  * In usual cases, it will be better to use seq_printf(). It's easier to read.
  */
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
-                       unsigned long long num)
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+                        unsigned long long num)
 {
        int len;
 
@@ -706,35 +701,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter,
 
        if (num < 10) {
                m->buf[m->count++] = num + '0';
-               return 0;
+               return;
        }
 
        len = num_to_str(m->buf + m->count, m->size - m->count, num);
        if (!len)
                goto overflow;
        m->count += len;
-       return 0;
+       return;
+
 overflow:
        seq_set_overflow(m);
-       return -1;
 }
 EXPORT_SYMBOL(seq_put_decimal_ull);
 
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
-                       long long num)
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
 {
        if (num < 0) {
                if (m->count + 3 >= m->size) {
                        seq_set_overflow(m);
-                       return -1;
+                       return;
                }
                if (delimiter)
                        m->buf[m->count++] = delimiter;
                num = -num;
                delimiter = '-';
        }
-       return seq_put_decimal_ull(m, delimiter, num);
-
+       seq_put_decimal_ull(m, delimiter, num);
 }
 EXPORT_SYMBOL(seq_put_decimal_ll);
 
@@ -773,6 +766,47 @@ void seq_pad(struct seq_file *m, char c)
 }
 EXPORT_SYMBOL(seq_pad);
 
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+                 int rowsize, int groupsize, const void *buf, size_t len,
+                 bool ascii)
+{
+       const u8 *ptr = buf;
+       int i, linelen, remaining = len;
+       int ret;
+
+       if (rowsize != 16 && rowsize != 32)
+               rowsize = 16;
+
+       for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+               linelen = min(remaining, rowsize);
+               remaining -= rowsize;
+
+               switch (prefix_type) {
+               case DUMP_PREFIX_ADDRESS:
+                       seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+                       break;
+               case DUMP_PREFIX_OFFSET:
+                       seq_printf(m, "%s%.8x: ", prefix_str, i);
+                       break;
+               default:
+                       seq_printf(m, "%s", prefix_str);
+                       break;
+               }
+
+               ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+                                        m->buf + m->count, m->size - m->count,
+                                        ascii);
+               if (ret >= m->size - m->count) {
+                       seq_set_overflow(m);
+               } else {
+                       m->count += ret;
+                       seq_putc(m, '\n');
+               }
+       }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
        struct list_head *lh;
index 940d5ec..b1bc954 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
+#include <asm-generic/dma-coherent.h>
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
                                              size_t size,
@@ -237,4 +238,121 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 
 #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL)
 
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev, flag)        (true)
+#endif
+
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+                                      dma_addr_t *dma_handle, gfp_t flag,
+                                      struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+       void *cpu_addr;
+
+       BUG_ON(!ops);
+
+       if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
+               return cpu_addr;
+
+       if (!arch_dma_alloc_attrs(&dev, &flag))
+               return NULL;
+       if (!ops->alloc)
+               return NULL;
+
+       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+       return cpu_addr;
+}
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+                                    void *cpu_addr, dma_addr_t dma_handle,
+                                    struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       BUG_ON(!ops);
+       WARN_ON(irqs_disabled());
+
+       if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
+               return;
+
+       if (!ops->free)
+               return;
+
+       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+       ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, gfp_t flag)
+{
+       return dma_alloc_attrs(dev, size, dma_handle, flag, NULL);
+}
+
+static inline void dma_free_coherent(struct device *dev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle)
+{
+       return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL);
+}
+
+static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, gfp_t gfp)
+{
+       DEFINE_DMA_ATTRS(attrs);
+
+       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+       return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
+}
+
+static inline void dma_free_noncoherent(struct device *dev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle)
+{
+       DEFINE_DMA_ATTRS(attrs);
+
+       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+       dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
+}
+
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+       debug_dma_mapping_error(dev, dma_addr);
+
+       if (get_dma_ops(dev)->mapping_error)
+               return get_dma_ops(dev)->mapping_error(dev, dma_addr);
+
+#ifdef DMA_ERROR_CODE
+       return dma_addr == DMA_ERROR_CODE;
+#else
+       return 0;
+#endif
+}
+
+#ifndef HAVE_ARCH_DMA_SUPPORTED
+static inline int dma_supported(struct device *dev, u64 mask)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (!ops)
+               return 0;
+       if (!ops->dma_supported)
+               return 1;
+       return ops->dma_supported(dev, mask);
+}
+#endif
+
+#ifndef HAVE_ARCH_DMA_SET_MASK
+static inline int dma_set_mask(struct device *dev, u64 mask)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (ops->set_dma_mask)
+               return ops->set_dma_mask(dev, mask);
+
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+       *dev->dma_mask = mask;
+       return 0;
+}
+#endif
+
 #endif
index 83bfb87..e2aadbc 100644 (file)
@@ -111,8 +111,8 @@ static inline void queued_spin_unlock_wait(struct qspinlock *lock)
                cpu_relax();
 }
 
-#ifndef virt_queued_spin_lock
-static __always_inline bool virt_queued_spin_lock(struct qspinlock *lock)
+#ifndef virt_spin_lock
+static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 {
        return false;
 }
index e596675..e1e4d7c 100644 (file)
@@ -52,13 +52,16 @@ struct arch_timer_cpu {
 
        /* Timer IRQ */
        const struct kvm_irq_level      *irq;
+
+       /* VGIC mapping */
+       struct irq_phys_map             *map;
 };
 
 int kvm_timer_hyp_init(void);
 void kvm_timer_enable(struct kvm *kvm);
 void kvm_timer_init(struct kvm *kvm);
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-                         const struct kvm_irq_level *irq);
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+                        const struct kvm_irq_level *irq);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
index 133ea00..d901f1a 100644 (file)
@@ -95,11 +95,15 @@ enum vgic_type {
 #define LR_STATE_ACTIVE                (1 << 1)
 #define LR_STATE_MASK          (3 << 0)
 #define LR_EOI_INT             (1 << 2)
+#define LR_HW                  (1 << 3)
 
 struct vgic_lr {
-       u16     irq;
-       u8      source;
-       u8      state;
+       unsigned irq:10;
+       union {
+               unsigned hwirq:10;
+               unsigned source:3;
+       };
+       unsigned state:4;
 };
 
 struct vgic_vmcr {
@@ -155,6 +159,19 @@ struct vgic_io_device {
        struct kvm_io_device dev;
 };
 
+struct irq_phys_map {
+       u32                     virt_irq;
+       u32                     phys_irq;
+       u32                     irq;
+       bool                    active;
+};
+
+struct irq_phys_map_entry {
+       struct list_head        entry;
+       struct rcu_head         rcu;
+       struct irq_phys_map     map;
+};
+
 struct vgic_dist {
        spinlock_t              lock;
        bool                    in_kernel;
@@ -252,6 +269,10 @@ struct vgic_dist {
        struct vgic_vm_ops      vm_ops;
        struct vgic_io_device   dist_iodev;
        struct vgic_io_device   *redist_iodevs;
+
+       /* Virtual irq to hwirq mapping */
+       spinlock_t              irq_phys_map_lock;
+       struct list_head        irq_phys_map_list;
 };
 
 struct vgic_v2_cpu_if {
@@ -303,6 +324,9 @@ struct vgic_cpu {
                struct vgic_v2_cpu_if   vgic_v2;
                struct vgic_v3_cpu_if   vgic_v3;
        };
+
+       /* Protected by the distributor's irq_phys_map_lock */
+       struct list_head        irq_phys_map_list;
 };
 
 #define LR_EMPTY       0xff
@@ -317,16 +341,25 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 int kvm_vgic_hyp_init(void);
 int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_get_max_vcpus(void);
+void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+                              struct irq_phys_map *map, bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+                                          int virt_irq, int irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
 
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
index 0fe9df9..5a5d79e 100644 (file)
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
-       if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+       if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
 }
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
 }
 
 struct wb_iter {
-       int                     start_blkcg_id;
+       int                     start_memcg_id;
        struct radix_tree_iter  tree_iter;
        void                    **slot;
 };
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
        WARN_ON_ONCE(!rcu_read_lock_held());
 
-       if (iter->start_blkcg_id >= 0) {
-               iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
-               iter->start_blkcg_id = -1;
+       if (iter->start_memcg_id >= 0) {
+               iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
+               iter->start_memcg_id = -1;
        } else {
                iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
        }
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
 static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
                                                   struct backing_dev_info *bdi,
-                                                  int start_blkcg_id)
+                                                  int start_memcg_id)
 {
-       iter->start_blkcg_id = start_blkcg_id;
+       iter->start_memcg_id = start_memcg_id;
 
-       if (start_blkcg_id)
+       if (start_memcg_id)
                return __wb_iter_next(iter, bdi);
        else
                return &bdi->wb;
 }
 
 /**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
  * @wb_cur: cursor struct bdi_writeback pointer
  * @bdi: bdi to walk wb's of
  * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_blkcg_id: blkcg ID to start iteration from
+ * @start_memcg_id: memcg ID to start iteration from
  *
  * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
  * to be used as temp storage during iteration.  rcu_read_lock() must be
  * held throughout iteration.
  */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)             \
-       for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);      \
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)             \
+       for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
             (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
index a4cd164..0a5cc7a 100644 (file)
  */
 
 #include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH    (INT_MAX / 2)
+
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX                UINT_MAX
 
@@ -45,7 +48,7 @@ struct blkcg {
        struct blkcg_gq                 *blkg_hint;
        struct hlist_head               blkg_list;
 
-       struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
+       struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];
 
        struct list_head                all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
 #endif
 };
 
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
+ */
 struct blkg_stat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt;
+       struct percpu_counter           cpu_cnt;
+       atomic64_t                      aux_cnt;
 };
 
 struct blkg_rwstat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt[BLKG_RWSTAT_NR];
+       struct percpu_counter           cpu_cnt[BLKG_RWSTAT_NR];
+       atomic64_t                      aux_cnt[BLKG_RWSTAT_NR];
 };
 
 /*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
  * request_queue (q).  This is used by blkcg policies which need to track
  * information per blkcg - q pair.
  *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
  */
 struct blkg_policy_data {
        /* the blkg and policy id this per-policy data belongs to */
        struct blkcg_gq                 *blkg;
        int                             plid;
-
-       /* used during policy activation */
-       struct list_head                alloc_node;
 };
 
 /*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
-       /* the policy id this per-policy data belongs to */
+       /* the blkcg and policy id this per-policy data belongs to */
+       struct blkcg                    *blkcg;
        int                             plid;
 };
 
@@ -123,40 +127,50 @@ struct blkcg_gq {
        /* is this blkg online? protected by both blkcg and q locks */
        bool                            online;
 
+       struct blkg_rwstat              stat_bytes;
+       struct blkg_rwstat              stat_ios;
+
        struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
 
        struct rcu_head                 rcu_head;
 };
 
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 
 struct blkcg_policy {
        int                             plid;
-       /* policy specific private data size */
-       size_t                          pd_size;
-       /* policy specific per-blkcg data size */
-       size_t                          cpd_size;
        /* cgroup files for the policy */
-       struct cftype                   *cftypes;
+       struct cftype                   *dfl_cftypes;
+       struct cftype                   *legacy_cftypes;
 
        /* operations */
+       blkcg_pol_alloc_cpd_fn          *cpd_alloc_fn;
        blkcg_pol_init_cpd_fn           *cpd_init_fn;
+       blkcg_pol_free_cpd_fn           *cpd_free_fn;
+       blkcg_pol_bind_cpd_fn           *cpd_bind_fn;
+
+       blkcg_pol_alloc_pd_fn           *pd_alloc_fn;
        blkcg_pol_init_pd_fn            *pd_init_fn;
        blkcg_pol_online_pd_fn          *pd_online_fn;
        blkcg_pol_offline_pd_fn         *pd_offline_fn;
-       blkcg_pol_exit_pd_fn            *pd_exit_fn;
+       blkcg_pol_free_pd_fn            *pd_free_fn;
        blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
 };
 
 extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
 
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                                    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
                             const struct blkcg_policy *pol);
 
+const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
                       int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off);
 
 struct blkg_conf_ctx {
        struct gendisk                  *disk;
        struct blkcg_gq                 *blkg;
-       u64                             v;
+       char                            *body;
 };
 
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx);
+                  char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
 
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 
 static inline struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-       return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+       return css_to_blkcg(task_css(tsk, io_cgrp_id));
 }
 
 static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 static inline struct cgroup_subsys_state *
 task_get_blkcg_css(struct task_struct *task)
 {
-       return task_get_css(task, blkio_cgrp_id);
+       return task_get_css(task, io_cgrp_id);
 }
 
 /**
@@ -232,6 +252,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
        return css_to_blkcg(blkcg->css.parent);
 }
 
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+                                            struct request_queue *q,
+                                            bool update_hint)
+{
+       struct blkcg_gq *blkg;
+
+       if (blkcg == &blkcg_root)
+               return q->root_blkg;
+
+       blkg = rcu_dereference(blkcg->blkg_hint);
+       if (blkg && blkg->q == q)
+               return blkg;
+
+       return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+                                          struct request_queue *q)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (unlikely(blk_queue_bypass(q)))
+               return NULL;
+       return __blkg_lookup(blkcg, q, false);
+}
+
 /**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                     struct blkcg_policy *pol)
 {
-       return blkcg ? blkcg->pd[pol->plid] : NULL;
+       return blkcg ? blkcg->cpd[pol->plid] : NULL;
 }
 
 /**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
        return pd ? pd->blkg : NULL;
 }
 
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+       return cpd ? cpd->blkcg : NULL;
+}
+
 /**
  * blkg_path - format cgroup path of blkg
  * @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
                call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint);
-
 /**
  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
  * @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
         * or if either the blkcg or queue is going away.  Fall back to
         * root_rl in such cases.
         */
-       blkg = blkg_lookup_create(blkcg, q);
-       if (IS_ERR(blkg))
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg))
                goto root_rl;
 
        blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
  */
 static inline void blk_put_rl(struct request_list *rl)
 {
-       /* root_rl may not have blkg set */
-       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+       if (rl->blkg->blkcg != &blkcg_root)
                blkg_put(rl->blkg);
 }
 
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 #define blk_queue_for_each_rl(rl, q)   \
        for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
-       u64_stats_init(&stat->syncp);
+       int ret;
+
+       ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+       if (ret)
+               return ret;
+
+       atomic64_set(&stat->aux_cnt, 0);
+       return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+       percpu_counter_destroy(&stat->cpu_cnt);
 }
 
 /**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
  * @stat: target blkg_stat
  * @val: value to add
  *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
  */
 static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 {
-       u64_stats_update_begin(&stat->syncp);
-       stat->cnt += val;
-       u64_stats_update_end(&stat->syncp);
+       __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
-       unsigned int start;
-       uint64_t v;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&stat->syncp);
-               v = stat->cnt;
-       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-       return v;
+       return percpu_counter_sum_positive(&stat->cpu_cnt);
 }
 
 /**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
  */
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
-       stat->cnt = 0;
+       percpu_counter_set(&stat->cpu_cnt, 0);
+       atomic64_set(&stat->aux_cnt, 0);
 }
 
 /**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
  * @to: the destination blkg_stat
  * @from: the source
  *
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+                                    struct blkg_stat *from)
 {
-       blkg_stat_add(to, blkg_stat_read(from));
+       atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+                    &to->aux_cnt);
 }
 
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
-       u64_stats_init(&rwstat->syncp);
+       int i, ret;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+               if (ret) {
+                       while (--i >= 0)
+                               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+                       return ret;
+               }
+               atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
+       return 0;
+}
+
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+       int i;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 }
 
 /**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
                                   int rw, uint64_t val)
 {
-       u64_stats_update_begin(&rwstat->syncp);
+       struct percpu_counter *cnt;
 
        if (rw & REQ_WRITE)
-               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
        else
-               rwstat->cnt[BLKG_RWSTAT_READ] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
        if (rw & REQ_SYNC)
-               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
        else
-               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 
-       u64_stats_update_end(&rwstat->syncp);
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_rwstat_read - read the current values of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
  *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
  */
 static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 {
-       unsigned int start;
-       struct blkg_rwstat tmp;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-               tmp = *rwstat;
-       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+       struct blkg_rwstat result;
+       int i;
 
-       return tmp;
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               atomic64_set(&result.aux_cnt[i],
+                            percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+       return result;
 }
 
 /**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
        struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
 
-       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+       return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 }
 
 /**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
  */
 static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
-       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+       int i;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+               atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
 }
 
 /**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
  * @to: the destination blkg_rwstat
  * @from: the source
  *
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-                                    struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+                                      struct blkg_rwstat *from)
 {
        struct blkg_rwstat v = blkg_rwstat_read(from);
        int i;
 
-       u64_stats_update_begin(&to->syncp);
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
-               to->cnt[i] += v.cnt[i];
-       u64_stats_update_end(&to->syncp);
+               atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+                            atomic64_read(&from->aux_cnt[i]),
+                            &to->aux_cnt[i]);
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                          struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                                 struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+                                        struct bio *bio)
+{
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+       bool throtl = false;
+
+       rcu_read_lock();
+       blkcg = bio_blkcg(bio);
+
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg)) {
+               spin_lock_irq(q->queue_lock);
+               blkg = blkg_lookup_create(blkcg, q);
+               if (IS_ERR(blkg))
+                       blkg = NULL;
+               spin_unlock_irq(q->queue_lock);
+       }
+
+       throtl = blk_throtl_bio(q, blkg, bio);
+
+       if (!throtl) {
+               blkg = blkg ?: q->root_blkg;
+               blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+                               bio->bi_iter.bi_size);
+               blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+       }
+
+       rcu_read_unlock();
+       return !throtl;
 }
 
 #else  /* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
 static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
 static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
 
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+                                        struct bio *bio) { return true; }
+
 #define blk_queue_for_each_rl(rl, q)   \
        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 
index 708923b..38a5ff7 100644 (file)
@@ -584,7 +584,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 
 #define list_entry_rq(ptr)     list_entry((ptr), struct request, queuelist)
 
-#define rq_data_dir(rq)                (((rq)->cmd_flags & 1) != 0)
+#define rq_data_dir(rq)                ((int)((rq)->cmd_flags & 1))
 
 /*
  * Driver can handle struct request, if it either has an old style
index 9ebee53..397c5cd 100644 (file)
@@ -46,6 +46,7 @@ struct ceph_options {
        unsigned long mount_timeout;            /* jiffies */
        unsigned long osd_idle_ttl;             /* jiffies */
        unsigned long osd_keepalive_timeout;    /* jiffies */
+       unsigned long monc_ping_timeout;        /* jiffies */
 
        /*
         * any type that can't be simply compared or doesn't need need
@@ -66,6 +67,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT     msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT     msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT      msecs_to_jiffies(60 * 1000)
+#define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN        (16*1024*1024)
index 3775327..7e1252e 100644 (file)
@@ -248,6 +248,8 @@ struct ceph_connection {
        int in_base_pos;     /* bytes read */
        __le64 in_temp_ack;  /* for reading an ack */
 
+       struct timespec last_keepalive_ack;
+
        struct delayed_work work;           /* send|recv work */
        unsigned long       delay;          /* current delay interval */
 };
@@ -285,6 +287,8 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
 extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
+extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
+                                      unsigned long interval);
 
 extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
                                size_t length, size_t alignment);
index 1c18872..0fe2656 100644 (file)
@@ -84,10 +84,12 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_MSG           7  /* message */
 #define CEPH_MSGR_TAG_ACK           8  /* message ack */
 #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADPROTOVER   10 /* bad protocol version */
 #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
 #define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
 #define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2    14 /* keepalive2 byte + ceph_timespec */
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
 
 
 /*
index 1f36945..1a96fda 100644 (file)
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
 #endif
 
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
 #endif
 
 #if IS_ENABLED(CONFIG_MEMCG)
index 31ce435..bdcf358 100644 (file)
 struct clock_event_device;
 struct module;
 
-/* Clock event mode commands for legacy ->set_mode(): OBSOLETE */
-enum clock_event_mode {
-       CLOCK_EVT_MODE_UNUSED,
-       CLOCK_EVT_MODE_SHUTDOWN,
-       CLOCK_EVT_MODE_PERIODIC,
-       CLOCK_EVT_MODE_ONESHOT,
-       CLOCK_EVT_MODE_RESUME,
-};
-
 /*
  * Possible states of a clock event device.
  *
@@ -86,16 +77,14 @@ enum clock_event_state {
  * @min_delta_ns:      minimum delta value in ns
  * @mult:              nanosecond to cycles multiplier
  * @shift:             nanoseconds to cycles divisor (power of two)
- * @mode:              operating mode, relevant only to ->set_mode(), OBSOLETE
  * @state_use_accessors:current state of the device, assigned by the core code
  * @features:          features
  * @retries:           number of forced programming retries
- * @set_mode:          legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
- * @set_state_periodic:        switch state to periodic, if !set_mode
- * @set_state_oneshot: switch state to oneshot, if !set_mode
- * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode
- * @set_state_shutdown:        switch state to shutdown, if !set_mode
- * @tick_resume:       resume clkevt device, if !set_mode
+ * @set_state_periodic:        switch state to periodic
+ * @set_state_oneshot: switch state to oneshot
+ * @set_state_oneshot_stopped: switch state to oneshot_stopped
+ * @set_state_shutdown:        switch state to shutdown
+ * @tick_resume:       resume clkevt device
  * @broadcast:         function to broadcast events
  * @min_delta_ticks:   minimum delta value in ticks stored for reconfiguration
  * @max_delta_ticks:   maximum delta value in ticks stored for reconfiguration
@@ -116,18 +105,10 @@ struct clock_event_device {
        u64                     min_delta_ns;
        u32                     mult;
        u32                     shift;
-       enum clock_event_mode   mode;
        enum clock_event_state  state_use_accessors;
        unsigned int            features;
        unsigned long           retries;
 
-       /*
-        * State transition callback(s): Only one of the two groups should be
-        * defined:
-        * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
-        * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume().
-        */
-       void                    (*set_mode)(enum clock_event_mode mode, struct clock_event_device *);
        int                     (*set_state_periodic)(struct clock_event_device *);
        int                     (*set_state_oneshot)(struct clock_event_device *);
        int                     (*set_state_oneshot_stopped)(struct clock_event_device *);
index 71e4faf..9eeeb95 100644 (file)
 
 #define ICH_LR_EOI                     (1UL << 41)
 #define ICH_LR_GROUP                   (1UL << 60)
+#define ICH_LR_HW                      (1UL << 61)
 #define ICH_LR_STATE                   (3UL << 62)
 #define ICH_LR_PENDING_BIT             (1UL << 62)
 #define ICH_LR_ACTIVE_BIT              (1UL << 63)
+#define ICH_LR_PHYS_ID_SHIFT           32
+#define ICH_LR_PHYS_ID_MASK            (0x3ffUL << ICH_LR_PHYS_ID_SHIFT)
 
 #define ICH_MISR_EOI                   (1 << 0)
 #define ICH_MISR_U                     (1 << 1)
index af3d29f..b8901df 100644 (file)
 
 #define GICH_LR_VIRTUALID              (0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT     (10)
-#define GICH_LR_PHYSID_CPUID           (7 << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PHYSID_CPUID           (0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
 #define GICH_LR_STATE                  (3 << 28)
 #define GICH_LR_PENDING_BIT            (1 << 28)
 #define GICH_LR_ACTIVE_BIT             (1 << 29)
 #define GICH_LR_EOI                    (1 << 19)
+#define GICH_LR_HW                     (1 << 31)
 
 #define GICH_VMCR_CTRL_SHIFT           0
 #define GICH_VMCR_CTRL_MASK            (0x21f << GICH_VMCR_CTRL_SHIFT)
index 7f653e8..f109423 100644 (file)
@@ -21,8 +21,8 @@
  *
  * DEFINE_STATIC_KEY_TRUE(key);
  * DEFINE_STATIC_KEY_FALSE(key);
- * static_key_likely()
- * statick_key_unlikely()
+ * static_branch_likely()
+ * static_branch_unlikely()
  *
  * Jump labels provide an interface to generate dynamic branches using
  * self-modifying code. Assuming toolchain and architecture support, if we
  * statement, setting the key to true requires us to patch in a jump
  * to the out-of-line of true branch.
  *
- * In addtion to static_branch_{enable,disable}, we can also reference count
+ * In addition to static_branch_{enable,disable}, we can also reference count
  * the key or branch direction via static_branch_{inc,dec}. Thus,
  * static_branch_inc() can be thought of as a 'make more true' and
- * static_branch_dec() as a 'make more false'. The inc()/dec()
- * interface is meant to be used exclusively from the inc()/dec() for a given
- * key.
+ * static_branch_dec() as a 'make more false'.
  *
  * Since this relies on modifying code, the branch modifying functions
  * must be considered absolute slow paths (machine wide synchronization etc.).
index 123be25..5d4e9c4 100644 (file)
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 }
 
 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+size_t kernfs_path_len(struct kernfs_node *kn);
 char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
                                size_t buflen);
 void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
 { return -ENOSYS; }
 
+static inline size_t kernfs_path_len(struct kernfs_node *kn)
+{ return 0; }
+
 static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
                                              size_t buflen)
 { return NULL; }
index b63218f..d140b1e 100644 (file)
@@ -16,7 +16,7 @@
 
 #include <uapi/linux/kexec.h>
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
@@ -318,13 +318,24 @@ int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
 
-#else /* !CONFIG_KEXEC */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                        unsigned long buf_len);
+void * __weak arch_kexec_kernel_image_load(struct kimage *image);
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image);
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                       unsigned long buf_len);
+int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
+                                       Elf_Shdr *sechdrs, unsigned int relsec);
+int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                       unsigned int relsec);
+
+#else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #define kexec_in_progress false
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 #endif /* !defined(__ASSEBMLY__) */
 
index 0555cc6..fcfd2bf 100644 (file)
@@ -85,8 +85,6 @@ enum umh_disable_depth {
        UMH_DISABLED,
 };
 
-extern void usermodehelper_init(void);
-
 extern int __usermodehelper_disable(enum umh_disable_depth depth);
 extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
 
index 81089cf..1bef9e2 100644 (file)
@@ -242,6 +242,7 @@ struct kvm_vcpu {
        int sigset_active;
        sigset_t sigset;
        struct kvm_vcpu_stat stat;
+       unsigned int halt_poll_ns;
 
 #ifdef CONFIG_HAS_IOMEM
        int mmio_needed;
index d92b80b..ad800e6 100644 (file)
@@ -305,11 +305,9 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
-
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -345,6 +343,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+ino_t page_cgroup_ino(struct page *page);
 
 static inline bool mem_cgroup_disabled(void)
 {
@@ -555,11 +554,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
        return &zone->lruvec;
 }
 
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       return NULL;
-}
-
 static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
 {
diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h
new file mode 100644 (file)
index 0000000..eb492d4
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2015 Microchip Technology
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _MICROCHIPPHY_H
+#define _MICROCHIPPHY_H
+
+#define LAN88XX_INT_MASK                       (0x19)
+#define LAN88XX_INT_MASK_MDINTPIN_EN_          (0x8000)
+#define LAN88XX_INT_MASK_SPEED_CHANGE_         (0x4000)
+#define LAN88XX_INT_MASK_LINK_CHANGE_          (0x2000)
+#define LAN88XX_INT_MASK_FDX_CHANGE_           (0x1000)
+#define LAN88XX_INT_MASK_AUTONEG_ERR_          (0x0800)
+#define LAN88XX_INT_MASK_AUTONEG_DONE_         (0x0400)
+#define LAN88XX_INT_MASK_POE_DETECT_           (0x0200)
+#define LAN88XX_INT_MASK_SYMBOL_ERR_           (0x0100)
+#define LAN88XX_INT_MASK_FAST_LINK_FAIL_       (0x0080)
+#define LAN88XX_INT_MASK_WOL_EVENT_            (0x0040)
+#define LAN88XX_INT_MASK_EXTENDED_INT_         (0x0020)
+#define LAN88XX_INT_MASK_RESERVED_             (0x0010)
+#define LAN88XX_INT_MASK_FALSE_CARRIER_                (0x0008)
+#define LAN88XX_INT_MASK_LINK_SPEED_DS_                (0x0004)
+#define LAN88XX_INT_MASK_MASTER_SLAVE_DONE_    (0x0002)
+#define LAN88XX_INT_MASK_RX__ER_               (0x0001)
+
+#define LAN88XX_INT_STS                                (0x1A)
+#define LAN88XX_INT_STS_INT_ACTIVE_            (0x8000)
+#define LAN88XX_INT_STS_SPEED_CHANGE_          (0x4000)
+#define LAN88XX_INT_STS_LINK_CHANGE_           (0x2000)
+#define LAN88XX_INT_STS_FDX_CHANGE_            (0x1000)
+#define LAN88XX_INT_STS_AUTONEG_ERR_           (0x0800)
+#define LAN88XX_INT_STS_AUTONEG_DONE_          (0x0400)
+#define LAN88XX_INT_STS_POE_DETECT_            (0x0200)
+#define LAN88XX_INT_STS_SYMBOL_ERR_            (0x0100)
+#define LAN88XX_INT_STS_FAST_LINK_FAIL_                (0x0080)
+#define LAN88XX_INT_STS_WOL_EVENT_             (0x0040)
+#define LAN88XX_INT_STS_EXTENDED_INT_          (0x0020)
+#define LAN88XX_INT_STS_RESERVED_              (0x0010)
+#define LAN88XX_INT_STS_FALSE_CARRIER_         (0x0008)
+#define LAN88XX_INT_STS_LINK_SPEED_DS_         (0x0004)
+#define LAN88XX_INT_STS_MASTER_SLAVE_DONE_     (0x0002)
+#define LAN88XX_INT_STS_RX_ER_                 (0x0001)
+
+#define LAN88XX_EXT_PAGE_ACCESS                        (0x1F)
+#define LAN88XX_EXT_PAGE_SPACE_0               (0x0000)
+#define LAN88XX_EXT_PAGE_SPACE_1               (0x0001)
+#define LAN88XX_EXT_PAGE_SPACE_2               (0x0002)
+
+/* Extended Register Page 1 space */
+#define LAN88XX_EXT_MODE_CTRL                  (0x13)
+#define LAN88XX_EXT_MODE_CTRL_MDIX_MASK_       (0x000C)
+#define LAN88XX_EXT_MODE_CTRL_AUTO_MDIX_       (0x0000)
+#define LAN88XX_EXT_MODE_CTRL_MDI_             (0x0008)
+#define LAN88XX_EXT_MODE_CTRL_MDI_X_           (0x000C)
+
+/* MMD 3 Registers */
+#define        LAN88XX_MMD3_CHIP_ID                    (32877)
+#define        LAN88XX_MMD3_CHIP_REV                   (32878)
+
+#endif /* _MICROCHIPPHY_H */
index f25a957..91c08f6 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/shrinker.h>
 #include <linux/resource.h>
 #include <linux/page_ext.h>
+#include <linux/err.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1214,6 +1215,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
                    int write, int force, struct page **pages);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                        struct page **pages);
+
+/* Container for pinned pfns / pages */
+struct frame_vector {
+       unsigned int nr_allocated;      /* Number of frames we have space for */
+       unsigned int nr_frames; /* Number of frames stored in ptrs array */
+       bool got_ref;           /* Did we pin pages by getting page ref? */
+       bool is_pfns;           /* Does array contain pages or pfns? */
+       void *ptrs[0];          /* Array of pinned pfns / pages. Use
+                                * pfns_vector_pages() or pfns_vector_pfns()
+                                * for access */
+};
+
+struct frame_vector *frame_vector_create(unsigned int nr_frames);
+void frame_vector_destroy(struct frame_vector *vec);
+int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
+                    bool write, bool force, struct frame_vector *vec);
+void put_vaddr_frames(struct frame_vector *vec);
+int frame_vector_to_pages(struct frame_vector *vec);
+void frame_vector_to_pfns(struct frame_vector *vec);
+
+static inline unsigned int frame_vector_count(struct frame_vector *vec)
+{
+       return vec->nr_frames;
+}
+
+static inline struct page **frame_vector_pages(struct frame_vector *vec)
+{
+       if (vec->is_pfns) {
+               int err = frame_vector_to_pages(vec);
+
+               if (err)
+                       return ERR_PTR(err);
+       }
+       return (struct page **)(vec->ptrs);
+}
+
+static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
+{
+       if (!vec->is_pfns)
+               frame_vector_to_pfns(vec);
+       return (unsigned long *)(vec->ptrs);
+}
+
 struct kvec;
 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
                        struct page **pages);
@@ -1873,11 +1917,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
-       unsigned long pgoff, unsigned long *populate);
+       vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+static inline unsigned long
+do_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot, unsigned long flags,
+       unsigned long pgoff, unsigned long *populate)
+{
+       return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+}
+
 #ifdef CONFIG_MMU
 extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
index 61cd67f..a1a210d 100644 (file)
@@ -65,6 +65,16 @@ struct mmu_notifier_ops {
                                 unsigned long start,
                                 unsigned long end);
 
+       /*
+        * clear_young is a lightweight version of clear_flush_young. Like the
+        * latter, it is supposed to test-and-clear the young/accessed bitflag
+        * in the secondary pte, but it may omit flushing the secondary tlb.
+        */
+       int (*clear_young)(struct mmu_notifier *mn,
+                          struct mm_struct *mm,
+                          unsigned long start,
+                          unsigned long end);
+
        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+                                     unsigned long start,
+                                     unsigned long end);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+                                          unsigned long start,
+                                          unsigned long end)
+{
+       if (mm_has_notifiers(mm))
+               return __mmu_notifier_clear_young(mm, start, end);
+       return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
 {
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __young;                                                        \
 })
 
+#define ptep_clear_young_notify(__vma, __address, __ptep)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PAGE_SIZE);    \
+       __young;                                                        \
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PMD_SIZE);     \
+       __young;                                                        \
+})
+
 #define        ptep_clear_flush_notify(__vma, __address, __ptep)               \
 ({                                                                     \
        unsigned long ___addr = __address & PAGE_MASK;                  \
@@ -427,6 +471,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define        ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
index 9120edb..639e9b8 100644 (file)
@@ -68,8 +68,17 @@ extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
-extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-                                        u32 dst_portid, gfp_t gfp_mask);
+
+extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+                                          unsigned int ldiff, u32 dst_portid,
+                                          gfp_t gfp_mask);
+static inline struct sk_buff *
+netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid,
+                 gfp_t gfp_mask)
+{
+       return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask);
+}
+
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
                             __u32 group, gfp_t allocation);
index b02f72b..f798e2a 100644 (file)
@@ -522,10 +522,9 @@ static inline int ntb_mw_clear_trans(struct ntb_dev *ntb, int idx)
  * @speed:     OUT - The link speed expressed as PCIe generation number.
  * @width:     OUT - The link width expressed as the number of PCIe lanes.
  *
- * Set the translation of a memory window.  The peer may access local memory
- * through the window starting at the address, up to the size.  The address
- * must be aligned to the alignment specified by ntb_mw_get_range().  The size
- * must be aligned to the size alignment specified by ntb_mw_get_range().
+ * Get the current state of the ntb link.  It is recommended to query the link
+ * state once after every link event.  It is safe to query the link state in
+ * the context of the link event callback.
  *
  * Return: One if the link is up, zero if the link is down, otherwise a
  *             negative value indicating the error number.
@@ -795,7 +794,7 @@ static inline int ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 }
 
 /**
- * ntb_peer_db_clear() - clear bits in the local doorbell register
+ * ntb_peer_db_clear() - clear bits in the peer doorbell register
  * @ntb:       NTB device context.
  * @db_bits:   Doorbell bits to clear.
  *
index 2862861..7243eb9 100644 (file)
@@ -83,3 +83,4 @@ void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
 void ntb_transport_link_up(struct ntb_transport_qp *qp);
 void ntb_transport_link_down(struct ntb_transport_qp *qp);
 bool ntb_transport_link_query(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp);
index 41c9384..416509e 100644 (file)
@@ -108,6 +108,10 @@ enum pageflags {
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        PG_compound_lock,
+#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+       PG_young,
+       PG_idle,
 #endif
        __NR_PAGEFLAGS,
 
@@ -289,6 +293,13 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Young, young)
+SETPAGEFLAG(Young, young)
+TESTCLEARFLAG(Young, young)
+PAGEFLAG(Idle, idle)
+#endif
+
 /*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
index c42981c..17f118a 100644 (file)
@@ -26,6 +26,10 @@ enum page_ext_flags {
        PAGE_EXT_DEBUG_POISON,          /* Page is poisoned */
        PAGE_EXT_DEBUG_GUARD,
        PAGE_EXT_OWNER,
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+       PAGE_EXT_YOUNG,
+       PAGE_EXT_IDLE,
+#endif
 };
 
 /*
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
new file mode 100644 (file)
index 0000000..bf268fa
--- /dev/null
@@ -0,0 +1,110 @@
+#ifndef _LINUX_MM_PAGE_IDLE_H
+#define _LINUX_MM_PAGE_IDLE_H
+
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/page_ext.h>
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+
+#ifdef CONFIG_64BIT
+static inline bool page_is_young(struct page *page)
+{
+       return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+       SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       ClearPageIdle(page);
+}
+#else /* !CONFIG_64BIT */
+/*
+ * If there is not enough space to store Idle and Young bits in page flags, use
+ * page ext flags instead.
+ */
+extern struct page_ext_operations page_idle_ops;
+
+static inline bool page_is_young(struct page *page)
+{
+       return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_young(struct page *page)
+{
+       set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return test_and_clear_bit(PAGE_EXT_YOUNG,
+                                 &lookup_page_ext(page)->flags);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+#endif /* CONFIG_64BIT */
+
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+
+static inline bool page_is_young(struct page *page)
+{
+       return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
+#endif /* _LINUX_MM_PAGE_IDLE_H */
index cab7ba5..e817722 100644 (file)
@@ -34,6 +34,7 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
                                              unsigned long freq,
@@ -80,6 +81,11 @@ static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
        return 0;
 }
 
+static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+       return NULL;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
                                        unsigned long freq, bool available)
 {
index 2110a81..317e16d 100644 (file)
@@ -19,8 +19,8 @@
  * under normal circumstances, used to verify that nobody uses
  * non-initialized list entries.
  */
-#define LIST_POISON1  ((void *) 0x00100100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x00200200 + POISON_POINTER_DELTA)
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
 #define ATM_POISON_FREE                0x12
 #define ATM_POISON             0xdeadbeef
 
-/********** net/ **********/
-#define NEIGHBOR_DEAD          0xdeadbeef
-#define NETFILTER_LINK_POISON  0xdead57ac
-
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT       0x11
 #define MUTEX_DEBUG_FREE       0x22
@@ -83,7 +79,4 @@
 /********** security/ **********/
 #define KEY_DESTROY            0xbd
 
-/********** sound/oss/ **********/
-#define OSS_POISON_FREE                0xAB
-
 #endif
index a6298b2..9729565 100644 (file)
@@ -404,10 +404,10 @@ do {                                                                      \
        static DEFINE_RATELIMIT_STATE(_rs,                              \
                                      DEFAULT_RATELIMIT_INTERVAL,       \
                                      DEFAULT_RATELIMIT_BURST);         \
-       DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);                 \
+       DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));         \
        if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&        \
            __ratelimit(&_rs))                                          \
-               __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__);    \
+               __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);    \
 } while (0)
 #elif defined(DEBUG)
 #define pr_debug_ratelimited(fmt, ...)                                 \
@@ -456,11 +456,17 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,      \
                         groupsize, buf, len, ascii)
-#else
+#elif defined(DEBUG)
 #define print_hex_dump_debug(prefix_str, prefix_type, rowsize,         \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,    \
                       groupsize, buf, len, ascii)
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+#else
+static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
+                                       int rowsize, int groupsize,
+                                       const void *buf, size_t len, bool ascii)
+{
+}
+#endif
 
 #endif
index da5602b..7f65f9c 100644 (file)
@@ -74,6 +74,20 @@ static inline int device_reset_optional(struct device *dev)
        return -ENOSYS;
 }
 
+static inline struct reset_control *__must_check reset_control_get(
+                                       struct device *dev, const char *id)
+{
+       WARN_ON(1);
+       return ERR_PTR(-EINVAL);
+}
+
+static inline struct reset_control *__must_check devm_reset_control_get(
+                                       struct device *dev, const char *id)
+{
+       WARN_ON(1);
+       return ERR_PTR(-EINVAL);
+}
+
 static inline struct reset_control *reset_control_get_optional(
                                        struct device *dev, const char *id)
 {
index d4c7271..dde00de 100644 (file)
@@ -114,13 +114,22 @@ int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
 int seq_release(struct inode *, struct file *);
-int seq_escape(struct seq_file *, const char *, const char *);
-int seq_putc(struct seq_file *m, char c);
-int seq_puts(struct seq_file *m, const char *s);
 int seq_write(struct seq_file *seq, const void *data, size_t len);
 
-__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
-__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
+__printf(2, 0)
+void seq_vprintf(struct seq_file *m, const char *fmt, va_list args);
+__printf(2, 3)
+void seq_printf(struct seq_file *m, const char *fmt, ...);
+void seq_putc(struct seq_file *m, char c);
+void seq_puts(struct seq_file *m, const char *s);
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+                        unsigned long long num);
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num);
+void seq_escape(struct seq_file *m, const char *s, const char *esc);
+
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+                 int rowsize, int groupsize, const void *buf, size_t len,
+                 bool ascii);
 
 int seq_path(struct seq_file *, const struct path *, const char *);
 int seq_file_path(struct seq_file *, struct file *, const char *);
@@ -134,10 +143,6 @@ int single_release(struct inode *, struct file *);
 void *__seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_release_private(struct inode *, struct file *);
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
-                       unsigned long long num);
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
-                       long long num);
 
 static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 {
index 71f711d..dabe643 100644 (file)
@@ -48,24 +48,24 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_HEX             0x20
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-               unsigned int flags, const char *esc);
+               unsigned int flags, const char *only);
 
 static inline int string_escape_mem_any_np(const char *src, size_t isz,
-               char *dst, size_t osz, const char *esc)
+               char *dst, size_t osz, const char *only)
 {
-       return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc);
+       return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
 }
 
 static inline int string_escape_str(const char *src, char *dst, size_t sz,
-               unsigned int flags, const char *esc)
+               unsigned int flags, const char *only)
 {
-       return string_escape_mem(src, strlen(src), dst, sz, flags, esc);
+       return string_escape_mem(src, strlen(src), dst, sz, flags, only);
 }
 
 static inline int string_escape_str_any_np(const char *src, char *dst,
-               size_t sz, const char *esc)
+               size_t sz, const char *only)
 {
-       return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc);
+       return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
 }
 
 #endif
index 0800131..a460e2e 100644 (file)
@@ -885,4 +885,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
                        const char __user *const __user *argv,
                        const char __user *const __user *envp, int flags);
 
+asmlinkage long sys_membarrier(int cmd, int flags);
+
 #endif
index 037e9df..17292fe 100644 (file)
@@ -92,23 +92,19 @@ struct thermal_zone_device_ops {
                     struct thermal_cooling_device *);
        int (*unbind) (struct thermal_zone_device *,
                       struct thermal_cooling_device *);
-       int (*get_temp) (struct thermal_zone_device *, unsigned long *);
+       int (*get_temp) (struct thermal_zone_device *, int *);
        int (*get_mode) (struct thermal_zone_device *,
                         enum thermal_device_mode *);
        int (*set_mode) (struct thermal_zone_device *,
                enum thermal_device_mode);
        int (*get_trip_type) (struct thermal_zone_device *, int,
                enum thermal_trip_type *);
-       int (*get_trip_temp) (struct thermal_zone_device *, int,
-                             unsigned long *);
-       int (*set_trip_temp) (struct thermal_zone_device *, int,
-                             unsigned long);
-       int (*get_trip_hyst) (struct thermal_zone_device *, int,
-                             unsigned long *);
-       int (*set_trip_hyst) (struct thermal_zone_device *, int,
-                             unsigned long);
-       int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *);
-       int (*set_emul_temp) (struct thermal_zone_device *, unsigned long);
+       int (*get_trip_temp) (struct thermal_zone_device *, int, int *);
+       int (*set_trip_temp) (struct thermal_zone_device *, int, int);
+       int (*get_trip_hyst) (struct thermal_zone_device *, int, int *);
+       int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
+       int (*get_crit_temp) (struct thermal_zone_device *, int *);
+       int (*set_emul_temp) (struct thermal_zone_device *, int);
        int (*get_trend) (struct thermal_zone_device *, int,
                          enum thermal_trend *);
        int (*notify) (struct thermal_zone_device *, int,
@@ -332,9 +328,9 @@ struct thermal_genl_event {
  *                temperature.
  */
 struct thermal_zone_of_device_ops {
-       int (*get_temp)(void *, long *);
+       int (*get_temp)(void *, int *);
        int (*get_trend)(void *, long *);
-       int (*set_emul_temp)(void *, unsigned long);
+       int (*set_emul_temp)(void *, int);
 };
 
 /**
@@ -406,7 +402,7 @@ thermal_of_cooling_device_register(struct device_node *np, char *, void *,
                                   const struct thermal_cooling_device_ops *);
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
 struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
-int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp);
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 
 int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
@@ -457,7 +453,7 @@ static inline struct thermal_zone_device *thermal_zone_get_zone_by_name(
                const char *name)
 { return ERR_PTR(-ENODEV); }
 static inline int thermal_zone_get_temp(
-               struct thermal_zone_device *tz, unsigned long *temp)
+               struct thermal_zone_device *tz, int *temp)
 { return -ENODEV; }
 static inline int get_tz_trend(struct thermal_zone_device *tz, int trip)
 { return -ENODEV; }
index 48d901f..e312219 100644 (file)
@@ -147,11 +147,20 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
                cpumask_or(mask, mask, tick_nohz_full_mask);
 }
 
+static inline int housekeeping_any_cpu(void)
+{
+       return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+}
+
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void tick_nohz_full_kick_all(void);
 extern void __tick_nohz_task_switch(void);
 #else
+static inline int housekeeping_any_cpu(void)
+{
+       return smp_processor_id();
+}
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
index c924a28..42f8ec9 100644 (file)
@@ -36,6 +36,8 @@ enum zpool_mapmode {
        ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
 };
 
+bool zpool_has_pool(char *type);
+
 struct zpool *zpool_create_pool(char *type, char *name,
                        gfp_t gfp, const struct zpool_ops *ops);
 
index 9f36641..6513c7e 100644 (file)
@@ -15,6 +15,7 @@
 #define _MEDIA_VIDEOBUF2_MEMOPS_H
 
 #include <media/videobuf2-core.h>
+#include <linux/mm.h>
 
 /**
  * struct vb2_vmarea_handler - common vma refcount tracking handler
@@ -31,11 +32,9 @@ struct vb2_vmarea_handler {
 
 extern const struct vm_operations_struct vb2_common_vm_ops;
 
-int vb2_get_contig_userptr(unsigned long vaddr, unsigned long size,
-                          struct vm_area_struct **res_vma, dma_addr_t *res_pa);
-
-struct vm_area_struct *vb2_get_vma(struct vm_area_struct *vma);
-void vb2_put_vma(struct vm_area_struct *vma);
-
+struct frame_vector *vb2_create_framevec(unsigned long start,
+                                        unsigned long length,
+                                        bool write);
+void vb2_destroy_framevec(struct frame_vector *vec);
 
 #endif
index 4e8f804..59160de 100644 (file)
@@ -66,7 +66,6 @@ struct fib_rules_ops {
                                           struct nlattr **);
        int                     (*fill)(struct fib_rule *, struct sk_buff *,
                                        struct fib_rule_hdr *);
-       u32                     (*default_pref)(struct fib_rules_ops *ops);
        size_t                  (*nlmsg_payload)(struct fib_rule *);
 
        /* Called after modifications to the rules set, must flush
@@ -118,5 +117,4 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
                     struct fib_lookup_arg *);
 int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
                         u32 flags);
-u32 fib_default_rule_pref(struct fib_rules_ops *ops);
 #endif
index e3314e5..bfc5694 100644 (file)
@@ -477,7 +477,9 @@ struct ieee80211_event {
  * @chandef: Channel definition for this BSS -- the hardware might be
  *     configured a higher bandwidth than this BSS uses, for example.
  * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
- *     This field is only valid when the channel type is one of the HT types.
+ *     This field is only valid when the channel is a wide HT/VHT channel.
+ *     Note that with TDLS this can be the case (channel is HT, protection must
+ *     be used from this field) even when the BSS association isn't using HT.
  * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value
  *     implies disabled
  * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis
index bab824b..d4c6b5f 100644 (file)
@@ -59,7 +59,7 @@ static inline unsigned int
 br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb,
                       const struct nf_hook_state *state)
 {
-       return NF_DROP;
+       return NF_ACCEPT;
 }
 #endif
 
index f5e23c6..e8ad468 100644 (file)
@@ -298,6 +298,7 @@ void init_nf_conntrack_hash_rnd(void);
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
                                 const struct nf_conntrack_zone *zone,
                                 gfp_t flags);
+void nf_ct_tmpl_free(struct nf_conn *tmpl);
 
 #define NF_CT_STAT_INC(net, count)       __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
index 2a24668..aa8bee7 100644 (file)
@@ -125,7 +125,7 @@ static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
 
 static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
 {
-       return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1;
+       return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE;
 }
 
 unsigned int nft_parse_register(const struct nlattr *attr);
index 676b03b..11571b2 100644 (file)
@@ -61,4 +61,9 @@ static inline bool scsi_sense_valid(const struct scsi_sense_hdr *sshdr)
 extern bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len,
                                 struct scsi_sense_hdr *sshdr);
 
+extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq);
+int scsi_set_sense_information(u8 *buf, int buf_len, u64 info);
+extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
+                                      int desc_type);
+
 #endif /* _SCSI_COMMON_H_ */
index 50c2a36..fe89d7c 100644 (file)
@@ -196,34 +196,13 @@ struct scsi_device {
        struct execute_work     ew; /* used to get process context on put */
        struct work_struct      requeue_work;
 
-       struct scsi_dh_data     *scsi_dh_data;
+       struct scsi_device_handler *handler;
+       void                    *handler_data;
+
        enum scsi_device_state sdev_state;
        unsigned long           sdev_data[0];
 } __attribute__((aligned(sizeof(unsigned long))));
 
-typedef void (*activate_complete)(void *, int);
-struct scsi_device_handler {
-       /* Used by the infrastructure */
-       struct list_head list; /* list of scsi_device_handlers */
-
-       /* Filled by the hardware handler */
-       struct module *module;
-       const char *name;
-       int (*check_sense)(struct scsi_device *, struct scsi_sense_hdr *);
-       struct scsi_dh_data *(*attach)(struct scsi_device *);
-       void (*detach)(struct scsi_device *);
-       int (*activate)(struct scsi_device *, activate_complete, void *);
-       int (*prep_fn)(struct scsi_device *, struct request *);
-       int (*set_params)(struct scsi_device *, const char *);
-       bool (*match)(struct scsi_device *);
-};
-
-struct scsi_dh_data {
-       struct scsi_device_handler *scsi_dh;
-       struct scsi_device *sdev;
-       struct kref kref;
-};
-
 #define        to_scsi_device(d)       \
        container_of(d, struct scsi_device, sdev_gendev)
 #define        class_to_sdev(d)        \
index 620c723..85d7317 100644 (file)
@@ -55,11 +55,26 @@ enum {
        SCSI_DH_NOSYS,
        SCSI_DH_DRIVER_MAX,
 };
-#if defined(CONFIG_SCSI_DH) || defined(CONFIG_SCSI_DH_MODULE)
+
+typedef void (*activate_complete)(void *, int);
+struct scsi_device_handler {
+       /* Used by the infrastructure */
+       struct list_head list; /* list of scsi_device_handlers */
+
+       /* Filled by the hardware handler */
+       struct module *module;
+       const char *name;
+       int (*check_sense)(struct scsi_device *, struct scsi_sense_hdr *);
+       int (*attach)(struct scsi_device *);
+       void (*detach)(struct scsi_device *);
+       int (*activate)(struct scsi_device *, activate_complete, void *);
+       int (*prep_fn)(struct scsi_device *, struct request *);
+       int (*set_params)(struct scsi_device *, const char *);
+};
+
+#ifdef CONFIG_SCSI_DH
 extern int scsi_dh_activate(struct request_queue *, activate_complete, void *);
-extern int scsi_dh_handler_exist(const char *);
 extern int scsi_dh_attach(struct request_queue *, const char *);
-extern void scsi_dh_detach(struct request_queue *);
 extern const char *scsi_dh_attached_handler_name(struct request_queue *, gfp_t);
 extern int scsi_dh_set_params(struct request_queue *, const char *);
 #else
@@ -69,18 +84,10 @@ static inline int scsi_dh_activate(struct request_queue *req,
        fn(data, 0);
        return 0;
 }
-static inline int scsi_dh_handler_exist(const char *name)
-{
-       return 0;
-}
 static inline int scsi_dh_attach(struct request_queue *req, const char *name)
 {
        return SCSI_DH_NOSYS;
 }
-static inline void scsi_dh_detach(struct request_queue *q)
-{
-       return;
-}
 static inline const char *scsi_dh_attached_handler_name(struct request_queue *q,
                                                        gfp_t gfp)
 {
index 8d1d7fa..dbb8c64 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/scatterlist.h>
 
 #include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_common.h>
 struct scsi_device;
 struct Scsi_Host;
 
@@ -21,14 +22,9 @@ static inline bool scsi_sense_is_deferred(const struct scsi_sense_hdr *sshdr)
        return ((sshdr->response_code >= 0x70) && (sshdr->response_code & 1));
 }
 
-extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
-                                      int desc_type);
-
 extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
                                   u64 * info_out);
 
-extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq);
-
 extern int scsi_ioctl_reset(struct scsi_device *, int __user *);
 
 struct scsi_eh_save {
index 0aedbb2..373d334 100644 (file)
@@ -62,6 +62,8 @@
 /* T10 protection information disabled by default */
 #define TA_DEFAULT_T10_PI              0
 #define TA_DEFAULT_FABRIC_PROT_TYPE    0
+/* TPG status needs to be enabled to return sendtargets discovery endpoint info */
+#define TA_DEFAULT_TPG_ENABLED_SENDTARGETS 1
 
 #define ISCSI_IOV_DATA_BUFFER          5
 
@@ -517,7 +519,6 @@ struct iscsi_conn {
        u16                     cid;
        /* Remote TCP Port */
        u16                     login_port;
-       u16                     local_port;
        int                     net_size;
        int                     login_family;
        u32                     auth_id;
@@ -527,9 +528,8 @@ struct iscsi_conn {
        u32                     exp_statsn;
        /* Per connection status sequence number */
        u32                     stat_sn;
-#define IPV6_ADDRESS_SPACE                             48
-       unsigned char           login_ip[IPV6_ADDRESS_SPACE];
-       unsigned char           local_ip[IPV6_ADDRESS_SPACE];
+       struct sockaddr_storage login_sockaddr;
+       struct sockaddr_storage local_sockaddr;
        int                     conn_usage_count;
        int                     conn_waiting_on_uc;
        atomic_t                check_immediate_queue;
@@ -636,7 +636,7 @@ struct iscsi_session {
        /* session wide counter: expected command sequence number */
        u32                     exp_cmd_sn;
        /* session wide counter: maximum allowed command sequence number */
-       u32                     max_cmd_sn;
+       atomic_t                max_cmd_sn;
        struct list_head        sess_ooo_cmdsn_list;
 
        /* LIO specific session ID */
@@ -764,6 +764,7 @@ struct iscsi_tpg_attrib {
        u32                     default_erl;
        u8                      t10_pi;
        u32                     fabric_prot_type;
+       u32                     tpg_enabled_sendtargets;
        struct iscsi_portal_group *tpg;
 };
 
@@ -776,12 +777,10 @@ struct iscsi_np {
        enum iscsi_timer_flags_table np_login_timer_flags;
        u32                     np_exports;
        enum np_flags_table     np_flags;
-       unsigned char           np_ip[IPV6_ADDRESS_SPACE];
-       u16                     np_port;
        spinlock_t              np_thread_lock;
        struct completion       np_restart_comp;
        struct socket           *np_socket;
-       struct __kernel_sockaddr_storage np_sockaddr;
+       struct sockaddr_storage np_sockaddr;
        struct task_struct      *np_thread;
        struct timer_list       np_login_timer;
        void                    *np_context;
index 3ff76b4..e615bb4 100644 (file)
@@ -50,7 +50,7 @@ struct iscsi_login_stats {
        u64             last_fail_time;         /* time stamp (jiffies) */
        u32             last_fail_type;
        int             last_intr_fail_ip_family;
-       unsigned char   last_intr_fail_ip_addr[IPV6_ADDRESS_SPACE];
+       struct sockaddr_storage last_intr_fail_sockaddr;
        char            last_intr_fail_name[224];
 } ____cacheline_aligned;
 
index e6bb166..90e37fa 100644 (file)
@@ -9,7 +9,7 @@ struct iscsit_transport {
        int priv_size;
        struct module *owner;
        struct list_head t_node;
-       int (*iscsit_setup_np)(struct iscsi_np *, struct __kernel_sockaddr_storage *);
+       int (*iscsit_setup_np)(struct iscsi_np *, struct sockaddr_storage *);
        int (*iscsit_accept_np)(struct iscsi_np *, struct iscsi_conn *);
        void (*iscsit_free_np)(struct iscsi_np *);
        void (*iscsit_wait_conn)(struct iscsi_conn *);
index 1e5c8f9..56cf8e4 100644 (file)
@@ -93,4 +93,6 @@ bool  target_lun_is_rdonly(struct se_cmd *);
 sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd,
        sense_reason_t (*exec_cmd)(struct se_cmd *cmd));
 
+bool target_sense_desc_format(struct se_device *dev);
+
 #endif /* TARGET_CORE_BACKEND_H */
index 17ae2d6..ac9bf1c 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/blkdev.h>
 #include <linux/percpu_ida.h>
+#include <linux/t10-pi.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 
@@ -426,12 +427,6 @@ enum target_core_dif_check {
        TARGET_DIF_CHECK_REFTAG = 0x1 << 2,
 };
 
-struct se_dif_v1_tuple {
-       __be16                  guard_tag;
-       __be16                  app_tag;
-       __be32                  ref_tag;
-};
-
 /* for sam_task_attr */
 #define TCM_SIMPLE_TAG 0x20
 #define TCM_HEAD_TAG   0x21
@@ -444,6 +439,9 @@ struct se_cmd {
        u8                      scsi_asc;
        u8                      scsi_ascq;
        u16                     scsi_sense_length;
+       unsigned                cmd_wait_set:1;
+       unsigned                unknown_data_length:1;
+       bool                    state_active:1;
        u64                     tag; /* SAM command identifier aka task tag */
        /* Delay for ALUA Active/NonOptimized state access in milliseconds */
        int                     alua_nonop_delay;
@@ -455,11 +453,8 @@ struct se_cmd {
        unsigned int            map_tag;
        /* Transport protocol dependent state, see transport_state_table */
        enum transport_state_table t_state;
-       unsigned                cmd_wait_set:1;
-       unsigned                unknown_data_length:1;
        /* See se_cmd_flags_table */
        u32                     se_cmd_flags;
-       u32                     se_ordered_id;
        /* Total size in bytes associated with command */
        u32                     data_length;
        u32                     residual_count;
@@ -477,7 +472,6 @@ struct se_cmd {
        struct se_tmr_req       *se_tmr_req;
        struct list_head        se_cmd_list;
        struct completion       cmd_wait_comp;
-       struct kref             cmd_kref;
        const struct target_core_fabric_ops *se_tfo;
        sense_reason_t          (*execute_cmd)(struct se_cmd *);
        sense_reason_t (*transport_complete_callback)(struct se_cmd *, bool);
@@ -497,6 +491,7 @@ struct se_cmd {
 #define CMD_T_REQUEST_STOP     (1 << 8)
 #define CMD_T_BUSY             (1 << 9)
        spinlock_t              t_state_lock;
+       struct kref             cmd_kref;
        struct completion       t_transport_stop_comp;
 
        struct work_struct      work;
@@ -509,8 +504,10 @@ struct se_cmd {
        struct scatterlist      *t_bidi_data_sg;
        unsigned int            t_bidi_data_nents;
 
+       /* Used for lun->lun_ref counting */
+       int                     lun_ref_active;
+
        struct list_head        state_list;
-       bool                    state_active;
 
        /* old task stop completion, consider merging with some of the above */
        struct completion       task_stop_comp;
@@ -518,20 +515,17 @@ struct se_cmd {
        /* backend private data */
        void                    *priv;
 
-       /* Used for lun->lun_ref counting */
-       int                     lun_ref_active;
-
        /* DIF related members */
        enum target_prot_op     prot_op;
        enum target_prot_type   prot_type;
        u8                      prot_checks;
+       bool                    prot_pto;
        u32                     prot_length;
        u32                     reftag_seed;
        struct scatterlist      *t_prot_sg;
        unsigned int            t_prot_nents;
        sense_reason_t          pi_err;
        sector_t                bad_sector;
-       bool                    prot_pto;
 };
 
 struct se_ua {
@@ -598,7 +592,6 @@ struct se_ml_stat_grps {
 };
 
 struct se_lun_acl {
-       char                    initiatorname[TRANSPORT_IQN_LEN];
        u64                     mapped_lun;
        struct se_node_acl      *se_lun_nacl;
        struct se_lun           *se_lun;
@@ -685,7 +678,6 @@ struct se_lun {
 #define SE_LUN_LINK_MAGIC                      0xffff7771
        u32                     lun_link_magic;
        u32                     lun_access;
-       u32                     lun_flags;
        u32                     lun_index;
 
        /* RELATIVE TARGET PORT IDENTIFER */
@@ -751,7 +743,6 @@ struct se_device {
        atomic_long_t           write_bytes;
        /* Active commands on this virtual SE device */
        atomic_t                simple_cmds;
-       atomic_t                dev_ordered_id;
        atomic_t                dev_ordered_sync;
        atomic_t                dev_qf_count;
        u32                     export_count;
index 18afef9..7fb2557 100644 (file)
@@ -5,6 +5,19 @@ struct target_core_fabric_ops {
        struct module *module;
        const char *name;
        size_t node_acl_size;
+       /*
+        * Limits number of scatterlist entries per SCF_SCSI_DATA_CDB payload.
+        * Setting this value tells target-core to enforce this limit, and
+        * report as INQUIRY EVPD=b0 MAXIMUM TRANSFER LENGTH.
+        *
+        * target-core will currently reset se_cmd->data_length to this
+        * maximum size, and set UNDERFLOW residual count if length exceeds
+        * this limit.
+        *
+        * XXX: Not all initiator hosts honor this block-limit EVPD
+        * XXX: Currently assumes single PAGE_SIZE per scatterlist entry
+        */
+       u32 max_data_sg_nents;
        char *(*get_fabric_name)(void);
        char *(*tpg_get_wwn)(struct se_portal_group *);
        u16 (*tpg_get_tag)(struct se_portal_group *);
@@ -152,6 +165,7 @@ int transport_generic_handle_tmr(struct se_cmd *);
 void   transport_generic_request_failure(struct se_cmd *, sense_reason_t);
 void   __target_execute_cmd(struct se_cmd *);
 int    transport_lookup_tmr_lun(struct se_cmd *, u64);
+void   core_allocate_nexus_loss_ua(struct se_node_acl *acl);
 
 struct se_node_acl *core_tpg_get_initiator_node_acl(struct se_portal_group *tpg,
                unsigned char *);
index a44062d..d6f8322 100644 (file)
@@ -358,6 +358,36 @@ TRACE_EVENT(
 
 #endif
 
+TRACE_EVENT(kvm_halt_poll_ns,
+       TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+       TP_ARGS(grow, vcpu_id, new, old),
+
+       TP_STRUCT__entry(
+               __field(bool, grow)
+               __field(unsigned int, vcpu_id)
+               __field(int, new)
+               __field(int, old)
+       ),
+
+       TP_fast_assign(
+               __entry->grow           = grow;
+               __entry->vcpu_id        = vcpu_id;
+               __entry->new            = new;
+               __entry->old            = old;
+       ),
+
+       TP_printk("vcpu %u: halt_poll_ns %d (%s %d)",
+                       __entry->vcpu_id,
+                       __entry->new,
+                       __entry->grow ? "grow" : "shrink",
+                       __entry->old)
+);
+
+#define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \
+       trace_kvm_halt_poll_ns(true, vcpu_id, new, old)
+#define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
+       trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
index 12e1321..5afae8f 100644 (file)
@@ -11,7 +11,7 @@ TRACE_EVENT(thermal_power_allocator,
                 u32 total_req_power, u32 *granted_power,
                 u32 total_granted_power, size_t num_actors,
                 u32 power_range, u32 max_allocatable_power,
-                unsigned long current_temp, s32 delta_temp),
+                int current_temp, s32 delta_temp),
        TP_ARGS(tz, req_power, total_req_power, granted_power,
                total_granted_power, num_actors, power_range,
                max_allocatable_power, current_temp, delta_temp),
@@ -24,7 +24,7 @@ TRACE_EVENT(thermal_power_allocator,
                __field(size_t,        num_actors               )
                __field(u32,           power_range              )
                __field(u32,           max_allocatable_power    )
-               __field(unsigned long, current_temp             )
+               __field(int,           current_temp             )
                __field(s32,           delta_temp               )
        ),
        TP_fast_assign(
@@ -42,7 +42,7 @@ TRACE_EVENT(thermal_power_allocator,
                __entry->delta_temp = delta_temp;
        ),
 
-       TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%lu delta_temperature=%d",
+       TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%d delta_temperature=%d",
                __entry->tz_id,
                __print_array(__get_dynamic_array(req_power),
                               __entry->num_actors, 4),
index a7aa607..fff846b 100644 (file)
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
        TP_ARGS(inode, flags)
 );
 
+#ifdef CREATE_TRACE_POINTS
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+       struct cgroup *cgrp = wb->memcg_css->cgroup;
+       char *path;
+
+       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
+       WARN_ON_ONCE(path != buf);
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+       if (wbc->wb)
+               return __trace_wb_cgroup_size(wbc->wb);
+       else
+               return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+                                            struct writeback_control *wbc)
+{
+       if (wbc->wb)
+               __trace_wb_assign_cgroup(buf, wbc->wb);
+       else
+               strcpy(buf, "/");
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+       return 2;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+       strcpy(buf, "/");
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+       return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+                                            struct writeback_control *wbc)
+{
+       strcpy(buf, "/");
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+#endif /* CREATE_TRACE_POINTS */
+
 DECLARE_EVENT_CLASS(writeback_write_inode_template,
 
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
                __array(char, name, 32)
                __field(unsigned long, ino)
                __field(int, sync_mode)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
        ),
 
        TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
                        dev_name(inode_to_bdi(inode)->dev), 32);
                __entry->ino            = inode->i_ino;
                __entry->sync_mode      = wbc->sync_mode;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
        ),
 
-       TP_printk("bdi %s: ino=%lu sync_mode=%d",
+       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
                __entry->name,
                __entry->ino,
-               __entry->sync_mode
+               __entry->sync_mode,
+               __get_str(cgroup)
        )
 );
 
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
 );
 
 DECLARE_EVENT_CLASS(writeback_work_class,
-       TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
-       TP_ARGS(bdi, work),
+       TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
+       TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
        TP_fast_assign(
                strncpy(__entry->name,
-                       bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+                       wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background = work->for_background;
                __entry->reason = work->reason;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
-                 "kupdate=%d range_cyclic=%d background=%d reason=%s",
+                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
-                 __print_symbolic(__entry->reason, WB_WORK_REASON)
+                 __print_symbolic(__entry->reason, WB_WORK_REASON),
+                 __get_str(cgroup)
        )
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
 DEFINE_EVENT(writeback_work_class, name, \
-       TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
-       TP_ARGS(bdi, work))
+       TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
+       TP_ARGS(wb, work))
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
 );
 
 DECLARE_EVENT_CLASS(writeback_class,
-       TP_PROTO(struct backing_dev_info *bdi),
-       TP_ARGS(bdi),
+       TP_PROTO(struct bdi_writeback *wb),
+       TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
        TP_fast_assign(
-               strncpy(__entry->name, dev_name(bdi->dev), 32);
+               strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
-       TP_printk("bdi %s",
-                 __entry->name
+       TP_printk("bdi %s: cgroup=%s",
+                 __entry->name,
+                 __get_str(cgroup)
        )
 );
 #define DEFINE_WRITEBACK_EVENT(name) \
 DEFINE_EVENT(writeback_class, name, \
-       TP_PROTO(struct backing_dev_info *bdi), \
-       TP_ARGS(bdi))
+       TP_PROTO(struct bdi_writeback *wb), \
+       TP_ARGS(wb))
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
+
+TRACE_EVENT(writeback_bdi_register,
+       TP_PROTO(struct backing_dev_info *bdi),
+       TP_ARGS(bdi),
+       TP_STRUCT__entry(
+               __array(char, name, 32)
+       ),
+       TP_fast_assign(
+               strncpy(__entry->name, dev_name(bdi->dev), 32);
+       ),
+       TP_printk("bdi %s",
+               __entry->name
+       )
+);
 
 DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
        ),
 
        TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
                __entry->range_cyclic   = wbc->range_cyclic;
                __entry->range_start    = (long)wbc->range_start;
                __entry->range_end      = (long)wbc->range_end;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
        ),
 
        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                "bgrd=%d reclm=%d cyclic=%d "
-               "start=0x%lx end=0x%lx",
+               "start=0x%lx end=0x%lx cgroup=%s",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
                __entry->for_reclaim,
                __entry->range_cyclic,
                __entry->range_start,
-               __entry->range_end)
+               __entry->range_end,
+               __get_str(cgroup)
+       )
 )
 
 #define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
                __field(long,           age)
                __field(int,            moved)
                __field(int,            reason)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
        TP_fast_assign(
                unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
                                  (jiffies - *older_than_this) * 1000 / HZ : -1;
                __entry->moved  = moved;
                __entry->reason = work->reason;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
-       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
+       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
                __entry->name,
                __entry->older, /* older_than_this in jiffies */
                __entry->age,   /* older_than_this in relative milliseconds */
                __entry->moved,
-               __print_symbolic(__entry->reason, WB_WORK_REASON)
+               __print_symbolic(__entry->reason, WB_WORK_REASON),
+               __get_str(cgroup)
        )
 );
 
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
 
 TRACE_EVENT(bdi_dirty_ratelimit,
 
-       TP_PROTO(struct backing_dev_info *bdi,
+       TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),
 
-       TP_ARGS(bdi, dirty_rate, task_ratelimit),
+       TP_ARGS(wb, dirty_rate, task_ratelimit),
 
        TP_STRUCT__entry(
                __array(char,           bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
                __field(unsigned long,  dirty_ratelimit)
                __field(unsigned long,  task_ratelimit)
                __field(unsigned long,  balanced_dirty_ratelimit)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
 
        TP_fast_assign(
-               strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-               __entry->write_bw       = KBps(bdi->wb.write_bandwidth);
-               __entry->avg_write_bw   = KBps(bdi->wb.avg_write_bandwidth);
+               strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+               __entry->write_bw       = KBps(wb->write_bandwidth);
+               __entry->avg_write_bw   = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate     = KBps(dirty_rate);
-               __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
+               __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
-                                       KBps(bdi->wb.balanced_dirty_ratelimit);
+                                       KBps(wb->balanced_dirty_ratelimit);
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
 
        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
-                 "balanced_dirty_ratelimit=%lu",
+                 "balanced_dirty_ratelimit=%lu cgroup=%s",
                  __entry->bdi,
                  __entry->write_bw,            /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,          /* bdi dirty rate */
                  __entry->dirty_ratelimit,     /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
-                 __entry->balanced_dirty_ratelimit /* the balanced ratelimit */
+                 __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
+                 __get_str(cgroup)
        )
 );
 
 TRACE_EVENT(balance_dirty_pages,
 
-       TP_PROTO(struct backing_dev_info *bdi,
+       TP_PROTO(struct bdi_writeback *wb,
                 unsigned long thresh,
                 unsigned long bg_thresh,
                 unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
                 long pause,
                 unsigned long start_time),
 
-       TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+       TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),
 
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
                __field(         long,  pause)
                __field(unsigned long,  period)
                __field(         long,  think)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
 
        TP_fast_assign(
                unsigned long freerun = (thresh + bg_thresh) / 2;
-               strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+               strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
 
                __entry->limit          = global_wb_domain.dirty_limit;
                __entry->setpoint       = (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
                __entry->period         = period * 1000 / HZ;
                __entry->pause          = pause * 1000 / HZ;
                __entry->paused         = (jiffies - start_time) * 1000 / HZ;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
 
 
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
                  "bdi_setpoint=%lu bdi_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
-                 "paused=%lu pause=%ld period=%lu think=%ld",
+                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
                  __entry->paused,      /* ms */
                  __entry->pause,       /* ms */
                  __entry->period,      /* ms */
-                 __entry->think        /* ms */
+                 __entry->think,       /* ms */
+                 __get_str(cgroup)
          )
 );
 
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
                __field(unsigned long, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
+               __dynamic_array(char, cgroup,
+                               __trace_wb_cgroup_size(inode_to_wb(inode)))
        ),
 
        TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
                __entry->ino            = inode->i_ino;
                __entry->state          = inode->i_state;
                __entry->dirtied_when   = inode->dirtied_when;
+               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
        ),
 
-       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
+       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
                  __entry->name,
                  __entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
-                 (jiffies - __entry->dirtied_when) / HZ
+                 (jiffies - __entry->dirtied_when) / HZ,
+                 __get_str(cgroup)
        )
 );
 
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
        ),
 
        TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write    = nr_to_write;
                __entry->wrote          = nr_to_write - wbc->nr_to_write;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
        ),
 
        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
-                 "index=%lu to_write=%ld wrote=%lu",
+                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
                  __entry->name,
                  __entry->ino,
                  show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
-                 __entry->wrote
+                 __entry->wrote,
+                 __get_str(cgroup)
        )
 );
 
index e016bd9..8da542a 100644 (file)
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
 __SYSCALL(__NR_bpf, sys_bpf)
 #define __NR_execveat 281
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_membarrier 282
+__SYSCALL(__NR_membarrier, sys_membarrier)
 
 #undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
 
 /*
  * All syscalls below here should go away really,
index dbd16a2..fd5aa47 100644 (file)
@@ -358,7 +358,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
-       s32 param;
+       __s32 param;
        /*
         * WARNING: Using pointers instead of fixed-size u64 means we need to write
         * compat32 code. Don't repeat this mistake.
index 70ff1d9..f7b2db4 100644 (file)
@@ -252,6 +252,7 @@ header-y += mdio.h
 header-y += media.h
 header-y += media-bus-format.h
 header-y += mei.h
+header-y += membarrier.h
 header-y += memfd.h
 header-y += mempolicy.h
 header-y += meye.h
index 3429a3b..b56dfcf 100644 (file)
@@ -39,6 +39,7 @@
 #define EM_TI_C6000    140     /* TI C6X DSPs */
 #define EM_AARCH64     183     /* ARM 64 bit */
 #define EM_TILEPRO     188     /* Tilera TILEPro */
+#define EM_MICROBLAZE  189     /* Xilinx MicroBlaze */
 #define EM_TILEGX      191     /* Tilera TILE-Gx */
 #define EM_FRV         0x5441  /* Fujitsu FR-V */
 #define EM_AVR32       0x18ad  /* Atmel AVR32 */
index aa63ed0..ea9221b 100644 (file)
@@ -42,6 +42,7 @@
 #define ETH_P_LOOP     0x0060          /* Ethernet Loopback packet     */
 #define ETH_P_PUP      0x0200          /* Xerox PUP packet             */
 #define ETH_P_PUPAT    0x0201          /* Xerox PUP Addr Trans packet  */
+#define ETH_P_TSN      0x22F0          /* TSN (IEEE 1722) packet       */
 #define ETH_P_IP       0x0800          /* Internet Protocol packet     */
 #define ETH_P_X25      0x0805          /* CCITT X.25                   */
 #define ETH_P_ARP      0x0806          /* Address Resolution packet    */
index a6c4962..5da5f87 100644 (file)
@@ -33,6 +33,7 @@
 #define KPF_THP                        22
 #define KPF_BALLOON            23
 #define KPF_ZERO_PAGE          24
+#define KPF_IDLE               25
 
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
index 0d831f9..a9256f0 100644 (file)
@@ -237,6 +237,7 @@ struct kvm_run {
                        __u32 count;
                        __u64 data_offset; /* relative to kvm_run start */
                } io;
+               /* KVM_EXIT_DEBUG */
                struct {
                        struct kvm_debug_exit_arch arch;
                } debug;
@@ -285,6 +286,7 @@ struct kvm_run {
                        __u32 data;
                        __u8  is_write;
                } dcr;
+               /* KVM_EXIT_INTERNAL_ERROR */
                struct {
                        __u32 suberror;
                        /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
@@ -295,6 +297,7 @@ struct kvm_run {
                struct {
                        __u64 gprs[32];
                } osi;
+               /* KVM_EXIT_PAPR_HCALL */
                struct {
                        __u64 nr;
                        __u64 ret;
@@ -819,6 +822,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
new file mode 100644 (file)
index 0000000..e0b108b
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef _UAPI_LINUX_MEMBARRIER_H
+#define _UAPI_LINUX_MEMBARRIER_H
+
+/*
+ * linux/membarrier.h
+ *
+ * membarrier system call API
+ *
+ * Copyright (c) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * enum membarrier_cmd - membarrier system call command
+ * @MEMBARRIER_CMD_QUERY:   Query the set of supported commands. It returns
+ *                          a bitmask of valid commands.
+ * @MEMBARRIER_CMD_SHARED:  Execute a memory barrier on all running threads.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This covers threads from all processes
+ *                          running on the system. This command returns 0.
+ *
+ * Command to be passed to the membarrier system call. The commands need to
+ * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
+ * the value 0.
+ */
+enum membarrier_cmd {
+       MEMBARRIER_CMD_QUERY = 0,
+       MEMBARRIER_CMD_SHARED = (1 << 0),
+};
+
+#endif /* _UAPI_LINUX_MEMBARRIER_H */
index b67f99d..95c6521 100644 (file)
 #define TCMU_MAILBOX_VERSION 2
 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */
 
-/* See https://gcc.gnu.org/onlinedocs/cpp/Stringification.html */
-#define xstr(s) str(s)
-#define str(s) #s
-
 struct tcmu_mailbox {
        __u16 version;
        __u16 flags;
index a853168..7ddeeda 100644 (file)
@@ -44,6 +44,10 @@ struct privcmd_hypercall {
 
 struct privcmd_mmap_entry {
        __u64 va;
+       /*
+        * This should be a GFN. It's not possible to change the name because
+        * it's exposed to the user-space.
+        */
        __u64 mfn;
        __u64 npages;
 };
index a5983da..1daae48 100644 (file)
@@ -3,9 +3,9 @@
 
 #include <asm/xen/page.h>
 
-static inline unsigned long page_to_mfn(struct page *page)
+static inline unsigned long xen_page_to_gfn(struct page *page)
 {
-       return pfn_to_mfn(page_to_pfn(page));
+       return pfn_to_gfn(page_to_pfn(page));
 }
 
 struct xen_memory_region {
index 0ce4f32..e4e214a 100644 (file)
@@ -30,7 +30,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
 struct vm_area_struct;
 
 /*
- * xen_remap_domain_mfn_array() - map an array of foreign frames
+ * xen_remap_domain_gfn_array() - map an array of foreign frames
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
  * @gfn:     Array of GFNs to map
@@ -46,14 +46,14 @@ struct vm_area_struct;
  * Returns the number of successfully mapped frames, or a -ve error
  * code.
  */
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
                               unsigned long addr,
                               xen_pfn_t *gfn, int nr,
                               int *err_ptr, pgprot_t prot,
                               unsigned domid,
                               struct page **pages);
 
-/* xen_remap_domain_mfn_range() - map a range of foreign frames
+/* xen_remap_domain_gfn_range() - map a range of foreign frames
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
  * @gfn:     First GFN to map.
@@ -65,12 +65,12 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
  * Returns the number of successfully mapped frames, or a -ve error
  * code.
  */
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
                               unsigned long addr,
                               xen_pfn_t gfn, int nr,
                               pgprot_t prot, unsigned domid,
                               struct page **pages);
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
                               int numpgs, struct page **pages);
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
                              unsigned long addr,
index 02da9f1..c24b6f7 100644 (file)
@@ -1602,6 +1602,18 @@ config PCI_QUIRKS
          bugs/quirks. Disable this only if your target machine is
          unaffected by PCI quirks.
 
+config MEMBARRIER
+       bool "Enable membarrier() system call" if EXPERT
+       default y
+       help
+         Enable the membarrier() system call that allows issuing memory
+         barriers across all running threads, which can be used to distribute
+         the cost of user-space memory barriers asymmetrically by transforming
+         pairs of memory barriers into pairs consisting of membarrier() and a
+         compiler barrier.
+
+         If unsure, say Y.
+
 config EMBEDDED
        bool "Embedded system"
        option allnoconfig_y
index ad1bd77..b32ad7d 100644 (file)
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
 
 static void __init free_initrd(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
        unsigned long crashk_end   = (unsigned long)__va(crashk_res.end);
 #endif
        if (do_retain_initrd)
                goto skip;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        /*
         * If the initrd region is overlapped with crashkernel reserved region,
         * free only memory that is not part of crashkernel region.
index 5650655..9e64d70 100644 (file)
@@ -877,7 +877,6 @@ static void __init do_initcalls(void)
 static void __init do_basic_setup(void)
 {
        cpuset_init_smp();
-       usermodehelper_init();
        shmem_init();
        driver_init();
        init_irq_proc();
index 2b49159..71f448e 100644 (file)
@@ -123,7 +123,7 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
        size_t len = src->m_ts;
        size_t alen;
 
-       BUG_ON(dst == NULL);
+       WARN_ON(dst == NULL);
        if (src->m_ts > dst->m_ts)
                return ERR_PTR(-EINVAL);
 
index 4aef24d..222131e 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -159,7 +159,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
         * We raced in the idr lookup or with shm_destroy().  Either way, the
         * ID is busted.
         */
-       BUG_ON(IS_ERR(ipcp));
+       WARN_ON(IS_ERR(ipcp));
 
        return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
@@ -393,7 +393,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
                return ret;
        sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
-       BUG_ON(!sfd->vm_ops->fault);
+       WARN_ON(!sfd->vm_ops->fault);
 #endif
        vma->vm_ops = &shm_vm_ops;
        shm_open(vma);
index e0d7587..53abf00 100644 (file)
@@ -49,7 +49,9 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
@@ -98,6 +100,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
index dc9b464..35bac8e 100644 (file)
@@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *uvalue = u64_to_ptr(attr->value);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *value, *ptr;
+       struct fd f;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *uvalue = u64_to_ptr(attr->value);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *value;
+       struct fd f;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr)
 {
        void __user *ukey = u64_to_ptr(attr->key);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
+       struct fd f;
        void *key;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *unext_key = u64_to_ptr(attr->next_key);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *next_key;
+       struct fd f;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
index ed12e38..b074b23 100644 (file)
@@ -283,7 +283,7 @@ static const char *const bpf_class_string[] = {
        [BPF_ALU64] = "alu64",
 };
 
-static const char *const bpf_alu_string[] = {
+static const char *const bpf_alu_string[16] = {
        [BPF_ADD >> 4]  = "+=",
        [BPF_SUB >> 4]  = "-=",
        [BPF_MUL >> 4]  = "*=",
@@ -307,7 +307,7 @@ static const char *const bpf_ldst_string[] = {
        [BPF_DW >> 3] = "u64",
 };
 
-static const char *const bpf_jmp_string[] = {
+static const char *const bpf_jmp_string[16] = {
        [BPF_JA >> 4]   = "jmp",
        [BPF_JEQ >> 4]  = "==",
        [BPF_JGT >> 4]  = ">",
index 9656a3c..009cc9a 100644 (file)
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
  * low power state that may have caused some blocks in the same power domain
  * to reset.
  *
- * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * Must be called after cpu_cluster_pm_enter has been called for the power
  * domain, and before cpu_pm_exit has been called on any cpu in the power
  * domain. Notified drivers can include VFP co-processor, interrupt controller
  * and its PM extensions, local CPU timers context save/restore which
index ec1c076..71179a0 100644 (file)
 #include <linux/cn_proc.h>
 
 #if 0
-#define kdebug(FMT, ...) \
-       printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...)                                               \
+       printk("[%-5.5s%5u] " FMT "\n",                                 \
+              current->comm, current->pid, ##__VA_ARGS__)
 #else
-#define kdebug(FMT, ...) \
-       no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...)                                               \
+do {                                                                   \
+       if (0)                                                          \
+               no_printk("[%-5.5s%5u] " FMT "\n",                      \
+                         current->comm, current->pid, ##__VA_ARGS__);  \
+} while (0)
 #endif
 
 static struct kmem_cache *cred_jar;
index e818389..f548f69 100644 (file)
@@ -9094,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
 
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
index c98f926..e820cce 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/ftrace.h>
 #include <linux/memory.h>
 #include <linux/module.h>
-#include <linux/ftrace.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
 
index a785c10..4c5edc3 100644 (file)
 /*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#define pr_fmt(fmt)    "kexec: " fmt
-
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/kexec.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
 #include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-       .name  = "Crash kernel",
-       .start = 0,
-       .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
-       .name  = "Crash kernel",
-       .start = 0,
-       .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
-       /*
-        * If crash_kexec_post_notifiers is enabled, don't run
-        * crash_kexec() here yet, which must be run after panic
-        * notifiers in panic().
-        */
-       if (crash_kexec_post_notifiers)
-               return 0;
-       /*
-        * There are 4 panic() calls in do_exit() path, each of which
-        * corresponds to each of these 4 conditions.
-        */
-       if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-               return 1;
-       return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses.  On processors
- * where you can disable the MMU this is trivial, and easy.  For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place.  This means I can only support memory whose
- * physical address can fit in an unsigned long.  In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages.  As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it).  The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- *  - allocating a page table with the control code buffer identity
- *    mapped, to simplify machine_kexec and make kexec_on_panic more
- *    reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
+#include <linux/slab.h>
 
-static int kimage_is_destination_range(struct kimage *image,
-                                      unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
-                                      gfp_t gfp_mask,
-                                      unsigned long dest);
+#include "kexec_internal.h"
 
 static int copy_user_segment_list(struct kimage *image,
                                  unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
        return ret;
 }
 
-static int sanity_check_segment_list(struct kimage *image)
-{
-       int result, i;
-       unsigned long nr_segments = image->nr_segments;
-
-       /*
-        * Verify we have good destination addresses.  The caller is
-        * responsible for making certain we don't attempt to load
-        * the new image into invalid or reserved areas of RAM.  This
-        * just verifies it is an address we can use.
-        *
-        * Since the kernel does everything in page size chunks ensure
-        * the destination addresses are page aligned.  Too many
-        * special cases crop of when we don't do this.  The most
-        * insidious is getting overlapping destination addresses
-        * simply because addresses are changed to page size
-        * granularity.
-        */
-       result = -EADDRNOTAVAIL;
-       for (i = 0; i < nr_segments; i++) {
-               unsigned long mstart, mend;
-
-               mstart = image->segment[i].mem;
-               mend   = mstart + image->segment[i].memsz;
-               if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-                       return result;
-               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-                       return result;
-       }
-
-       /* Verify our destination addresses do not overlap.
-        * If we alloed overlapping destination addresses
-        * through very weird things can happen with no
-        * easy explanation as one segment stops on another.
-        */
-       result = -EINVAL;
-       for (i = 0; i < nr_segments; i++) {
-               unsigned long mstart, mend;
-               unsigned long j;
-
-               mstart = image->segment[i].mem;
-               mend   = mstart + image->segment[i].memsz;
-               for (j = 0; j < i; j++) {
-                       unsigned long pstart, pend;
-                       pstart = image->segment[j].mem;
-                       pend   = pstart + image->segment[j].memsz;
-                       /* Do the segments overlap ? */
-                       if ((mend > pstart) && (mstart < pend))
-                               return result;
-               }
-       }
-
-       /* Ensure our buffer sizes are strictly less than
-        * our memory sizes.  This should always be the case,
-        * and it is easier to check up front than to be surprised
-        * later on.
-        */
-       result = -EINVAL;
-       for (i = 0; i < nr_segments; i++) {
-               if (image->segment[i].bufsz > image->segment[i].memsz)
-                       return result;
-       }
-
-       /*
-        * Verify we have good destination addresses.  Normally
-        * the caller is responsible for making certain we don't
-        * attempt to load the new image into invalid or reserved
-        * areas of RAM.  But crash kernels are preloaded into a
-        * reserved area of ram.  We must ensure the addresses
-        * are in the reserved area otherwise preloading the
-        * kernel could corrupt things.
-        */
-
-       if (image->type == KEXEC_TYPE_CRASH) {
-               result = -EADDRNOTAVAIL;
-               for (i = 0; i < nr_segments; i++) {
-                       unsigned long mstart, mend;
-
-                       mstart = image->segment[i].mem;
-                       mend = mstart + image->segment[i].memsz - 1;
-                       /* Ensure we are within the crash kernel limits */
-                       if ((mstart < crashk_res.start) ||
-                           (mend > crashk_res.end))
-                               return result;
-               }
-       }
-
-       return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
-       struct kimage *image;
-
-       /* Allocate a controlling structure */
-       image = kzalloc(sizeof(*image), GFP_KERNEL);
-       if (!image)
-               return NULL;
-
-       image->head = 0;
-       image->entry = &image->head;
-       image->last_entry = &image->head;
-       image->control_page = ~0; /* By default this does not apply */
-       image->type = KEXEC_TYPE_DEFAULT;
-
-       /* Initialize the list of control pages */
-       INIT_LIST_HEAD(&image->control_pages);
-
-       /* Initialize the list of destination pages */
-       INIT_LIST_HEAD(&image->dest_pages);
-
-       /* Initialize the list of unusable pages */
-       INIT_LIST_HEAD(&image->unusable_pages);
-
-       return image;
-}
-
-static void kimage_free_page_list(struct list_head *list);
-
 static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
                             unsigned long nr_segments,
                             struct kexec_segment __user *segments,
@@ -354,2427 +101,155 @@ out_free_image:
        return ret;
 }
 
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
-       struct fd f = fdget(fd);
-       int ret;
-       struct kstat stat;
-       loff_t pos;
-       ssize_t bytes = 0;
-
-       if (!f.file)
-               return -EBADF;
-
-       ret = vfs_getattr(&f.file->f_path, &stat);
-       if (ret)
-               goto out;
-
-       if (stat.size > INT_MAX) {
-               ret = -EFBIG;
-               goto out;
-       }
-
-       /* Don't hand 0 to vmalloc, it whines. */
-       if (stat.size == 0) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       *buf = vmalloc(stat.size);
-       if (!*buf) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       pos = 0;
-       while (pos < stat.size) {
-               bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
-                                   stat.size - pos);
-               if (bytes < 0) {
-                       vfree(*buf);
-                       ret = bytes;
-                       goto out;
-               }
-
-               if (bytes == 0)
-                       break;
-               pos += bytes;
-       }
-
-       if (pos != stat.size) {
-               ret = -EBADF;
-               vfree(*buf);
-               goto out;
-       }
-
-       *buf_len = pos;
-out:
-       fdput(f);
-       return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-                                        unsigned long buf_len)
-{
-       return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
-       return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
-                                       unsigned long buf_len)
-{
-       return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                                unsigned int relsec)
-{
-       pr_err("RELA relocation unsupported.\n");
-       return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                            unsigned int relsec)
-{
-       pr_err("REL relocation unsupported.\n");
-       return -ENOEXEC;
-}
-
 /*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and then copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
  */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-
-       vfree(image->kernel_buf);
-       image->kernel_buf = NULL;
 
-       vfree(image->initrd_buf);
-       image->initrd_buf = NULL;
-
-       kfree(image->cmdline_buf);
-       image->cmdline_buf = NULL;
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+               struct kexec_segment __user *, segments, unsigned long, flags)
+{
+       struct kimage **dest_image, *image;
+       int result;
 
-       vfree(pi->purgatory_buf);
-       pi->purgatory_buf = NULL;
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+               return -EPERM;
 
-       vfree(pi->sechdrs);
-       pi->sechdrs = NULL;
+       /*
+        * Verify we have a legal set of flags
+        * This leaves us room for future extensions.
+        */
+       if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+               return -EINVAL;
 
-       /* See if architecture has anything to cleanup post load */
-       arch_kimage_file_post_load_cleanup(image);
+       /* Verify we are on the appropriate architecture */
+       if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+               ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+               return -EINVAL;
 
-       /*
-        * Above call should have called into bootloader to free up
-        * any data stored in kimage->image_loader_data. It should
-        * be ok now to free it up.
+       /* Put an artificial cap on the number
+        * of segments passed to kexec_load.
         */
-       kfree(image->image_loader_data);
-       image->image_loader_data = NULL;
-}
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
 
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
-                            const char __user *cmdline_ptr,
-                            unsigned long cmdline_len, unsigned flags)
-{
-       int ret = 0;
-       void *ldata;
+       image = NULL;
+       result = 0;
 
-       ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
-                               &image->kernel_buf_len);
-       if (ret)
-               return ret;
+       /* Because we write directly to the reserved memory
+        * region when loading crash kernels we need a mutex here to
+        * prevent multiple crash  kernels from attempting to load
+        * simultaneously, and to prevent a crash kernel from loading
+        * over the top of a in use crash kernel.
+        *
+        * KISS: always take the mutex.
+        */
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
 
-       /* Call arch image probe handlers */
-       ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
-                                           image->kernel_buf_len);
+       dest_image = &kexec_image;
+       if (flags & KEXEC_ON_CRASH)
+               dest_image = &kexec_crash_image;
+       if (nr_segments > 0) {
+               unsigned long i;
 
-       if (ret)
-               goto out;
+               if (flags & KEXEC_ON_CRASH) {
+                       /*
+                        * Loading another kernel to switch to if this one
+                        * crashes.  Free any current crash dump kernel before
+                        * we corrupt it.
+                        */
 
-#ifdef CONFIG_KEXEC_VERIFY_SIG
-       ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
-                                          image->kernel_buf_len);
-       if (ret) {
-               pr_debug("kernel signature verification failed.\n");
-               goto out;
-       }
-       pr_debug("kernel signature verification successful.\n");
-#endif
-       /* It is possible that there no initramfs is being loaded */
-       if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
-               ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
-                                       &image->initrd_buf_len);
-               if (ret)
-                       goto out;
-       }
+                       kimage_free(xchg(&kexec_crash_image, NULL));
+                       result = kimage_alloc_init(&image, entry, nr_segments,
+                                                  segments, flags);
+                       crash_map_reserved_pages();
+               } else {
+                       /* Loading another kernel to reboot into. */
 
-       if (cmdline_len) {
-               image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
-               if (!image->cmdline_buf) {
-                       ret = -ENOMEM;
-                       goto out;
+                       result = kimage_alloc_init(&image, entry, nr_segments,
+                                                  segments, flags);
                }
-
-               ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
-                                    cmdline_len);
-               if (ret) {
-                       ret = -EFAULT;
+               if (result)
                        goto out;
-               }
-
-               image->cmdline_buf_len = cmdline_len;
 
-               /* command line should be a string with last byte null */
-               if (image->cmdline_buf[cmdline_len - 1] != '\0') {
-                       ret = -EINVAL;
+               if (flags & KEXEC_PRESERVE_CONTEXT)
+                       image->preserve_context = 1;
+               result = machine_kexec_prepare(image);
+               if (result)
                        goto out;
-               }
-       }
 
-       /* Call arch image load handlers */
-       ldata = arch_kexec_kernel_image_load(image);
-
-       if (IS_ERR(ldata)) {
-               ret = PTR_ERR(ldata);
-               goto out;
+               for (i = 0; i < nr_segments; i++) {
+                       result = kimage_load_segment(image, &image->segment[i]);
+                       if (result)
+                               goto out;
+               }
+               kimage_terminate(image);
+               if (flags & KEXEC_ON_CRASH)
+                       crash_unmap_reserved_pages();
        }
+       /* Install the new kernel, and  Uninstall the old */
+       image = xchg(dest_image, image);
 
-       image->image_loader_data = ldata;
 out:
-       /* In case of error, free up all allocated memory in this function */
-       if (ret)
-               kimage_file_post_load_cleanup(image);
-       return ret;
+       mutex_unlock(&kexec_mutex);
+       kimage_free(image);
+
+       return result;
 }
 
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
-                      int initrd_fd, const char __user *cmdline_ptr,
-                      unsigned long cmdline_len, unsigned long flags)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
+                      compat_ulong_t, nr_segments,
+                      struct compat_kexec_segment __user *, segments,
+                      compat_ulong_t, flags)
 {
-       int ret;
-       struct kimage *image;
-       bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
-       image = do_kimage_alloc_init();
-       if (!image)
-               return -ENOMEM;
+       struct compat_kexec_segment in;
+       struct kexec_segment out, __user *ksegments;
+       unsigned long i, result;
 
-       image->file_mode = 1;
+       /* Don't allow clients that don't understand the native
+        * architecture to do anything.
+        */
+       if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
+               return -EINVAL;
 
-       if (kexec_on_panic) {
-               /* Enable special crash kernel control page alloc policy. */
-               image->control_page = crashk_res.start;
-               image->type = KEXEC_TYPE_CRASH;
-       }
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
 
-       ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
-                                          cmdline_ptr, cmdline_len, flags);
-       if (ret)
-               goto out_free_image;
-
-       ret = sanity_check_segment_list(image);
-       if (ret)
-               goto out_free_post_load_bufs;
-
-       ret = -ENOMEM;
-       image->control_code_page = kimage_alloc_control_pages(image,
-                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
-       if (!image->control_code_page) {
-               pr_err("Could not allocate control_code_buffer\n");
-               goto out_free_post_load_bufs;
-       }
-
-       if (!kexec_on_panic) {
-               image->swap_page = kimage_alloc_control_pages(image, 0);
-               if (!image->swap_page) {
-                       pr_err("Could not allocate swap buffer\n");
-                       goto out_free_control_pages;
-               }
-       }
-
-       *rimage = image;
-       return 0;
-out_free_control_pages:
-       kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
-       kimage_file_post_load_cleanup(image);
-out_free_image:
-       kfree(image);
-       return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
-                                       unsigned long start,
-                                       unsigned long end)
-{
-       unsigned long i;
-
-       for (i = 0; i < image->nr_segments; i++) {
-               unsigned long mstart, mend;
-
-               mstart = image->segment[i].mem;
-               mend = mstart + image->segment[i].memsz;
-               if ((end > mstart) && (start < mend))
-                       return 1;
-       }
-
-       return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
-       struct page *pages;
-
-       pages = alloc_pages(gfp_mask, order);
-       if (pages) {
-               unsigned int count, i;
-               pages->mapping = NULL;
-               set_page_private(pages, order);
-               count = 1 << order;
-               for (i = 0; i < count; i++)
-                       SetPageReserved(pages + i);
-       }
-
-       return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
-       unsigned int order, count, i;
-
-       order = page_private(page);
-       count = 1 << order;
-       for (i = 0; i < count; i++)
-               ClearPageReserved(page + i);
-       __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
-       struct list_head *pos, *next;
-
-       list_for_each_safe(pos, next, list) {
-               struct page *page;
-
-               page = list_entry(pos, struct page, lru);
-               list_del(&page->lru);
-               kimage_free_pages(page);
-       }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
-                                                       unsigned int order)
-{
-       /* Control pages are special, they are the intermediaries
-        * that are needed while we copy the rest of the pages
-        * to their final resting place.  As such they must
-        * not conflict with either the destination addresses
-        * or memory the kernel is already using.
-        *
-        * The only case where we really need more than one of
-        * these are for architectures where we cannot disable
-        * the MMU and must instead generate an identity mapped
-        * page table for all of the memory.
-        *
-        * At worst this runs in O(N) of the image size.
-        */
-       struct list_head extra_pages;
-       struct page *pages;
-       unsigned int count;
-
-       count = 1 << order;
-       INIT_LIST_HEAD(&extra_pages);
-
-       /* Loop while I can allocate a page and the page allocated
-        * is a destination page.
-        */
-       do {
-               unsigned long pfn, epfn, addr, eaddr;
-
-               pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
-               if (!pages)
-                       break;
-               pfn   = page_to_pfn(pages);
-               epfn  = pfn + count;
-               addr  = pfn << PAGE_SHIFT;
-               eaddr = epfn << PAGE_SHIFT;
-               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-                             kimage_is_destination_range(image, addr, eaddr)) {
-                       list_add(&pages->lru, &extra_pages);
-                       pages = NULL;
-               }
-       } while (!pages);
-
-       if (pages) {
-               /* Remember the allocated page... */
-               list_add(&pages->lru, &image->control_pages);
-
-               /* Because the page is already in it's destination
-                * location we will never allocate another page at
-                * that address.  Therefore kimage_alloc_pages
-                * will not return it (again) and we don't need
-                * to give it an entry in image->segment[].
-                */
-       }
-       /* Deal with the destination pages I have inadvertently allocated.
-        *
-        * Ideally I would convert multi-page allocations into single
-        * page allocations, and add everything to image->dest_pages.
-        *
-        * For now it is simpler to just free the pages.
-        */
-       kimage_free_page_list(&extra_pages);
-
-       return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-                                                     unsigned int order)
-{
-       /* Control pages are special, they are the intermediaries
-        * that are needed while we copy the rest of the pages
-        * to their final resting place.  As such they must
-        * not conflict with either the destination addresses
-        * or memory the kernel is already using.
-        *
-        * Control pages are also the only pags we must allocate
-        * when loading a crash kernel.  All of the other pages
-        * are specified by the segments and we just memcpy
-        * into them directly.
-        *
-        * The only case where we really need more than one of
-        * these are for architectures where we cannot disable
-        * the MMU and must instead generate an identity mapped
-        * page table for all of the memory.
-        *
-        * Given the low demand this implements a very simple
-        * allocator that finds the first hole of the appropriate
-        * size in the reserved memory region, and allocates all
-        * of the memory up to and including the hole.
-        */
-       unsigned long hole_start, hole_end, size;
-       struct page *pages;
-
-       pages = NULL;
-       size = (1 << order) << PAGE_SHIFT;
-       hole_start = (image->control_page + (size - 1)) & ~(size - 1);
-       hole_end   = hole_start + size - 1;
-       while (hole_end <= crashk_res.end) {
-               unsigned long i;
-
-               if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
-                       break;
-               /* See if I overlap any of the segments */
-               for (i = 0; i < image->nr_segments; i++) {
-                       unsigned long mstart, mend;
-
-                       mstart = image->segment[i].mem;
-                       mend   = mstart + image->segment[i].memsz - 1;
-                       if ((hole_end >= mstart) && (hole_start <= mend)) {
-                               /* Advance the hole to the end of the segment */
-                               hole_start = (mend + (size - 1)) & ~(size - 1);
-                               hole_end   = hole_start + size - 1;
-                               break;
-                       }
-               }
-               /* If I don't overlap any segments I have found my hole! */
-               if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
-                       break;
-               }
-       }
-       if (pages)
-               image->control_page = hole_end;
-
-       return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
-                                        unsigned int order)
-{
-       struct page *pages = NULL;
-
-       switch (image->type) {
-       case KEXEC_TYPE_DEFAULT:
-               pages = kimage_alloc_normal_control_pages(image, order);
-               break;
-       case KEXEC_TYPE_CRASH:
-               pages = kimage_alloc_crash_control_pages(image, order);
-               break;
-       }
-
-       return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
-       if (*image->entry != 0)
-               image->entry++;
-
-       if (image->entry == image->last_entry) {
-               kimage_entry_t *ind_page;
-               struct page *page;
-
-               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-               if (!page)
-                       return -ENOMEM;
-
-               ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
-               image->entry = ind_page;
-               image->last_entry = ind_page +
-                                     ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
-       }
-       *image->entry = entry;
-       image->entry++;
-       *image->entry = 0;
-
-       return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
-                                  unsigned long destination)
-{
-       int result;
-
-       destination &= PAGE_MASK;
-       result = kimage_add_entry(image, destination | IND_DESTINATION);
-
-       return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
-       int result;
-
-       page &= PAGE_MASK;
-       result = kimage_add_entry(image, page | IND_SOURCE);
-
-       return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
-       /* Walk through and free any extra destination pages I may have */
-       kimage_free_page_list(&image->dest_pages);
-
-       /* Walk through and free any unusable pages I have cached */
-       kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
-       if (*image->entry != 0)
-               image->entry++;
-
-       *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
-       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
-               ptr = (entry & IND_INDIRECTION) ? \
-                       phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
-       struct page *page;
-
-       page = pfn_to_page(entry >> PAGE_SHIFT);
-       kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
-       kimage_entry_t *ptr, entry;
-       kimage_entry_t ind = 0;
-
-       if (!image)
-               return;
-
-       kimage_free_extra_pages(image);
-       for_each_kimage_entry(image, ptr, entry) {
-               if (entry & IND_INDIRECTION) {
-                       /* Free the previous indirection page */
-                       if (ind & IND_INDIRECTION)
-                               kimage_free_entry(ind);
-                       /* Save this indirection page until we are
-                        * done with it.
-                        */
-                       ind = entry;
-               } else if (entry & IND_SOURCE)
-                       kimage_free_entry(entry);
-       }
-       /* Free the final indirection page */
-       if (ind & IND_INDIRECTION)
-               kimage_free_entry(ind);
-
-       /* Handle any machine specific cleanup */
-       machine_kexec_cleanup(image);
-
-       /* Free the kexec control pages... */
-       kimage_free_page_list(&image->control_pages);
-
-       /*
-        * Free up any temporary buffers allocated. This might hit if
-        * error occurred much later after buffer allocation.
-        */
-       if (image->file_mode)
-               kimage_file_post_load_cleanup(image);
-
-       kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
-                                       unsigned long page)
-{
-       kimage_entry_t *ptr, entry;
-       unsigned long destination = 0;
-
-       for_each_kimage_entry(image, ptr, entry) {
-               if (entry & IND_DESTINATION)
-                       destination = entry & PAGE_MASK;
-               else if (entry & IND_SOURCE) {
-                       if (page == destination)
-                               return ptr;
-                       destination += PAGE_SIZE;
-               }
-       }
-
-       return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
-                                       gfp_t gfp_mask,
-                                       unsigned long destination)
-{
-       /*
-        * Here we implement safeguards to ensure that a source page
-        * is not copied to its destination page before the data on
-        * the destination page is no longer useful.
-        *
-        * To do this we maintain the invariant that a source page is
-        * either its own destination page, or it is not a
-        * destination page at all.
-        *
-        * That is slightly stronger than required, but the proof
-        * that no problems will not occur is trivial, and the
-        * implementation is simply to verify.
-        *
-        * When allocating all pages normally this algorithm will run
-        * in O(N) time, but in the worst case it will run in O(N^2)
-        * time.   If the runtime is a problem the data structures can
-        * be fixed.
-        */
-       struct page *page;
-       unsigned long addr;
-
-       /*
-        * Walk through the list of destination pages, and see if I
-        * have a match.
-        */
-       list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
-               if (addr == destination) {
-                       list_del(&page->lru);
-                       return page;
-               }
-       }
-       page = NULL;
-       while (1) {
-               kimage_entry_t *old;
-
-               /* Allocate a page, if we run out of memory give up */
-               page = kimage_alloc_pages(gfp_mask, 0);
-               if (!page)
-                       return NULL;
-               /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
-                               (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-                       list_add(&page->lru, &image->unusable_pages);
-                       continue;
-               }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
-
-               /* If it is the destination page we want use it */
-               if (addr == destination)
-                       break;
-
-               /* If the page is not a destination page use it */
-               if (!kimage_is_destination_range(image, addr,
-                                                 addr + PAGE_SIZE))
-                       break;
-
-               /*
-                * I know that the page is someones destination page.
-                * See if there is already a source page for this
-                * destination page.  And if so swap the source pages.
-                */
-               old = kimage_dst_used(image, addr);
-               if (old) {
-                       /* If so move it */
-                       unsigned long old_addr;
-                       struct page *old_page;
-
-                       old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
-                       copy_highpage(page, old_page);
-                       *old = addr | (*old & ~PAGE_MASK);
-
-                       /* The old page I have found cannot be a
-                        * destination page, so return it if it's
-                        * gfp_flags honor the ones passed in.
-                        */
-                       if (!(gfp_mask & __GFP_HIGHMEM) &&
-                           PageHighMem(old_page)) {
-                               kimage_free_pages(old_page);
-                               continue;
-                       }
-                       addr = old_addr;
-                       page = old_page;
-                       break;
-               } else {
-                       /* Place the page on the destination list I
-                        * will use it later.
-                        */
-                       list_add(&page->lru, &image->dest_pages);
-               }
-       }
-
-       return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
-                                        struct kexec_segment *segment)
-{
-       unsigned long maddr;
-       size_t ubytes, mbytes;
-       int result;
-       unsigned char __user *buf = NULL;
-       unsigned char *kbuf = NULL;
-
-       result = 0;
-       if (image->file_mode)
-               kbuf = segment->kbuf;
-       else
-               buf = segment->buf;
-       ubytes = segment->bufsz;
-       mbytes = segment->memsz;
-       maddr = segment->mem;
-
-       result = kimage_set_destination(image, maddr);
-       if (result < 0)
-               goto out;
-
-       while (mbytes) {
-               struct page *page;
-               char *ptr;
-               size_t uchunk, mchunk;
-
-               page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
-               if (!page) {
-                       result  = -ENOMEM;
-                       goto out;
-               }
-               result = kimage_add_page(image, page_to_pfn(page)
-                                                               << PAGE_SHIFT);
-               if (result < 0)
-                       goto out;
-
-               ptr = kmap(page);
-               /* Start with a clear page */
-               clear_page(ptr);
-               ptr += maddr & ~PAGE_MASK;
-               mchunk = min_t(size_t, mbytes,
-                               PAGE_SIZE - (maddr & ~PAGE_MASK));
-               uchunk = min(ubytes, mchunk);
-
-               /* For file based kexec, source pages are in kernel memory */
-               if (image->file_mode)
-                       memcpy(ptr, kbuf, uchunk);
-               else
-                       result = copy_from_user(ptr, buf, uchunk);
-               kunmap(page);
-               if (result) {
-                       result = -EFAULT;
-                       goto out;
-               }
-               ubytes -= uchunk;
-               maddr  += mchunk;
-               if (image->file_mode)
-                       kbuf += mchunk;
-               else
-                       buf += mchunk;
-               mbytes -= mchunk;
-       }
-out:
-       return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
-                                       struct kexec_segment *segment)
-{
-       /* For crash dumps kernels we simply copy the data from
-        * user space to it's destination.
-        * We do things a page at a time for the sake of kmap.
-        */
-       unsigned long maddr;
-       size_t ubytes, mbytes;
-       int result;
-       unsigned char __user *buf = NULL;
-       unsigned char *kbuf = NULL;
-
-       result = 0;
-       if (image->file_mode)
-               kbuf = segment->kbuf;
-       else
-               buf = segment->buf;
-       ubytes = segment->bufsz;
-       mbytes = segment->memsz;
-       maddr = segment->mem;
-       while (mbytes) {
-               struct page *page;
-               char *ptr;
-               size_t uchunk, mchunk;
-
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
-               if (!page) {
-                       result  = -ENOMEM;
-                       goto out;
-               }
-               ptr = kmap(page);
-               ptr += maddr & ~PAGE_MASK;
-               mchunk = min_t(size_t, mbytes,
-                               PAGE_SIZE - (maddr & ~PAGE_MASK));
-               uchunk = min(ubytes, mchunk);
-               if (mchunk > uchunk) {
-                       /* Zero the trailing part of the page */
-                       memset(ptr + uchunk, 0, mchunk - uchunk);
-               }
-
-               /* For file based kexec, source pages are in kernel memory */
-               if (image->file_mode)
-                       memcpy(ptr, kbuf, uchunk);
-               else
-                       result = copy_from_user(ptr, buf, uchunk);
-               kexec_flush_icache_page(page);
-               kunmap(page);
-               if (result) {
-                       result = -EFAULT;
-                       goto out;
-               }
-               ubytes -= uchunk;
-               maddr  += mchunk;
-               if (image->file_mode)
-                       kbuf += mchunk;
-               else
-                       buf += mchunk;
-               mbytes -= mchunk;
-       }
-out:
-       return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
-                               struct kexec_segment *segment)
-{
-       int result = -ENOMEM;
-
-       switch (image->type) {
-       case KEXEC_TYPE_DEFAULT:
-               result = kimage_load_normal_segment(image, segment);
-               break;
-       case KEXEC_TYPE_CRASH:
-               result = kimage_load_crash_segment(image, segment);
-               break;
-       }
-
-       return result;
-}
-
-/*
- * Exec Kernel system call: for obvious reasons only root may call it.
- *
- * This call breaks up into three pieces.
- * - A generic part which loads the new kernel from the current
- *   address space, and very carefully places the data in the
- *   allocated pages.
- *
- * - A generic part that interacts with the kernel and tells all of
- *   the devices to shut down.  Preventing on-going dmas, and placing
- *   the devices in a consistent state so a later kernel can
- *   reinitialize them.
- *
- * - A machine specific part that includes the syscall number
- *   and then copies the image to it's final destination.  And
- *   jumps into the image at entry.
- *
- * kexec does not sync, or unmount filesystems so if you need
- * that to happen you need to do that yourself.
- */
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-static DEFINE_MUTEX(kexec_mutex);
-
-SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
-               struct kexec_segment __user *, segments, unsigned long, flags)
-{
-       struct kimage **dest_image, *image;
-       int result;
-
-       /* We only trust the superuser with rebooting the system. */
-       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-               return -EPERM;
-
-       /*
-        * Verify we have a legal set of flags
-        * This leaves us room for future extensions.
-        */
-       if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
-               return -EINVAL;
-
-       /* Verify we are on the appropriate architecture */
-       if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
-               ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
-               return -EINVAL;
-
-       /* Put an artificial cap on the number
-        * of segments passed to kexec_load.
-        */
-       if (nr_segments > KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       image = NULL;
-       result = 0;
-
-       /* Because we write directly to the reserved memory
-        * region when loading crash kernels we need a mutex here to
-        * prevent multiple crash  kernels from attempting to load
-        * simultaneously, and to prevent a crash kernel from loading
-        * over the top of a in use crash kernel.
-        *
-        * KISS: always take the mutex.
-        */
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-
-       dest_image = &kexec_image;
-       if (flags & KEXEC_ON_CRASH)
-               dest_image = &kexec_crash_image;
-       if (nr_segments > 0) {
-               unsigned long i;
-
-               if (flags & KEXEC_ON_CRASH) {
-                       /*
-                        * Loading another kernel to switch to if this one
-                        * crashes.  Free any current crash dump kernel before
-                        * we corrupt it.
-                        */
-
-                       kimage_free(xchg(&kexec_crash_image, NULL));
-                       result = kimage_alloc_init(&image, entry, nr_segments,
-                                                  segments, flags);
-                       crash_map_reserved_pages();
-               } else {
-                       /* Loading another kernel to reboot into. */
-
-                       result = kimage_alloc_init(&image, entry, nr_segments,
-                                                  segments, flags);
-               }
-               if (result)
-                       goto out;
-
-               if (flags & KEXEC_PRESERVE_CONTEXT)
-                       image->preserve_context = 1;
-               result = machine_kexec_prepare(image);
-               if (result)
-                       goto out;
-
-               for (i = 0; i < nr_segments; i++) {
-                       result = kimage_load_segment(image, &image->segment[i]);
-                       if (result)
-                               goto out;
-               }
-               kimage_terminate(image);
-               if (flags & KEXEC_ON_CRASH)
-                       crash_unmap_reserved_pages();
-       }
-       /* Install the new kernel, and  Uninstall the old */
-       image = xchg(dest_image, image);
-
-out:
-       mutex_unlock(&kexec_mutex);
-       kimage_free(image);
-
-       return result;
-}
-
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
-                      compat_ulong_t, nr_segments,
-                      struct compat_kexec_segment __user *, segments,
-                      compat_ulong_t, flags)
-{
-       struct compat_kexec_segment in;
-       struct kexec_segment out, __user *ksegments;
-       unsigned long i, result;
-
-       /* Don't allow clients that don't understand the native
-        * architecture to do anything.
-        */
-       if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
-               return -EINVAL;
-
-       if (nr_segments > KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
-       for (i = 0; i < nr_segments; i++) {
-               result = copy_from_user(&in, &segments[i], sizeof(in));
-               if (result)
-                       return -EFAULT;
-
-               out.buf   = compat_ptr(in.buf);
-               out.bufsz = in.bufsz;
-               out.mem   = in.mem;
-               out.memsz = in.memsz;
-
-               result = copy_to_user(&ksegments[i], &out, sizeof(out));
-               if (result)
-                       return -EFAULT;
-       }
-
-       return sys_kexec_load(entry, nr_segments, ksegments, flags);
-}
-#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
-               unsigned long, cmdline_len, const char __user *, cmdline_ptr,
-               unsigned long, flags)
-{
-       int ret = 0, i;
-       struct kimage **dest_image, *image;
-
-       /* We only trust the superuser with rebooting the system. */
-       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-               return -EPERM;
-
-       /* Make sure we have a legal set of flags */
-       if (flags != (flags & KEXEC_FILE_FLAGS))
-               return -EINVAL;
-
-       image = NULL;
-
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-
-       dest_image = &kexec_image;
-       if (flags & KEXEC_FILE_ON_CRASH)
-               dest_image = &kexec_crash_image;
-
-       if (flags & KEXEC_FILE_UNLOAD)
-               goto exchange;
-
-       /*
-        * In case of crash, new kernel gets loaded in reserved region. It is
-        * same memory where old crash kernel might be loaded. Free any
-        * current crash dump kernel before we corrupt it.
-        */
-       if (flags & KEXEC_FILE_ON_CRASH)
-               kimage_free(xchg(&kexec_crash_image, NULL));
-
-       ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
-                                    cmdline_len, flags);
-       if (ret)
-               goto out;
-
-       ret = machine_kexec_prepare(image);
-       if (ret)
-               goto out;
-
-       ret = kexec_calculate_store_digests(image);
-       if (ret)
-               goto out;
-
-       for (i = 0; i < image->nr_segments; i++) {
-               struct kexec_segment *ksegment;
-
-               ksegment = &image->segment[i];
-               pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
-                        i, ksegment->buf, ksegment->bufsz, ksegment->mem,
-                        ksegment->memsz);
-
-               ret = kimage_load_segment(image, &image->segment[i]);
-               if (ret)
-                       goto out;
-       }
-
-       kimage_terminate(image);
-
-       /*
-        * Free up any temporary buffers allocated which are not needed
-        * after image has been loaded
-        */
-       kimage_file_post_load_cleanup(image);
-exchange:
-       image = xchg(dest_image, image);
-out:
-       mutex_unlock(&kexec_mutex);
-       kimage_free(image);
-       return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
-       /* Take the kexec_mutex here to prevent sys_kexec_load
-        * running on one cpu from replacing the crash kernel
-        * we are using after a panic on a different cpu.
-        *
-        * If the crash kernel was not located in a fixed area
-        * of memory the xchg(&kexec_crash_image) would be
-        * sufficient.  But since I reuse the memory...
-        */
-       if (mutex_trylock(&kexec_mutex)) {
-               if (kexec_crash_image) {
-                       struct pt_regs fixed_regs;
-
-                       crash_setup_regs(&fixed_regs, regs);
-                       crash_save_vmcoreinfo();
-                       machine_crash_shutdown(&fixed_regs);
-                       machine_kexec(kexec_crash_image);
-               }
-               mutex_unlock(&kexec_mutex);
-       }
-}
-
-size_t crash_get_memory_size(void)
-{
-       size_t size = 0;
-       mutex_lock(&kexec_mutex);
-       if (crashk_res.end != crashk_res.start)
-               size = resource_size(&crashk_res);
-       mutex_unlock(&kexec_mutex);
-       return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
-                                          unsigned long end)
-{
-       unsigned long addr;
-
-       for (addr = begin; addr < end; addr += PAGE_SIZE)
-               free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
-       int ret = 0;
-       unsigned long start, end;
-       unsigned long old_size;
-       struct resource *ram_res;
-
-       mutex_lock(&kexec_mutex);
-
-       if (kexec_crash_image) {
-               ret = -ENOENT;
-               goto unlock;
-       }
-       start = crashk_res.start;
-       end = crashk_res.end;
-       old_size = (end == 0) ? 0 : end - start + 1;
-       if (new_size >= old_size) {
-               ret = (new_size == old_size) ? 0 : -EINVAL;
-               goto unlock;
-       }
-
-       ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-       if (!ram_res) {
-               ret = -ENOMEM;
-               goto unlock;
-       }
-
-       start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-       end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
-       crash_map_reserved_pages();
-       crash_free_reserved_phys_range(end, crashk_res.end);
-
-       if ((start == end) && (crashk_res.parent != NULL))
-               release_resource(&crashk_res);
-
-       ram_res->start = end;
-       ram_res->end = crashk_res.end;
-       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-       ram_res->name = "System RAM";
-
-       crashk_res.end = end - 1;
-
-       insert_resource(&iomem_resource, ram_res);
-       crash_unmap_reserved_pages();
-
-unlock:
-       mutex_unlock(&kexec_mutex);
-       return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-                           size_t data_len)
-{
-       struct elf_note note;
-
-       note.n_namesz = strlen(name) + 1;
-       note.n_descsz = data_len;
-       note.n_type   = type;
-       memcpy(buf, &note, sizeof(note));
-       buf += (sizeof(note) + 3)/4;
-       memcpy(buf, name, note.n_namesz);
-       buf += (note.n_namesz + 3)/4;
-       memcpy(buf, data, note.n_descsz);
-       buf += (note.n_descsz + 3)/4;
-
-       return buf;
-}
-
-static void final_note(u32 *buf)
-{
-       struct elf_note note;
-
-       note.n_namesz = 0;
-       note.n_descsz = 0;
-       note.n_type   = 0;
-       memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-       struct elf_prstatus prstatus;
-       u32 *buf;
-
-       if ((cpu < 0) || (cpu >= nr_cpu_ids))
-               return;
-
-       /* Using ELF notes here is opportunistic.
-        * I need a well defined structure format
-        * for the data I pass, and I need tags
-        * on the data to indicate what information I have
-        * squirrelled away.  ELF notes happen to provide
-        * all of that, so there is no need to invent something new.
-        */
-       buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-       if (!buf)
-               return;
-       memset(&prstatus, 0, sizeof(prstatus));
-       prstatus.pr_pid = current->pid;
-       elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
-       buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-                             &prstatus, sizeof(prstatus));
-       final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
-       /* Allocate memory for saving cpu registers. */
-       crash_notes = alloc_percpu(note_buf_t);
-       if (!crash_notes) {
-               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
-               return -ENOMEM;
-       }
-       return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-                                       unsigned long long system_ram,
-                                       unsigned long long *crash_size,
-                                       unsigned long long *crash_base)
-{
-       char *cur = cmdline, *tmp;
-
-       /* for each entry of the comma-separated list */
-       do {
-               unsigned long long start, end = ULLONG_MAX, size;
-
-               /* get the start of the range */
-               start = memparse(cur, &tmp);
-               if (cur == tmp) {
-                       pr_warn("crashkernel: Memory value expected\n");
-                       return -EINVAL;
-               }
-               cur = tmp;
-               if (*cur != '-') {
-                       pr_warn("crashkernel: '-' expected\n");
-                       return -EINVAL;
-               }
-               cur++;
-
-               /* if no ':' is here, than we read the end */
-               if (*cur != ':') {
-                       end = memparse(cur, &tmp);
-                       if (cur == tmp) {
-                               pr_warn("crashkernel: Memory value expected\n");
-                               return -EINVAL;
-                       }
-                       cur = tmp;
-                       if (end <= start) {
-                               pr_warn("crashkernel: end <= start\n");
-                               return -EINVAL;
-                       }
-               }
-
-               if (*cur != ':') {
-                       pr_warn("crashkernel: ':' expected\n");
-                       return -EINVAL;
-               }
-               cur++;
-
-               size = memparse(cur, &tmp);
-               if (cur == tmp) {
-                       pr_warn("Memory value expected\n");
-                       return -EINVAL;
-               }
-               cur = tmp;
-               if (size >= system_ram) {
-                       pr_warn("crashkernel: invalid size\n");
-                       return -EINVAL;
-               }
-
-               /* match ? */
-               if (system_ram >= start && system_ram < end) {
-                       *crash_size = size;
-                       break;
-               }
-       } while (*cur++ == ',');
-
-       if (*crash_size > 0) {
-               while (*cur && *cur != ' ' && *cur != '@')
-                       cur++;
-               if (*cur == '@') {
-                       cur++;
-                       *crash_base = memparse(cur, &tmp);
-                       if (cur == tmp) {
-                               pr_warn("Memory value expected after '@'\n");
-                               return -EINVAL;
-                       }
-               }
-       }
-
-       return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *     crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-                                          unsigned long long *crash_size,
-                                          unsigned long long *crash_base)
-{
-       char *cur = cmdline;
-
-       *crash_size = memparse(cmdline, &cur);
-       if (cmdline == cur) {
-               pr_warn("crashkernel: memory value expected\n");
-               return -EINVAL;
-       }
-
-       if (*cur == '@')
-               *crash_base = memparse(cur+1, &cur);
-       else if (*cur != ' ' && *cur != '\0') {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-       [SUFFIX_HIGH] = ",high",
-       [SUFFIX_LOW]  = ",low",
-       [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *     crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-                                          unsigned long long   *crash_size,
-                                          const char *suffix)
-{
-       char *cur = cmdline;
-
-       *crash_size = memparse(cmdline, &cur);
-       if (cmdline == cur) {
-               pr_warn("crashkernel: memory value expected\n");
-               return -EINVAL;
-       }
-
-       /* check with suffix */
-       if (strncmp(cur, suffix, strlen(suffix))) {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-       cur += strlen(suffix);
-       if (*cur != ' ' && *cur != '\0') {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-                            const char *name,
-                            const char *suffix)
-{
-       char *p = cmdline, *ck_cmdline = NULL;
-
-       /* find crashkernel and use the last one if there are more */
-       p = strstr(p, name);
-       while (p) {
-               char *end_p = strchr(p, ' ');
-               char *q;
-
-               if (!end_p)
-                       end_p = p + strlen(p);
-
-               if (!suffix) {
-                       int i;
-
-                       /* skip the one with any known suffix */
-                       for (i = 0; suffix_tbl[i]; i++) {
-                               q = end_p - strlen(suffix_tbl[i]);
-                               if (!strncmp(q, suffix_tbl[i],
-                                            strlen(suffix_tbl[i])))
-                                       goto next;
-                       }
-                       ck_cmdline = p;
-               } else {
-                       q = end_p - strlen(suffix);
-                       if (!strncmp(q, suffix, strlen(suffix)))
-                               ck_cmdline = p;
-               }
-next:
-               p = strstr(p+1, name);
-       }
-
-       if (!ck_cmdline)
-               return NULL;
-
-       return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base,
-                            const char *name,
-                            const char *suffix)
-{
-       char    *first_colon, *first_space;
-       char    *ck_cmdline;
-
-       BUG_ON(!crash_size || !crash_base);
-       *crash_size = 0;
-       *crash_base = 0;
-
-       ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
-       if (!ck_cmdline)
-               return -EINVAL;
-
-       ck_cmdline += strlen(name);
-
-       if (suffix)
-               return parse_crashkernel_suffix(ck_cmdline, crash_size,
-                               suffix);
-       /*
-        * if the commandline contains a ':', then that's the extended
-        * syntax -- if not, it must be the classic syntax
-        */
-       first_colon = strchr(ck_cmdline, ':');
-       first_space = strchr(ck_cmdline, ' ');
-       if (first_colon && (!first_space || first_colon < first_space))
-               return parse_crashkernel_mem(ck_cmdline, system_ram,
-                               crash_size, crash_base);
-
-       return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                       "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                               "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                               "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
-       u32 *buf = vmcoreinfo_note;
-
-       if (!vmcoreinfo_size)
-               return;
-       buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-                             vmcoreinfo_size);
-       final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
-       vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-       update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-       va_list args;
-       char buf[0x50];
-       size_t r;
-
-       va_start(args, fmt);
-       r = vscnprintf(buf, sizeof(buf), fmt, args);
-       va_end(args);
-
-       r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
-       memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-       vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
-       return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-       VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-       VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-       VMCOREINFO_SYMBOL(init_uts_ns);
-       VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-       VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-       VMCOREINFO_SYMBOL(_stext);
-       VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-       VMCOREINFO_SYMBOL(mem_map);
-       VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-       VMCOREINFO_SYMBOL(mem_section);
-       VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-       VMCOREINFO_STRUCT_SIZE(mem_section);
-       VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-       VMCOREINFO_STRUCT_SIZE(page);
-       VMCOREINFO_STRUCT_SIZE(pglist_data);
-       VMCOREINFO_STRUCT_SIZE(zone);
-       VMCOREINFO_STRUCT_SIZE(free_area);
-       VMCOREINFO_STRUCT_SIZE(list_head);
-       VMCOREINFO_SIZE(nodemask_t);
-       VMCOREINFO_OFFSET(page, flags);
-       VMCOREINFO_OFFSET(page, _count);
-       VMCOREINFO_OFFSET(page, mapping);
-       VMCOREINFO_OFFSET(page, lru);
-       VMCOREINFO_OFFSET(page, _mapcount);
-       VMCOREINFO_OFFSET(page, private);
-       VMCOREINFO_OFFSET(pglist_data, node_zones);
-       VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-       VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-       VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-       VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-       VMCOREINFO_OFFSET(pglist_data, node_id);
-       VMCOREINFO_OFFSET(zone, free_area);
-       VMCOREINFO_OFFSET(zone, vm_stat);
-       VMCOREINFO_OFFSET(zone, spanned_pages);
-       VMCOREINFO_OFFSET(free_area, free_list);
-       VMCOREINFO_OFFSET(list_head, next);
-       VMCOREINFO_OFFSET(list_head, prev);
-       VMCOREINFO_OFFSET(vmap_area, va_start);
-       VMCOREINFO_OFFSET(vmap_area, list);
-       VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-       log_buf_kexec_setup();
-       VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-       VMCOREINFO_NUMBER(NR_FREE_PAGES);
-       VMCOREINFO_NUMBER(PG_lru);
-       VMCOREINFO_NUMBER(PG_private);
-       VMCOREINFO_NUMBER(PG_swapcache);
-       VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-       VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-       VMCOREINFO_NUMBER(PG_head_mask);
-       VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
-       VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
-       arch_crash_save_vmcoreinfo();
-       update_vmcoreinfo_note();
-
-       return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
-                                   struct kexec_buf *kbuf)
-{
-       struct kimage *image = kbuf->image;
-       unsigned long temp_start, temp_end;
-
-       temp_end = min(end, kbuf->buf_max);
-       temp_start = temp_end - kbuf->memsz;
-
-       do {
-               /* align down start */
-               temp_start = temp_start & (~(kbuf->buf_align - 1));
-
-               if (temp_start < start || temp_start < kbuf->buf_min)
-                       return 0;
-
-               temp_end = temp_start + kbuf->memsz - 1;
-
-               /*
-                * Make sure this does not conflict with any of existing
-                * segments
-                */
-               if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                       temp_start = temp_start - PAGE_SIZE;
-                       continue;
-               }
-
-               /* We found a suitable memory range */
-               break;
-       } while (1);
-
-       /* If we are here, we found a suitable memory range */
-       kbuf->mem = temp_start;
-
-       /* Success, stop navigating through remaining System RAM ranges */
-       return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
-                                    struct kexec_buf *kbuf)
-{
-       struct kimage *image = kbuf->image;
-       unsigned long temp_start, temp_end;
-
-       temp_start = max(start, kbuf->buf_min);
-
-       do {
-               temp_start = ALIGN(temp_start, kbuf->buf_align);
-               temp_end = temp_start + kbuf->memsz - 1;
-
-               if (temp_end > end || temp_end > kbuf->buf_max)
-                       return 0;
-               /*
-                * Make sure this does not conflict with any of existing
-                * segments
-                */
-               if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                       temp_start = temp_start + PAGE_SIZE;
-                       continue;
-               }
-
-               /* We found a suitable memory range */
-               break;
-       } while (1);
-
-       /* If we are here, we found a suitable memory range */
-       kbuf->mem = temp_start;
-
-       /* Success, stop navigating through remaining System RAM ranges */
-       return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
-       struct kexec_buf *kbuf = (struct kexec_buf *)arg;
-       unsigned long sz = end - start + 1;
-
-       /* Returning 0 will take to next memory range */
-       if (sz < kbuf->memsz)
-               return 0;
-
-       if (end < kbuf->buf_min || start > kbuf->buf_max)
-               return 0;
-
-       /*
-        * Allocate memory top down with-in ram range. Otherwise bottom up
-        * allocation.
-        */
-       if (kbuf->top_down)
-               return locate_mem_hole_top_down(start, end, kbuf);
-       return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
-                    unsigned long memsz, unsigned long buf_align,
-                    unsigned long buf_min, unsigned long buf_max,
-                    bool top_down, unsigned long *load_addr)
-{
-
-       struct kexec_segment *ksegment;
-       struct kexec_buf buf, *kbuf;
-       int ret;
-
-       /* Currently adding segment this way is allowed only in file mode */
-       if (!image->file_mode)
-               return -EINVAL;
-
-       if (image->nr_segments >= KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       /*
-        * Make sure we are not trying to add buffer after allocating
-        * control pages. All segments need to be placed first before
-        * any control pages are allocated. As control page allocation
-        * logic goes through list of segments to make sure there are
-        * no destination overlaps.
-        */
-       if (!list_empty(&image->control_pages)) {
-               WARN_ON(1);
-               return -EINVAL;
-       }
-
-       memset(&buf, 0, sizeof(struct kexec_buf));
-       kbuf = &buf;
-       kbuf->image = image;
-       kbuf->buffer = buffer;
-       kbuf->bufsz = bufsz;
-
-       kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
-       kbuf->buf_align = max(buf_align, PAGE_SIZE);
-       kbuf->buf_min = buf_min;
-       kbuf->buf_max = buf_max;
-       kbuf->top_down = top_down;
-
-       /* Walk the RAM ranges and allocate a suitable range for the buffer */
-       if (image->type == KEXEC_TYPE_CRASH)
-               ret = walk_iomem_res("Crash kernel",
-                                    IORESOURCE_MEM | IORESOURCE_BUSY,
-                                    crashk_res.start, crashk_res.end, kbuf,
-                                    locate_mem_hole_callback);
-       else
-               ret = walk_system_ram_res(0, -1, kbuf,
-                                         locate_mem_hole_callback);
-       if (ret != 1) {
-               /* A suitable memory range could not be found for buffer */
-               return -EADDRNOTAVAIL;
-       }
-
-       /* Found a suitable memory range */
-       ksegment = &image->segment[image->nr_segments];
-       ksegment->kbuf = kbuf->buffer;
-       ksegment->bufsz = kbuf->bufsz;
-       ksegment->mem = kbuf->mem;
-       ksegment->memsz = kbuf->memsz;
-       image->nr_segments++;
-       *load_addr = ksegment->mem;
-       return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
-       struct crypto_shash *tfm;
-       struct shash_desc *desc;
-       int ret = 0, i, j, zero_buf_sz, sha_region_sz;
-       size_t desc_size, nullsz;
-       char *digest;
-       void *zero_buf;
-       struct kexec_sha_region *sha_regions;
-       struct purgatory_info *pi = &image->purgatory_info;
-
-       zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
-       zero_buf_sz = PAGE_SIZE;
-
-       tfm = crypto_alloc_shash("sha256", 0, 0);
-       if (IS_ERR(tfm)) {
-               ret = PTR_ERR(tfm);
-               goto out;
-       }
-
-       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-       desc = kzalloc(desc_size, GFP_KERNEL);
-       if (!desc) {
-               ret = -ENOMEM;
-               goto out_free_tfm;
-       }
-
-       sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
-       sha_regions = vzalloc(sha_region_sz);
-       if (!sha_regions)
-               goto out_free_desc;
-
-       desc->tfm   = tfm;
-       desc->flags = 0;
-
-       ret = crypto_shash_init(desc);
-       if (ret < 0)
-               goto out_free_sha_regions;
-
-       digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
-       if (!digest) {
-               ret = -ENOMEM;
-               goto out_free_sha_regions;
-       }
-
-       for (j = i = 0; i < image->nr_segments; i++) {
-               struct kexec_segment *ksegment;
-
-               ksegment = &image->segment[i];
-               /*
-                * Skip purgatory as it will be modified once we put digest
-                * info in purgatory.
-                */
-               if (ksegment->kbuf == pi->purgatory_buf)
-                       continue;
-
-               ret = crypto_shash_update(desc, ksegment->kbuf,
-                                         ksegment->bufsz);
-               if (ret)
-                       break;
-
-               /*
-                * Assume rest of the buffer is filled with zero and
-                * update digest accordingly.
-                */
-               nullsz = ksegment->memsz - ksegment->bufsz;
-               while (nullsz) {
-                       unsigned long bytes = nullsz;
-
-                       if (bytes > zero_buf_sz)
-                               bytes = zero_buf_sz;
-                       ret = crypto_shash_update(desc, zero_buf, bytes);
-                       if (ret)
-                               break;
-                       nullsz -= bytes;
-               }
-
-               if (ret)
-                       break;
-
-               sha_regions[j].start = ksegment->mem;
-               sha_regions[j].len = ksegment->memsz;
-               j++;
-       }
-
-       if (!ret) {
-               ret = crypto_shash_final(desc, digest);
-               if (ret)
-                       goto out_free_digest;
-               ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
-                                               sha_regions, sha_region_sz, 0);
-               if (ret)
-                       goto out_free_digest;
-
-               ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
-                                               digest, SHA256_DIGEST_SIZE, 0);
-               if (ret)
-                       goto out_free_digest;
-       }
-
-out_free_digest:
-       kfree(digest);
-out_free_sha_regions:
-       vfree(sha_regions);
-out_free_desc:
-       kfree(desc);
-out_free_tfm:
-       kfree(tfm);
-out:
-       return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
-                                 unsigned long max, int top_down)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
-       unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
-       unsigned char *buf_addr, *src;
-       int i, ret = 0, entry_sidx = -1;
-       const Elf_Shdr *sechdrs_c;
-       Elf_Shdr *sechdrs = NULL;
-       void *purgatory_buf = NULL;
-
-       /*
-        * sechdrs_c points to section headers in purgatory and are read
-        * only. No modifications allowed.
-        */
-       sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
-       /*
-        * We can not modify sechdrs_c[] and its fields. It is read only.
-        * Copy it over to a local copy where one can store some temporary
-        * data and free it at the end. We need to modify ->sh_addr and
-        * ->sh_offset fields to keep track of permanent and temporary
-        * locations of sections.
-        */
-       sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-       if (!sechdrs)
-               return -ENOMEM;
-
-       memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
-       /*
-        * We seem to have multiple copies of sections. First copy is which
-        * is embedded in kernel in read only section. Some of these sections
-        * will be copied to a temporary buffer and relocated. And these
-        * sections will finally be copied to their final destination at
-        * segment load time.
-        *
-        * Use ->sh_offset to reflect section address in memory. It will
-        * point to original read only copy if section is not allocatable.
-        * Otherwise it will point to temporary copy which will be relocated.
-        *
-        * Use ->sh_addr to contain final address of the section where it
-        * will go during execution time.
-        */
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (sechdrs[i].sh_type == SHT_NOBITS)
-                       continue;
-
-               sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
-                                               sechdrs[i].sh_offset;
-       }
-
-       /*
-        * Identify entry point section and make entry relative to section
-        * start.
-        */
-       entry = pi->ehdr->e_entry;
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
-                       continue;
-
-               /* Make entry section relative */
-               if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
-                   ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
-                    pi->ehdr->e_entry)) {
-                       entry_sidx = i;
-                       entry -= sechdrs[i].sh_addr;
-                       break;
-               }
-       }
-
-       /* Determine how much memory is needed to load relocatable object. */
-       buf_align = 1;
-       bss_align = 1;
-       buf_sz = 0;
-       bss_sz = 0;
-
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               align = sechdrs[i].sh_addralign;
-               if (sechdrs[i].sh_type != SHT_NOBITS) {
-                       if (buf_align < align)
-                               buf_align = align;
-                       buf_sz = ALIGN(buf_sz, align);
-                       buf_sz += sechdrs[i].sh_size;
-               } else {
-                       /* bss section */
-                       if (bss_align < align)
-                               bss_align = align;
-                       bss_sz = ALIGN(bss_sz, align);
-                       bss_sz += sechdrs[i].sh_size;
-               }
-       }
-
-       /* Determine the bss padding required to align bss properly */
-       bss_pad = 0;
-       if (buf_sz & (bss_align - 1))
-               bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
-       memsz = buf_sz + bss_pad + bss_sz;
-
-       /* Allocate buffer for purgatory */
-       purgatory_buf = vzalloc(buf_sz);
-       if (!purgatory_buf) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       if (buf_align < bss_align)
-               buf_align = bss_align;
-
-       /* Add buffer to segment list */
-       ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
-                               buf_align, min, max, top_down,
-                               &pi->purgatory_load_addr);
-       if (ret)
-               goto out;
-
-       /* Load SHF_ALLOC sections */
-       buf_addr = purgatory_buf;
-       load_addr = curr_load_addr = pi->purgatory_load_addr;
-       bss_addr = load_addr + buf_sz + bss_pad;
-
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               align = sechdrs[i].sh_addralign;
-               if (sechdrs[i].sh_type != SHT_NOBITS) {
-                       curr_load_addr = ALIGN(curr_load_addr, align);
-                       offset = curr_load_addr - load_addr;
-                       /* We already modifed ->sh_offset to keep src addr */
-                       src = (char *) sechdrs[i].sh_offset;
-                       memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
-                       /* Store load address and source address of section */
-                       sechdrs[i].sh_addr = curr_load_addr;
-
-                       /*
-                        * This section got copied to temporary buffer. Update
-                        * ->sh_offset accordingly.
-                        */
-                       sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
-                       /* Advance to the next address */
-                       curr_load_addr += sechdrs[i].sh_size;
-               } else {
-                       bss_addr = ALIGN(bss_addr, align);
-                       sechdrs[i].sh_addr = bss_addr;
-                       bss_addr += sechdrs[i].sh_size;
-               }
-       }
-
-       /* Update entry point based on load address of text section */
-       if (entry_sidx >= 0)
-               entry += sechdrs[entry_sidx].sh_addr;
-
-       /* Make kernel jump to purgatory after shutdown */
-       image->start = entry;
-
-       /* Used later to get/set symbol values */
-       pi->sechdrs = sechdrs;
-
-       /*
-        * Used later to identify which section is purgatory and skip it
-        * from checksumming.
-        */
-       pi->purgatory_buf = purgatory_buf;
-       return ret;
-out:
-       vfree(sechdrs);
-       vfree(purgatory_buf);
-       return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
-       int i, ret;
-       struct purgatory_info *pi = &image->purgatory_info;
-       Elf_Shdr *sechdrs = pi->sechdrs;
-
-       /* Apply relocations */
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               Elf_Shdr *section, *symtab;
-
-               if (sechdrs[i].sh_type != SHT_RELA &&
-                   sechdrs[i].sh_type != SHT_REL)
-                       continue;
-
-               /*
-                * For section of type SHT_RELA/SHT_REL,
-                * ->sh_link contains section header index of associated
-                * symbol table. And ->sh_info contains section header
-                * index of section to which relocations apply.
-                */
-               if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
-                   sechdrs[i].sh_link >= pi->ehdr->e_shnum)
-                       return -ENOEXEC;
-
-               section = &sechdrs[sechdrs[i].sh_info];
-               symtab = &sechdrs[sechdrs[i].sh_link];
-
-               if (!(section->sh_flags & SHF_ALLOC))
-                       continue;
-
-               /*
-                * symtab->sh_link contain section header index of associated
-                * string table.
-                */
-               if (symtab->sh_link >= pi->ehdr->e_shnum)
-                       /* Invalid section number? */
-                       continue;
-
-               /*
-                * Respective architecture needs to provide support for applying
-                * relocations of type SHT_RELA/SHT_REL.
-                */
-               if (sechdrs[i].sh_type == SHT_RELA)
-                       ret = arch_kexec_apply_relocations_add(pi->ehdr,
-                                                              sechdrs, i);
-               else if (sechdrs[i].sh_type == SHT_REL)
-                       ret = arch_kexec_apply_relocations(pi->ehdr,
-                                                          sechdrs, i);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
-                        unsigned long max, int top_down,
-                        unsigned long *load_addr)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       int ret;
-
-       if (kexec_purgatory_size <= 0)
-               return -EINVAL;
-
-       if (kexec_purgatory_size < sizeof(Elf_Ehdr))
-               return -ENOEXEC;
-
-       pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
-       if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
-           || pi->ehdr->e_type != ET_REL
-           || !elf_check_arch(pi->ehdr)
-           || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
-               return -ENOEXEC;
-
-       if (pi->ehdr->e_shoff >= kexec_purgatory_size
-           || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
-           kexec_purgatory_size - pi->ehdr->e_shoff))
-               return -ENOEXEC;
-
-       ret = __kexec_load_purgatory(image, min, max, top_down);
-       if (ret)
-               return ret;
-
-       ret = kexec_apply_relocations(image);
-       if (ret)
-               goto out;
-
-       *load_addr = pi->purgatory_load_addr;
-       return 0;
-out:
-       vfree(pi->sechdrs);
-       vfree(pi->purgatory_buf);
-       return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
-                                           const char *name)
-{
-       Elf_Sym *syms;
-       Elf_Shdr *sechdrs;
-       Elf_Ehdr *ehdr;
-       int i, k;
-       const char *strtab;
-
-       if (!pi->sechdrs || !pi->ehdr)
-               return NULL;
-
-       sechdrs = pi->sechdrs;
-       ehdr = pi->ehdr;
-
-       for (i = 0; i < ehdr->e_shnum; i++) {
-               if (sechdrs[i].sh_type != SHT_SYMTAB)
-                       continue;
-
-               if (sechdrs[i].sh_link >= ehdr->e_shnum)
-                       /* Invalid strtab section number */
-                       continue;
-               strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
-               syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
-               /* Go through symbols for a match */
-               for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
-                       if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
-                               continue;
-
-                       if (strcmp(strtab + syms[k].st_name, name) != 0)
-                               continue;
-
-                       if (syms[k].st_shndx == SHN_UNDEF ||
-                           syms[k].st_shndx >= ehdr->e_shnum) {
-                               pr_debug("Symbol: %s has bad section index %d.\n",
-                                               name, syms[k].st_shndx);
-                               return NULL;
-                       }
-
-                       /* Found the symbol we are looking for */
-                       return &syms[k];
-               }
-       }
-
-       return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       Elf_Sym *sym;
-       Elf_Shdr *sechdr;
-
-       sym = kexec_purgatory_find_symbol(pi, name);
-       if (!sym)
-               return ERR_PTR(-EINVAL);
-
-       sechdr = &pi->sechdrs[sym->st_shndx];
-
-       /*
-        * Returns the address where symbol will finally be loaded after
-        * kexec_load_segment()
-        */
-       return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
-                                  void *buf, unsigned int size, bool get_value)
-{
-       Elf_Sym *sym;
-       Elf_Shdr *sechdrs;
-       struct purgatory_info *pi = &image->purgatory_info;
-       char *sym_buf;
-
-       sym = kexec_purgatory_find_symbol(pi, name);
-       if (!sym)
-               return -EINVAL;
-
-       if (sym->st_size != size) {
-               pr_err("symbol %s size mismatch: expected %lu actual %u\n",
-                      name, (unsigned long)sym->st_size, size);
-               return -EINVAL;
-       }
+       ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+       for (i = 0; i < nr_segments; i++) {
+               result = copy_from_user(&in, &segments[i], sizeof(in));
+               if (result)
+                       return -EFAULT;
 
-       sechdrs = pi->sechdrs;
+               out.buf   = compat_ptr(in.buf);
+               out.bufsz = in.bufsz;
+               out.mem   = in.mem;
+               out.memsz = in.memsz;
 
-       if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
-               pr_err("symbol %s is in a bss section. Cannot %s\n", name,
-                      get_value ? "get" : "set");
-               return -EINVAL;
+               result = copy_to_user(&ksegments[i], &out, sizeof(out));
+               if (result)
+                       return -EFAULT;
        }
 
-       sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
-                                       sym->st_value;
-
-       if (get_value)
-               memcpy((void *)buf, sym_buf, size);
-       else
-               memcpy((void *)sym_buf, buf, size);
-
-       return 0;
+       return sys_kexec_load(entry, nr_segments, ksegments, flags);
 }
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable.  If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
-       int error = 0;
-
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-       if (!kexec_image) {
-               error = -EINVAL;
-               goto Unlock;
-       }
-
-#ifdef CONFIG_KEXEC_JUMP
-       if (kexec_image->preserve_context) {
-               lock_system_sleep();
-               pm_prepare_console();
-               error = freeze_processes();
-               if (error) {
-                       error = -EBUSY;
-                       goto Restore_console;
-               }
-               suspend_console();
-               error = dpm_suspend_start(PMSG_FREEZE);
-               if (error)
-                       goto Resume_console;
-               /* At this point, dpm_suspend_start() has been called,
-                * but *not* dpm_suspend_end(). We *must* call
-                * dpm_suspend_end() now.  Otherwise, drivers for
-                * some devices (e.g. interrupt controllers) become
-                * desynchronized with the actual state of the
-                * hardware at resume time, and evil weirdness ensues.
-                */
-               error = dpm_suspend_end(PMSG_FREEZE);
-               if (error)
-                       goto Resume_devices;
-               error = disable_nonboot_cpus();
-               if (error)
-                       goto Enable_cpus;
-               local_irq_disable();
-               error = syscore_suspend();
-               if (error)
-                       goto Enable_irqs;
-       } else
-#endif
-       {
-               kexec_in_progress = true;
-               kernel_restart_prepare(NULL);
-               migrate_to_reboot_cpu();
-
-               /*
-                * migrate_to_reboot_cpu() disables CPU hotplug assuming that
-                * no further code needs to use CPU hotplug (which is true in
-                * the reboot case). However, the kexec path depends on using
-                * CPU hotplug again; so re-enable it here.
-                */
-               cpu_hotplug_enable();
-               pr_emerg("Starting new kernel\n");
-               machine_shutdown();
-       }
-
-       machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
-       if (kexec_image->preserve_context) {
-               syscore_resume();
- Enable_irqs:
-               local_irq_enable();
- Enable_cpus:
-               enable_nonboot_cpus();
-               dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
-               dpm_resume_end(PMSG_RESTORE);
- Resume_console:
-               resume_console();
-               thaw_processes();
- Restore_console:
-               pm_restore_console();
-               unlock_system_sleep();
-       }
 #endif
-
- Unlock:
-       mutex_unlock(&kexec_mutex);
-       return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644 (file)
index 0000000..201b453
--- /dev/null
@@ -0,0 +1,1534 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt)    "kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+       .name  = "Crash kernel",
+       .start = 0,
+       .end   = 0,
+       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+       .name  = "Crash kernel",
+       .start = 0,
+       .end   = 0,
+       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+       /*
+        * If crash_kexec_post_notifiers is enabled, don't run
+        * crash_kexec() here yet, which must be run after panic
+        * notifiers in panic().
+        */
+       if (crash_kexec_post_notifiers)
+               return 0;
+       /*
+        * There are 4 panic() calls in do_exit() path, each of which
+        * corresponds to each of these 4 conditions.
+        */
+       if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+               return 1;
+       return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+                                      gfp_t gfp_mask,
+                                      unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+       int result, i;
+       unsigned long nr_segments = image->nr_segments;
+
+       /*
+        * Verify we have good destination addresses.  The caller is
+        * responsible for making certain we don't attempt to load
+        * the new image into invalid or reserved areas of RAM.  This
+        * just verifies it is an address we can use.
+        *
+        * Since the kernel does everything in page size chunks ensure
+        * the destination addresses are page aligned.  Too many
+        * special cases crop of when we don't do this.  The most
+        * insidious is getting overlapping destination addresses
+        * simply because addresses are changed to page size
+        * granularity.
+        */
+       result = -EADDRNOTAVAIL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mstart, mend;
+
+               mstart = image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+                       return result;
+               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+                       return result;
+       }
+
+       /* Verify our destination addresses do not overlap.
+        * If we alloed overlapping destination addresses
+        * through very weird things can happen with no
+        * easy explanation as one segment stops on another.
+        */
+       result = -EINVAL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mstart, mend;
+               unsigned long j;
+
+               mstart = image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               for (j = 0; j < i; j++) {
+                       unsigned long pstart, pend;
+
+                       pstart = image->segment[j].mem;
+                       pend   = pstart + image->segment[j].memsz;
+                       /* Do the segments overlap ? */
+                       if ((mend > pstart) && (mstart < pend))
+                               return result;
+               }
+       }
+
+       /* Ensure our buffer sizes are strictly less than
+        * our memory sizes.  This should always be the case,
+        * and it is easier to check up front than to be surprised
+        * later on.
+        */
+       result = -EINVAL;
+       for (i = 0; i < nr_segments; i++) {
+               if (image->segment[i].bufsz > image->segment[i].memsz)
+                       return result;
+       }
+
+       /*
+        * Verify we have good destination addresses.  Normally
+        * the caller is responsible for making certain we don't
+        * attempt to load the new image into invalid or reserved
+        * areas of RAM.  But crash kernels are preloaded into a
+        * reserved area of ram.  We must ensure the addresses
+        * are in the reserved area otherwise preloading the
+        * kernel could corrupt things.
+        */
+
+       if (image->type == KEXEC_TYPE_CRASH) {
+               result = -EADDRNOTAVAIL;
+               for (i = 0; i < nr_segments; i++) {
+                       unsigned long mstart, mend;
+
+                       mstart = image->segment[i].mem;
+                       mend = mstart + image->segment[i].memsz - 1;
+                       /* Ensure we are within the crash kernel limits */
+                       if ((mstart < crashk_res.start) ||
+                           (mend > crashk_res.end))
+                               return result;
+               }
+       }
+
+       return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+       struct kimage *image;
+
+       /* Allocate a controlling structure */
+       image = kzalloc(sizeof(*image), GFP_KERNEL);
+       if (!image)
+               return NULL;
+
+       image->head = 0;
+       image->entry = &image->head;
+       image->last_entry = &image->head;
+       image->control_page = ~0; /* By default this does not apply */
+       image->type = KEXEC_TYPE_DEFAULT;
+
+       /* Initialize the list of control pages */
+       INIT_LIST_HEAD(&image->control_pages);
+
+       /* Initialize the list of destination pages */
+       INIT_LIST_HEAD(&image->dest_pages);
+
+       /* Initialize the list of unusable pages */
+       INIT_LIST_HEAD(&image->unusable_pages);
+
+       return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+                                       unsigned long start,
+                                       unsigned long end)
+{
+       unsigned long i;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               unsigned long mstart, mend;
+
+               mstart = image->segment[i].mem;
+               mend = mstart + image->segment[i].memsz;
+               if ((end > mstart) && (start < mend))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+       struct page *pages;
+
+       pages = alloc_pages(gfp_mask, order);
+       if (pages) {
+               unsigned int count, i;
+
+               pages->mapping = NULL;
+               set_page_private(pages, order);
+               count = 1 << order;
+               for (i = 0; i < count; i++)
+                       SetPageReserved(pages + i);
+       }
+
+       return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+       unsigned int order, count, i;
+
+       order = page_private(page);
+       count = 1 << order;
+       for (i = 0; i < count; i++)
+               ClearPageReserved(page + i);
+       __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+       struct list_head *pos, *next;
+
+       list_for_each_safe(pos, next, list) {
+               struct page *page;
+
+               page = list_entry(pos, struct page, lru);
+               list_del(&page->lru);
+               kimage_free_pages(page);
+       }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+                                                       unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * At worst this runs in O(N) of the image size.
+        */
+       struct list_head extra_pages;
+       struct page *pages;
+       unsigned int count;
+
+       count = 1 << order;
+       INIT_LIST_HEAD(&extra_pages);
+
+       /* Loop while I can allocate a page and the page allocated
+        * is a destination page.
+        */
+       do {
+               unsigned long pfn, epfn, addr, eaddr;
+
+               pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+               if (!pages)
+                       break;
+               pfn   = page_to_pfn(pages);
+               epfn  = pfn + count;
+               addr  = pfn << PAGE_SHIFT;
+               eaddr = epfn << PAGE_SHIFT;
+               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+                             kimage_is_destination_range(image, addr, eaddr)) {
+                       list_add(&pages->lru, &extra_pages);
+                       pages = NULL;
+               }
+       } while (!pages);
+
+       if (pages) {
+               /* Remember the allocated page... */
+               list_add(&pages->lru, &image->control_pages);
+
+               /* Because the page is already in it's destination
+                * location we will never allocate another page at
+                * that address.  Therefore kimage_alloc_pages
+                * will not return it (again) and we don't need
+                * to give it an entry in image->segment[].
+                */
+       }
+       /* Deal with the destination pages I have inadvertently allocated.
+        *
+        * Ideally I would convert multi-page allocations into single
+        * page allocations, and add everything to image->dest_pages.
+        *
+        * For now it is simpler to just free the pages.
+        */
+       kimage_free_page_list(&extra_pages);
+
+       return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+                                                     unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * Control pages are also the only pags we must allocate
+        * when loading a crash kernel.  All of the other pages
+        * are specified by the segments and we just memcpy
+        * into them directly.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * Given the low demand this implements a very simple
+        * allocator that finds the first hole of the appropriate
+        * size in the reserved memory region, and allocates all
+        * of the memory up to and including the hole.
+        */
+       unsigned long hole_start, hole_end, size;
+       struct page *pages;
+
+       pages = NULL;
+       size = (1 << order) << PAGE_SHIFT;
+       hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+       hole_end   = hole_start + size - 1;
+       while (hole_end <= crashk_res.end) {
+               unsigned long i;
+
+               if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+                       break;
+               /* See if I overlap any of the segments */
+               for (i = 0; i < image->nr_segments; i++) {
+                       unsigned long mstart, mend;
+
+                       mstart = image->segment[i].mem;
+                       mend   = mstart + image->segment[i].memsz - 1;
+                       if ((hole_end >= mstart) && (hole_start <= mend)) {
+                               /* Advance the hole to the end of the segment */
+                               hole_start = (mend + (size - 1)) & ~(size - 1);
+                               hole_end   = hole_start + size - 1;
+                               break;
+                       }
+               }
+               /* If I don't overlap any segments I have found my hole! */
+               if (i == image->nr_segments) {
+                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       image->control_page = hole_end;
+                       break;
+               }
+       }
+
+       return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       struct page *pages = NULL;
+
+       switch (image->type) {
+       case KEXEC_TYPE_DEFAULT:
+               pages = kimage_alloc_normal_control_pages(image, order);
+               break;
+       case KEXEC_TYPE_CRASH:
+               pages = kimage_alloc_crash_control_pages(image, order);
+               break;
+       }
+
+       return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+       if (*image->entry != 0)
+               image->entry++;
+
+       if (image->entry == image->last_entry) {
+               kimage_entry_t *ind_page;
+               struct page *page;
+
+               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+               if (!page)
+                       return -ENOMEM;
+
+               ind_page = page_address(page);
+               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               image->entry = ind_page;
+               image->last_entry = ind_page +
+                                     ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+       }
+       *image->entry = entry;
+       image->entry++;
+       *image->entry = 0;
+
+       return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+                                  unsigned long destination)
+{
+       int result;
+
+       destination &= PAGE_MASK;
+       result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+       return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+       int result;
+
+       page &= PAGE_MASK;
+       result = kimage_add_entry(image, page | IND_SOURCE);
+
+       return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+       /* Walk through and free any extra destination pages I may have */
+       kimage_free_page_list(&image->dest_pages);
+
+       /* Walk through and free any unusable pages I have cached */
+       kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+       if (*image->entry != 0)
+               image->entry++;
+
+       *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+               ptr = (entry & IND_INDIRECTION) ? \
+                       phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+       struct page *page;
+
+       page = pfn_to_page(entry >> PAGE_SHIFT);
+       kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+       kimage_entry_t *ptr, entry;
+       kimage_entry_t ind = 0;
+
+       if (!image)
+               return;
+
+       kimage_free_extra_pages(image);
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_INDIRECTION) {
+                       /* Free the previous indirection page */
+                       if (ind & IND_INDIRECTION)
+                               kimage_free_entry(ind);
+                       /* Save this indirection page until we are
+                        * done with it.
+                        */
+                       ind = entry;
+               } else if (entry & IND_SOURCE)
+                       kimage_free_entry(entry);
+       }
+       /* Free the final indirection page */
+       if (ind & IND_INDIRECTION)
+               kimage_free_entry(ind);
+
+       /* Handle any machine specific cleanup */
+       machine_kexec_cleanup(image);
+
+       /* Free the kexec control pages... */
+       kimage_free_page_list(&image->control_pages);
+
+       /*
+        * Free up any temporary buffers allocated. This might hit if
+        * error occurred much later after buffer allocation.
+        */
+       if (image->file_mode)
+               kimage_file_post_load_cleanup(image);
+
+       kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+                                       unsigned long page)
+{
+       kimage_entry_t *ptr, entry;
+       unsigned long destination = 0;
+
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_DESTINATION)
+                       destination = entry & PAGE_MASK;
+               else if (entry & IND_SOURCE) {
+                       if (page == destination)
+                               return ptr;
+                       destination += PAGE_SIZE;
+               }
+       }
+
+       return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+                                       gfp_t gfp_mask,
+                                       unsigned long destination)
+{
+       /*
+        * Here we implement safeguards to ensure that a source page
+        * is not copied to its destination page before the data on
+        * the destination page is no longer useful.
+        *
+        * To do this we maintain the invariant that a source page is
+        * either its own destination page, or it is not a
+        * destination page at all.
+        *
+        * That is slightly stronger than required, but the proof
+        * that no problems will not occur is trivial, and the
+        * implementation is simply to verify.
+        *
+        * When allocating all pages normally this algorithm will run
+        * in O(N) time, but in the worst case it will run in O(N^2)
+        * time.   If the runtime is a problem the data structures can
+        * be fixed.
+        */
+       struct page *page;
+       unsigned long addr;
+
+       /*
+        * Walk through the list of destination pages, and see if I
+        * have a match.
+        */
+       list_for_each_entry(page, &image->dest_pages, lru) {
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+               if (addr == destination) {
+                       list_del(&page->lru);
+                       return page;
+               }
+       }
+       page = NULL;
+       while (1) {
+               kimage_entry_t *old;
+
+               /* Allocate a page, if we run out of memory give up */
+               page = kimage_alloc_pages(gfp_mask, 0);
+               if (!page)
+                       return NULL;
+               /* If the page cannot be used file it away */
+               if (page_to_pfn(page) >
+                               (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+                       list_add(&page->lru, &image->unusable_pages);
+                       continue;
+               }
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+
+               /* If it is the destination page we want use it */
+               if (addr == destination)
+                       break;
+
+               /* If the page is not a destination page use it */
+               if (!kimage_is_destination_range(image, addr,
+                                                 addr + PAGE_SIZE))
+                       break;
+
+               /*
+                * I know that the page is someones destination page.
+                * See if there is already a source page for this
+                * destination page.  And if so swap the source pages.
+                */
+               old = kimage_dst_used(image, addr);
+               if (old) {
+                       /* If so move it */
+                       unsigned long old_addr;
+                       struct page *old_page;
+
+                       old_addr = *old & PAGE_MASK;
+                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       copy_highpage(page, old_page);
+                       *old = addr | (*old & ~PAGE_MASK);
+
+                       /* The old page I have found cannot be a
+                        * destination page, so return it if it's
+                        * gfp_flags honor the ones passed in.
+                        */
+                       if (!(gfp_mask & __GFP_HIGHMEM) &&
+                           PageHighMem(old_page)) {
+                               kimage_free_pages(old_page);
+                               continue;
+                       }
+                       addr = old_addr;
+                       page = old_page;
+                       break;
+               }
+               /* Place the page on the destination list, to be used later */
+               list_add(&page->lru, &image->dest_pages);
+       }
+
+       return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+                                        struct kexec_segment *segment)
+{
+       unsigned long maddr;
+       size_t ubytes, mbytes;
+       int result;
+       unsigned char __user *buf = NULL;
+       unsigned char *kbuf = NULL;
+
+       result = 0;
+       if (image->file_mode)
+               kbuf = segment->kbuf;
+       else
+               buf = segment->buf;
+       ubytes = segment->bufsz;
+       mbytes = segment->memsz;
+       maddr = segment->mem;
+
+       result = kimage_set_destination(image, maddr);
+       if (result < 0)
+               goto out;
+
+       while (mbytes) {
+               struct page *page;
+               char *ptr;
+               size_t uchunk, mchunk;
+
+               page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+               if (!page) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               result = kimage_add_page(image, page_to_pfn(page)
+                                                               << PAGE_SHIFT);
+               if (result < 0)
+                       goto out;
+
+               ptr = kmap(page);
+               /* Start with a clear page */
+               clear_page(ptr);
+               ptr += maddr & ~PAGE_MASK;
+               mchunk = min_t(size_t, mbytes,
+                               PAGE_SIZE - (maddr & ~PAGE_MASK));
+               uchunk = min(ubytes, mchunk);
+
+               /* For file based kexec, source pages are in kernel memory */
+               if (image->file_mode)
+                       memcpy(ptr, kbuf, uchunk);
+               else
+                       result = copy_from_user(ptr, buf, uchunk);
+               kunmap(page);
+               if (result) {
+                       result = -EFAULT;
+                       goto out;
+               }
+               ubytes -= uchunk;
+               maddr  += mchunk;
+               if (image->file_mode)
+                       kbuf += mchunk;
+               else
+                       buf += mchunk;
+               mbytes -= mchunk;
+       }
+out:
+       return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+                                       struct kexec_segment *segment)
+{
+       /* For crash dumps kernels we simply copy the data from
+        * user space to it's destination.
+        * We do things a page at a time for the sake of kmap.
+        */
+       unsigned long maddr;
+       size_t ubytes, mbytes;
+       int result;
+       unsigned char __user *buf = NULL;
+       unsigned char *kbuf = NULL;
+
+       result = 0;
+       if (image->file_mode)
+               kbuf = segment->kbuf;
+       else
+               buf = segment->buf;
+       ubytes = segment->bufsz;
+       mbytes = segment->memsz;
+       maddr = segment->mem;
+       while (mbytes) {
+               struct page *page;
+               char *ptr;
+               size_t uchunk, mchunk;
+
+               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               if (!page) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               ptr = kmap(page);
+               ptr += maddr & ~PAGE_MASK;
+               mchunk = min_t(size_t, mbytes,
+                               PAGE_SIZE - (maddr & ~PAGE_MASK));
+               uchunk = min(ubytes, mchunk);
+               if (mchunk > uchunk) {
+                       /* Zero the trailing part of the page */
+                       memset(ptr + uchunk, 0, mchunk - uchunk);
+               }
+
+               /* For file based kexec, source pages are in kernel memory */
+               if (image->file_mode)
+                       memcpy(ptr, kbuf, uchunk);
+               else
+                       result = copy_from_user(ptr, buf, uchunk);
+               kexec_flush_icache_page(page);
+               kunmap(page);
+               if (result) {
+                       result = -EFAULT;
+                       goto out;
+               }
+               ubytes -= uchunk;
+               maddr  += mchunk;
+               if (image->file_mode)
+                       kbuf += mchunk;
+               else
+                       buf += mchunk;
+               mbytes -= mchunk;
+       }
+out:
+       return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       int result = -ENOMEM;
+
+       switch (image->type) {
+       case KEXEC_TYPE_DEFAULT:
+               result = kimage_load_normal_segment(image, segment);
+               break;
+       case KEXEC_TYPE_CRASH:
+               result = kimage_load_crash_segment(image, segment);
+               break;
+       }
+
+       return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+       /* Take the kexec_mutex here to prevent sys_kexec_load
+        * running on one cpu from replacing the crash kernel
+        * we are using after a panic on a different cpu.
+        *
+        * If the crash kernel was not located in a fixed area
+        * of memory the xchg(&kexec_crash_image) would be
+        * sufficient.  But since I reuse the memory...
+        */
+       if (mutex_trylock(&kexec_mutex)) {
+               if (kexec_crash_image) {
+                       struct pt_regs fixed_regs;
+
+                       crash_setup_regs(&fixed_regs, regs);
+                       crash_save_vmcoreinfo();
+                       machine_crash_shutdown(&fixed_regs);
+                       machine_kexec(kexec_crash_image);
+               }
+               mutex_unlock(&kexec_mutex);
+       }
+}
+
+size_t crash_get_memory_size(void)
+{
+       size_t size = 0;
+
+       mutex_lock(&kexec_mutex);
+       if (crashk_res.end != crashk_res.start)
+               size = resource_size(&crashk_res);
+       mutex_unlock(&kexec_mutex);
+       return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+                                          unsigned long end)
+{
+       unsigned long addr;
+
+       for (addr = begin; addr < end; addr += PAGE_SIZE)
+               free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+       int ret = 0;
+       unsigned long start, end;
+       unsigned long old_size;
+       struct resource *ram_res;
+
+       mutex_lock(&kexec_mutex);
+
+       if (kexec_crash_image) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+       start = crashk_res.start;
+       end = crashk_res.end;
+       old_size = (end == 0) ? 0 : end - start + 1;
+       if (new_size >= old_size) {
+               ret = (new_size == old_size) ? 0 : -EINVAL;
+               goto unlock;
+       }
+
+       ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+       if (!ram_res) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+
+       start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+       end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+       crash_map_reserved_pages();
+       crash_free_reserved_phys_range(end, crashk_res.end);
+
+       if ((start == end) && (crashk_res.parent != NULL))
+               release_resource(&crashk_res);
+
+       ram_res->start = end;
+       ram_res->end = crashk_res.end;
+       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       ram_res->name = "System RAM";
+
+       crashk_res.end = end - 1;
+
+       insert_resource(&iomem_resource, ram_res);
+       crash_unmap_reserved_pages();
+
+unlock:
+       mutex_unlock(&kexec_mutex);
+       return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                           size_t data_len)
+{
+       struct elf_note note;
+
+       note.n_namesz = strlen(name) + 1;
+       note.n_descsz = data_len;
+       note.n_type   = type;
+       memcpy(buf, &note, sizeof(note));
+       buf += (sizeof(note) + 3)/4;
+       memcpy(buf, name, note.n_namesz);
+       buf += (note.n_namesz + 3)/4;
+       memcpy(buf, data, note.n_descsz);
+       buf += (note.n_descsz + 3)/4;
+
+       return buf;
+}
+
+static void final_note(u32 *buf)
+{
+       struct elf_note note;
+
+       note.n_namesz = 0;
+       note.n_descsz = 0;
+       note.n_type   = 0;
+       memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+       struct elf_prstatus prstatus;
+       u32 *buf;
+
+       if ((cpu < 0) || (cpu >= nr_cpu_ids))
+               return;
+
+       /* Using ELF notes here is opportunistic.
+        * I need a well defined structure format
+        * for the data I pass, and I need tags
+        * on the data to indicate what information I have
+        * squirrelled away.  ELF notes happen to provide
+        * all of that, so there is no need to invent something new.
+        */
+       buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+       if (!buf)
+               return;
+       memset(&prstatus, 0, sizeof(prstatus));
+       prstatus.pr_pid = current->pid;
+       elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+       buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+                             &prstatus, sizeof(prstatus));
+       final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+       /* Allocate memory for saving cpu registers. */
+       size_t size, align;
+
+       /*
+        * crash_notes could be allocated across 2 vmalloc pages when percpu
+        * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+        * pages are also on 2 continuous physical pages. In this case the
+        * 2nd part of crash_notes in 2nd page could be lost since only the
+        * starting address and size of crash_notes are exported through sysfs.
+        * Here round up the size of crash_notes to the nearest power of two
+        * and pass it to __alloc_percpu as align value. This can make sure
+        * crash_notes is allocated inside one physical page.
+        */
+       size = sizeof(note_buf_t);
+       align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+       /*
+        * Break compile if size is bigger than PAGE_SIZE since crash_notes
+        * definitely will be in 2 pages with that.
+        */
+       BUILD_BUG_ON(size > PAGE_SIZE);
+
+       crash_notes = __alloc_percpu(size, align);
+       if (!crash_notes) {
+               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+               return -ENOMEM;
+       }
+       return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+                                       unsigned long long system_ram,
+                                       unsigned long long *crash_size,
+                                       unsigned long long *crash_base)
+{
+       char *cur = cmdline, *tmp;
+
+       /* for each entry of the comma-separated list */
+       do {
+               unsigned long long start, end = ULLONG_MAX, size;
+
+               /* get the start of the range */
+               start = memparse(cur, &tmp);
+               if (cur == tmp) {
+                       pr_warn("crashkernel: Memory value expected\n");
+                       return -EINVAL;
+               }
+               cur = tmp;
+               if (*cur != '-') {
+                       pr_warn("crashkernel: '-' expected\n");
+                       return -EINVAL;
+               }
+               cur++;
+
+               /* if no ':' is here, than we read the end */
+               if (*cur != ':') {
+                       end = memparse(cur, &tmp);
+                       if (cur == tmp) {
+                               pr_warn("crashkernel: Memory value expected\n");
+                               return -EINVAL;
+                       }
+                       cur = tmp;
+                       if (end <= start) {
+                               pr_warn("crashkernel: end <= start\n");
+                               return -EINVAL;
+                       }
+               }
+
+               if (*cur != ':') {
+                       pr_warn("crashkernel: ':' expected\n");
+                       return -EINVAL;
+               }
+               cur++;
+
+               size = memparse(cur, &tmp);
+               if (cur == tmp) {
+                       pr_warn("Memory value expected\n");
+                       return -EINVAL;
+               }
+               cur = tmp;
+               if (size >= system_ram) {
+                       pr_warn("crashkernel: invalid size\n");
+                       return -EINVAL;
+               }
+
+               /* match ? */
+               if (system_ram >= start && system_ram < end) {
+                       *crash_size = size;
+                       break;
+               }
+       } while (*cur++ == ',');
+
+       if (*crash_size > 0) {
+               while (*cur && *cur != ' ' && *cur != '@')
+                       cur++;
+               if (*cur == '@') {
+                       cur++;
+                       *crash_base = memparse(cur, &tmp);
+                       if (cur == tmp) {
+                               pr_warn("Memory value expected after '@'\n");
+                               return -EINVAL;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *     crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+                                          unsigned long long *crash_size,
+                                          unsigned long long *crash_base)
+{
+       char *cur = cmdline;
+
+       *crash_size = memparse(cmdline, &cur);
+       if (cmdline == cur) {
+               pr_warn("crashkernel: memory value expected\n");
+               return -EINVAL;
+       }
+
+       if (*cur == '@')
+               *crash_base = memparse(cur+1, &cur);
+       else if (*cur != ' ' && *cur != '\0') {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+       [SUFFIX_HIGH] = ",high",
+       [SUFFIX_LOW]  = ",low",
+       [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *     crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+                                          unsigned long long   *crash_size,
+                                          const char *suffix)
+{
+       char *cur = cmdline;
+
+       *crash_size = memparse(cmdline, &cur);
+       if (cmdline == cur) {
+               pr_warn("crashkernel: memory value expected\n");
+               return -EINVAL;
+       }
+
+       /* check with suffix */
+       if (strncmp(cur, suffix, strlen(suffix))) {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+       cur += strlen(suffix);
+       if (*cur != ' ' && *cur != '\0') {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+                            const char *name,
+                            const char *suffix)
+{
+       char *p = cmdline, *ck_cmdline = NULL;
+
+       /* find crashkernel and use the last one if there are more */
+       p = strstr(p, name);
+       while (p) {
+               char *end_p = strchr(p, ' ');
+               char *q;
+
+               if (!end_p)
+                       end_p = p + strlen(p);
+
+               if (!suffix) {
+                       int i;
+
+                       /* skip the one with any known suffix */
+                       for (i = 0; suffix_tbl[i]; i++) {
+                               q = end_p - strlen(suffix_tbl[i]);
+                               if (!strncmp(q, suffix_tbl[i],
+                                            strlen(suffix_tbl[i])))
+                                       goto next;
+                       }
+                       ck_cmdline = p;
+               } else {
+                       q = end_p - strlen(suffix);
+                       if (!strncmp(q, suffix, strlen(suffix)))
+                               ck_cmdline = p;
+               }
+next:
+               p = strstr(p+1, name);
+       }
+
+       if (!ck_cmdline)
+               return NULL;
+
+       return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base,
+                            const char *name,
+                            const char *suffix)
+{
+       char    *first_colon, *first_space;
+       char    *ck_cmdline;
+
+       BUG_ON(!crash_size || !crash_base);
+       *crash_size = 0;
+       *crash_base = 0;
+
+       ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+       if (!ck_cmdline)
+               return -EINVAL;
+
+       ck_cmdline += strlen(name);
+
+       if (suffix)
+               return parse_crashkernel_suffix(ck_cmdline, crash_size,
+                               suffix);
+       /*
+        * if the commandline contains a ':', then that's the extended
+        * syntax -- if not, it must be the classic syntax
+        */
+       first_colon = strchr(ck_cmdline, ':');
+       first_space = strchr(ck_cmdline, ' ');
+       if (first_colon && (!first_space || first_colon < first_space))
+               return parse_crashkernel_mem(ck_cmdline, system_ram,
+                               crash_size, crash_base);
+
+       return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                       "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                               "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                               "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+       u32 *buf = vmcoreinfo_note;
+
+       if (!vmcoreinfo_size)
+               return;
+       buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+                             vmcoreinfo_size);
+       final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+       vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+       update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+       va_list args;
+       char buf[0x50];
+       size_t r;
+
+       va_start(args, fmt);
+       r = vscnprintf(buf, sizeof(buf), fmt, args);
+       va_end(args);
+
+       r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+       memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+       vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+       return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+       VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+       VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+       VMCOREINFO_SYMBOL(init_uts_ns);
+       VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+       VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+       VMCOREINFO_SYMBOL(_stext);
+       VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+       VMCOREINFO_SYMBOL(mem_map);
+       VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+       VMCOREINFO_SYMBOL(mem_section);
+       VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+       VMCOREINFO_STRUCT_SIZE(mem_section);
+       VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+       VMCOREINFO_STRUCT_SIZE(page);
+       VMCOREINFO_STRUCT_SIZE(pglist_data);
+       VMCOREINFO_STRUCT_SIZE(zone);
+       VMCOREINFO_STRUCT_SIZE(free_area);
+       VMCOREINFO_STRUCT_SIZE(list_head);
+       VMCOREINFO_SIZE(nodemask_t);
+       VMCOREINFO_OFFSET(page, flags);
+       VMCOREINFO_OFFSET(page, _count);
+       VMCOREINFO_OFFSET(page, mapping);
+       VMCOREINFO_OFFSET(page, lru);
+       VMCOREINFO_OFFSET(page, _mapcount);
+       VMCOREINFO_OFFSET(page, private);
+       VMCOREINFO_OFFSET(pglist_data, node_zones);
+       VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+       VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+       VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+       VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+       VMCOREINFO_OFFSET(pglist_data, node_id);
+       VMCOREINFO_OFFSET(zone, free_area);
+       VMCOREINFO_OFFSET(zone, vm_stat);
+       VMCOREINFO_OFFSET(zone, spanned_pages);
+       VMCOREINFO_OFFSET(free_area, free_list);
+       VMCOREINFO_OFFSET(list_head, next);
+       VMCOREINFO_OFFSET(list_head, prev);
+       VMCOREINFO_OFFSET(vmap_area, va_start);
+       VMCOREINFO_OFFSET(vmap_area, list);
+       VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+       log_buf_kexec_setup();
+       VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+       VMCOREINFO_NUMBER(NR_FREE_PAGES);
+       VMCOREINFO_NUMBER(PG_lru);
+       VMCOREINFO_NUMBER(PG_private);
+       VMCOREINFO_NUMBER(PG_swapcache);
+       VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+       VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+       VMCOREINFO_NUMBER(PG_head_mask);
+       VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_X86
+       VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+#endif
+#ifdef CONFIG_HUGETLBFS
+       VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+       arch_crash_save_vmcoreinfo();
+       update_vmcoreinfo_note();
+
+       return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+       int error = 0;
+
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
+       if (!kexec_image) {
+               error = -EINVAL;
+               goto Unlock;
+       }
+
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context) {
+               lock_system_sleep();
+               pm_prepare_console();
+               error = freeze_processes();
+               if (error) {
+                       error = -EBUSY;
+                       goto Restore_console;
+               }
+               suspend_console();
+               error = dpm_suspend_start(PMSG_FREEZE);
+               if (error)
+                       goto Resume_console;
+               /* At this point, dpm_suspend_start() has been called,
+                * but *not* dpm_suspend_end(). We *must* call
+                * dpm_suspend_end() now.  Otherwise, drivers for
+                * some devices (e.g. interrupt controllers) become
+                * desynchronized with the actual state of the
+                * hardware at resume time, and evil weirdness ensues.
+                */
+               error = dpm_suspend_end(PMSG_FREEZE);
+               if (error)
+                       goto Resume_devices;
+               error = disable_nonboot_cpus();
+               if (error)
+                       goto Enable_cpus;
+               local_irq_disable();
+               error = syscore_suspend();
+               if (error)
+                       goto Enable_irqs;
+       } else
+#endif
+       {
+               kexec_in_progress = true;
+               kernel_restart_prepare(NULL);
+               migrate_to_reboot_cpu();
+
+               /*
+                * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+                * no further code needs to use CPU hotplug (which is true in
+                * the reboot case). However, the kexec path depends on using
+                * CPU hotplug again; so re-enable it here.
+                */
+               cpu_hotplug_enable();
+               pr_emerg("Starting new kernel\n");
+               machine_shutdown();
+       }
+
+       machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context) {
+               syscore_resume();
+ Enable_irqs:
+               local_irq_enable();
+ Enable_cpus:
+               enable_nonboot_cpus();
+               dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+               dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+               resume_console();
+               thaw_processes();
+ Restore_console:
+               pm_restore_console();
+               unlock_system_sleep();
+       }
+#endif
+
+ Unlock:
+       mutex_unlock(&kexec_mutex);
+       return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644 (file)
index 0000000..6a9a3f2
--- /dev/null
@@ -0,0 +1,1045 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ *      Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+       struct fd f = fdget(fd);
+       int ret;
+       struct kstat stat;
+       loff_t pos;
+       ssize_t bytes = 0;
+
+       if (!f.file)
+               return -EBADF;
+
+       ret = vfs_getattr(&f.file->f_path, &stat);
+       if (ret)
+               goto out;
+
+       if (stat.size > INT_MAX) {
+               ret = -EFBIG;
+               goto out;
+       }
+
+       /* Don't hand 0 to vmalloc, it whines. */
+       if (stat.size == 0) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       *buf = vmalloc(stat.size);
+       if (!*buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       pos = 0;
+       while (pos < stat.size) {
+               bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+                                   stat.size - pos);
+               if (bytes < 0) {
+                       vfree(*buf);
+                       ret = bytes;
+                       goto out;
+               }
+
+               if (bytes == 0)
+                       break;
+               pos += bytes;
+       }
+
+       if (pos != stat.size) {
+               ret = -EBADF;
+               vfree(*buf);
+               goto out;
+       }
+
+       *buf_len = pos;
+out:
+       fdput(f);
+       return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                        unsigned long buf_len)
+{
+       return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+       return ERR_PTR(-ENOEXEC);
+}
+
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+       return -EINVAL;
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                       unsigned long buf_len)
+{
+       return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                unsigned int relsec)
+{
+       pr_err("RELA relocation unsupported.\n");
+       return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                            unsigned int relsec)
+{
+       pr_err("REL relocation unsupported.\n");
+       return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+
+       vfree(image->kernel_buf);
+       image->kernel_buf = NULL;
+
+       vfree(image->initrd_buf);
+       image->initrd_buf = NULL;
+
+       kfree(image->cmdline_buf);
+       image->cmdline_buf = NULL;
+
+       vfree(pi->purgatory_buf);
+       pi->purgatory_buf = NULL;
+
+       vfree(pi->sechdrs);
+       pi->sechdrs = NULL;
+
+       /* See if architecture has anything to cleanup post load */
+       arch_kimage_file_post_load_cleanup(image);
+
+       /*
+        * Above call should have called into bootloader to free up
+        * any data stored in kimage->image_loader_data. It should
+        * be ok now to free it up.
+        */
+       kfree(image->image_loader_data);
+       image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+                            const char __user *cmdline_ptr,
+                            unsigned long cmdline_len, unsigned flags)
+{
+       int ret = 0;
+       void *ldata;
+
+       ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+                               &image->kernel_buf_len);
+       if (ret)
+               return ret;
+
+       /* Call arch image probe handlers */
+       ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+                                           image->kernel_buf_len);
+
+       if (ret)
+               goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+       ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+                                          image->kernel_buf_len);
+       if (ret) {
+               pr_debug("kernel signature verification failed.\n");
+               goto out;
+       }
+       pr_debug("kernel signature verification successful.\n");
+#endif
+       /* It is possible that there no initramfs is being loaded */
+       if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+               ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+                                       &image->initrd_buf_len);
+               if (ret)
+                       goto out;
+       }
+
+       if (cmdline_len) {
+               image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+               if (!image->cmdline_buf) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+
+               ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+                                    cmdline_len);
+               if (ret) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               image->cmdline_buf_len = cmdline_len;
+
+               /* command line should be a string with last byte null */
+               if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /* Call arch image load handlers */
+       ldata = arch_kexec_kernel_image_load(image);
+
+       if (IS_ERR(ldata)) {
+               ret = PTR_ERR(ldata);
+               goto out;
+       }
+
+       image->image_loader_data = ldata;
+out:
+       /* In case of error, free up all allocated memory in this function */
+       if (ret)
+               kimage_file_post_load_cleanup(image);
+       return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+                      int initrd_fd, const char __user *cmdline_ptr,
+                      unsigned long cmdline_len, unsigned long flags)
+{
+       int ret;
+       struct kimage *image;
+       bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+       image = do_kimage_alloc_init();
+       if (!image)
+               return -ENOMEM;
+
+       image->file_mode = 1;
+
+       if (kexec_on_panic) {
+               /* Enable special crash kernel control page alloc policy. */
+               image->control_page = crashk_res.start;
+               image->type = KEXEC_TYPE_CRASH;
+       }
+
+       ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+                                          cmdline_ptr, cmdline_len, flags);
+       if (ret)
+               goto out_free_image;
+
+       ret = sanity_check_segment_list(image);
+       if (ret)
+               goto out_free_post_load_bufs;
+
+       ret = -ENOMEM;
+       image->control_code_page = kimage_alloc_control_pages(image,
+                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
+       if (!image->control_code_page) {
+               pr_err("Could not allocate control_code_buffer\n");
+               goto out_free_post_load_bufs;
+       }
+
+       if (!kexec_on_panic) {
+               image->swap_page = kimage_alloc_control_pages(image, 0);
+               if (!image->swap_page) {
+                       pr_err("Could not allocate swap buffer\n");
+                       goto out_free_control_pages;
+               }
+       }
+
+       *rimage = image;
+       return 0;
+out_free_control_pages:
+       kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+       kimage_file_post_load_cleanup(image);
+out_free_image:
+       kfree(image);
+       return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+               unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+               unsigned long, flags)
+{
+       int ret = 0, i;
+       struct kimage **dest_image, *image;
+
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+               return -EPERM;
+
+       /* Make sure we have a legal set of flags */
+       if (flags != (flags & KEXEC_FILE_FLAGS))
+               return -EINVAL;
+
+       image = NULL;
+
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
+
+       dest_image = &kexec_image;
+       if (flags & KEXEC_FILE_ON_CRASH)
+               dest_image = &kexec_crash_image;
+
+       if (flags & KEXEC_FILE_UNLOAD)
+               goto exchange;
+
+       /*
+        * In case of crash, new kernel gets loaded in reserved region. It is
+        * same memory where old crash kernel might be loaded. Free any
+        * current crash dump kernel before we corrupt it.
+        */
+       if (flags & KEXEC_FILE_ON_CRASH)
+               kimage_free(xchg(&kexec_crash_image, NULL));
+
+       ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+                                    cmdline_len, flags);
+       if (ret)
+               goto out;
+
+       ret = machine_kexec_prepare(image);
+       if (ret)
+               goto out;
+
+       ret = kexec_calculate_store_digests(image);
+       if (ret)
+               goto out;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               struct kexec_segment *ksegment;
+
+               ksegment = &image->segment[i];
+               pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+                        i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+                        ksegment->memsz);
+
+               ret = kimage_load_segment(image, &image->segment[i]);
+               if (ret)
+                       goto out;
+       }
+
+       kimage_terminate(image);
+
+       /*
+        * Free up any temporary buffers allocated which are not needed
+        * after image has been loaded
+        */
+       kimage_file_post_load_cleanup(image);
+exchange:
+       image = xchg(dest_image, image);
+out:
+       mutex_unlock(&kexec_mutex);
+       kimage_free(image);
+       return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+                                   struct kexec_buf *kbuf)
+{
+       struct kimage *image = kbuf->image;
+       unsigned long temp_start, temp_end;
+
+       temp_end = min(end, kbuf->buf_max);
+       temp_start = temp_end - kbuf->memsz;
+
+       do {
+               /* align down start */
+               temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+               if (temp_start < start || temp_start < kbuf->buf_min)
+                       return 0;
+
+               temp_end = temp_start + kbuf->memsz - 1;
+
+               /*
+                * Make sure this does not conflict with any of existing
+                * segments
+                */
+               if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                       temp_start = temp_start - PAGE_SIZE;
+                       continue;
+               }
+
+               /* We found a suitable memory range */
+               break;
+       } while (1);
+
+       /* If we are here, we found a suitable memory range */
+       kbuf->mem = temp_start;
+
+       /* Success, stop navigating through remaining System RAM ranges */
+       return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+                                    struct kexec_buf *kbuf)
+{
+       struct kimage *image = kbuf->image;
+       unsigned long temp_start, temp_end;
+
+       temp_start = max(start, kbuf->buf_min);
+
+       do {
+               temp_start = ALIGN(temp_start, kbuf->buf_align);
+               temp_end = temp_start + kbuf->memsz - 1;
+
+               if (temp_end > end || temp_end > kbuf->buf_max)
+                       return 0;
+               /*
+                * Make sure this does not conflict with any of existing
+                * segments
+                */
+               if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                       temp_start = temp_start + PAGE_SIZE;
+                       continue;
+               }
+
+               /* We found a suitable memory range */
+               break;
+       } while (1);
+
+       /* If we are here, we found a suitable memory range */
+       kbuf->mem = temp_start;
+
+       /* Success, stop navigating through remaining System RAM ranges */
+       return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+       struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+       unsigned long sz = end - start + 1;
+
+       /* Returning 0 will take to next memory range */
+       if (sz < kbuf->memsz)
+               return 0;
+
+       if (end < kbuf->buf_min || start > kbuf->buf_max)
+               return 0;
+
+       /*
+        * Allocate memory top down with-in ram range. Otherwise bottom up
+        * allocation.
+        */
+       if (kbuf->top_down)
+               return locate_mem_hole_top_down(start, end, kbuf);
+       return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+                    unsigned long memsz, unsigned long buf_align,
+                    unsigned long buf_min, unsigned long buf_max,
+                    bool top_down, unsigned long *load_addr)
+{
+
+       struct kexec_segment *ksegment;
+       struct kexec_buf buf, *kbuf;
+       int ret;
+
+       /* Currently adding segment this way is allowed only in file mode */
+       if (!image->file_mode)
+               return -EINVAL;
+
+       if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+               return -EINVAL;
+
+       /*
+        * Make sure we are not trying to add buffer after allocating
+        * control pages. All segments need to be placed first before
+        * any control pages are allocated. As control page allocation
+        * logic goes through list of segments to make sure there are
+        * no destination overlaps.
+        */
+       if (!list_empty(&image->control_pages)) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+
+       memset(&buf, 0, sizeof(struct kexec_buf));
+       kbuf = &buf;
+       kbuf->image = image;
+       kbuf->buffer = buffer;
+       kbuf->bufsz = bufsz;
+
+       kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+       kbuf->buf_align = max(buf_align, PAGE_SIZE);
+       kbuf->buf_min = buf_min;
+       kbuf->buf_max = buf_max;
+       kbuf->top_down = top_down;
+
+       /* Walk the RAM ranges and allocate a suitable range for the buffer */
+       if (image->type == KEXEC_TYPE_CRASH)
+               ret = walk_iomem_res("Crash kernel",
+                                    IORESOURCE_MEM | IORESOURCE_BUSY,
+                                    crashk_res.start, crashk_res.end, kbuf,
+                                    locate_mem_hole_callback);
+       else
+               ret = walk_system_ram_res(0, -1, kbuf,
+                                         locate_mem_hole_callback);
+       if (ret != 1) {
+               /* A suitable memory range could not be found for buffer */
+               return -EADDRNOTAVAIL;
+       }
+
+       /* Found a suitable memory range */
+       ksegment = &image->segment[image->nr_segments];
+       ksegment->kbuf = kbuf->buffer;
+       ksegment->bufsz = kbuf->bufsz;
+       ksegment->mem = kbuf->mem;
+       ksegment->memsz = kbuf->memsz;
+       image->nr_segments++;
+       *load_addr = ksegment->mem;
+       return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+       struct crypto_shash *tfm;
+       struct shash_desc *desc;
+       int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+       size_t desc_size, nullsz;
+       char *digest;
+       void *zero_buf;
+       struct kexec_sha_region *sha_regions;
+       struct purgatory_info *pi = &image->purgatory_info;
+
+       zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+       zero_buf_sz = PAGE_SIZE;
+
+       tfm = crypto_alloc_shash("sha256", 0, 0);
+       if (IS_ERR(tfm)) {
+               ret = PTR_ERR(tfm);
+               goto out;
+       }
+
+       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+       desc = kzalloc(desc_size, GFP_KERNEL);
+       if (!desc) {
+               ret = -ENOMEM;
+               goto out_free_tfm;
+       }
+
+       sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+       sha_regions = vzalloc(sha_region_sz);
+       if (!sha_regions)
+               goto out_free_desc;
+
+       desc->tfm   = tfm;
+       desc->flags = 0;
+
+       ret = crypto_shash_init(desc);
+       if (ret < 0)
+               goto out_free_sha_regions;
+
+       digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+       if (!digest) {
+               ret = -ENOMEM;
+               goto out_free_sha_regions;
+       }
+
+       for (j = i = 0; i < image->nr_segments; i++) {
+               struct kexec_segment *ksegment;
+
+               ksegment = &image->segment[i];
+               /*
+                * Skip purgatory as it will be modified once we put digest
+                * info in purgatory.
+                */
+               if (ksegment->kbuf == pi->purgatory_buf)
+                       continue;
+
+               ret = crypto_shash_update(desc, ksegment->kbuf,
+                                         ksegment->bufsz);
+               if (ret)
+                       break;
+
+               /*
+                * Assume rest of the buffer is filled with zero and
+                * update digest accordingly.
+                */
+               nullsz = ksegment->memsz - ksegment->bufsz;
+               while (nullsz) {
+                       unsigned long bytes = nullsz;
+
+                       if (bytes > zero_buf_sz)
+                               bytes = zero_buf_sz;
+                       ret = crypto_shash_update(desc, zero_buf, bytes);
+                       if (ret)
+                               break;
+                       nullsz -= bytes;
+               }
+
+               if (ret)
+                       break;
+
+               sha_regions[j].start = ksegment->mem;
+               sha_regions[j].len = ksegment->memsz;
+               j++;
+       }
+
+       if (!ret) {
+               ret = crypto_shash_final(desc, digest);
+               if (ret)
+                       goto out_free_digest;
+               ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+                                               sha_regions, sha_region_sz, 0);
+               if (ret)
+                       goto out_free_digest;
+
+               ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+                                               digest, SHA256_DIGEST_SIZE, 0);
+               if (ret)
+                       goto out_free_digest;
+       }
+
+out_free_digest:
+       kfree(digest);
+out_free_sha_regions:
+       vfree(sha_regions);
+out_free_desc:
+       kfree(desc);
+out_free_tfm:
+       kfree(tfm);
+out:
+       return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+                                 unsigned long max, int top_down)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+       unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+       unsigned char *buf_addr, *src;
+       int i, ret = 0, entry_sidx = -1;
+       const Elf_Shdr *sechdrs_c;
+       Elf_Shdr *sechdrs = NULL;
+       void *purgatory_buf = NULL;
+
+       /*
+        * sechdrs_c points to section headers in purgatory and are read
+        * only. No modifications allowed.
+        */
+       sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+       /*
+        * We can not modify sechdrs_c[] and its fields. It is read only.
+        * Copy it over to a local copy where one can store some temporary
+        * data and free it at the end. We need to modify ->sh_addr and
+        * ->sh_offset fields to keep track of permanent and temporary
+        * locations of sections.
+        */
+       sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+       if (!sechdrs)
+               return -ENOMEM;
+
+       memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+       /*
+        * We seem to have multiple copies of sections. First copy is which
+        * is embedded in kernel in read only section. Some of these sections
+        * will be copied to a temporary buffer and relocated. And these
+        * sections will finally be copied to their final destination at
+        * segment load time.
+        *
+        * Use ->sh_offset to reflect section address in memory. It will
+        * point to original read only copy if section is not allocatable.
+        * Otherwise it will point to temporary copy which will be relocated.
+        *
+        * Use ->sh_addr to contain final address of the section where it
+        * will go during execution time.
+        */
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (sechdrs[i].sh_type == SHT_NOBITS)
+                       continue;
+
+               sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+                                               sechdrs[i].sh_offset;
+       }
+
+       /*
+        * Identify entry point section and make entry relative to section
+        * start.
+        */
+       entry = pi->ehdr->e_entry;
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+                       continue;
+
+               /* Make entry section relative */
+               if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+                   ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+                    pi->ehdr->e_entry)) {
+                       entry_sidx = i;
+                       entry -= sechdrs[i].sh_addr;
+                       break;
+               }
+       }
+
+       /* Determine how much memory is needed to load relocatable object. */
+       buf_align = 1;
+       bss_align = 1;
+       buf_sz = 0;
+       bss_sz = 0;
+
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               align = sechdrs[i].sh_addralign;
+               if (sechdrs[i].sh_type != SHT_NOBITS) {
+                       if (buf_align < align)
+                               buf_align = align;
+                       buf_sz = ALIGN(buf_sz, align);
+                       buf_sz += sechdrs[i].sh_size;
+               } else {
+                       /* bss section */
+                       if (bss_align < align)
+                               bss_align = align;
+                       bss_sz = ALIGN(bss_sz, align);
+                       bss_sz += sechdrs[i].sh_size;
+               }
+       }
+
+       /* Determine the bss padding required to align bss properly */
+       bss_pad = 0;
+       if (buf_sz & (bss_align - 1))
+               bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+       memsz = buf_sz + bss_pad + bss_sz;
+
+       /* Allocate buffer for purgatory */
+       purgatory_buf = vzalloc(buf_sz);
+       if (!purgatory_buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (buf_align < bss_align)
+               buf_align = bss_align;
+
+       /* Add buffer to segment list */
+       ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+                               buf_align, min, max, top_down,
+                               &pi->purgatory_load_addr);
+       if (ret)
+               goto out;
+
+       /* Load SHF_ALLOC sections */
+       buf_addr = purgatory_buf;
+       load_addr = curr_load_addr = pi->purgatory_load_addr;
+       bss_addr = load_addr + buf_sz + bss_pad;
+
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               align = sechdrs[i].sh_addralign;
+               if (sechdrs[i].sh_type != SHT_NOBITS) {
+                       curr_load_addr = ALIGN(curr_load_addr, align);
+                       offset = curr_load_addr - load_addr;
+                       /* We already modifed ->sh_offset to keep src addr */
+                       src = (char *) sechdrs[i].sh_offset;
+                       memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+                       /* Store load address and source address of section */
+                       sechdrs[i].sh_addr = curr_load_addr;
+
+                       /*
+                        * This section got copied to temporary buffer. Update
+                        * ->sh_offset accordingly.
+                        */
+                       sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+                       /* Advance to the next address */
+                       curr_load_addr += sechdrs[i].sh_size;
+               } else {
+                       bss_addr = ALIGN(bss_addr, align);
+                       sechdrs[i].sh_addr = bss_addr;
+                       bss_addr += sechdrs[i].sh_size;
+               }
+       }
+
+       /* Update entry point based on load address of text section */
+       if (entry_sidx >= 0)
+               entry += sechdrs[entry_sidx].sh_addr;
+
+       /* Make kernel jump to purgatory after shutdown */
+       image->start = entry;
+
+       /* Used later to get/set symbol values */
+       pi->sechdrs = sechdrs;
+
+       /*
+        * Used later to identify which section is purgatory and skip it
+        * from checksumming.
+        */
+       pi->purgatory_buf = purgatory_buf;
+       return ret;
+out:
+       vfree(sechdrs);
+       vfree(purgatory_buf);
+       return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+       int i, ret;
+       struct purgatory_info *pi = &image->purgatory_info;
+       Elf_Shdr *sechdrs = pi->sechdrs;
+
+       /* Apply relocations */
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               Elf_Shdr *section, *symtab;
+
+               if (sechdrs[i].sh_type != SHT_RELA &&
+                   sechdrs[i].sh_type != SHT_REL)
+                       continue;
+
+               /*
+                * For section of type SHT_RELA/SHT_REL,
+                * ->sh_link contains section header index of associated
+                * symbol table. And ->sh_info contains section header
+                * index of section to which relocations apply.
+                */
+               if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+                   sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+                       return -ENOEXEC;
+
+               section = &sechdrs[sechdrs[i].sh_info];
+               symtab = &sechdrs[sechdrs[i].sh_link];
+
+               if (!(section->sh_flags & SHF_ALLOC))
+                       continue;
+
+               /*
+                * symtab->sh_link contain section header index of associated
+                * string table.
+                */
+               if (symtab->sh_link >= pi->ehdr->e_shnum)
+                       /* Invalid section number? */
+                       continue;
+
+               /*
+                * Respective architecture needs to provide support for applying
+                * relocations of type SHT_RELA/SHT_REL.
+                */
+               if (sechdrs[i].sh_type == SHT_RELA)
+                       ret = arch_kexec_apply_relocations_add(pi->ehdr,
+                                                              sechdrs, i);
+               else if (sechdrs[i].sh_type == SHT_REL)
+                       ret = arch_kexec_apply_relocations(pi->ehdr,
+                                                          sechdrs, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+                        unsigned long max, int top_down,
+                        unsigned long *load_addr)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       int ret;
+
+       if (kexec_purgatory_size <= 0)
+               return -EINVAL;
+
+       if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+               return -ENOEXEC;
+
+       pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+       if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+           || pi->ehdr->e_type != ET_REL
+           || !elf_check_arch(pi->ehdr)
+           || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+               return -ENOEXEC;
+
+       if (pi->ehdr->e_shoff >= kexec_purgatory_size
+           || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+           kexec_purgatory_size - pi->ehdr->e_shoff))
+               return -ENOEXEC;
+
+       ret = __kexec_load_purgatory(image, min, max, top_down);
+       if (ret)
+               return ret;
+
+       ret = kexec_apply_relocations(image);
+       if (ret)
+               goto out;
+
+       *load_addr = pi->purgatory_load_addr;
+       return 0;
+out:
+       vfree(pi->sechdrs);
+       vfree(pi->purgatory_buf);
+       return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+                                           const char *name)
+{
+       Elf_Sym *syms;
+       Elf_Shdr *sechdrs;
+       Elf_Ehdr *ehdr;
+       int i, k;
+       const char *strtab;
+
+       if (!pi->sechdrs || !pi->ehdr)
+               return NULL;
+
+       sechdrs = pi->sechdrs;
+       ehdr = pi->ehdr;
+
+       for (i = 0; i < ehdr->e_shnum; i++) {
+               if (sechdrs[i].sh_type != SHT_SYMTAB)
+                       continue;
+
+               if (sechdrs[i].sh_link >= ehdr->e_shnum)
+                       /* Invalid strtab section number */
+                       continue;
+               strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+               syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+               /* Go through symbols for a match */
+               for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+                       if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+                               continue;
+
+                       if (strcmp(strtab + syms[k].st_name, name) != 0)
+                               continue;
+
+                       if (syms[k].st_shndx == SHN_UNDEF ||
+                           syms[k].st_shndx >= ehdr->e_shnum) {
+                               pr_debug("Symbol: %s has bad section index %d.\n",
+                                               name, syms[k].st_shndx);
+                               return NULL;
+                       }
+
+                       /* Found the symbol we are looking for */
+                       return &syms[k];
+               }
+       }
+
+       return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       Elf_Sym *sym;
+       Elf_Shdr *sechdr;
+
+       sym = kexec_purgatory_find_symbol(pi, name);
+       if (!sym)
+               return ERR_PTR(-EINVAL);
+
+       sechdr = &pi->sechdrs[sym->st_shndx];
+
+       /*
+        * Returns the address where symbol will finally be loaded after
+        * kexec_load_segment()
+        */
+       return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+                                  void *buf, unsigned int size, bool get_value)
+{
+       Elf_Sym *sym;
+       Elf_Shdr *sechdrs;
+       struct purgatory_info *pi = &image->purgatory_info;
+       char *sym_buf;
+
+       sym = kexec_purgatory_find_symbol(pi, name);
+       if (!sym)
+               return -EINVAL;
+
+       if (sym->st_size != size) {
+               pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+                      name, (unsigned long)sym->st_size, size);
+               return -EINVAL;
+       }
+
+       sechdrs = pi->sechdrs;
+
+       if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+               pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+                      get_value ? "get" : "set");
+               return -EINVAL;
+       }
+
+       sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+                                       sym->st_value;
+
+       if (get_value)
+               memcpy((void *)buf, sym_buf, size);
+       else
+               memcpy((void *)sym_buf, buf, size);
+
+       return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644 (file)
index 0000000..e4392a6
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+                               unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
index 2777f40..da98d05 100644 (file)
@@ -45,8 +45,6 @@
 
 extern int max_threads;
 
-static struct workqueue_struct *khelper_wq;
-
 #define CAP_BSET       (void *)1
 #define CAP_PI         (void *)2
 
@@ -114,10 +112,11 @@ out:
  * @...: arguments as specified in the format string
  *
  * Load a module using the user mode module loader. The function returns
- * zero on success or a negative errno code on failure. Note that a
- * successful module load does not mean the module did not then unload
- * and exit on an error of its own. Callers must check that the service
- * they requested is now available not blindly invoke it.
+ * zero on success or a negative errno code or positive exit code from
+ * "modprobe" on failure. Note that a successful module load does not mean
+ * the module did not then unload and exit on an error of its own. Callers
+ * must check that the service they requested is now available not blindly
+ * invoke it.
  *
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
 /*
  * This is the task which runs the usermode application
  */
-static int ____call_usermodehelper(void *data)
+static int call_usermodehelper_exec_async(void *data)
 {
        struct subprocess_info *sub_info = data;
        struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
        flush_signal_handlers(current, 1);
        spin_unlock_irq(&current->sighand->siglock);
 
-       /* We can run anywhere, unlike our parent keventd(). */
-       set_cpus_allowed_ptr(current, cpu_all_mask);
-
        /*
-        * Our parent is keventd, which runs with elevated scheduling priority.
-        * Avoid propagating that into the userspace child.
+        * Our parent (unbound workqueue) runs with elevated scheduling
+        * priority. Avoid propagating that into the userspace child.
         */
        set_user_nice(current, 0);
 
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
                           (const char __user *const __user *)sub_info->envp);
 out:
        sub_info->retval = retval;
-       /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+       /*
+        * call_usermodehelper_exec_sync() will call umh_complete
+        * if UHM_WAIT_PROC.
+        */
        if (!(sub_info->wait & UMH_WAIT_PROC))
                umh_complete(sub_info);
        if (!retval)
@@ -266,15 +265,14 @@ out:
        do_exit(0);
 }
 
-/* Keventd can't block, but this (a child) can. */
-static int wait_for_helper(void *data)
+/* Handles UMH_WAIT_PROC.  */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 {
-       struct subprocess_info *sub_info = data;
        pid_t pid;
 
        /* If SIGCLD is ignored sys_wait4 won't populate the status. */
        kernel_sigaction(SIGCHLD, SIG_DFL);
-       pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+       pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
        if (pid < 0) {
                sub_info->retval = pid;
        } else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
                /*
                 * Normally it is bogus to call wait4() from in-kernel because
                 * wait4() wants to write the exit code to a userspace address.
-                * But wait_for_helper() always runs as keventd, and put_user()
-                * to a kernel address works OK for kernel threads, due to their
-                * having an mm_segment_t which spans the entire address space.
+                * But call_usermodehelper_exec_sync() always runs as kernel
+                * thread (workqueue) and put_user() to a kernel address works
+                * OK for kernel threads, due to their having an mm_segment_t
+                * which spans the entire address space.
                 *
                 * Thus the __user pointer cast is valid here.
                 */
                sys_wait4(pid, (int __user *)&ret, 0, NULL);
 
                /*
-                * If ret is 0, either ____call_usermodehelper failed and the
-                * real error code is already in sub_info->retval or
+                * If ret is 0, either call_usermodehelper_exec_async failed and
+                * the real error code is already in sub_info->retval or
                 * sub_info->retval is 0 anyway, so don't mess with it then.
                 */
                if (ret)
                        sub_info->retval = ret;
        }
 
+       /* Restore default kernel sig handler */
+       kernel_sigaction(SIGCHLD, SIG_IGN);
+
        umh_complete(sub_info);
-       do_exit(0);
 }
 
-/* This is run by khelper thread  */
-static void __call_usermodehelper(struct work_struct *work)
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-       pid_t pid;
 
-       if (sub_info->wait & UMH_WAIT_PROC)
-               pid = kernel_thread(wait_for_helper, sub_info,
-                                   CLONE_FS | CLONE_FILES | SIGCHLD);
-       else
-               pid = kernel_thread(____call_usermodehelper, sub_info,
-                                   SIGCHLD);
+       if (sub_info->wait & UMH_WAIT_PROC) {
+               call_usermodehelper_exec_sync(sub_info);
+       } else {
+               pid_t pid;
 
-       if (pid < 0) {
-               sub_info->retval = pid;
-               umh_complete(sub_info);
+               pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+                                   SIGCHLD);
+               if (pid < 0) {
+                       sub_info->retval = pid;
+                       umh_complete(sub_info);
+               }
        }
 }
 
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
        if (!sub_info)
                goto out;
 
-       INIT_WORK(&sub_info->work, __call_usermodehelper);
+       INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
        sub_info->path = path;
        sub_info->argv = argv;
        sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  *        from interrupt context.
  *
  * Runs a user-space application.  The application is started
- * asynchronously if wait is not set, and runs as a child of keventd.
- * (ie. it runs with full root capabilities).
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                return -EINVAL;
        }
        helper_lock();
-       if (!khelper_wq || usermodehelper_disabled) {
+       if (usermodehelper_disabled) {
                retval = -EBUSY;
                goto out;
        }
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;
 
-       queue_work(khelper_wq, &sub_info->work);
+       queue_work(system_unbound_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;
 
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
        },
        { }
 };
-
-void __init usermodehelper_init(void)
-{
-       khelper_wq = create_singlethread_workqueue("khelper");
-       BUG_ON(!khelper_wq);
-}
index 6683cce..e83b264 100644 (file)
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
 KERNEL_ATTR_RW(profiling);
 #endif
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static ssize_t kexec_loaded_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_PROFILING
        &profiling_attr.attr,
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
        &kexec_crash_size_attr.attr,
index 337c881..87e9ce6 100644 (file)
@@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
        if (pv_enabled())
                goto queue;
 
-       if (virt_queued_spin_lock(lock))
+       if (virt_spin_lock(lock))
                return;
 
        /*
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644 (file)
index 0000000..536c727
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+       if (unlikely(flags))
+               return -EINVAL;
+       switch (cmd) {
+       case MEMBARRIER_CMD_QUERY:
+               return MEMBARRIER_CMD_BITMASK;
+       case MEMBARRIER_CMD_SHARED:
+               if (num_online_cpus() > 1)
+                       synchronize_sched();
+               return 0;
+       default:
+               return -EINVAL;
+       }
+}
index cf8c242..8f0324e 100644 (file)
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
        .release = devkmsg_release,
 };
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This appends the listed symbols to /proc/vmcore
  *
index d20c85d..bd30a97 100644 (file)
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                kernel_restart(buffer);
                break;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        case LINUX_REBOOT_CMD_KEXEC:
                ret = kernel_kexec();
                break;
index 3595403..97d276f 100644 (file)
@@ -621,18 +621,21 @@ int get_nohz_timer_target(void)
        int i, cpu = smp_processor_id();
        struct sched_domain *sd;
 
-       if (!idle_cpu(cpu))
+       if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
                return cpu;
 
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
-                       if (!idle_cpu(i)) {
+                       if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
                                cpu = i;
                                goto unlock;
                        }
                }
        }
+
+       if (!is_housekeeping_cpu(cpu))
+               cpu = housekeeping_any_cpu();
 unlock:
        rcu_read_unlock();
        return cpu;
@@ -5178,24 +5181,47 @@ static void migrate_tasks(struct rq *dead_rq)
                        break;
 
                /*
-                * Ensure rq->lock covers the entire task selection
-                * until the migration.
+                * pick_next_task assumes pinned rq->lock.
                 */
                lockdep_pin_lock(&rq->lock);
                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
 
+               /*
+                * Rules for changing task_struct::cpus_allowed are holding
+                * both pi_lock and rq->lock, such that holding either
+                * stabilizes the mask.
+                *
+                * Drop rq->lock is not quite as disastrous as it usually is
+                * because !cpu_active at this point, which means load-balance
+                * will not interfere. Also, stop-machine.
+                */
+               lockdep_unpin_lock(&rq->lock);
+               raw_spin_unlock(&rq->lock);
+               raw_spin_lock(&next->pi_lock);
+               raw_spin_lock(&rq->lock);
+
+               /*
+                * Since we're inside stop-machine, _nothing_ should have
+                * changed the task, WARN if weird stuff happened, because in
+                * that case the above rq->lock drop is a fail too.
+                */
+               if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+                       raw_spin_unlock(&next->pi_lock);
+                       continue;
+               }
+
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-               lockdep_unpin_lock(&rq->lock);
                rq = __migrate_task(rq, next, dest_cpu);
                if (rq != dead_rq) {
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
                }
+               raw_spin_unlock(&next->pi_lock);
        }
 
        rq->stop = stop;
index 03c3875..a02decf 100644 (file)
@@ -245,3 +245,6 @@ cond_syscall(sys_bpf);
 
 /* execveat */
 cond_syscall(sys_execveat);
+
+/* membarrier */
+cond_syscall(sys_membarrier);
index 19b62b5..e69201d 100644 (file)
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        {
                .procname       = "kexec_load_disabled",
                .data           = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                       *lvalp = (unsigned long)-val;
+                       *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                       *lvalp = (unsigned long)-val;
+                       *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
index 50eb107..a9b76a4 100644 (file)
@@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 static int __clockevents_switch_state(struct clock_event_device *dev,
                                      enum clock_event_state state)
 {
-       /* Transition with legacy set_mode() callback */
-       if (dev->set_mode) {
-               /* Legacy callback doesn't support new modes */
-               if (state > CLOCK_EVT_STATE_ONESHOT)
-                       return -ENOSYS;
-               /*
-                * 'clock_event_state' and 'clock_event_mode' have 1-to-1
-                * mapping until *_ONESHOT, and so a simple cast will work.
-                */
-               dev->set_mode((enum clock_event_mode)state, dev);
-               dev->mode = (enum clock_event_mode)state;
-               return 0;
-       }
-
        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
                return 0;
 
@@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)
 {
        int ret = 0;
 
-       if (dev->set_mode) {
-               dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
-               dev->mode = CLOCK_EVT_MODE_RESUME;
-       } else if (dev->tick_resume) {
+       if (dev->tick_resume)
                ret = dev->tick_resume(dev);
-       }
 
        return ret;
 }
@@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind_device);
 
-/* Sanity check of state transition callbacks */
-static int clockevents_sanity_check(struct clock_event_device *dev)
-{
-       /* Legacy set_mode() callback */
-       if (dev->set_mode) {
-               /* We shouldn't be supporting new modes now */
-               WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-                       dev->set_state_shutdown || dev->tick_resume ||
-                       dev->set_state_oneshot_stopped);
-
-               BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-               return 0;
-       }
-
-       if (dev->features & CLOCK_EVT_FEAT_DUMMY)
-               return 0;
-
-       return 0;
-}
-
 /**
  * clockevents_register_device - register a clock event device
  * @dev:       device to register
@@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
        unsigned long flags;
 
-       BUG_ON(clockevents_sanity_check(dev));
-
        /* Initialize state to DETACHED */
        clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 
index d11c55b..4fcd99e 100644 (file)
@@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu)
                 * the set mode function!
                 */
                clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
-               dev->mode = CLOCK_EVT_MODE_UNUSED;
                clockevents_exchange_device(dev, NULL);
                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
index 3319e16..7c7ec45 100644 (file)
@@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)
 __setup("nohz_full=", tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
-                                                unsigned long action,
-                                                void *hcpu)
+                                      unsigned long action,
+                                      void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
 
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                /*
-                * If we handle the timekeeping duty for full dynticks CPUs,
-                * we can't safely shutdown that CPU.
+                * The boot CPU handles housekeeping duty (unbound timers,
+                * workqueues, timekeeping, ...) on behalf of full dynticks
+                * CPUs. It must remain online when nohz full is enabled.
                 */
                if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
                        return NOTIFY_BAD;
@@ -370,6 +371,12 @@ void __init tick_nohz_init(void)
        cpu_notifier(tick_nohz_cpu_down_callback, 0);
        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
                cpumask_pr_args(tick_nohz_full_mask));
+
+       /*
+        * We need at least one CPU to handle housekeeping work such
+        * as timekeeping, unbound timers, workqueues, ...
+        */
+       WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
 
index f6ee2e6..3739ac6 100644 (file)
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
        negative = (tick_error < 0);
 
        /* Sort out the magnitude of the correction */
-       tick_error = abs(tick_error);
+       tick_error = abs64(tick_error);
        for (adj = 0; tick_error > interval; adj++)
                tick_error >>= 1;
 
index 129c960..f75e35b 100644 (file)
@@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
                   (unsigned long long) dev->min_delta_ns);
        SEQ_printf(m, " mult:           %u\n", dev->mult);
        SEQ_printf(m, " shift:          %u\n", dev->shift);
-       SEQ_printf(m, " mode:           %d\n", dev->mode);
+       SEQ_printf(m, " mode:           %d\n", clockevent_get_state(dev));
        SEQ_printf(m, " next_event:     %Ld nsecs\n",
                   (unsigned long long) ktime_to_ns(dev->next_event));
 
@@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        print_name_offset(m, dev->set_next_event);
        SEQ_printf(m, "\n");
 
-       if (dev->set_mode) {
-               SEQ_printf(m, " set_mode:       ");
-               print_name_offset(m, dev->set_mode);
+       if (dev->set_state_shutdown) {
+               SEQ_printf(m, " shutdown: ");
+               print_name_offset(m, dev->set_state_shutdown);
                SEQ_printf(m, "\n");
-       } else {
-               if (dev->set_state_shutdown) {
-                       SEQ_printf(m, " shutdown: ");
-                       print_name_offset(m, dev->set_state_shutdown);
-                       SEQ_printf(m, "\n");
-               }
+       }
 
-               if (dev->set_state_periodic) {
-                       SEQ_printf(m, " periodic: ");
-                       print_name_offset(m, dev->set_state_periodic);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->set_state_periodic) {
+               SEQ_printf(m, " periodic: ");
+               print_name_offset(m, dev->set_state_periodic);
+               SEQ_printf(m, "\n");
+       }
 
-               if (dev->set_state_oneshot) {
-                       SEQ_printf(m, " oneshot:  ");
-                       print_name_offset(m, dev->set_state_oneshot);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->set_state_oneshot) {
+               SEQ_printf(m, " oneshot:  ");
+               print_name_offset(m, dev->set_state_oneshot);
+               SEQ_printf(m, "\n");
+       }
 
-               if (dev->set_state_oneshot_stopped) {
-                       SEQ_printf(m, " oneshot stopped: ");
-                       print_name_offset(m, dev->set_state_oneshot_stopped);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->set_state_oneshot_stopped) {
+               SEQ_printf(m, " oneshot stopped: ");
+               print_name_offset(m, dev->set_state_oneshot_stopped);
+               SEQ_printf(m, "\n");
+       }
 
-               if (dev->tick_resume) {
-                       SEQ_printf(m, " resume:   ");
-                       print_name_offset(m, dev->tick_resume);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->tick_resume) {
+               SEQ_printf(m, " resume:   ");
+               print_name_offset(m, dev->tick_resume);
+               SEQ_printf(m, "\n");
        }
 
        SEQ_printf(m, " event_handler:  ");
index a578a01..8148143 100644 (file)
@@ -367,7 +367,8 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
 
        nchunks = nbits = totaldigits = c = 0;
        do {
-               chunk = ndigits = 0;
+               chunk = 0;
+               ndigits = totaldigits;
 
                /* Get the next chunk of the bitmap */
                while (buflen) {
@@ -406,9 +407,9 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
                                return -EOVERFLOW;
 
                        chunk = (chunk << 4) | hex_to_bin(c);
-                       ndigits++; totaldigits++;
+                       totaldigits++;
                }
-               if (ndigits == 0)
+               if (ndigits == totaldigits)
                        return -EINVAL;
                if (nchunks == 0 && chunk == 0)
                        continue;
@@ -505,7 +506,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                int nmaskbits)
 {
        unsigned a, b;
-       int c, old_c, totaldigits;
+       int c, old_c, totaldigits, ndigits;
        const char __user __force *ubuf = (const char __user __force *)buf;
        int at_start, in_range;
 
@@ -515,6 +516,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                at_start = 1;
                in_range = 0;
                a = b = 0;
+               ndigits = totaldigits;
 
                /* Get the next cpu# or a range of cpu#'s */
                while (buflen) {
@@ -528,23 +530,27 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                        if (isspace(c))
                                continue;
 
-                       /*
-                        * If the last character was a space and the current
-                        * character isn't '\0', we've got embedded whitespace.
-                        * This is a no-no, so throw an error.
-                        */
-                       if (totaldigits && c && isspace(old_c))
-                               return -EINVAL;
-
                        /* A '\0' or a ',' signal the end of a cpu# or range */
                        if (c == '\0' || c == ',')
                                break;
+                       /*
+                       * whitespaces between digits are not allowed,
+                       * but it's ok if whitespaces are on head or tail.
+                       * when old_c is whilespace,
+                       * if totaldigits == ndigits, whitespace is on head.
+                       * if whitespace is on tail, it should not run here.
+                       * as c was ',' or '\0',
+                       * the last code line has broken the current loop.
+                       */
+                       if ((totaldigits != ndigits) && isspace(old_c))
+                               return -EINVAL;
 
                        if (c == '-') {
                                if (at_start || in_range)
                                        return -EINVAL;
                                b = 0;
                                in_range = 1;
+                               at_start = 1;
                                continue;
                        }
 
@@ -557,15 +563,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                        at_start = 0;
                        totaldigits++;
                }
+               if (ndigits == totaldigits)
+                       continue;
+               /* if no digit is after '-', it's wrong*/
+               if (at_start && in_range)
+                       return -EINVAL;
                if (!(a <= b))
                        return -EINVAL;
                if (b >= nmaskbits)
                        return -ERANGE;
-               if (!at_start) {
-                       while (a <= b) {
-                               set_bit(a, maskp);
-                               a++;
-                       }
+               while (a <= b) {
+                       set_bit(a, maskp);
+                       a++;
                }
        } while (buflen && c == ',');
        return 0;
index 6dd0335..0234361 100644 (file)
@@ -743,12 +743,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long len,
+STATIC int INIT __decompress(unsigned char *buf, long len,
                        long (*fill)(void*, unsigned long),
                        long (*flush)(void*, unsigned long),
-                       unsigned char *outbuf,
+                       unsigned char *outbuf, long olen,
                        long *pos,
-                       void(*error)(char *x))
+                       void (*error)(char *x))
 {
        return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error);
 }
index d4c7891..555c06b 100644 (file)
@@ -1,4 +1,5 @@
 #ifdef STATIC
+#define PREBOOT
 /* Pre-boot environment: included */
 
 /* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots
@@ -33,23 +34,23 @@ static long INIT nofill(void *buffer, unsigned long len)
 }
 
 /* Included from initramfs et al code */
-STATIC int INIT gunzip(unsigned char *buf, long len,
+STATIC int INIT __gunzip(unsigned char *buf, long len,
                       long (*fill)(void*, unsigned long),
                       long (*flush)(void*, unsigned long),
-                      unsigned char *out_buf,
+                      unsigned char *out_buf, long out_len,
                       long *pos,
                       void(*error)(char *x)) {
        u8 *zbuf;
        struct z_stream_s *strm;
        int rc;
-       size_t out_len;
 
        rc = -1;
        if (flush) {
                out_len = 0x8000; /* 32 K */
                out_buf = malloc(out_len);
        } else {
-               out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
+               if (!out_len)
+                       out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
        }
        if (!out_buf) {
                error("Out of memory while allocating output buffer");
@@ -181,4 +182,24 @@ gunzip_nomem1:
        return rc; /* returns Z_OK (0) if successful */
 }
 
-#define decompress gunzip
+#ifndef PREBOOT
+STATIC int INIT gunzip(unsigned char *buf, long len,
+                      long (*fill)(void*, unsigned long),
+                      long (*flush)(void*, unsigned long),
+                      unsigned char *out_buf,
+                      long *pos,
+                      void (*error)(char *x))
+{
+       return __gunzip(buf, len, fill, flush, out_buf, 0, pos, error);
+}
+#else
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long out_len,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return __gunzip(buf, len, fill, flush, out_buf, out_len, pos, error);
+}
+#endif
index 40f66eb..036fc88 100644 (file)
@@ -196,12 +196,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
                              long (*fill)(void*, unsigned long),
                              long (*flush)(void*, unsigned long),
-                             unsigned char *output,
+                             unsigned char *output, long out_len,
                              long *posp,
-                             void(*error)(char *x)
+                             void (*error)(char *x)
        )
 {
        return unlz4(buf, in_len - 4, fill, flush, output, posp, error);
index 0be83af..ed7a1fd 100644 (file)
@@ -620,7 +620,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, long in_len,
 
        num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
        p = (uint16_t *) large_malloc(num_probs * sizeof(*p));
-       if (p == 0)
+       if (p == NULL)
                goto exit_2;
        num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
        for (i = 0; i < num_probs; i++)
@@ -667,13 +667,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
                              long (*fill)(void*, unsigned long),
                              long (*flush)(void*, unsigned long),
-                             unsigned char *output,
+                             unsigned char *output, long out_len,
                              long *posp,
-                             void(*error)(char *x)
-       )
+                             void (*error)(char *x))
 {
        return unlzma(buf, in_len - 4, fill, flush, output, posp, error);
 }
index b94a31b..f4c158e 100644 (file)
@@ -31,6 +31,7 @@
  */
 
 #ifdef STATIC
+#define PREBOOT
 #include "lzo/lzo1x_decompress_safe.c"
 #else
 #include <linux/decompress/unlzo.h>
@@ -287,4 +288,14 @@ exit:
        return ret;
 }
 
-#define decompress unlzo
+#ifdef PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long olen,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return unlzo(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
index b07a783..25d59a9 100644 (file)
@@ -394,4 +394,14 @@ error_alloc_state:
  * This macro is used by architecture-specific files to decompress
  * the kernel image.
  */
-#define decompress unxz
+#ifdef XZ_PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long olen,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return unxz(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
index ec8da78..94be244 100644 (file)
@@ -152,7 +152,7 @@ int kstrtoll(const char *s, unsigned int base, long long *res)
                rv = _kstrtoull(s + 1, base, &tmp);
                if (rv < 0)
                        return rv;
-               if ((long long)(-tmp) >= 0)
+               if ((long long)-tmp > 0)
                        return -ERANGE;
                *res = -tmp;
        } else {
index c98ae81..54036ce 100644 (file)
@@ -410,7 +410,7 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * @dst:       destination buffer (escaped)
  * @osz:       destination buffer size
  * @flags:     combination of the flags (bitwise OR):
- *     %ESCAPE_SPACE:
+ *     %ESCAPE_SPACE: (special white space, not space itself)
  *             '\f' - form feed
  *             '\n' - new line
  *             '\r' - carriage return
@@ -432,16 +432,18 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *             all previous together
  *     %ESCAPE_HEX:
  *             '\xHH' - byte with hexadecimal value HH (2 digits)
- * @esc:       NULL-terminated string of characters any of which, if found in
- *             the source, has to be escaped
+ * @only:      NULL-terminated string containing characters used to limit
+ *             the selected escape class. If characters are included in @only
+ *             that would not normally be escaped by the classes selected
+ *             in @flags, they will be copied to @dst unescaped.
  *
  * Description:
  * The process of escaping byte buffer includes several parts. They are applied
  * in the following sequence.
  *     1. The character is matched to the printable class, if asked, and in
  *        case of match it passes through to the output.
- *     2. The character is not matched to the one from @esc string and thus
- *        must go as is to the output.
+ *     2. The character is not matched to the one from @only string and thus
+ *        must go as-is to the output.
  *     3. The character is checked if it falls into the class given by @flags.
  *        %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
  *        character. Note that they actually can't go together, otherwise
@@ -458,11 +460,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * dst for a '\0' terminator if and only if ret < osz.
  */
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-                     unsigned int flags, const char *esc)
+                     unsigned int flags, const char *only)
 {
        char *p = dst;
        char *end = p + osz;
-       bool is_dict = esc && *esc;
+       bool is_dict = only && *only;
 
        while (isz--) {
                unsigned char c = *src++;
@@ -471,7 +473,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                 * Apply rules in the following sequence:
                 *      - the character is printable, when @flags has
                 *        %ESCAPE_NP bit set
-                *      - the @esc string is supplied and does not contain a
+                *      - the @only string is supplied and does not contain a
                 *        character under question
                 *      - the character doesn't fall into a class of symbols
                 *        defined by given @flags
@@ -479,7 +481,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                 * output buffer.
                 */
                if ((flags & ESCAPE_NP && isprint(c)) ||
-                   (is_dict && !strchr(esc, c))) {
+                   (is_dict && !strchr(only, c))) {
                        /* do nothing */
                } else {
                        if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
index 4137bca..f355f67 100644 (file)
@@ -260,6 +260,7 @@ static void __init test_kstrtoll_ok(void)
                {"4294967297",  10,     4294967297LL},
                {"9223372036854775807", 10,     9223372036854775807LL},
 
+               {"-0",  10,     0LL},
                {"-1",  10,     -1LL},
                {"-2",  10,     -2LL},
                {"-9223372036854775808",        10,     LLONG_MIN},
@@ -277,11 +278,6 @@ static void __init test_kstrtoll_fail(void)
                {"-9223372036854775809",        10},
                {"-18446744073709551614",       10},
                {"-18446744073709551615",       10},
-               /* negative zero isn't an integer in Linux */
-               {"-0",  0},
-               {"-0",  8},
-               {"-0",  10},
-               {"-0",  16},
                /* sign is first character if any */
                {"-+1", 0},
                {"-+1", 8},
index 098c08e..c1efb1b 100644 (file)
@@ -65,7 +65,7 @@ static noinline void __init kmalloc_node_oob_right(void)
        kfree(ptr);
 }
 
-static noinline void __init kmalloc_large_oob_rigth(void)
+static noinline void __init kmalloc_large_oob_right(void)
 {
        char *ptr;
        size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
@@ -114,7 +114,7 @@ static noinline void __init kmalloc_oob_krealloc_less(void)
                kfree(ptr1);
                return;
        }
-       ptr2[size1] = 'x';
+       ptr2[size2] = 'x';
        kfree(ptr2);
 }
 
@@ -259,7 +259,7 @@ static int __init kmalloc_tests_init(void)
        kmalloc_oob_right();
        kmalloc_oob_left();
        kmalloc_node_oob_right();
-       kmalloc_large_oob_rigth();
+       kmalloc_large_oob_right();
        kmalloc_oob_krealloc_more();
        kmalloc_oob_krealloc_less();
        kmalloc_oob_16();
index ddf3482..9b1756b 100644 (file)
@@ -35,6 +35,7 @@
 /* #include "deflate.h" */
 
 #include <linux/zutil.h>
+#include <linux/bitrev.h>
 #include "defutil.h"
 
 #ifdef DEBUG_ZLIB
@@ -146,7 +147,6 @@ static void send_all_trees (deflate_state *s, int lcodes, int dcodes,
 static void compress_block (deflate_state *s, ct_data *ltree,
                            ct_data *dtree);
 static void set_data_type  (deflate_state *s);
-static unsigned bi_reverse (unsigned value, int length);
 static void bi_windup      (deflate_state *s);
 static void bi_flush       (deflate_state *s);
 static void copy_block     (deflate_state *s, char *buf, unsigned len,
@@ -284,7 +284,7 @@ static void tr_static_init(void)
     /* The static distance tree is trivial: */
     for (n = 0; n < D_CODES; n++) {
         static_dtree[n].Len = 5;
-        static_dtree[n].Code = bi_reverse((unsigned)n, 5);
+        static_dtree[n].Code = bitrev32((u32)n) >> (32 - 5);
     }
     static_init_done = 1;
 }
@@ -520,7 +520,7 @@ static void gen_codes(
         int len = tree[n].Len;
         if (len == 0) continue;
         /* Now reverse the bits */
-        tree[n].Code = bi_reverse(next_code[len]++, len);
+        tree[n].Code = bitrev32((u32)(next_code[len]++)) >> (32 - len);
 
         Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
              n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
index b640b64..a8c3708 100644 (file)
@@ -292,22 +292,6 @@ void zlib_tr_stored_type_only (deflate_state *);
     put_byte(s, (uch)((ush)(w) >> 8)); \
 }
 
-/* ===========================================================================
- * Reverse the first len bits of a code, using straightforward code (a faster
- * method would use a table)
- * IN assertion: 1 <= len <= 15
- */
-static inline unsigned bi_reverse(unsigned code, /* the value to invert */
-                                 int len)       /* its bit length */
-{
-    register unsigned res = 0;
-    do {
-        res |= code & 1;
-        code >>= 1, res <<= 1;
-    } while (--len > 0);
-    return res >> 1;
-}
-
 /* ===========================================================================
  * Flush the bit buffer, keeping at most 7 bits in it.
  */
index 3a4070f..0d9fdcd 100644 (file)
@@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT
          processes running early in the lifetime of the systemm until kswapd
          finishes the initialisation.
 
+config IDLE_PAGE_TRACKING
+       bool "Enable idle page tracking"
+       depends on SYSFS && MMU
+       select PAGE_EXTENSION if !64BIT
+       help
+         This feature allows to estimate the amount of user pages that have
+         not been touched during a given period of time. This information can
+         be useful to tune memory cgroup limits and/or for job placement
+         within a compute cluster.
+
+         See Documentation/vm/idle_page_tracking.txt for more details.
+
 config ZONE_DEVICE
        bool "Device memory (pmem, etc...) hotplug support" if EXPERT
        default !ZONE_DMA
@@ -665,3 +677,6 @@ config ZONE_DEVICE
          mapping in an O_DIRECT operation, among other things.
 
          If FS_DAX is enabled, then say Y.
+
+config FRAME_VECTOR
+       bool
index b424d5e..2ed4319 100644 (file)
@@ -79,3 +79,5 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
+obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
index ee8d7fd..2df8ddc 100644 (file)
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
        int ret = 0;
 
        memcg = mem_cgroup_from_css(memcg_css);
-       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
        blkcg = css_to_blkcg(blkcg_css);
        memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
        blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 
                        /* see whether the blkcg association has changed */
                        blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
-                                                    &blkio_cgrp_subsys);
+                                                    &io_cgrp_subsys);
                        if (unlikely(wb->blkcg_css != blkcg_css ||
                                     !wb_tryget(wb)))
                                wb = NULL;
index 76089dd..6c1b3ea 100644 (file)
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        {1UL << PG_compound_lock,       "compound_lock" },
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+       {1UL << PG_young,               "young"         },
+       {1UL << PG_idle,                "idle"          },
+#endif
 };
 
 static void dump_flags(unsigned long flags,
index 23f744d..17ae14b 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <asm/fixmap.h>
+#include <asm/early_ioremap.h>
 
 #ifdef CONFIG_MMU
 static int early_ioremap_debug __initdata;
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
new file mode 100644 (file)
index 0000000..cdabcb9
--- /dev/null
@@ -0,0 +1,230 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+/*
+ * get_vaddr_frames() - map virtual addresses to pfns
+ * @start:     starting user address
+ * @nr_frames: number of pages / pfns from start to map
+ * @write:     whether pages will be written to by the caller
+ * @force:     whether to force write access even if user mapping is
+ *             readonly. See description of the same argument of
+               get_user_pages().
+ * @vec:       structure which receives pages / pfns of the addresses mapped.
+ *             It should have space for at least nr_frames entries.
+ *
+ * This function maps virtual addresses from @start and fills @vec structure
+ * with page frame numbers or page pointers to corresponding pages (choice
+ * depends on the type of the vma underlying the virtual address). If @start
+ * belongs to a normal vma, the function grabs reference to each of the pages
+ * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
+ * touch page structures and the caller must make sure pfns aren't reused for
+ * anything else while he is using them.
+ *
+ * The function returns number of pages mapped which may be less than
+ * @nr_frames. In particular we stop mapping if there are more vmas of
+ * different type underlying the specified range of virtual addresses.
+ * When the function isn't able to map a single page, it returns error.
+ *
+ * This function takes care of grabbing mmap_sem as necessary.
+ */
+int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
+                    bool write, bool force, struct frame_vector *vec)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       int ret = 0;
+       int err;
+       int locked;
+
+       if (nr_frames == 0)
+               return 0;
+
+       if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
+               nr_frames = vec->nr_allocated;
+
+       down_read(&mm->mmap_sem);
+       locked = 1;
+       vma = find_vma_intersection(mm, start, start + 1);
+       if (!vma) {
+               ret = -EFAULT;
+               goto out;
+       }
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
+               vec->got_ref = true;
+               vec->is_pfns = false;
+               ret = get_user_pages_locked(current, mm, start, nr_frames,
+                       write, force, (struct page **)(vec->ptrs), &locked);
+               goto out;
+       }
+
+       vec->got_ref = false;
+       vec->is_pfns = true;
+       do {
+               unsigned long *nums = frame_vector_pfns(vec);
+
+               while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
+                       err = follow_pfn(vma, start, &nums[ret]);
+                       if (err) {
+                               if (ret == 0)
+                                       ret = err;
+                               goto out;
+                       }
+                       start += PAGE_SIZE;
+                       ret++;
+               }
+               /*
+                * We stop if we have enough pages or if VMA doesn't completely
+                * cover the tail page.
+                */
+               if (ret >= nr_frames || start < vma->vm_end)
+                       break;
+               vma = find_vma_intersection(mm, start, start + 1);
+       } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
+out:
+       if (locked)
+               up_read(&mm->mmap_sem);
+       if (!ret)
+               ret = -EFAULT;
+       if (ret > 0)
+               vec->nr_frames = ret;
+       return ret;
+}
+EXPORT_SYMBOL(get_vaddr_frames);
+
+/**
+ * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
+ *                     them
+ * @vec:       frame vector to put
+ *
+ * Drop references to pages if get_vaddr_frames() acquired them. We also
+ * invalidate the frame vector so that it is prepared for the next call into
+ * get_vaddr_frames().
+ */
+void put_vaddr_frames(struct frame_vector *vec)
+{
+       int i;
+       struct page **pages;
+
+       if (!vec->got_ref)
+               goto out;
+       pages = frame_vector_pages(vec);
+       /*
+        * frame_vector_pages() might needed to do a conversion when
+        * get_vaddr_frames() got pages but vec was later converted to pfns.
+        * But it shouldn't really fail to convert pfns back...
+        */
+       if (WARN_ON(IS_ERR(pages)))
+               goto out;
+       for (i = 0; i < vec->nr_frames; i++)
+               put_page(pages[i]);
+       vec->got_ref = false;
+out:
+       vec->nr_frames = 0;
+}
+EXPORT_SYMBOL(put_vaddr_frames);
+
+/**
+ * frame_vector_to_pages - convert frame vector to contain page pointers
+ * @vec:       frame vector to convert
+ *
+ * Convert @vec to contain array of page pointers.  If the conversion is
+ * successful, return 0. Otherwise return an error. Note that we do not grab
+ * page references for the page structures.
+ */
+int frame_vector_to_pages(struct frame_vector *vec)
+{
+       int i;
+       unsigned long *nums;
+       struct page **pages;
+
+       if (!vec->is_pfns)
+               return 0;
+       nums = frame_vector_pfns(vec);
+       for (i = 0; i < vec->nr_frames; i++)
+               if (!pfn_valid(nums[i]))
+                       return -EINVAL;
+       pages = (struct page **)nums;
+       for (i = 0; i < vec->nr_frames; i++)
+               pages[i] = pfn_to_page(nums[i]);
+       vec->is_pfns = false;
+       return 0;
+}
+EXPORT_SYMBOL(frame_vector_to_pages);
+
+/**
+ * frame_vector_to_pfns - convert frame vector to contain pfns
+ * @vec:       frame vector to convert
+ *
+ * Convert @vec to contain array of pfns.
+ */
+void frame_vector_to_pfns(struct frame_vector *vec)
+{
+       int i;
+       unsigned long *nums;
+       struct page **pages;
+
+       if (vec->is_pfns)
+               return;
+       pages = (struct page **)(vec->ptrs);
+       nums = (unsigned long *)pages;
+       for (i = 0; i < vec->nr_frames; i++)
+               nums[i] = page_to_pfn(pages[i]);
+       vec->is_pfns = true;
+}
+EXPORT_SYMBOL(frame_vector_to_pfns);
+
+/**
+ * frame_vector_create() - allocate & initialize structure for pinned pfns
+ * @nr_frames: number of pfns slots we should reserve
+ *
+ * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
+ * pfns.
+ */
+struct frame_vector *frame_vector_create(unsigned int nr_frames)
+{
+       struct frame_vector *vec;
+       int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
+
+       if (WARN_ON_ONCE(nr_frames == 0))
+               return NULL;
+       /*
+        * This is absurdly high. It's here just to avoid strange effects when
+        * arithmetics overflows.
+        */
+       if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
+               return NULL;
+       /*
+        * Avoid higher order allocations, use vmalloc instead. It should
+        * be rare anyway.
+        */
+       if (size <= PAGE_SIZE)
+               vec = kmalloc(size, GFP_KERNEL);
+       else
+               vec = vmalloc(size);
+       if (!vec)
+               return NULL;
+       vec->nr_allocated = nr_frames;
+       vec->nr_frames = 0;
+       return vec;
+}
+EXPORT_SYMBOL(frame_vector_create);
+
+/**
+ * frame_vector_destroy() - free memory allocated to carry frame vector
+ * @vec:       Frame vector to free
+ *
+ * Free structure allocated by frame_vector_create() to carry frames.
+ */
+void frame_vector_destroy(struct frame_vector *vec)
+{
+       /* Make sure put_vaddr_frames() got called properly... */
+       VM_BUG_ON(vec->nr_frames > 0);
+       kvfree(vec);
+}
+EXPORT_SYMBOL(frame_vector_destroy);
index b16279c..4b06b8d 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page,
                /* clear PageTail before overwriting first_page */
                smp_wmb();
 
+               if (page_is_young(page))
+                       set_page_young(page_tail);
+               if (page_is_idle(page))
+                       set_page_idle(page_tail);
+
                /*
                 * __split_huge_page_splitting() already set the
                 * splitting bit in all pmd that could map this
@@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                VM_BUG_ON_PAGE(PageLRU(page), page);
 
                /* If there is no mapped pte young don't collapse the page */
-               if (pte_young(pteval) || PageReferenced(page) ||
+               if (pte_young(pteval) ||
+                   page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
@@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 */
                if (page_count(page) != 1 + !!PageSwapCache(page))
                        goto out_unmap;
-               if (pte_young(pteval) || PageReferenced(page) ||
+               if (pte_young(pteval) ||
+                   page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
index aeba0ed..9d26fd9 100644 (file)
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
        /*
         * do a racy check with elevated page count, to make sure PG_hwpoison
         * will only be set for the targeted owner (or on a free page).
-        * We temporarily take page lock for try_get_mem_cgroup_from_page().
         * memory_failure() will redo the check reliably inside page lock.
         */
-       lock_page(hpage);
        err = hwpoison_filter(hpage);
-       unlock_page(hpage);
        if (err)
                goto put_out;
 
@@ -126,7 +123,7 @@ static int pfn_inject_init(void)
        if (!dentry)
                goto fail;
 
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
                                    hwpoison_dir, &hwpoison_filter_memcg);
        if (!dentry)
index f532f6a..77191ec 100644 (file)
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq,
                            struct kmemleak_object *object)
 {
        const u8 *ptr = (const u8 *)object->pointer;
-       int i, len, remaining;
-       unsigned char linebuf[HEX_ROW_SIZE * 5];
+       size_t len;
 
        /* limit the number of lines to HEX_MAX_LINES */
-       remaining = len =
-               min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
-
-       seq_printf(seq, "  hex dump (first %d bytes):\n", len);
-       for (i = 0; i < len; i += HEX_ROW_SIZE) {
-               int linelen = min(remaining, HEX_ROW_SIZE);
-
-               remaining -= HEX_ROW_SIZE;
-               hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
-                                  HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
-                                  HEX_ASCII);
-               seq_printf(seq, "    %s\n", linebuf);
-       }
+       len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
+
+       seq_printf(seq, "  hex dump (first %zu bytes):\n", len);
+       seq_hex_dump(seq, "    ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+                    HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
 }
 
 /*
index 1742a2d..6ddaeba 100644 (file)
@@ -441,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
        return &memcg->css;
 }
 
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+       struct mem_cgroup *memcg;
+       unsigned long ino = 0;
+
+       rcu_read_lock();
+       memcg = READ_ONCE(page->mem_cgroup);
+       while (memcg && !(memcg->css.flags & CSS_ONLINE))
+               memcg = parent_mem_cgroup(memcg);
+       if (memcg)
+               ino = cgroup_ino(memcg->css.cgroup);
+       rcu_read_unlock();
+       return ino;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -2071,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
        css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges.  If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       struct mem_cgroup *memcg;
-       unsigned short id;
-       swp_entry_t ent;
-
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-       memcg = page->mem_cgroup;
-       if (memcg) {
-               if (!css_tryget_online(&memcg->css))
-                       memcg = NULL;
-       } else if (PageSwapCache(page)) {
-               ent.val = page_private(page);
-               id = lookup_swap_cgroup_id(ent);
-               rcu_read_lock();
-               memcg = mem_cgroup_from_id(id);
-               if (memcg && !css_tryget_online(&memcg->css))
-                       memcg = NULL;
-               rcu_read_unlock();
-       }
-       return memcg;
-}
-
 static void lock_page_lru(struct page *page, int *isolated)
 {
        struct zone *zone = page_zone(page);
@@ -5301,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 * the page lock, which serializes swap cache removal, which
                 * in turn serializes uncharging.
                 */
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
                if (page->mem_cgroup)
                        goto out;
+
+               if (do_swap_account) {
+                       swp_entry_t ent = { .val = page_private(page), };
+                       unsigned short id = lookup_swap_cgroup_id(ent);
+
+                       rcu_read_lock();
+                       memcg = mem_cgroup_from_id(id);
+                       if (memcg && !css_tryget_online(&memcg->css))
+                               memcg = NULL;
+                       rcu_read_unlock();
+               }
        }
 
        if (PageTransHuge(page)) {
@@ -5310,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
        }
 
-       if (do_swap_account && PageSwapCache(page))
-               memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
 
index eeda648..9588269 100644 (file)
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p)
  * can only guarantee that the page either belongs to the memcg tasks, or is
  * a freed page.
  */
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
 {
-       struct mem_cgroup *mem;
-       struct cgroup_subsys_state *css;
-       unsigned long ino;
-
        if (!hwpoison_filter_memcg)
                return 0;
 
-       mem = try_get_mem_cgroup_from_page(p);
-       if (!mem)
-               return -EINVAL;
-
-       css = &mem->css;
-       ino = cgroup_ino(css->cgroup);
-       css_put(css);
-
-       if (ino != hwpoison_filter_memcg)
+       if (page_cgroup_ino(p) != hwpoison_filter_memcg)
                return -EINVAL;
 
        return 0;
index 6cd0b21..9cb2747 100644 (file)
@@ -3233,7 +3233,7 @@ out:
 static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pmd_t *pmd, unsigned int flags)
 {
-       if (!vma->vm_ops)
+       if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
        if (vma->vm_ops->pmd_fault)
                return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
@@ -3244,7 +3244,7 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
                        unsigned int flags)
 {
-       if (!vma->vm_ops)
+       if (vma_is_anonymous(vma))
                return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
        if (vma->vm_ops->pmd_fault)
                return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
index 02ce25d..c3cb566 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/gfp.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                        __set_page_dirty_nobuffers(newpage);
        }
 
+       if (page_is_young(page))
+               set_page_young(newpage);
+       if (page_is_idle(page))
+               set_page_idle(newpage);
+
        /*
         * Copy NUMA information to the new page, to prevent over-eager
         * future migrations of this same page.
index b6be324..971dd2c 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,6 +612,8 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
+       WARN_ONCE(vma->vm_file && !vma->vm_ops, "missing vma->vm_ops");
+
        /* Update tracking information for the gap following the new vma. */
        if (vma->vm_next)
                vma_gap_update(vma->vm_next);
@@ -1260,14 +1262,12 @@ static inline int mlock_future_check(struct mm_struct *mm,
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
-                       unsigned long flags, unsigned long pgoff,
-                       unsigned long *populate)
+                       unsigned long flags, vm_flags_t vm_flags,
+                       unsigned long pgoff, unsigned long *populate)
 {
        struct mm_struct *mm = current->mm;
-       vm_flags_t vm_flags;
 
        *populate = 0;
 
@@ -1311,7 +1311,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
-       vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+       vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
        if (flags & MAP_LOCKED)
@@ -1638,6 +1638,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
                 */
                WARN_ON_ONCE(addr != vma->vm_start);
 
+               /* All file mapping must have ->vm_ops set */
+               if (!vma->vm_ops) {
+                       static const struct vm_operations_struct dummy_ops = {};
+                       vma->vm_ops = &dummy_ops;
+               }
+
                addr = vma->vm_start;
                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
index 3b9b3d0..5fbdd36 100644 (file)
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }
 
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+                              unsigned long start,
+                              unsigned long end)
+{
+       struct mmu_notifier *mn;
+       int young = 0, id;
+
+       id = srcu_read_lock(&srcu);
+       hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+               if (mn->ops->clear_young)
+                       young |= mn->ops->clear_young(mn, mm, start, end);
+       }
+       srcu_read_unlock(&srcu, id);
+
+       return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
                              unsigned long address)
 {
index 1cc0709..ab14a20 100644 (file)
@@ -1233,18 +1233,19 @@ enomem:
 /*
  * handle mapping creation for uClinux
  */
-unsigned long do_mmap_pgoff(struct file *file,
-                           unsigned long addr,
-                           unsigned long len,
-                           unsigned long prot,
-                           unsigned long flags,
-                           unsigned long pgoff,
-                           unsigned long *populate)
+unsigned long do_mmap(struct file *file,
+                       unsigned long addr,
+                       unsigned long len,
+                       unsigned long prot,
+                       unsigned long flags,
+                       vm_flags_t vm_flags,
+                       unsigned long pgoff,
+                       unsigned long *populate)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *rb;
-       unsigned long capabilities, vm_flags, result;
+       unsigned long capabilities, result;
        int ret;
 
        *populate = 0;
@@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        /* we've determined that we can make the mapping, now translate what we
         * now know into VMA flags */
-       vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+       vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
 
        /* we're going to need to record the mapping */
        region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
index 5cccc12..0a931cd 100644 (file)
@@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
        wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 
-       trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
+       trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
 }
 
 static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
@@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * do a reset, as it may be a light dirtier.
                 */
                if (pause < min_pause) {
-                       trace_balance_dirty_pages(bdi,
+                       trace_balance_dirty_pages(wb,
                                                  sdtc->thresh,
                                                  sdtc->bg_thresh,
                                                  sdtc->dirty,
@@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                }
 
 pause:
-               trace_balance_dirty_pages(bdi,
+               trace_balance_dirty_pages(wb,
                                          sdtc->thresh,
                                          sdtc->bg_thresh,
                                          sdtc->dirty,
index d86fd2f..292ca7b 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
 #include <linux/page_owner.h>
+#include <linux/page_idle.h>
 
 /*
  * struct page extension
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = {
 #ifdef CONFIG_PAGE_OWNER
        &page_owner_ops,
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+       &page_idle_ops,
+#endif
 };
 
 static unsigned long total_usage;
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644 (file)
index 0000000..d5dd790
--- /dev/null
@@ -0,0 +1,232 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_ext.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE      sizeof(u64)
+#define BITMAP_CHUNK_BITS      (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+       struct page *page;
+       struct zone *zone;
+
+       if (!pfn_valid(pfn))
+               return NULL;
+
+       page = pfn_to_page(pfn);
+       if (!page || !PageLRU(page) ||
+           !get_page_unless_zero(page))
+               return NULL;
+
+       zone = page_zone(page);
+       spin_lock_irq(&zone->lru_lock);
+       if (unlikely(!PageLRU(page))) {
+               put_page(page);
+               page = NULL;
+       }
+       spin_unlock_irq(&zone->lru_lock);
+       return page;
+}
+
+static int page_idle_clear_pte_refs_one(struct page *page,
+                                       struct vm_area_struct *vma,
+                                       unsigned long addr, void *arg)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte;
+       bool referenced = false;
+
+       if (unlikely(PageTransHuge(page))) {
+               pmd = page_check_address_pmd(page, mm, addr,
+                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+               if (pmd) {
+                       referenced = pmdp_clear_young_notify(vma, addr, pmd);
+                       spin_unlock(ptl);
+               }
+       } else {
+               pte = page_check_address(page, mm, addr, &ptl, 0);
+               if (pte) {
+                       referenced = ptep_clear_young_notify(vma, addr, pte);
+                       pte_unmap_unlock(pte, ptl);
+               }
+       }
+       if (referenced) {
+               clear_page_idle(page);
+               /*
+                * We cleared the referenced bit in a mapping to this page. To
+                * avoid interference with page reclaim, mark it young so that
+                * page_referenced() will return > 0.
+                */
+               set_page_young(page);
+       }
+       return SWAP_AGAIN;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+       /*
+        * Since rwc.arg is unused, rwc is effectively immutable, so we
+        * can make it static const to save some cycles and stack.
+        */
+       static const struct rmap_walk_control rwc = {
+               .rmap_one = page_idle_clear_pte_refs_one,
+               .anon_lock = page_lock_anon_vma_read,
+       };
+       bool need_lock;
+
+       if (!page_mapped(page) ||
+           !page_rmapping(page))
+               return;
+
+       need_lock = !PageAnon(page) || PageKsm(page);
+       if (need_lock && !trylock_page(page))
+               return;
+
+       rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+       if (need_lock)
+               unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+                                    struct bin_attribute *attr, char *buf,
+                                    loff_t pos, size_t count)
+{
+       u64 *out = (u64 *)buf;
+       struct page *page;
+       unsigned long pfn, end_pfn;
+       int bit;
+
+       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+               return -EINVAL;
+
+       pfn = pos * BITS_PER_BYTE;
+       if (pfn >= max_pfn)
+               return 0;
+
+       end_pfn = pfn + count * BITS_PER_BYTE;
+       if (end_pfn > max_pfn)
+               end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+       for (; pfn < end_pfn; pfn++) {
+               bit = pfn % BITMAP_CHUNK_BITS;
+               if (!bit)
+                       *out = 0ULL;
+               page = page_idle_get_page(pfn);
+               if (page) {
+                       if (page_is_idle(page)) {
+                               /*
+                                * The page might have been referenced via a
+                                * pte, in which case it is not idle. Clear
+                                * refs and recheck.
+                                */
+                               page_idle_clear_pte_refs(page);
+                               if (page_is_idle(page))
+                                       *out |= 1ULL << bit;
+                       }
+                       put_page(page);
+               }
+               if (bit == BITMAP_CHUNK_BITS - 1)
+                       out++;
+               cond_resched();
+       }
+       return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+                                     struct bin_attribute *attr, char *buf,
+                                     loff_t pos, size_t count)
+{
+       const u64 *in = (u64 *)buf;
+       struct page *page;
+       unsigned long pfn, end_pfn;
+       int bit;
+
+       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+               return -EINVAL;
+
+       pfn = pos * BITS_PER_BYTE;
+       if (pfn >= max_pfn)
+               return -ENXIO;
+
+       end_pfn = pfn + count * BITS_PER_BYTE;
+       if (end_pfn > max_pfn)
+               end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+       for (; pfn < end_pfn; pfn++) {
+               bit = pfn % BITMAP_CHUNK_BITS;
+               if ((*in >> bit) & 1) {
+                       page = page_idle_get_page(pfn);
+                       if (page) {
+                               page_idle_clear_pte_refs(page);
+                               set_page_idle(page);
+                               put_page(page);
+                       }
+               }
+               if (bit == BITMAP_CHUNK_BITS - 1)
+                       in++;
+               cond_resched();
+       }
+       return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+               __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+                          page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+       &page_idle_bitmap_attr,
+       NULL,
+};
+
+static struct attribute_group page_idle_attr_group = {
+       .bin_attrs = page_idle_bin_attrs,
+       .name = "page_idle",
+};
+
+#ifndef CONFIG_64BIT
+static bool need_page_idle(void)
+{
+       return true;
+}
+struct page_ext_operations page_idle_ops = {
+       .need = need_page_idle,
+};
+#endif
+
+static int __init page_idle_init(void)
+{
+       int err;
+
+       err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+       if (err) {
+               pr_err("page_idle: register sysfs failed\n");
+               return err;
+       }
+       return 0;
+}
+subsys_initcall(page_idle_init);
index 0db38e7..f5b5c1f 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -59,6 +59,7 @@
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
 
+       if (referenced)
+               clear_page_idle(page);
+       if (test_and_clear_page_young(page))
+               referenced++;
+
        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags;
index a3a0a2f..983f692 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
 #include <linux/gfp.h>
 #include <linux/uio.h>
 #include <linux/hugetlb.h>
+#include <linux/page_idle.h>
 
 #include "internal.h"
 
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page)
        } else if (!PageReferenced(page)) {
                SetPageReferenced(page);
        }
+       if (page_is_idle(page))
+               clear_page_idle(page);
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
index 68d2dd8..8f670d3 100644 (file)
@@ -99,6 +99,39 @@ static void zpool_put_driver(struct zpool_driver *driver)
        module_put(driver->owner);
 }
 
+/**
+ * zpool_has_pool() - Check if the pool driver is available
+ * @type       The type of the zpool to check (e.g. zbud, zsmalloc)
+ *
+ * This checks if the @type pool driver is available.  This will try to load
+ * the requested module, if needed, but there is no guarantee the module will
+ * still be loaded and available immediately after calling.  If this returns
+ * true, the caller should assume the pool is available, but must be prepared
+ * to handle the @zpool_create_pool() returning failure.  However if this
+ * returns false, the caller should assume the requested pool type is not
+ * available; either the requested pool type module does not exist, or could
+ * not be loaded, and calling @zpool_create_pool() with the pool type will
+ * fail.
+ *
+ * Returns: true if @type pool is available, false if not
+ */
+bool zpool_has_pool(char *type)
+{
+       struct zpool_driver *driver = zpool_get_driver(type);
+
+       if (!driver) {
+               request_module("zpool-%s", type);
+               driver = zpool_get_driver(type);
+       }
+
+       if (!driver)
+               return false;
+
+       zpool_put_driver(driver);
+       return true;
+}
+EXPORT_SYMBOL(zpool_has_pool);
+
 /**
  * zpool_create_pool() - Create a new zpool
  * @type       The type of the zpool to create (e.g. zbud, zsmalloc)
index 48a1d08..4043df7 100644 (file)
@@ -80,85 +80,54 @@ static u64 zswap_duplicate_entry;
 static bool zswap_enabled;
 module_param_named(enabled, zswap_enabled, bool, 0644);
 
-/* Compressor to be used by zswap (fixed at boot for now) */
+/* Crypto compressor to use */
 #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0444);
-
-/* The maximum percentage of memory that the compressed pool can occupy */
-static unsigned int zswap_max_pool_percent = 20;
-module_param_named(max_pool_percent,
-                       zswap_max_pool_percent, uint, 0644);
+static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
+static struct kparam_string zswap_compressor_kparam = {
+       .string =       zswap_compressor,
+       .maxlen =       sizeof(zswap_compressor),
+};
+static int zswap_compressor_param_set(const char *,
+                                     const struct kernel_param *);
+static struct kernel_param_ops zswap_compressor_param_ops = {
+       .set =          zswap_compressor_param_set,
+       .get =          param_get_string,
+};
+module_param_cb(compressor, &zswap_compressor_param_ops,
+               &zswap_compressor_kparam, 0644);
 
-/* Compressed storage to use */
+/* Compressed storage zpool to use */
 #define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-module_param_named(zpool, zswap_zpool_type, charp, 0444);
+static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
+static struct kparam_string zswap_zpool_kparam = {
+       .string =       zswap_zpool_type,
+       .maxlen =       sizeof(zswap_zpool_type),
+};
+static int zswap_zpool_param_set(const char *, const struct kernel_param *);
+static struct kernel_param_ops zswap_zpool_param_ops = {
+       .set =  zswap_zpool_param_set,
+       .get =  param_get_string,
+};
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
 
-/* zpool is shared by all of zswap backend  */
-static struct zpool *zswap_pool;
+/* The maximum percentage of memory that the compressed pool can occupy */
+static unsigned int zswap_max_pool_percent = 20;
+module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
 
 /*********************************
-* compression functions
+* data structures
 **********************************/
-/* per-cpu compression transforms */
-static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
 
-enum comp_op {
-       ZSWAP_COMPOP_COMPRESS,
-       ZSWAP_COMPOP_DECOMPRESS
+struct zswap_pool {
+       struct zpool *zpool;
+       struct crypto_comp * __percpu *tfm;
+       struct kref kref;
+       struct list_head list;
+       struct rcu_head rcu_head;
+       struct notifier_block notifier;
+       char tfm_name[CRYPTO_MAX_ALG_NAME];
 };
 
-static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
-                               u8 *dst, unsigned int *dlen)
-{
-       struct crypto_comp *tfm;
-       int ret;
-
-       tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
-       switch (op) {
-       case ZSWAP_COMPOP_COMPRESS:
-               ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
-               break;
-       case ZSWAP_COMPOP_DECOMPRESS:
-               ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-       put_cpu();
-       return ret;
-}
-
-static int __init zswap_comp_init(void)
-{
-       if (!crypto_has_comp(zswap_compressor, 0, 0)) {
-               pr_info("%s compressor not available\n", zswap_compressor);
-               /* fall back to default compressor */
-               zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-               if (!crypto_has_comp(zswap_compressor, 0, 0))
-                       /* can't even load the default compressor */
-                       return -ENODEV;
-       }
-       pr_info("using %s compressor\n", zswap_compressor);
-
-       /* alloc percpu transforms */
-       zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
-       if (!zswap_comp_pcpu_tfms)
-               return -ENOMEM;
-       return 0;
-}
-
-static void __init zswap_comp_exit(void)
-{
-       /* free percpu transforms */
-       free_percpu(zswap_comp_pcpu_tfms);
-}
-
-/*********************************
-* data structures
-**********************************/
 /*
  * struct zswap_entry
  *
@@ -166,22 +135,24 @@ static void __init zswap_comp_exit(void)
  * page within zswap.
  *
  * rbnode - links the entry into red-black tree for the appropriate swap type
+ * offset - the swap offset for the entry.  Index into the red-black tree.
  * refcount - the number of outstanding reference to the entry. This is needed
  *            to protect against premature freeing of the entry by code
  *            concurrent calls to load, invalidate, and writeback.  The lock
  *            for the zswap_tree structure that contains the entry must
  *            be held while changing the refcount.  Since the lock must
  *            be held, there is no reason to also make refcount atomic.
- * offset - the swap offset for the entry.  Index into the red-black tree.
- * handle - zpool allocation handle that stores the compressed page data
  * length - the length in bytes of the compressed page data.  Needed during
  *          decompression
+ * pool - the zswap_pool the entry's data is in
+ * handle - zpool allocation handle that stores the compressed page data
  */
 struct zswap_entry {
        struct rb_node rbnode;
        pgoff_t offset;
        int refcount;
        unsigned int length;
+       struct zswap_pool *pool;
        unsigned long handle;
 };
 
@@ -201,6 +172,51 @@ struct zswap_tree {
 
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
 
+/* RCU-protected iteration */
+static LIST_HEAD(zswap_pools);
+/* protects zswap_pools list modification */
+static DEFINE_SPINLOCK(zswap_pools_lock);
+
+/* used by param callback function */
+static bool zswap_init_started;
+
+/*********************************
+* helpers and fwd declarations
+**********************************/
+
+#define zswap_pool_debug(msg, p)                               \
+       pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
+                zpool_get_type((p)->zpool))
+
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_pool_get(struct zswap_pool *pool);
+static void zswap_pool_put(struct zswap_pool *pool);
+
+static const struct zpool_ops zswap_zpool_ops = {
+       .evict = zswap_writeback_entry
+};
+
+static bool zswap_is_full(void)
+{
+       return totalram_pages * zswap_max_pool_percent / 100 <
+               DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
+static void zswap_update_total_size(void)
+{
+       struct zswap_pool *pool;
+       u64 total = 0;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list)
+               total += zpool_get_total_size(pool->zpool);
+
+       rcu_read_unlock();
+
+       zswap_pool_total_size = total;
+}
+
 /*********************************
 * zswap entry functions
 **********************************/
@@ -294,10 +310,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
  */
 static void zswap_free_entry(struct zswap_entry *entry)
 {
-       zpool_free(zswap_pool, entry->handle);
+       zpool_free(entry->pool->zpool, entry->handle);
+       zswap_pool_put(entry->pool);
        zswap_entry_cache_free(entry);
        atomic_dec(&zswap_stored_pages);
-       zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+       zswap_update_total_size();
 }
 
 /* caller must hold the tree lock */
@@ -339,35 +356,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 **********************************/
 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
 
-static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
+static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
 {
-       struct crypto_comp *tfm;
        u8 *dst;
 
        switch (action) {
        case CPU_UP_PREPARE:
-               tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
-               if (IS_ERR(tfm)) {
-                       pr_err("can't allocate compressor transform\n");
-                       return NOTIFY_BAD;
-               }
-               *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
                dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
                if (!dst) {
                        pr_err("can't allocate compressor buffer\n");
-                       crypto_free_comp(tfm);
-                       *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
                        return NOTIFY_BAD;
                }
                per_cpu(zswap_dstmem, cpu) = dst;
                break;
        case CPU_DEAD:
        case CPU_UP_CANCELED:
-               tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
-               if (tfm) {
-                       crypto_free_comp(tfm);
-                       *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
-               }
                dst = per_cpu(zswap_dstmem, cpu);
                kfree(dst);
                per_cpu(zswap_dstmem, cpu) = NULL;
@@ -378,43 +381,398 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
        return NOTIFY_OK;
 }
 
-static int zswap_cpu_notifier(struct notifier_block *nb,
-                               unsigned long action, void *pcpu)
+static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
+                                    unsigned long action, void *pcpu)
 {
-       unsigned long cpu = (unsigned long)pcpu;
-       return __zswap_cpu_notifier(action, cpu);
+       return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
 }
 
-static struct notifier_block zswap_cpu_notifier_block = {
-       .notifier_call = zswap_cpu_notifier
+static struct notifier_block zswap_dstmem_notifier = {
+       .notifier_call =        zswap_cpu_dstmem_notifier,
 };
 
-static int __init zswap_cpu_init(void)
+static int __init zswap_cpu_dstmem_init(void)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
+                   NOTIFY_BAD)
+                       goto cleanup;
+       __register_cpu_notifier(&zswap_dstmem_notifier);
+       cpu_notifier_register_done();
+       return 0;
+
+cleanup:
+       for_each_online_cpu(cpu)
+               __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
+       cpu_notifier_register_done();
+       return -ENOMEM;
+}
+
+static void zswap_cpu_dstmem_destroy(void)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
+       __unregister_cpu_notifier(&zswap_dstmem_notifier);
+       cpu_notifier_register_done();
+}
+
+static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
+                                    unsigned long action, unsigned long cpu)
+{
+       struct crypto_comp *tfm;
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+               if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
+                       break;
+               tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
+               if (IS_ERR_OR_NULL(tfm)) {
+                       pr_err("could not alloc crypto comp %s : %ld\n",
+                              pool->tfm_name, PTR_ERR(tfm));
+                       return NOTIFY_BAD;
+               }
+               *per_cpu_ptr(pool->tfm, cpu) = tfm;
+               break;
+       case CPU_DEAD:
+       case CPU_UP_CANCELED:
+               tfm = *per_cpu_ptr(pool->tfm, cpu);
+               if (!IS_ERR_OR_NULL(tfm))
+                       crypto_free_comp(tfm);
+               *per_cpu_ptr(pool->tfm, cpu) = NULL;
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static int zswap_cpu_comp_notifier(struct notifier_block *nb,
+                                  unsigned long action, void *pcpu)
+{
+       unsigned long cpu = (unsigned long)pcpu;
+       struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
+
+       return __zswap_cpu_comp_notifier(pool, action, cpu);
+}
+
+static int zswap_cpu_comp_init(struct zswap_pool *pool)
 {
        unsigned long cpu;
 
+       memset(&pool->notifier, 0, sizeof(pool->notifier));
+       pool->notifier.notifier_call = zswap_cpu_comp_notifier;
+
        cpu_notifier_register_begin();
        for_each_online_cpu(cpu)
-               if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+               if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
+                   NOTIFY_BAD)
                        goto cleanup;
-       __register_cpu_notifier(&zswap_cpu_notifier_block);
+       __register_cpu_notifier(&pool->notifier);
        cpu_notifier_register_done();
        return 0;
 
 cleanup:
        for_each_online_cpu(cpu)
-               __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
+               __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
        cpu_notifier_register_done();
        return -ENOMEM;
 }
 
+static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
+       __unregister_cpu_notifier(&pool->notifier);
+       cpu_notifier_register_done();
+}
+
 /*********************************
-* helpers
+* pool functions
 **********************************/
-static bool zswap_is_full(void)
+
+static struct zswap_pool *__zswap_pool_current(void)
 {
-       return totalram_pages * zswap_max_pool_percent / 100 <
-               DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+       struct zswap_pool *pool;
+
+       pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+       WARN_ON(!pool);
+
+       return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+       assert_spin_locked(&zswap_pools_lock);
+
+       return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+       struct zswap_pool *pool;
+
+       rcu_read_lock();
+
+       pool = __zswap_pool_current();
+       if (!pool || !zswap_pool_get(pool))
+               pool = NULL;
+
+       rcu_read_unlock();
+
+       return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+       struct zswap_pool *pool, *last = NULL;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list)
+               last = pool;
+       if (!WARN_ON(!last) && !zswap_pool_get(last))
+               last = NULL;
+
+       rcu_read_unlock();
+
+       return last;
+}
+
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+       struct zswap_pool *pool;
+
+       assert_spin_locked(&zswap_pools_lock);
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list) {
+               if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
+                       continue;
+               if (strncmp(zpool_get_type(pool->zpool), type,
+                           sizeof(zswap_zpool_type)))
+                       continue;
+               /* if we can't get it, it's about to be destroyed */
+               if (!zswap_pool_get(pool))
+                       continue;
+               return pool;
+       }
+
+       return NULL;
+}
+
+static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+{
+       struct zswap_pool *pool;
+       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+
+       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+       if (!pool) {
+               pr_err("pool alloc failed\n");
+               return NULL;
+       }
+
+       pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
+       if (!pool->zpool) {
+               pr_err("%s zpool not available\n", type);
+               goto error;
+       }
+       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+
+       strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+       pool->tfm = alloc_percpu(struct crypto_comp *);
+       if (!pool->tfm) {
+               pr_err("percpu alloc failed\n");
+               goto error;
+       }
+
+       if (zswap_cpu_comp_init(pool))
+               goto error;
+       pr_debug("using %s compressor\n", pool->tfm_name);
+
+       /* being the current pool takes 1 ref; this func expects the
+        * caller to always add the new pool as the current pool
+        */
+       kref_init(&pool->kref);
+       INIT_LIST_HEAD(&pool->list);
+
+       zswap_pool_debug("created", pool);
+
+       return pool;
+
+error:
+       free_percpu(pool->tfm);
+       if (pool->zpool)
+               zpool_destroy_pool(pool->zpool);
+       kfree(pool);
+       return NULL;
+}
+
+static struct zswap_pool *__zswap_pool_create_fallback(void)
+{
+       if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+               pr_err("compressor %s not available, using default %s\n",
+                      zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
+               strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
+                       sizeof(zswap_compressor));
+       }
+       if (!zpool_has_pool(zswap_zpool_type)) {
+               pr_err("zpool %s not available, using default %s\n",
+                      zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
+               strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
+                       sizeof(zswap_zpool_type));
+       }
+
+       return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+}
+
+static void zswap_pool_destroy(struct zswap_pool *pool)
+{
+       zswap_pool_debug("destroying", pool);
+
+       zswap_cpu_comp_destroy(pool);
+       free_percpu(pool->tfm);
+       zpool_destroy_pool(pool->zpool);
+       kfree(pool);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+       return kref_get_unless_zero(&pool->kref);
+}
+
+static void __zswap_pool_release(struct rcu_head *head)
+{
+       struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
+
+       /* nobody should have been able to get a kref... */
+       WARN_ON(kref_get_unless_zero(&pool->kref));
+
+       /* pool is now off zswap_pools list and has no references. */
+       zswap_pool_destroy(pool);
+}
+
+static void __zswap_pool_empty(struct kref *kref)
+{
+       struct zswap_pool *pool;
+
+       pool = container_of(kref, typeof(*pool), kref);
+
+       spin_lock(&zswap_pools_lock);
+
+       WARN_ON(pool == zswap_pool_current());
+
+       list_del_rcu(&pool->list);
+       call_rcu(&pool->rcu_head, __zswap_pool_release);
+
+       spin_unlock(&zswap_pools_lock);
+}
+
+static void zswap_pool_put(struct zswap_pool *pool)
+{
+       kref_put(&pool->kref, __zswap_pool_empty);
+}
+
+/*********************************
+* param callbacks
+**********************************/
+
+static int __zswap_param_set(const char *val, const struct kernel_param *kp,
+                            char *type, char *compressor)
+{
+       struct zswap_pool *pool, *put_pool = NULL;
+       char str[kp->str->maxlen], *s;
+       int ret;
+
+       /*
+        * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
+        * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
+        * 32 (arbitrary).
+        */
+       strlcpy(str, val, kp->str->maxlen);
+       s = strim(str);
+
+       /* if this is load-time (pre-init) param setting,
+        * don't create a pool; that's done during init.
+        */
+       if (!zswap_init_started)
+               return param_set_copystring(s, kp);
+
+       /* no change required */
+       if (!strncmp(kp->str->string, s, kp->str->maxlen))
+               return 0;
+
+       if (!type) {
+               type = s;
+               if (!zpool_has_pool(type)) {
+                       pr_err("zpool %s not available\n", type);
+                       return -ENOENT;
+               }
+       } else if (!compressor) {
+               compressor = s;
+               if (!crypto_has_comp(compressor, 0, 0)) {
+                       pr_err("compressor %s not available\n", compressor);
+                       return -ENOENT;
+               }
+       }
+
+       spin_lock(&zswap_pools_lock);
+
+       pool = zswap_pool_find_get(type, compressor);
+       if (pool) {
+               zswap_pool_debug("using existing", pool);
+               list_del_rcu(&pool->list);
+       } else {
+               spin_unlock(&zswap_pools_lock);
+               pool = zswap_pool_create(type, compressor);
+               spin_lock(&zswap_pools_lock);
+       }
+
+       if (pool)
+               ret = param_set_copystring(s, kp);
+       else
+               ret = -EINVAL;
+
+       if (!ret) {
+               put_pool = zswap_pool_current();
+               list_add_rcu(&pool->list, &zswap_pools);
+       } else if (pool) {
+               /* add the possibly pre-existing pool to the end of the pools
+                * list; if it's new (and empty) then it'll be removed and
+                * destroyed by the put after we drop the lock
+                */
+               list_add_tail_rcu(&pool->list, &zswap_pools);
+               put_pool = pool;
+       }
+
+       spin_unlock(&zswap_pools_lock);
+
+       /* drop the ref from either the old current pool,
+        * or the new pool we failed to add
+        */
+       if (put_pool)
+               zswap_pool_put(put_pool);
+
+       return ret;
+}
+
+static int zswap_compressor_param_set(const char *val,
+                                     const struct kernel_param *kp)
+{
+       return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
+}
+
+static int zswap_zpool_param_set(const char *val,
+                                const struct kernel_param *kp)
+{
+       return __zswap_param_set(val, kp, NULL, zswap_compressor);
 }
 
 /*********************************
@@ -477,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
        pgoff_t offset;
        struct zswap_entry *entry;
        struct page *page;
+       struct crypto_comp *tfm;
        u8 *src, *dst;
        unsigned int dlen;
        int ret;
@@ -517,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
                /* decompress */
                dlen = PAGE_SIZE;
-               src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+               src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
                                ZPOOL_MM_RO) + sizeof(struct zswap_header);
                dst = kmap_atomic(page);
-               ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
-                               entry->length, dst, &dlen);
+               tfm = *get_cpu_ptr(entry->pool->tfm);
+               ret = crypto_comp_decompress(tfm, src, entry->length,
+                                            dst, &dlen);
+               put_cpu_ptr(entry->pool->tfm);
                kunmap_atomic(dst);
-               zpool_unmap_handle(zswap_pool, entry->handle);
+               zpool_unmap_handle(entry->pool->zpool, entry->handle);
                BUG_ON(ret);
                BUG_ON(dlen != PAGE_SIZE);
 
@@ -572,6 +933,22 @@ end:
        return ret;
 }
 
+static int zswap_shrink(void)
+{
+       struct zswap_pool *pool;
+       int ret;
+
+       pool = zswap_pool_last_get();
+       if (!pool)
+               return -ENOENT;
+
+       ret = zpool_shrink(pool->zpool, 1, NULL);
+
+       zswap_pool_put(pool);
+
+       return ret;
+}
+
 /*********************************
 * frontswap hooks
 **********************************/
@@ -581,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 {
        struct zswap_tree *tree = zswap_trees[type];
        struct zswap_entry *entry, *dupentry;
+       struct crypto_comp *tfm;
        int ret;
        unsigned int dlen = PAGE_SIZE, len;
        unsigned long handle;
@@ -596,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* reclaim space if needed */
        if (zswap_is_full()) {
                zswap_pool_limit_hit++;
-               if (zpool_shrink(zswap_pool, 1, NULL)) {
+               if (zswap_shrink()) {
                        zswap_reject_reclaim_fail++;
                        ret = -ENOMEM;
                        goto reject;
@@ -611,33 +989,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                goto reject;
        }
 
+       /* if entry is successfully added, it keeps the reference */
+       entry->pool = zswap_pool_current_get();
+       if (!entry->pool) {
+               ret = -EINVAL;
+               goto freepage;
+       }
+
        /* compress */
        dst = get_cpu_var(zswap_dstmem);
+       tfm = *get_cpu_ptr(entry->pool->tfm);
        src = kmap_atomic(page);
-       ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
+       ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
        kunmap_atomic(src);
+       put_cpu_ptr(entry->pool->tfm);
        if (ret) {
                ret = -EINVAL;
-               goto freepage;
+               goto put_dstmem;
        }
 
        /* store */
        len = dlen + sizeof(struct zswap_header);
-       ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
-               &handle);
+       ret = zpool_malloc(entry->pool->zpool, len,
+                          __GFP_NORETRY | __GFP_NOWARN, &handle);
        if (ret == -ENOSPC) {
                zswap_reject_compress_poor++;
-               goto freepage;
+               goto put_dstmem;
        }
        if (ret) {
                zswap_reject_alloc_fail++;
-               goto freepage;
+               goto put_dstmem;
        }
-       zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
+       zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
        zhdr->swpentry = swp_entry(type, offset);
        buf = (u8 *)(zhdr + 1);
        memcpy(buf, dst, dlen);
-       zpool_unmap_handle(zswap_pool, handle);
+       zpool_unmap_handle(entry->pool->zpool, handle);
        put_cpu_var(zswap_dstmem);
 
        /* populate entry */
@@ -660,12 +1047,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 
        /* update stats */
        atomic_inc(&zswap_stored_pages);
-       zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+       zswap_update_total_size();
 
        return 0;
 
-freepage:
+put_dstmem:
        put_cpu_var(zswap_dstmem);
+       zswap_pool_put(entry->pool);
+freepage:
        zswap_entry_cache_free(entry);
 reject:
        return ret;
@@ -680,6 +1069,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 {
        struct zswap_tree *tree = zswap_trees[type];
        struct zswap_entry *entry;
+       struct crypto_comp *tfm;
        u8 *src, *dst;
        unsigned int dlen;
        int ret;
@@ -696,13 +1086,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
        /* decompress */
        dlen = PAGE_SIZE;
-       src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+       src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
                        ZPOOL_MM_RO) + sizeof(struct zswap_header);
        dst = kmap_atomic(page);
-       ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
-               dst, &dlen);
+       tfm = *get_cpu_ptr(entry->pool->tfm);
+       ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
+       put_cpu_ptr(entry->pool->tfm);
        kunmap_atomic(dst);
-       zpool_unmap_handle(zswap_pool, entry->handle);
+       zpool_unmap_handle(entry->pool->zpool, entry->handle);
        BUG_ON(ret);
 
        spin_lock(&tree->lock);
@@ -755,10 +1146,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        zswap_trees[type] = NULL;
 }
 
-static const struct zpool_ops zswap_zpool_ops = {
-       .evict = zswap_writeback_entry
-};
-
 static void zswap_frontswap_init(unsigned type)
 {
        struct zswap_tree *tree;
@@ -839,49 +1226,40 @@ static void __exit zswap_debugfs_exit(void) { }
 **********************************/
 static int __init init_zswap(void)
 {
-       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+       struct zswap_pool *pool;
 
-       pr_info("loading zswap\n");
-
-       zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
-                                       &zswap_zpool_ops);
-       if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
-               pr_info("%s zpool not available\n", zswap_zpool_type);
-               zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-               zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
-                                       &zswap_zpool_ops);
-       }
-       if (!zswap_pool) {
-               pr_err("%s zpool not available\n", zswap_zpool_type);
-               pr_err("zpool creation failed\n");
-               goto error;
-       }
-       pr_info("using %s pool\n", zswap_zpool_type);
+       zswap_init_started = true;
 
        if (zswap_entry_cache_create()) {
                pr_err("entry cache creation failed\n");
-               goto cachefail;
+               goto cache_fail;
        }
-       if (zswap_comp_init()) {
-               pr_err("compressor initialization failed\n");
-               goto compfail;
+
+       if (zswap_cpu_dstmem_init()) {
+               pr_err("dstmem alloc failed\n");
+               goto dstmem_fail;
        }
-       if (zswap_cpu_init()) {
-               pr_err("per-cpu initialization failed\n");
-               goto pcpufail;
+
+       pool = __zswap_pool_create_fallback();
+       if (!pool) {
+               pr_err("pool creation failed\n");
+               goto pool_fail;
        }
+       pr_info("loaded using pool %s/%s\n", pool->tfm_name,
+               zpool_get_type(pool->zpool));
+
+       list_add(&pool->list, &zswap_pools);
 
        frontswap_register_ops(&zswap_frontswap_ops);
        if (zswap_debugfs_init())
                pr_warn("debugfs initialization failed\n");
        return 0;
-pcpufail:
-       zswap_comp_exit();
-compfail:
+
+pool_fail:
+       zswap_cpu_dstmem_destroy();
+dstmem_fail:
        zswap_entry_cache_destroy();
-cachefail:
-       zpool_destroy_pool(zswap_pool);
-error:
+cache_fail:
        return -ENOMEM;
 }
 /* must be late so crypto has time to come up */
index af5e187..ea748c9 100644 (file)
@@ -16,7 +16,6 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
-#include <net/switchdev.h>
 #include <uapi/linux/if_bridge.h>
 
 #include "br_private.h"
index 3cd8cc9..5f5a02b 100644 (file)
@@ -117,10 +117,11 @@ out_filt:
        return err;
 }
 
-static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
-                          u16 vid)
+static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
+                         u16 vid)
 {
        const struct net_device_ops *ops = dev->netdev_ops;
+       int err = 0;
 
        /* If driver uses VLAN ndo ops, use 8021q to delete vid
         * on device, otherwise try switchdev ops to delete vid.
@@ -137,8 +138,12 @@ static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
                        },
                };
 
-               switchdev_port_obj_del(dev, &vlan_obj);
+               err = switchdev_port_obj_del(dev, &vlan_obj);
+               if (err == -EOPNOTSUPP)
+                       err = 0;
        }
+
+       return err;
 }
 
 static int __vlan_del(struct net_port_vlans *v, u16 vid)
@@ -151,7 +156,11 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid)
 
        if (v->port_idx) {
                struct net_bridge_port *p = v->parent.port;
-               __vlan_vid_del(p->dev, p->br, vid);
+               int err;
+
+               err = __vlan_vid_del(p->dev, p->br, vid);
+               if (err)
+                       return err;
        }
 
        clear_bit(vid, v->vlan_bitmap);
index 69a4d30..54a00d6 100644 (file)
@@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name,
        opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
        opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
        opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+       opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
 
        /* get mon ip(s) */
        /* ip1[:port1][,ip2[:port2]...] */
index 790fe89..4440edc 100644 (file)
@@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
        return 0;
 }
 
-
-
-#define AES_KEY_SIZE 16
-
 static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
 {
        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
index e3be1d2..525f454 100644 (file)
@@ -163,6 +163,7 @@ static struct kmem_cache    *ceph_msg_data_cache;
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
 
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
@@ -176,7 +177,7 @@ static struct lock_class_key socket_class;
 
 static void queue_con(struct ceph_connection *con);
 static void cancel_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
+static void ceph_con_workfn(struct work_struct *);
 static void con_fault(struct ceph_connection *con);
 
 /*
@@ -276,22 +277,22 @@ static void _ceph_msgr_exit(void)
                ceph_msgr_wq = NULL;
        }
 
-       ceph_msgr_slab_exit();
-
        BUG_ON(zero_page == NULL);
        page_cache_release(zero_page);
        zero_page = NULL;
+
+       ceph_msgr_slab_exit();
 }
 
 int ceph_msgr_init(void)
 {
+       if (ceph_msgr_slab_init())
+               return -ENOMEM;
+
        BUG_ON(zero_page != NULL);
        zero_page = ZERO_PAGE(0);
        page_cache_get(zero_page);
 
-       if (ceph_msgr_slab_init())
-               return -ENOMEM;
-
        /*
         * The number of active work items is limited by the number of
         * connections, so leave @max_active at default.
@@ -749,7 +750,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
        mutex_init(&con->mutex);
        INIT_LIST_HEAD(&con->out_queue);
        INIT_LIST_HEAD(&con->out_sent);
-       INIT_DELAYED_WORK(&con->work, con_work);
+       INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
 
        con->state = CON_STATE_CLOSED;
 }
@@ -1351,7 +1352,15 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 {
        dout("prepare_write_keepalive %p\n", con);
        con_out_kvec_reset(con);
-       con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+       if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+               struct timespec ts = CURRENT_TIME;
+               struct ceph_timespec ceph_ts;
+               ceph_encode_timespec(&ceph_ts, &ts);
+               con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+               con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts);
+       } else {
+               con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+       }
        con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
@@ -1625,6 +1634,12 @@ static void prepare_read_tag(struct ceph_connection *con)
        con->in_tag = CEPH_MSGR_TAG_READY;
 }
 
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+       dout("prepare_read_keepalive_ack %p\n", con);
+       con->in_base_pos = 0;
+}
+
 /*
  * Prepare to read a message.
  */
@@ -2322,13 +2337,6 @@ static int read_partial_message(struct ceph_connection *con)
                        return ret;
 
                BUG_ON(!con->in_msg ^ skip);
-               if (con->in_msg && data_len > con->in_msg->data_length) {
-                       pr_warn("%s skipping long message (%u > %zd)\n",
-                               __func__, data_len, con->in_msg->data_length);
-                       ceph_msg_put(con->in_msg);
-                       con->in_msg = NULL;
-                       skip = 1;
-               }
                if (skip) {
                        /* skip this message */
                        dout("alloc_msg said skip message\n");
@@ -2457,6 +2465,17 @@ static void process_message(struct ceph_connection *con)
        mutex_lock(&con->mutex);
 }
 
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+       struct ceph_timespec ceph_ts;
+       size_t size = sizeof(ceph_ts);
+       int ret = read_partial(con, size, size, &ceph_ts);
+       if (ret <= 0)
+               return ret;
+       ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
+       prepare_read_tag(con);
+       return 1;
+}
 
 /*
  * Write something to the socket.  Called in a worker thread when the
@@ -2526,6 +2545,10 @@ more_kvec:
 
 do_next:
        if (con->state == CON_STATE_OPEN) {
+               if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+                       prepare_write_keepalive(con);
+                       goto more;
+               }
                /* is anything else pending? */
                if (!list_empty(&con->out_queue)) {
                        prepare_write_message(con);
@@ -2535,10 +2558,6 @@ do_next:
                        prepare_write_ack(con);
                        goto more;
                }
-               if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
-                       prepare_write_keepalive(con);
-                       goto more;
-               }
        }
 
        /* Nothing to do! */
@@ -2641,6 +2660,9 @@ more:
                case CEPH_MSGR_TAG_ACK:
                        prepare_read_ack(con);
                        break;
+               case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+                       prepare_read_keepalive_ack(con);
+                       break;
                case CEPH_MSGR_TAG_CLOSE:
                        con_close_socket(con);
                        con->state = CON_STATE_CLOSED;
@@ -2684,6 +2706,12 @@ more:
                process_ack(con);
                goto more;
        }
+       if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+               ret = read_keepalive_ack(con);
+               if (ret <= 0)
+                       goto out;
+               goto more;
+       }
 
 out:
        dout("try_read done on %p ret %d\n", con, ret);
@@ -2799,7 +2827,7 @@ static void con_fault_finish(struct ceph_connection *con)
 /*
  * Do some work on a connection.  Drop a connection ref when we're done.
  */
-static void con_work(struct work_struct *work)
+static void ceph_con_workfn(struct work_struct *work)
 {
        struct ceph_connection *con = container_of(work, struct ceph_connection,
                                                   work.work);
@@ -3101,6 +3129,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+bool ceph_con_keepalive_expired(struct ceph_connection *con,
+                              unsigned long interval)
+{
+       if (interval > 0 &&
+           (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+               struct timespec now = CURRENT_TIME;
+               struct timespec ts;
+               jiffies_to_timespec(interval, &ts);
+               ts = timespec_add(con->last_keepalive_ack, ts);
+               return timespec_compare(&now, &ts) >= 0;
+       }
+       return false;
+}
+
 static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
 {
        struct ceph_msg_data *data;
index 9d6ff12..edda016 100644 (file)
@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
                              CEPH_ENTITY_TYPE_MON, monc->cur_mon,
                              &monc->monmap->mon_inst[monc->cur_mon].addr);
 
+               /* send an initial keepalive to ensure our timestamp is
+                * valid by the time we are in an OPENED state */
+               ceph_con_keepalive(&monc->con);
+
                /* initiatiate authentication handshake */
                ret = ceph_auth_build_hello(monc->auth,
                                            monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
  */
 static void __schedule_delayed(struct ceph_mon_client *monc)
 {
-       unsigned int delay;
+       struct ceph_options *opt = monc->client->options;
+       unsigned long delay;
 
-       if (monc->cur_mon < 0 || __sub_expired(monc))
+       if (monc->cur_mon < 0 || __sub_expired(monc)) {
                delay = 10 * HZ;
-       else
+       } else {
                delay = 20 * HZ;
-       dout("__schedule_delayed after %u\n", delay);
-       schedule_delayed_work(&monc->delayed_work, delay);
+               if (opt->monc_ping_timeout > 0)
+                       delay = min(delay, opt->monc_ping_timeout / 3);
+       }
+       dout("__schedule_delayed after %lu\n", delay);
+       schedule_delayed_work(&monc->delayed_work,
+                             round_jiffies_relative(delay));
 }
 
 /*
@@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work)
                __close_session(monc);
                __open_session(monc);  /* continue hunting */
        } else {
-               ceph_con_keepalive(&monc->con);
+               struct ceph_options *opt = monc->client->options;
+               int is_auth = ceph_auth_is_authenticated(monc->auth);
+               if (ceph_con_keepalive_expired(&monc->con,
+                                              opt->monc_ping_timeout)) {
+                       dout("monc keepalive timeout\n");
+                       is_auth = 0;
+                       __close_session(monc);
+                       monc->hunting = true;
+                       __open_session(monc);
+               }
 
-               __validate_auth(monc);
+               if (!monc->hunting) {
+                       ceph_con_keepalive(&monc->con);
+                       __validate_auth(monc);
+               }
 
-               if (ceph_auth_is_authenticated(monc->auth))
+               if (is_auth)
                        __send_subscribe(monc);
        }
        __schedule_delayed(monc);
index 5003367..80b94e3 100644 (file)
@@ -2817,8 +2817,9 @@ out:
 }
 
 /*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
+ * Lookup and return message for incoming reply.  Don't try to do
+ * anything about a larger than preallocated data portion of the
+ * message at the moment - for now, just skip the message.
  */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
                                  struct ceph_msg_header *hdr,
@@ -2836,10 +2837,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        mutex_lock(&osdc->request_mutex);
        req = __lookup_request(osdc, tid);
        if (!req) {
-               *skip = 1;
+               pr_warn("%s osd%d tid %llu unknown, skipping\n",
+                       __func__, osd->o_osd, tid);
                m = NULL;
-               dout("get_reply unknown tid %llu from osd%d\n", tid,
-                    osd->o_osd);
+               *skip = 1;
                goto out;
        }
 
@@ -2849,10 +2850,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        ceph_msg_revoke_incoming(req->r_reply);
 
        if (front_len > req->r_reply->front_alloc_len) {
-               pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n",
-                       front_len, req->r_reply->front_alloc_len,
-                       (unsigned int)con->peer_name.type,
-                       le64_to_cpu(con->peer_name.num));
+               pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
+                       __func__, osd->o_osd, req->r_tid, front_len,
+                       req->r_reply->front_alloc_len);
                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
                                 false);
                if (!m)
@@ -2860,37 +2860,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
        }
-       m = ceph_msg_get(req->r_reply);
-
-       if (data_len > 0) {
-               struct ceph_osd_data *osd_data;
 
-               /*
-                * XXX This is assuming there is only one op containing
-                * XXX page data.  Probably OK for reads, but this
-                * XXX ought to be done more generally.
-                */
-               osd_data = osd_req_op_extent_osd_data(req, 0);
-               if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
-                       if (osd_data->pages &&
-                               unlikely(osd_data->length < data_len)) {
-
-                               pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n",
-                                       tid, data_len, osd_data->length);
-                               *skip = 1;
-                               ceph_msg_put(m);
-                               m = NULL;
-                               goto out;
-                       }
-               }
+       if (data_len > req->r_reply->data_length) {
+               pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
+                       __func__, osd->o_osd, req->r_tid, data_len,
+                       req->r_reply->data_length);
+               m = NULL;
+               *skip = 1;
+               goto out;
        }
-       *skip = 0;
+
+       m = ceph_msg_get(req->r_reply);
        dout("get_reply tid %lld %p\n", tid, m);
 
 out:
        mutex_unlock(&osdc->request_mutex);
        return m;
-
 }
 
 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
index 4a31258..7d8f581 100644 (file)
@@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                ceph_decode_addr(&addr);
                pr_info("osd%d up\n", osd);
                BUG_ON(osd >= map->max_osd);
-               map->osd_state[osd] |= CEPH_OSD_UP;
+               map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
                map->osd_addr[osd] = addr;
        }
 
index ae8306e..bf77e36 100644 (file)
@@ -44,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 }
 EXPORT_SYMBOL(fib_default_rule_add);
 
-u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 {
        struct list_head *pos;
        struct fib_rule *rule;
@@ -60,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 
        return 0;
 }
-EXPORT_SYMBOL(fib_default_rule_pref);
 
 static void notify_rule_change(int event, struct fib_rule *rule,
                               struct fib_rules_ops *ops, struct nlmsghdr *nlh,
@@ -299,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
        }
        rule->fr_net = net;
 
-       if (tb[FRA_PRIORITY])
-               rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+       rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
+                                     : fib_default_rule_pref(ops);
 
        if (tb[FRA_IIFNAME]) {
                struct net_device *dev;
@@ -350,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
        else
                rule->suppress_ifgroup = -1;
 
-       if (!tb[FRA_PRIORITY] && ops->default_pref)
-               rule->pref = ops->default_pref(ops);
-
        err = -EINVAL;
        if (tb[FRA_GOTO]) {
                if (rule->action != FR_ACT_GOTO)
index 9d66a0f..295bbd6 100644 (file)
@@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
        .configure      = dn_fib_rule_configure,
        .compare        = dn_fib_rule_compare,
        .fill           = dn_fib_rule_fill,
-       .default_pref   = fib_default_rule_pref,
        .flush_cache    = dn_fib_rule_flush_cache,
        .nlgroup        = RTNLGRP_DECnet_RULE,
        .policy         = dn_fib_rule_policy,
index 18123d5..f2bda9e 100644 (file)
@@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
        .delete         = fib4_rule_delete,
        .compare        = fib4_rule_compare,
        .fill           = fib4_rule_fill,
-       .default_pref   = fib_default_rule_pref,
        .nlmsg_payload  = fib4_rule_nlmsg_payload,
        .flush_cache    = fib4_rule_flush_cache,
        .nlgroup        = RTNLGRP_IPV4_RULE,
index 3a2c016..866ee89 100644 (file)
@@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
        .match          = ipmr_rule_match,
        .configure      = ipmr_rule_configure,
        .compare        = ipmr_rule_compare,
-       .default_pref   = fib_default_rule_pref,
        .fill           = ipmr_rule_fill,
        .nlgroup        = RTNLGRP_IPV4_RULE,
        .policy         = ipmr_rule_policy,
index 28011fb..c6ded6b 100644 (file)
@@ -151,6 +151,21 @@ static void bictcp_init(struct sock *sk)
                tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
+static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+       if (event == CA_EVENT_TX_START) {
+               s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime;
+               struct bictcp *ca = inet_csk_ca(sk);
+
+               /* We were application limited (idle) for a while.
+                * Shift epoch_start to keep cwnd growth to cubic curve.
+                */
+               if (ca->epoch_start && delta > 0)
+                       ca->epoch_start += delta;
+               return;
+       }
+}
+
 /* calculate the cubic root of x using a table lookup followed by one
  * Newton-Raphson iteration.
  * Avg err ~= 0.195%
@@ -450,6 +465,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
        .cong_avoid     = bictcp_cong_avoid,
        .set_state      = bictcp_state,
        .undo_cwnd      = bictcp_undo_cwnd,
+       .cwnd_event     = bictcp_cwnd_event,
        .pkts_acked     = bictcp_acked,
        .owner          = THIS_MODULE,
        .name           = "cubic",
index 1188e4f..f9a8a12 100644 (file)
@@ -164,6 +164,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
        struct inet_connection_sock *icsk = inet_csk(sk);
        const u32 now = tcp_time_stamp;
 
+       if (tcp_packets_in_flight(tp) == 0)
+               tcp_ca_event(sk, CA_EVENT_TX_START);
+
        tp->lsndtime = now;
 
        /* If it is a reply for ato after last received
@@ -940,9 +943,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                                                           &md5);
        tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
-       if (tcp_packets_in_flight(tp) == 0)
-               tcp_ca_event(sk, CA_EVENT_TX_START);
-
        /* if no packet is in qdisc/device queue, then allow XPS to select
         * another queue. We can be called from tcp_tsq_handler()
         * which holds one reference to sk_wmem_alloc.
index 2367a16..9f777ec 100644 (file)
@@ -258,11 +258,6 @@ nla_put_failure:
        return -ENOBUFS;
 }
 
-static u32 fib6_rule_default_pref(struct fib_rules_ops *ops)
-{
-       return 0x3FFF;
-}
-
 static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
 {
        return nla_total_size(16) /* dst */
@@ -279,7 +274,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
        .configure              = fib6_rule_configure,
        .compare                = fib6_rule_compare,
        .fill                   = fib6_rule_fill,
-       .default_pref           = fib6_rule_default_pref,
        .nlmsg_payload          = fib6_rule_nlmsg_payload,
        .nlgroup                = RTNLGRP_IPV6_RULE,
        .policy                 = fib6_rule_policy,
index 74ceb73..0e004cc 100644 (file)
@@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
        .match          = ip6mr_rule_match,
        .configure      = ip6mr_rule_configure,
        .compare        = ip6mr_rule_compare,
-       .default_pref   = fib_default_rule_pref,
        .fill           = ip6mr_rule_fill,
        .nlgroup        = RTNLGRP_IPV6_RULE,
        .policy         = ip6mr_rule_policy,
@@ -550,7 +549,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
        if (it->cache == &mrt->mfc6_unres_queue)
                spin_unlock_bh(&mfc_unres_lock);
-       else if (it->cache == mrt->mfc6_cache_array)
+       else if (it->cache == &mrt->mfc6_cache_array[it->ct])
                read_unlock(&mrt_lock);
 }
 
index f45cac6..53617d7 100644 (file)
@@ -1748,7 +1748,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
        return -EINVAL;
 }
 
-int ip6_route_add(struct fib6_config *cfg)
+int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
 {
        int err;
        struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1756,7 +1756,6 @@ int ip6_route_add(struct fib6_config *cfg)
        struct net_device *dev = NULL;
        struct inet6_dev *idev = NULL;
        struct fib6_table *table;
-       struct mx6_config mxc = { .mx = NULL, };
        int addr_type;
 
        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
@@ -1981,6 +1980,32 @@ install_route:
 
        cfg->fc_nlinfo.nl_net = dev_net(dev);
 
+       *rt_ret = rt;
+
+       return 0;
+out:
+       if (dev)
+               dev_put(dev);
+       if (idev)
+               in6_dev_put(idev);
+       if (rt)
+               dst_free(&rt->dst);
+
+       *rt_ret = NULL;
+
+       return err;
+}
+
+int ip6_route_add(struct fib6_config *cfg)
+{
+       struct mx6_config mxc = { .mx = NULL, };
+       struct rt6_info *rt = NULL;
+       int err;
+
+       err = ip6_route_info_create(cfg, &rt);
+       if (err)
+               goto out;
+
        err = ip6_convert_metrics(&mxc, cfg);
        if (err)
                goto out;
@@ -1988,14 +2013,12 @@ install_route:
        err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
 
        kfree(mxc.mx);
+
        return err;
 out:
-       if (dev)
-               dev_put(dev);
-       if (idev)
-               in6_dev_put(idev);
        if (rt)
                dst_free(&rt->dst);
+
        return err;
 }
 
@@ -2776,19 +2799,78 @@ errout:
        return err;
 }
 
-static int ip6_route_multipath(struct fib6_config *cfg, int add)
+struct rt6_nh {
+       struct rt6_info *rt6_info;
+       struct fib6_config r_cfg;
+       struct mx6_config mxc;
+       struct list_head next;
+};
+
+static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
+{
+       struct rt6_nh *nh;
+
+       list_for_each_entry(nh, rt6_nh_list, next) {
+               pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
+                       &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
+                       nh->r_cfg.fc_ifindex);
+       }
+}
+
+static int ip6_route_info_append(struct list_head *rt6_nh_list,
+                                struct rt6_info *rt, struct fib6_config *r_cfg)
+{
+       struct rt6_nh *nh;
+       struct rt6_info *rtnh;
+       int err = -EEXIST;
+
+       list_for_each_entry(nh, rt6_nh_list, next) {
+               /* check if rt6_info already exists */
+               rtnh = nh->rt6_info;
+
+               if (rtnh->dst.dev == rt->dst.dev &&
+                   rtnh->rt6i_idev == rt->rt6i_idev &&
+                   ipv6_addr_equal(&rtnh->rt6i_gateway,
+                                   &rt->rt6i_gateway))
+                       return err;
+       }
+
+       nh = kzalloc(sizeof(*nh), GFP_KERNEL);
+       if (!nh)
+               return -ENOMEM;
+       nh->rt6_info = rt;
+       err = ip6_convert_metrics(&nh->mxc, r_cfg);
+       if (err) {
+               kfree(nh);
+               return err;
+       }
+       memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
+       list_add_tail(&nh->next, rt6_nh_list);
+
+       return 0;
+}
+
+static int ip6_route_multipath_add(struct fib6_config *cfg)
 {
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
+       struct rt6_info *rt;
+       struct rt6_nh *err_nh;
+       struct rt6_nh *nh, *nh_safe;
        int remaining;
        int attrlen;
-       int err = 0, last_err = 0;
+       int err = 1;
+       int nhn = 0;
+       int replace = (cfg->fc_nlinfo.nlh &&
+                      (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
+       LIST_HEAD(rt6_nh_list);
 
        remaining = cfg->fc_mp_len;
-beginning:
        rtnh = (struct rtnexthop *)cfg->fc_mp;
 
-       /* Parse a Multipath Entry */
+       /* Parse a Multipath Entry and build a list (rt6_nh_list) of
+        * rt6_info structs per nexthop
+        */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
@@ -2808,22 +2890,32 @@ beginning:
                        if (nla)
                                r_cfg.fc_encap_type = nla_get_u16(nla);
                }
-               err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+
+               err = ip6_route_info_create(&r_cfg, &rt);
+               if (err)
+                       goto cleanup;
+
+               err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
                if (err) {
-                       last_err = err;
-                       /* If we are trying to remove a route, do not stop the
-                        * loop when ip6_route_del() fails (because next hop is
-                        * already gone), we should try to remove all next hops.
-                        */
-                       if (add) {
-                               /* If add fails, we should try to delete all
-                                * next hops that have been already added.
-                                */
-                               add = 0;
-                               remaining = cfg->fc_mp_len - remaining;
-                               goto beginning;
-                       }
+                       dst_free(&rt->dst);
+                       goto cleanup;
+               }
+
+               rtnh = rtnh_next(rtnh, &remaining);
+       }
+
+       err_nh = NULL;
+       list_for_each_entry(nh, &rt6_nh_list, next) {
+               err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
+               /* nh->rt6_info is used or freed at this point, reset to NULL*/
+               nh->rt6_info = NULL;
+               if (err) {
+                       if (replace && nhn)
+                               ip6_print_replace_route_err(&rt6_nh_list);
+                       err_nh = nh;
+                       goto add_errout;
                }
+
                /* Because each route is added like a single route we remove
                 * these flags after the first nexthop: if there is a collision,
                 * we have already failed to add the first nexthop:
@@ -2833,6 +2925,62 @@ beginning:
                 */
                cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
                                                     NLM_F_REPLACE);
+               nhn++;
+       }
+
+       goto cleanup;
+
+add_errout:
+       /* Delete routes that were already added */
+       list_for_each_entry(nh, &rt6_nh_list, next) {
+               if (err_nh == nh)
+                       break;
+               ip6_route_del(&nh->r_cfg);
+       }
+
+cleanup:
+       list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
+               if (nh->rt6_info)
+                       dst_free(&nh->rt6_info->dst);
+               kfree(nh->mxc.mx);
+               list_del(&nh->next);
+               kfree(nh);
+       }
+
+       return err;
+}
+
+static int ip6_route_multipath_del(struct fib6_config *cfg)
+{
+       struct fib6_config r_cfg;
+       struct rtnexthop *rtnh;
+       int remaining;
+       int attrlen;
+       int err = 1, last_err = 0;
+
+       remaining = cfg->fc_mp_len;
+       rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+       /* Parse a Multipath Entry */
+       while (rtnh_ok(rtnh, remaining)) {
+               memcpy(&r_cfg, cfg, sizeof(*cfg));
+               if (rtnh->rtnh_ifindex)
+                       r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+               attrlen = rtnh_attrlen(rtnh);
+               if (attrlen > 0) {
+                       struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+                       nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+                       if (nla) {
+                               nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+                               r_cfg.fc_flags |= RTF_GATEWAY;
+                       }
+               }
+               err = ip6_route_del(&r_cfg);
+               if (err)
+                       last_err = err;
+
                rtnh = rtnh_next(rtnh, &remaining);
        }
 
@@ -2849,7 +2997,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
                return err;
 
        if (cfg.fc_mp)
-               return ip6_route_multipath(&cfg, 0);
+               return ip6_route_multipath_del(&cfg);
        else
                return ip6_route_del(&cfg);
 }
@@ -2864,7 +3012,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
                return err;
 
        if (cfg.fc_mp)
-               return ip6_route_multipath(&cfg, 1);
+               return ip6_route_multipath_add(&cfg);
        else
                return ip6_route_add(&cfg);
 }
index 685ec13..17b1fe9 100644 (file)
@@ -2468,6 +2468,10 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
            rssi_hyst == bss_conf->cqm_rssi_hyst)
                return 0;
 
+       if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER &&
+           !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI))
+               return -EOPNOTSUPP;
+
        bss_conf->cqm_rssi_thold = rssi_thold;
        bss_conf->cqm_rssi_hyst = rssi_hyst;
 
index 705ef1d..cd7e55e 100644 (file)
@@ -4267,6 +4267,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
        struct ieee80211_supported_band *sband;
        struct cfg80211_chan_def chandef;
        int ret;
+       u32 i;
+       bool have_80mhz;
 
        sband = local->hw.wiphy->bands[cbss->channel->band];
 
@@ -4317,6 +4319,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
                }
        }
 
+       /* Allow VHT if at least one channel on the sband supports 80 MHz */
+       have_80mhz = false;
+       for (i = 0; i < sband->n_channels; i++) {
+               if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+                                               IEEE80211_CHAN_NO_80MHZ))
+                       continue;
+
+               have_80mhz = true;
+               break;
+       }
+
+       if (!have_80mhz)
+               ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
        ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
                                                     cbss->channel,
                                                     ht_cap, ht_oper, vht_oper,
index 9857693..9ce8883 100644 (file)
@@ -716,7 +716,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata,
 
                /* Filter out rates that the STA does not support */
                *mask &= sta->supp_rates[sband->band];
-               for (i = 0; i < sizeof(mcs_mask); i++)
+               for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
                        mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i];
 
                sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map;
index aee701a..4e202d0 100644 (file)
@@ -1249,6 +1249,58 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata)
        mutex_unlock(&local->chanctx_mtx);
 }
 
+static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata)
+{
+       struct sta_info *sta;
+       bool result = false;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+               if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded ||
+                   !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+                   !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) ||
+                   !sta->sta.ht_cap.ht_supported)
+                       continue;
+               result = true;
+               break;
+       }
+       rcu_read_unlock();
+
+       return result;
+}
+
+static void
+iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata,
+                                  struct sta_info *sta)
+{
+       struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+       bool tdls_ht;
+       u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED |
+                        IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
+                        IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
+       u16 opmode;
+
+       /* Nothing to do if the BSS connection uses HT */
+       if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+               return;
+
+       tdls_ht = (sta && sta->sta.ht_cap.ht_supported) ||
+                 iee80211_tdls_have_ht_peers(sdata);
+
+       opmode = sdata->vif.bss_conf.ht_operation_mode;
+
+       if (tdls_ht)
+               opmode |= protection;
+       else
+               opmode &= ~protection;
+
+       if (opmode == sdata->vif.bss_conf.ht_operation_mode)
+               return;
+
+       sdata->vif.bss_conf.ht_operation_mode = opmode;
+       ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+}
+
 int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                        const u8 *peer, enum nl80211_tdls_operation oper)
 {
@@ -1274,6 +1326,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                return -ENOTSUPP;
        }
 
+       /* protect possible bss_conf changes and avoid concurrency in
+        * ieee80211_bss_info_change_notify()
+        */
+       sdata_lock(sdata);
        mutex_lock(&local->mtx);
        tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer);
 
@@ -1287,16 +1343,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 
                iee80211_tdls_recalc_chanctx(sdata);
 
-               rcu_read_lock();
+               mutex_lock(&local->sta_mtx);
                sta = sta_info_get(sdata, peer);
                if (!sta) {
-                       rcu_read_unlock();
+                       mutex_unlock(&local->sta_mtx);
                        ret = -ENOLINK;
                        break;
                }
 
+               iee80211_tdls_recalc_ht_protection(sdata, sta);
+
                set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
-               rcu_read_unlock();
+               mutex_unlock(&local->sta_mtx);
 
                WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) ||
                             !ether_addr_equal(sdata->u.mgd.tdls_peer, peer));
@@ -1318,6 +1376,11 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                ieee80211_flush_queues(local, sdata, false);
 
                ret = sta_info_destroy_addr(sdata, peer);
+
+               mutex_lock(&local->sta_mtx);
+               iee80211_tdls_recalc_ht_protection(sdata, NULL);
+               mutex_unlock(&local->sta_mtx);
+
                iee80211_tdls_recalc_chanctx(sdata);
                break;
        default:
@@ -1335,6 +1398,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                                     &sdata->u.mgd.request_smps_work);
 
        mutex_unlock(&local->mtx);
+       sdata_unlock(sdata);
        return ret;
 }
 
index 834ccdb..ff1c798 100644 (file)
@@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
        struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
        struct ieee80211_sta_vht_cap own_cap;
        u32 cap_info, i;
+       bool have_80mhz;
 
        memset(vht_cap, 0, sizeof(*vht_cap));
 
@@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
        if (!vht_cap_ie || !sband->vht_cap.vht_supported)
                return;
 
+       /* Allow VHT if at least one channel on the sband supports 80 MHz */
+       have_80mhz = false;
+       for (i = 0; i < sband->n_channels; i++) {
+               if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+                                               IEEE80211_CHAN_NO_80MHZ))
+                       continue;
+
+               have_80mhz = true;
+               break;
+       }
+
+       if (!have_80mhz)
+               return;
+
        /*
         * A VHT STA must support 40 MHz, but if we verify that here
         * then we break a few things - some APs (e.g. Netgear R6300v2
index afe905c..691b54f 100644 (file)
@@ -152,9 +152,13 @@ htable_bits(u32 hashsize)
 #define SET_HOST_MASK(family)  (family == AF_INET ? 32 : 128)
 
 #ifdef IP_SET_HASH_WITH_NET0
+/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */
 #define NLEN(family)           (SET_HOST_MASK(family) + 1)
+#define CIDR_POS(c)            ((c) - 1)
 #else
+/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */
 #define NLEN(family)           SET_HOST_MASK(family)
+#define CIDR_POS(c)            ((c) - 2)
 #endif
 
 #else
@@ -305,7 +309,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
                } else if (h->nets[i].cidr[n] < cidr) {
                        j = i;
                } else if (h->nets[i].cidr[n] == cidr) {
-                       h->nets[cidr - 1].nets[n]++;
+                       h->nets[CIDR_POS(cidr)].nets[n]++;
                        return;
                }
        }
@@ -314,7 +318,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
                        h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
        }
        h->nets[i].cidr[n] = cidr;
-       h->nets[cidr - 1].nets[n] = 1;
+       h->nets[CIDR_POS(cidr)].nets[n] = 1;
 }
 
 static void
@@ -325,8 +329,8 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
        for (i = 0; i < nets_length; i++) {
                if (h->nets[i].cidr[n] != cidr)
                        continue;
-               h->nets[cidr - 1].nets[n]--;
-               if (h->nets[cidr - 1].nets[n] > 0)
+               h->nets[CIDR_POS(cidr)].nets[n]--;
+               if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
                        return;
                for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
                        h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
index 3c862c0..a93dfeb 100644 (file)
@@ -131,6 +131,13 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next,
 #define HOST_MASK      32
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netnet4_init(struct hash_netnet4_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
                  const struct xt_action_param *par,
@@ -160,7 +167,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 {
        const struct hash_netnet *h = set->data;
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netnet4_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        u32 ip = 0, ip_to = 0, last;
        u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
@@ -169,6 +176,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netnet4_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
                return -IPSET_ERR_PROTOCOL;
@@ -357,6 +365,13 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next,
 #define IP_SET_EMIT_CREATE
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netnet6_init(struct hash_netnet6_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
                  const struct xt_action_param *par,
@@ -385,13 +400,14 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
                  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netnet6_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        int ret;
 
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netnet6_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
                return -IPSET_ERR_PROTOCOL;
index 0c68734..9a14c23 100644 (file)
@@ -142,6 +142,13 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
 #define HOST_MASK      32
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netportnet4_init(struct hash_netportnet4_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
                      const struct xt_action_param *par,
@@ -175,7 +182,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 {
        const struct hash_netportnet *h = set->data;
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netportnet4_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
        u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
@@ -185,6 +192,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netportnet4_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -412,6 +420,13 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
 #define IP_SET_EMIT_CREATE
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netportnet6_init(struct hash_netportnet6_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
                      const struct xt_action_param *par,
@@ -445,7 +460,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 {
        const struct hash_netportnet *h = set->data;
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netportnet6_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        u32 port, port_to;
        bool with_ports = false;
@@ -454,6 +469,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netportnet6_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
index eedf049..c09d6c7 100644 (file)
@@ -313,12 +313,13 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
-static void nf_ct_tmpl_free(struct nf_conn *tmpl)
+void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
        nf_ct_ext_destroy(tmpl);
        nf_ct_ext_free(tmpl);
        kfree(tmpl);
 }
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
index 888b955..c8a4a48 100644 (file)
@@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net)
 err3:
        free_percpu(snet->stats);
 err2:
-       nf_conntrack_free(ct);
+       nf_ct_tmpl_free(ct);
 err1:
        return err;
 }
index 0c0e8ec..70277b1 100644 (file)
@@ -444,6 +444,7 @@ done:
 static void nfnetlink_rcv(struct sk_buff *skb)
 {
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
+       u_int16_t res_id;
        int msglen;
 
        if (nlh->nlmsg_len < NLMSG_HDRLEN ||
@@ -468,7 +469,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 
                nfgenmsg = nlmsg_data(nlh);
                skb_pull(skb, msglen);
-               nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+               /* Work around old nft using host byte order */
+               if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+                       res_id = NFNL_SUBSYS_NFTABLES;
+               else
+                       res_id = ntohs(nfgenmsg->res_id);
+               nfnetlink_rcv_batch(skb, nlh, res_id);
        } else {
                netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
        }
index 685cc6a..a5cd6d9 100644 (file)
@@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                           __be32 **packet_id_ptr)
 {
        size_t size;
-       size_t data_len = 0, cap_len = 0;
+       size_t data_len = 0, cap_len = 0, rem_len = 0;
        unsigned int hlen = 0;
        struct sk_buff *skb;
        struct nlattr *nla;
@@ -360,6 +360,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                hlen = min_t(unsigned int, hlen, data_len);
                size += sizeof(struct nlattr) + hlen;
                cap_len = entskb->len;
+               rem_len = data_len - hlen;
                break;
        }
 
@@ -377,7 +378,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                        size += nla_total_size(seclen);
        }
 
-       skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+       skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
                                  GFP_ATOMIC);
        if (!skb) {
                skb_tx_error(entskb);
index 8e52489..faf32d8 100644 (file)
@@ -255,7 +255,7 @@ out:
        return 0;
 
 err3:
-       nf_conntrack_free(ct);
+       nf_ct_tmpl_free(ct);
 err2:
        nf_ct_l3proto_module_put(par->family);
 err1:
index 50889be..7f86d3b 100644 (file)
@@ -674,12 +674,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
 
        mask = datagram_poll(file, sock, wait);
 
-       spin_lock_bh(&sk->sk_receive_queue.lock);
-       if (nlk->rx_ring.pg_vec) {
-               if (netlink_has_valid_frame(&nlk->rx_ring))
-                       mask |= POLLIN | POLLRDNORM;
+       /* We could already have received frames in the normal receive
+        * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
+        * so if mask contains pollin/etc already, there's no point
+        * walking the ring.
+        */
+       if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
+               spin_lock_bh(&sk->sk_receive_queue.lock);
+               if (nlk->rx_ring.pg_vec) {
+                       if (netlink_has_valid_frame(&nlk->rx_ring))
+                               mask |= POLLIN | POLLRDNORM;
+               }
+               spin_unlock_bh(&sk->sk_receive_queue.lock);
        }
-       spin_unlock_bh(&sk->sk_receive_queue.lock);
 
        spin_lock_bh(&sk->sk_write_queue.lock);
        if (nlk->tx_ring.pg_vec) {
@@ -1837,15 +1844,16 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
-struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-                                 u32 dst_portid, gfp_t gfp_mask)
+struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+                                   unsigned int ldiff, u32 dst_portid,
+                                   gfp_t gfp_mask)
 {
 #ifdef CONFIG_NETLINK_MMAP
+       unsigned int maxlen, linear_size;
        struct sock *sk = NULL;
        struct sk_buff *skb;
        struct netlink_ring *ring;
        struct nl_mmap_hdr *hdr;
-       unsigned int maxlen;
 
        sk = netlink_getsockbyportid(ssk, dst_portid);
        if (IS_ERR(sk))
@@ -1856,7 +1864,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
        if (ring->pg_vec == NULL)
                goto out_put;
 
-       if (ring->frame_size - NL_MMAP_HDRLEN < size)
+       /* We need to account the full linear size needed as a ring
+        * slot cannot have non-linear parts.
+        */
+       linear_size = size + ldiff;
+       if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
                goto out_put;
 
        skb = alloc_skb_head(gfp_mask);
@@ -1870,13 +1882,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
 
        /* check again under lock */
        maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-       if (maxlen < size)
+       if (maxlen < linear_size)
                goto out_free;
 
        netlink_forward_ring(ring);
        hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
        if (hdr == NULL)
                goto err2;
+
        netlink_ring_setup_skb(skb, sk, ring, hdr);
        netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
        atomic_inc(&ring->pending);
@@ -1902,7 +1915,7 @@ out:
 #endif
        return alloc_skb(size, gfp_mask);
 }
-EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
 
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
index af7cdef..2a071f4 100644 (file)
@@ -5,6 +5,7 @@
 config OPENVSWITCH
        tristate "Open vSwitch"
        depends on INET
+       depends on (!NF_CONNTRACK || NF_CONNTRACK)
        select LIBCRC32C
        select MPLS
        select NET_MPLS_GSO
@@ -31,17 +32,6 @@ config OPENVSWITCH
 
          If unsure, say N.
 
-config OPENVSWITCH_CONNTRACK
-       bool "Open vSwitch conntrack action support"
-       depends on OPENVSWITCH
-       depends on NF_CONNTRACK
-       default OPENVSWITCH
-       ---help---
-         If you say Y here, then Open vSwitch module will be able to pass
-         packets through conntrack.
-
-         Say N to exclude this support and reduce the binary size.
-
 config OPENVSWITCH_GRE
        tristate "Open vSwitch GRE tunneling support"
        depends on OPENVSWITCH
index 5b5913b..60f8090 100644 (file)
@@ -15,7 +15,9 @@ openvswitch-y := \
        vport-internal_dev.o \
        vport-netdev.o
 
-openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o
+ifneq ($(CONFIG_NF_CONNTRACK),)
+openvswitch-y += conntrack.o
+endif
 
 obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
 obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
index 3cb3066..43f5dd7 100644 (file)
@@ -19,7 +19,7 @@
 struct ovs_conntrack_info;
 enum ovs_key_attr;
 
-#if defined(CONFIG_OPENVSWITCH_CONNTRACK)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 void ovs_ct_init(struct net *);
 void ovs_ct_exit(struct net *);
 bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
@@ -82,5 +82,5 @@ static inline int ovs_ct_put_key(const struct sw_flow_key *key,
 }
 
 static inline void ovs_ct_free_action(const struct nlattr *a) { }
-#endif
+#endif /* CONFIG_NF_CONNTRACK */
 #endif /* ovs_conntrack.h */
index a50e652..49adeef 100644 (file)
@@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 } while (0)
 
 /* rcu read lock must be held or the connection spinlock */
-static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+static struct rds_connection *rds_conn_lookup(struct net *net,
+                                             struct hlist_head *head,
                                              __be32 laddr, __be32 faddr,
                                              struct rds_transport *trans)
 {
@@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 
        hlist_for_each_entry_rcu(conn, head, c_hash_node) {
                if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
-                               conn->c_trans == trans) {
+                   conn->c_trans == trans && net == rds_conn_net(conn)) {
                        ret = conn;
                        break;
                }
@@ -132,7 +133,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
        if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
                goto new_conn;
        rcu_read_lock();
-       conn = rds_conn_lookup(head, laddr, faddr, trans);
+       conn = rds_conn_lookup(net, head, laddr, faddr, trans);
        if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
            laddr == faddr && !is_outgoing) {
                /* This is a looped back IB connection, and we're
@@ -189,6 +190,12 @@ new_conn:
                }
        }
 
+       if (trans == NULL) {
+               kmem_cache_free(rds_conn_slab, conn);
+               conn = ERR_PTR(-ENODEV);
+               goto out;
+       }
+
        conn->c_trans = trans;
 
        ret = trans->conn_alloc(conn, gfp);
@@ -239,7 +246,7 @@ new_conn:
                if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
                        found = NULL;
                else
-                       found = rds_conn_lookup(head, laddr, faddr, trans);
+                       found = rds_conn_lookup(net, head, laddr, faddr, trans);
                if (found) {
                        trans->conn_free(conn->c_transport_data);
                        kmem_cache_free(rds_conn_slab, conn);
index f12149a..b41e9ea 100644 (file)
@@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
 {
        struct rfkill *rfkill;
 
-       rfkill_global_states[type].cur = blocked;
+       if (type == RFKILL_TYPE_ALL) {
+               int i;
+
+               for (i = 0; i < NUM_RFKILL_TYPES; i++)
+                       rfkill_global_states[i].cur = blocked;
+       } else {
+               rfkill_global_states[type].cur = blocked;
+       }
+
        list_for_each_entry(rfkill, &rfkill_list, node) {
                if (rfkill->type != type && type != RFKILL_TYPE_ALL)
                        continue;
index 4345790..b714333 100644 (file)
@@ -506,14 +506,22 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
                if (IS_ERR(rt))
                        continue;
 
+               if (!dst)
+                       dst = &rt->dst;
+
                /* Ensure the src address belongs to the output
                 * interface.
                 */
                odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
                                     false);
-               if (!odev || odev->ifindex != fl4->flowi4_oif)
+               if (!odev || odev->ifindex != fl4->flowi4_oif) {
+                       if (&rt->dst != dst)
+                               dst_release(&rt->dst);
                        continue;
+               }
 
+               if (dst != &rt->dst)
+                       dst_release(dst);
                dst = &rt->dst;
                break;
        }
index 16c1c43..fda38f8 100644 (file)
@@ -853,12 +853,8 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                .cb = cb,
                .idx = idx,
        };
-       int err;
-
-       err = switchdev_port_obj_dump(dev, &dump.obj);
-       if (err)
-               return err;
 
+       switchdev_port_obj_dump(dev, &dump.obj);
        return dump.idx;
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
index 8b010c9..41042de 100644 (file)
@@ -169,6 +169,30 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
        }
 }
 
+/**
+ * bclink_prepare_wakeup - prepare users for wakeup after congestion
+ * @bcl: broadcast link
+ * @resultq: queue for users which can be woken up
+ * Move a number of waiting users, as permitted by available space in
+ * the send queue, from link wait queue to specified queue for wakeup
+ */
+static void bclink_prepare_wakeup(struct tipc_link *bcl, struct sk_buff_head *resultq)
+{
+       int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
+       int imp, lim;
+       struct sk_buff *skb, *tmp;
+
+       skb_queue_walk_safe(&bcl->wakeupq, skb, tmp) {
+               imp = TIPC_SKB_CB(skb)->chain_imp;
+               lim = bcl->window + bcl->backlog[imp].limit;
+               pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
+               if ((pnd[imp] + bcl->backlog[imp].len) >= lim)
+                       continue;
+               skb_unlink(skb, &bcl->wakeupq);
+               skb_queue_tail(resultq, skb);
+       }
+}
+
 /**
  * tipc_bclink_wakeup_users - wake up pending users
  *
@@ -177,8 +201,12 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
 void tipc_bclink_wakeup_users(struct net *net)
 {
        struct tipc_net *tn = net_generic(net, tipc_net_id);
+       struct tipc_link *bcl = tn->bcl;
+       struct sk_buff_head resultq;
 
-       tipc_sk_rcv(net, &tn->bclink->link.wakeupq);
+       skb_queue_head_init(&resultq);
+       bclink_prepare_wakeup(bcl, &resultq);
+       tipc_sk_rcv(net, &resultq);
 }
 
 /**
index b144485..2510b23 100644 (file)
@@ -2625,7 +2625,7 @@ static void restore_regulatory_settings(bool reset_user)
         * settings, user regulatory settings takes precedence.
         */
        if (is_an_alpha2(alpha2))
-               regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER);
+               regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER);
 
        spin_lock(&reg_requests_lock);
        list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
index a51ca0e..f2a1131 100755 (executable)
@@ -264,6 +264,7 @@ our $Sparse = qr{
                        __kernel|
                        __force|
                        __iomem|
+                       __pmem|
                        __must_check|
                        __init_refok|
                        __kprobes|
@@ -584,7 +585,7 @@ our $LvalOrFunc     = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*};
 our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)};
 
 our $declaration_macros = qr{(?x:
-       (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(|
+       (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
        (?:$Storage\s+)?LIST_HEAD\s*\(|
        (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(
 )};
@@ -1953,9 +1954,9 @@ sub process {
        our $clean = 1;
        my $signoff = 0;
        my $is_patch = 0;
-
        my $in_header_lines = $file ? 0 : 1;
        my $in_commit_log = 0;          #Scanning lines before patch
+       my $commit_log_possible_stack_dump = 0;
        my $commit_log_long_line = 0;
        my $commit_log_has_diff = 0;
        my $reported_maintainer_file = 0;
@@ -2166,11 +2167,15 @@ sub process {
                if ($showfile) {
                        $prefix = "$realfile:$realline: "
                } elsif ($emacs) {
-                       $prefix = "$filename:$linenr: ";
+                       if ($file) {
+                               $prefix = "$filename:$realline: ";
+                       } else {
+                               $prefix = "$filename:$linenr: ";
+                       }
                }
 
                if ($found_file) {
-                       if ($realfile =~ m@^(drivers/net/|net/)@) {
+                       if ($realfile =~ m@^(?:drivers/net/|net/|drivers/staging/)@) {
                                $check = 1;
                        } else {
                                $check = $check_orig;
@@ -2310,16 +2315,42 @@ sub process {
 
 # Check for line lengths > 75 in commit log, warn once
                if ($in_commit_log && !$commit_log_long_line &&
-                   length($line) > 75) {
+                   length($line) > 75 &&
+                   !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
+                                       # file delta changes
+                     $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
+                                       # filename then :
+                     $line =~ /^\s*(?:Fixes:|Link:)/i ||
+                                       # A Fixes: or Link: line
+                     $commit_log_possible_stack_dump)) {
                        WARN("COMMIT_LOG_LONG_LINE",
                             "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
                        $commit_log_long_line = 1;
                }
 
+# Check if the commit log is in a possible stack dump
+               if ($in_commit_log && !$commit_log_possible_stack_dump &&
+                   ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
+                    $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
+                               # timestamp
+                    $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
+                               # stack dump address
+                       $commit_log_possible_stack_dump = 1;
+               }
+
+# Reset possible stack dump if a blank line is found
+               if ($in_commit_log && $commit_log_possible_stack_dump &&
+                   $line =~ /^\s*$/) {
+                       $commit_log_possible_stack_dump = 0;
+               }
+
 # Check for git id commit length and improperly formed commit descriptions
-               if ($in_commit_log && $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i) {
-                       my $init_char = $1;
-                       my $orig_commit = lc($2);
+               if ($in_commit_log &&
+                   ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
+                    ($line =~ /\b[0-9a-f]{12,40}\b/i &&
+                     $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
+                       my $init_char = "c";
+                       my $orig_commit = "";
                        my $short = 1;
                        my $long = 0;
                        my $case = 1;
@@ -2330,6 +2361,13 @@ sub process {
                        my $orig_desc = "commit description";
                        my $description = "";
 
+                       if ($line =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) {
+                               $init_char = $1;
+                               $orig_commit = lc($2);
+                       } elsif ($line =~ /\b([0-9a-f]{12,40})\b/i) {
+                               $orig_commit = lc($1);
+                       }
+
                        $short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i);
                        $long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i);
                        $space = 0 if ($line =~ /\bcommit [0-9a-f]/i);
@@ -2738,6 +2776,8 @@ sub process {
                        }
                }
 
+# Block comment styles
+# Networking with an initial /*
                if ($realfile =~ m@^(drivers/net/|net/)@ &&
                    $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
                    $rawline =~ /^\+[ \t]*\*/ &&
@@ -2746,22 +2786,23 @@ sub process {
                             "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
                }
 
-               if ($realfile =~ m@^(drivers/net/|net/)@ &&
-                   $prevrawline =~ /^\+[ \t]*\/\*/ &&          #starting /*
+# Block comments use * on subsequent lines
+               if ($prevline =~ /$;[ \t]*$/ &&                 #ends in comment
+                   $prevrawline =~ /^\+.*?\/\*/ &&             #starting /*
                    $prevrawline !~ /\*\/[ \t]*$/ &&            #no trailing */
                    $rawline =~ /^\+/ &&                        #line is new
                    $rawline !~ /^\+[ \t]*\*/) {                #no leading *
-                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
-                            "networking block comments start with * on subsequent lines\n" . $hereprev);
+                       WARN("BLOCK_COMMENT_STYLE",
+                            "Block comments use * on subsequent lines\n" . $hereprev);
                }
 
-               if ($realfile =~ m@^(drivers/net/|net/)@ &&
-                   $rawline !~ m@^\+[ \t]*\*/[ \t]*$@ &&       #trailing */
+# Block comments use */ on trailing lines
+               if ($rawline !~ m@^\+[ \t]*\*/[ \t]*$@ &&       #trailing */
                    $rawline !~ m@^\+.*/\*.*\*/[ \t]*$@ &&      #inline /*...*/
                    $rawline !~ m@^\+.*\*{2,}/[ \t]*$@ &&       #trailing **/
                    $rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) {    #non blank */
-                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
-                            "networking block comments put the trailing */ on a separate line\n" . $herecurr);
+                       WARN("BLOCK_COMMENT_STYLE",
+                            "Block comments use a trailing */ on a separate line\n" . $herecurr);
                }
 
 # check for missing blank lines after struct/union declarations
@@ -3067,15 +3108,22 @@ sub process {
 
                        substr($s, 0, length($c), '');
 
-                       # Make sure we remove the line prefixes as we have
-                       # none on the first line, and are going to readd them
-                       # where necessary.
-                       $s =~ s/\n./\n/gs;
+                       # remove inline comments
+                       $s =~ s/$;/ /g;
+                       $c =~ s/$;/ /g;
 
                        # Find out how long the conditional actually is.
                        my @newlines = ($c =~ /\n/gs);
                        my $cond_lines = 1 + $#newlines;
 
+                       # Make sure we remove the line prefixes as we have
+                       # none on the first line, and are going to readd them
+                       # where necessary.
+                       $s =~ s/\n./\n/gs;
+                       while ($s =~ /\n\s+\\\n/) {
+                               $cond_lines += $s =~ s/\n\s+\\\n/\n/g;
+                       }
+
                        # We want to check the first line inside the block
                        # starting at the end of the conditional, so remove:
                        #  1) any blank line termination
@@ -3141,8 +3189,10 @@ sub process {
 
                        #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
 
-                       if ($check && (($sindent % 8) != 0 ||
-                           ($sindent <= $indent && $s ne ''))) {
+                       if ($check && $s ne '' &&
+                           (($sindent % 8) != 0 ||
+                            ($sindent < $indent) ||
+                            ($sindent > $indent + 8))) {
                                WARN("SUSPECT_CODE_INDENT",
                                     "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
                        }
@@ -3439,13 +3489,15 @@ sub process {
                        }
                }
 
-# # no BUG() or BUG_ON()
-#              if ($line =~ /\b(BUG|BUG_ON)\b/) {
-#                      print "Try to use WARN_ON & Recovery code rather than BUG() or BUG_ON()\n";
-#                      print "$herecurr";
-#                      $clean = 0;
-#              }
+# avoid BUG() or BUG_ON()
+               if ($line =~ /\b(?:BUG|BUG_ON)\b/) {
+                       my $msg_type = \&WARN;
+                       $msg_type = \&CHK if ($file);
+                       &{$msg_type}("AVOID_BUG",
+                                    "Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON()\n" . $herecurr);
+               }
 
+# avoid LINUX_VERSION_CODE
                if ($line =~ /\bLINUX_VERSION_CODE\b/) {
                        WARN("LINUX_VERSION_CODE",
                             "LINUX_VERSION_CODE should be avoided, code should be for the version to which it is merged\n" . $herecurr);
@@ -3520,7 +3572,7 @@ sub process {
 # function brace can't be on same line, except for #defines of do while,
 # or if closed on same line
                if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and
-                   !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) {
+                   !($line=~/\#\s*define.*do\s\{/) and !($line=~/}/)) {
                        if (ERROR("OPEN_BRACE",
                                  "open brace '{' following function declarations go on the next line\n" . $herecurr) &&
                            $fix) {
@@ -4032,8 +4084,8 @@ sub process {
 ##             }
 
 #need space before brace following if, while, etc
-               if (($line =~ /\(.*\){/ && $line !~ /\($Type\){/) ||
-                   $line =~ /do{/) {
+               if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) ||
+                   $line =~ /do\{/) {
                        if (ERROR("SPACING",
                                  "space required before the open brace '{'\n" . $herecurr) &&
                            $fix) {
@@ -4179,6 +4231,35 @@ sub process {
                        }
                }
 
+# comparisons with a constant or upper case identifier on the left
+#      avoid cases like "foo + BAR < baz"
+#      only fix matches surrounded by parentheses to avoid incorrect
+#      conversions like "FOO < baz() + 5" being "misfixed" to "baz() > FOO + 5"
+               if ($^V && $^V ge 5.10.0 &&
+                   $line =~ /^\+(.*)\b($Constant|[A-Z_][A-Z0-9_]*)\s*($Compare)\s*($LvalOrFunc)/) {
+                       my $lead = $1;
+                       my $const = $2;
+                       my $comp = $3;
+                       my $to = $4;
+                       my $newcomp = $comp;
+                       if ($lead !~ /$Operators\s*$/ &&
+                           $to !~ /^(?:Constant|[A-Z_][A-Z0-9_]*)$/ &&
+                           WARN("CONSTANT_COMPARISON",
+                                "Comparisons should place the constant on the right side of the test\n" . $herecurr) &&
+                           $fix) {
+                               if ($comp eq "<") {
+                                       $newcomp = ">";
+                               } elsif ($comp eq "<=") {
+                                       $newcomp = ">=";
+                               } elsif ($comp eq ">") {
+                                       $newcomp = "<";
+                               } elsif ($comp eq ">=") {
+                                       $newcomp = "<=";
+                               }
+                               $fixed[$fixlinenr] =~ s/\(\s*\Q$const\E\s*$Compare\s*\Q$to\E\s*\)/($to $newcomp $const)/;
+                       }
+               }
+
 # Return of what appears to be an errno should normally be negative
                if ($sline =~ /\breturn(?:\s*\(+\s*|\s+)(E[A-Z]+)(?:\s*\)+\s*|\s*)[;:,]/) {
                        my $name = $1;
@@ -4480,7 +4561,7 @@ sub process {
                            $dstat !~ /^for\s*$Constant$/ &&                            # for (...)
                            $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ &&   # for (...) bar()
                            $dstat !~ /^do\s*{/ &&                                      # do {...
-                           $dstat !~ /^\({/ &&                                         # ({...
+                           $dstat !~ /^\(\{/ &&                                                # ({...
                            $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/)
                        {
                                $ctx =~ s/\n*$//;
@@ -4789,16 +4870,20 @@ sub process {
                             "Consecutive strings are generally better as a single string\n" . $herecurr);
                }
 
-# check for %L{u,d,i} in strings
+# check for %L{u,d,i} and 0x%[udi] in strings
                my $string;
                while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
                        $string = substr($rawline, $-[1], $+[1] - $-[1]);
                        $string =~ s/%%/__/g;
-                       if ($string =~ /(?<!%)%L[udi]/) {
+                       if ($string =~ /(?<!%)%[\*\d\.\$]*L[udi]/) {
                                WARN("PRINTF_L",
                                     "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
                                last;
                        }
+                       if ($string =~ /0x%[\*\d\.\$\Llzth]*[udi]/) {
+                               ERROR("PRINTF_0xDECIMAL",
+                                     "Prefixing 0x with decimal output is defective\n" . $herecurr);
+                       }
                }
 
 # check for line continuations in quoted strings with odd counts of "
@@ -4816,10 +4901,34 @@ sub process {
 
 # check for needless "if (<foo>) fn(<foo>)" uses
                if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) {
-                       my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;';
-                       if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) {
-                               WARN('NEEDLESS_IF',
-                                    "$1(NULL) is safe and this check is probably not required\n" . $hereprev);
+                       my $tested = quotemeta($1);
+                       my $expr = '\s*\(\s*' . $tested . '\s*\)\s*;';
+                       if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?|(?:kmem_cache|mempool|dma_pool)_destroy)$expr/) {
+                               my $func = $1;
+                               if (WARN('NEEDLESS_IF',
+                                        "$func(NULL) is safe and this check is probably not required\n" . $hereprev) &&
+                                   $fix) {
+                                       my $do_fix = 1;
+                                       my $leading_tabs = "";
+                                       my $new_leading_tabs = "";
+                                       if ($lines[$linenr - 2] =~ /^\+(\t*)if\s*\(\s*$tested\s*\)\s*$/) {
+                                               $leading_tabs = $1;
+                                       } else {
+                                               $do_fix = 0;
+                                       }
+                                       if ($lines[$linenr - 1] =~ /^\+(\t+)$func\s*\(\s*$tested\s*\)\s*;\s*$/) {
+                                               $new_leading_tabs = $1;
+                                               if (length($leading_tabs) + 1 ne length($new_leading_tabs)) {
+                                                       $do_fix = 0;
+                                               }
+                                       } else {
+                                               $do_fix = 0;
+                                       }
+                                       if ($do_fix) {
+                                               fix_delete_line($fixlinenr - 1, $prevrawline);
+                                               $fixed[$fixlinenr] =~ s/^\+$new_leading_tabs/\+$leading_tabs/;
+                                       }
+                               }
                        }
                }
 
index fd0db01..6ce5945 100644 (file)
@@ -1,15 +1,15 @@
 /* Extract X.509 certificate in DER form from PKCS#11 or PEM.
  *
- * Copyright © 2014 Red Hat, Inc. All Rights Reserved.
- * Copyright © 2015 Intel Corporation.
+ * Copyright © 2014-2015 Red Hat, Inc. All Rights Reserved.
+ * Copyright © 2015      Intel Corporation.
  *
  * Authors: David Howells <dhowells@redhat.com>
  *          David Woodhouse <dwmw2@infradead.org>
  *
  * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the licence, or (at your option) any later version.
  */
 #define _GNU_SOURCE
 #include <stdio.h>
@@ -86,7 +86,7 @@ static void write_cert(X509 *x509)
                ERR(!wb, "%s", cert_dst);
        }
        X509_NAME_oneline(X509_get_subject_name(x509), buf, sizeof(buf));
-       ERR(!i2d_X509_bio(wb, x509), cert_dst);
+       ERR(!i2d_X509_bio(wb, x509), "%s", cert_dst);
        if (kbuild_verbose)
                fprintf(stderr, "Extracted cert: %s\n", buf);
 }
index 058bba3..c3899ca 100755 (executable)
@@ -1,12 +1,15 @@
 /* Sign a module file using the given key.
  *
- * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
+ * Copyright © 2014-2015 Red Hat, Inc. All Rights Reserved.
+ * Copyright © 2015      Intel Corporation.
+ *
+ * Authors: David Howells <dhowells@redhat.com>
+ *          David Woodhouse <dwmw2@infradead.org>
  *
  * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the licence, or (at your option) any later version.
  */
 #define _GNU_SOURCE
 #include <stdio.h>
index 7345508..03c1652 100644 (file)
@@ -401,7 +401,7 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
        bool match = false;
 
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
-                        lockdep_is_held(&devcgroup_mutex),
+                        !lockdep_is_held(&devcgroup_mutex),
                         "device_cgroup:verify_new_ex called without proper synchronization");
 
        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
index 3d22014..5bed771 100644 (file)
@@ -472,7 +472,7 @@ static int sel_mmap_policy_fault(struct vm_area_struct *vma,
        return 0;
 }
 
-static struct vm_operations_struct sel_mmap_policy_ops = {
+static const struct vm_operations_struct sel_mmap_policy_ops = {
        .fault = sel_mmap_policy_fault,
        .page_mkwrite = sel_mmap_policy_fault,
 };
index 4e6b090..a75b561 100644 (file)
@@ -1135,7 +1135,7 @@ static const struct hda_fixup alc880_fixups[] = {
                /* override all pins as BIOS on old Amilo is broken */
                .type = HDA_FIXUP_PINS,
                .v.pins = (const struct hda_pintbl[]) {
-                       { 0x14, 0x0121411f }, /* HP */
+                       { 0x14, 0x0121401f }, /* HP */
                        { 0x15, 0x99030120 }, /* speaker */
                        { 0x16, 0x99030130 }, /* bass speaker */
                        { 0x17, 0x411111f0 }, /* N/A */
@@ -1155,7 +1155,7 @@ static const struct hda_fixup alc880_fixups[] = {
                /* almost compatible with FUJITSU, but no bass and SPDIF */
                .type = HDA_FIXUP_PINS,
                .v.pins = (const struct hda_pintbl[]) {
-                       { 0x14, 0x0121411f }, /* HP */
+                       { 0x14, 0x0121401f }, /* HP */
                        { 0x15, 0x99030120 }, /* speaker */
                        { 0x16, 0x411111f0 }, /* N/A */
                        { 0x17, 0x411111f0 }, /* N/A */
@@ -1364,7 +1364,7 @@ static const struct snd_pci_quirk alc880_fixup_tbl[] = {
        SND_PCI_QUIRK(0x161f, 0x203d, "W810", ALC880_FIXUP_W810),
        SND_PCI_QUIRK(0x161f, 0x205d, "Medion Rim 2150", ALC880_FIXUP_MEDION_RIM),
        SND_PCI_QUIRK(0x1631, 0xe011, "PB 13201056", ALC880_FIXUP_6ST_AUTOMUTE),
-       SND_PCI_QUIRK(0x1734, 0x107c, "FSC F1734", ALC880_FIXUP_F1734),
+       SND_PCI_QUIRK(0x1734, 0x107c, "FSC Amilo M1437", ALC880_FIXUP_FUJITSU),
        SND_PCI_QUIRK(0x1734, 0x1094, "FSC Amilo M1451G", ALC880_FIXUP_FUJITSU),
        SND_PCI_QUIRK(0x1734, 0x10ac, "FSC AMILO Xi 1526", ALC880_FIXUP_F1734),
        SND_PCI_QUIRK(0x1734, 0x10b0, "FSC Amilo Pi1556", ALC880_FIXUP_FUJITSU),
@@ -5189,8 +5189,11 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1028, 0x06c7, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x06d9, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x06da, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
-       SND_PCI_QUIRK(0x1028, 0x06de, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
        SND_PCI_QUIRK(0x1028, 0x06db, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06dd, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06de, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06df, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06e0, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
        SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2),
@@ -6381,6 +6384,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1028, 0x05db, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x05fe, "Dell XPS 15", ALC668_FIXUP_DELL_XPS13),
        SND_PCI_QUIRK(0x1028, 0x060a, "Dell XPS 13", ALC668_FIXUP_DELL_XPS13),
+       SND_PCI_QUIRK(0x1028, 0x060d, "Dell M3800", ALC668_FIXUP_DELL_XPS13),
        SND_PCI_QUIRK(0x1028, 0x0625, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x0626, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x0696, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
index 784ceb8..35c1f6a 100644 (file)
@@ -1064,6 +1064,7 @@ static const struct of_device_id amd7930_match[] = {
        },
        {},
 };
+MODULE_DEVICE_TABLE(of, amd7930_match);
 
 static struct platform_driver amd7930_sbus_driver = {
        .driver = {
index 310a382..9700860 100644 (file)
@@ -377,7 +377,15 @@ int snd_usb_add_audio_stream(struct snd_usb_audio *chip,
 
        snd_usb_init_substream(as, stream, fp);
 
-       list_add(&as->list, &chip->pcm_list);
+       /*
+        * Keep using head insertion for M-Audio Audiophile USB (tm) which has a
+        * fix to swap capture stream order in conf/cards/USB-audio.conf
+        */
+       if (chip->usb_id == USB_ID(0x0763, 0x2003))
+               list_add(&as->list, &chip->pcm_list);
+       else
+               list_add_tail(&as->list, &chip->pcm_list);
+
        chip->pcm_devs++;
 
        snd_usb_proc_pcm_format_add(as);
index eb51325..284a76e 100644 (file)
@@ -768,8 +768,8 @@ static int process_exit_event(struct perf_tool *tool,
        if (!evsel->attr.sample_id_all) {
                sample->cpu = 0;
                sample->time = 0;
-               sample->tid = event->comm.tid;
-               sample->pid = event->comm.pid;
+               sample->tid = event->fork.tid;
+               sample->pid = event->fork.pid;
        }
        print_sample_start(sample, thread, evsel);
        perf_event__fprintf(event, stdout);
index 1aa21c9..5b83f56 100644 (file)
@@ -34,6 +34,8 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id)
                .disabled = 1,
                .freq = 1,
        };
+       struct cpu_map *cpus;
+       struct thread_map *threads;
 
        attr.sample_freq = 500;
 
@@ -50,14 +52,19 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id)
        }
        perf_evlist__add(evlist, evsel);
 
-       evlist->cpus = cpu_map__dummy_new();
-       evlist->threads = thread_map__new_by_tid(getpid());
-       if (!evlist->cpus || !evlist->threads) {
+       cpus = cpu_map__dummy_new();
+       threads = thread_map__new_by_tid(getpid());
+       if (!cpus || !threads) {
                err = -ENOMEM;
                pr_debug("Not enough memory to create thread/cpu maps\n");
-               goto out_delete_evlist;
+               goto out_free_maps;
        }
 
+       perf_evlist__set_maps(evlist, cpus, threads);
+
+       cpus    = NULL;
+       threads = NULL;
+
        if (perf_evlist__open(evlist)) {
                const char *knob = "/proc/sys/kernel/perf_event_max_sample_rate";
 
@@ -107,6 +114,9 @@ next_event:
                err = -1;
        }
 
+out_free_maps:
+       cpu_map__put(cpus);
+       thread_map__put(threads);
 out_delete_evlist:
        perf_evlist__delete(evlist);
        return err;
index 3a8fede..add1638 100644 (file)
@@ -43,6 +43,8 @@ int test__task_exit(void)
        };
        const char *argv[] = { "true", NULL };
        char sbuf[STRERR_BUFSIZE];
+       struct cpu_map *cpus;
+       struct thread_map *threads;
 
        signal(SIGCHLD, sig_handler);
 
@@ -58,14 +60,19 @@ int test__task_exit(void)
         * perf_evlist__prepare_workload we'll fill in the only thread
         * we're monitoring, the one forked there.
         */
-       evlist->cpus = cpu_map__dummy_new();
-       evlist->threads = thread_map__new_by_tid(-1);
-       if (!evlist->cpus || !evlist->threads) {
+       cpus = cpu_map__dummy_new();
+       threads = thread_map__new_by_tid(-1);
+       if (!cpus || !threads) {
                err = -ENOMEM;
                pr_debug("Not enough memory to create thread/cpu maps\n");
-               goto out_delete_evlist;
+               goto out_free_maps;
        }
 
+       perf_evlist__set_maps(evlist, cpus, threads);
+
+       cpus    = NULL;
+       threads = NULL;
+
        err = perf_evlist__prepare_workload(evlist, &target, argv, false,
                                            workload_exec_failed_signal);
        if (err < 0) {
@@ -114,6 +121,9 @@ retry:
                err = -1;
        }
 
+out_free_maps:
+       cpu_map__put(cpus);
+       thread_map__put(threads);
 out_delete_evlist:
        perf_evlist__delete(evlist);
        return err;
index cf86f2d..c04c60d 100644 (file)
@@ -1968,7 +1968,8 @@ skip_annotation:
                                          &options[nr_options], dso);
                nr_options += add_map_opt(browser, &actions[nr_options],
                                          &options[nr_options],
-                                         browser->selection->map);
+                                         browser->selection ?
+                                               browser->selection->map : NULL);
 
                /* perf script support */
                if (browser->he_selection) {
@@ -1976,6 +1977,15 @@ skip_annotation:
                                                     &actions[nr_options],
                                                     &options[nr_options],
                                                     thread, NULL);
+                       /*
+                        * Note that browser->selection != NULL
+                        * when browser->he_selection is not NULL,
+                        * so we don't need to check browser->selection
+                        * before fetching browser->selection->sym like what
+                        * we do before fetching browser->selection->map.
+                        *
+                        * See hist_browser__show_entry.
+                        */
                        nr_options += add_script_opt(browser,
                                                     &actions[nr_options],
                                                     &options[nr_options],
index d51a520..c8fc8a2 100644 (file)
@@ -124,6 +124,33 @@ void perf_evlist__delete(struct perf_evlist *evlist)
        free(evlist);
 }
 
+static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
+                                         struct perf_evsel *evsel)
+{
+       /*
+        * We already have cpus for evsel (via PMU sysfs) so
+        * keep it, if there's no target cpu list defined.
+        */
+       if (!evsel->own_cpus || evlist->has_user_cpus) {
+               cpu_map__put(evsel->cpus);
+               evsel->cpus = cpu_map__get(evlist->cpus);
+       } else if (evsel->cpus != evsel->own_cpus) {
+               cpu_map__put(evsel->cpus);
+               evsel->cpus = cpu_map__get(evsel->own_cpus);
+       }
+
+       thread_map__put(evsel->threads);
+       evsel->threads = thread_map__get(evlist->threads);
+}
+
+static void perf_evlist__propagate_maps(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel)
+               __perf_evlist__propagate_maps(evlist, evsel);
+}
+
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
 {
        entry->evlist = evlist;
@@ -133,18 +160,19 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
 
        if (!evlist->nr_entries++)
                perf_evlist__set_id_pos(evlist);
+
+       __perf_evlist__propagate_maps(evlist, entry);
 }
 
 void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
-                                  struct list_head *list,
-                                  int nr_entries)
+                                  struct list_head *list)
 {
-       bool set_id_pos = !evlist->nr_entries;
+       struct perf_evsel *evsel, *temp;
 
-       list_splice_tail(list, &evlist->entries);
-       evlist->nr_entries += nr_entries;
-       if (set_id_pos)
-               perf_evlist__set_id_pos(evlist);
+       __evlist__for_each_safe(list, temp, evsel) {
+               list_del_init(&evsel->node);
+               perf_evlist__add(evlist, evsel);
+       }
 }
 
 void __perf_evlist__set_leader(struct list_head *list)
@@ -210,7 +238,7 @@ static int perf_evlist__add_attrs(struct perf_evlist *evlist,
                list_add_tail(&evsel->node, &head);
        }
 
-       perf_evlist__splice_list_tail(evlist, &head, nr_attrs);
+       perf_evlist__splice_list_tail(evlist, &head);
 
        return 0;
 
@@ -1103,71 +1131,56 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
        return perf_evlist__mmap_ex(evlist, pages, overwrite, 0, false);
 }
 
-static int perf_evlist__propagate_maps(struct perf_evlist *evlist,
-                                      bool has_user_cpus)
-{
-       struct perf_evsel *evsel;
-
-       evlist__for_each(evlist, evsel) {
-               /*
-                * We already have cpus for evsel (via PMU sysfs) so
-                * keep it, if there's no target cpu list defined.
-                */
-               if (evsel->cpus && has_user_cpus)
-                       cpu_map__put(evsel->cpus);
-
-               if (!evsel->cpus || has_user_cpus)
-                       evsel->cpus = cpu_map__get(evlist->cpus);
-
-               evsel->threads = thread_map__get(evlist->threads);
-
-               if ((evlist->cpus && !evsel->cpus) ||
-                   (evlist->threads && !evsel->threads))
-                       return -ENOMEM;
-       }
-
-       return 0;
-}
-
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
 {
-       evlist->threads = thread_map__new_str(target->pid, target->tid,
-                                             target->uid);
+       struct cpu_map *cpus;
+       struct thread_map *threads;
+
+       threads = thread_map__new_str(target->pid, target->tid, target->uid);
 
-       if (evlist->threads == NULL)
+       if (!threads)
                return -1;
 
        if (target__uses_dummy_map(target))
-               evlist->cpus = cpu_map__dummy_new();
+               cpus = cpu_map__dummy_new();
        else
-               evlist->cpus = cpu_map__new(target->cpu_list);
+               cpus = cpu_map__new(target->cpu_list);
 
-       if (evlist->cpus == NULL)
+       if (!cpus)
                goto out_delete_threads;
 
-       return perf_evlist__propagate_maps(evlist, !!target->cpu_list);
+       evlist->has_user_cpus = !!target->cpu_list;
+
+       perf_evlist__set_maps(evlist, cpus, threads);
+
+       return 0;
 
 out_delete_threads:
-       thread_map__put(evlist->threads);
-       evlist->threads = NULL;
+       thread_map__put(threads);
        return -1;
 }
 
-int perf_evlist__set_maps(struct perf_evlist *evlist,
-                         struct cpu_map *cpus,
-                         struct thread_map *threads)
+void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
+                          struct thread_map *threads)
 {
-       if (evlist->cpus)
+       /*
+        * Allow for the possibility that one or another of the maps isn't being
+        * changed i.e. don't put it.  Note we are assuming the maps that are
+        * being applied are brand new and evlist is taking ownership of the
+        * original reference count of 1.  If that is not the case it is up to
+        * the caller to increase the reference count.
+        */
+       if (cpus != evlist->cpus) {
                cpu_map__put(evlist->cpus);
+               evlist->cpus = cpus;
+       }
 
-       evlist->cpus = cpus;
-
-       if (evlist->threads)
+       if (threads != evlist->threads) {
                thread_map__put(evlist->threads);
+               evlist->threads = threads;
+       }
 
-       evlist->threads = threads;
-
-       return perf_evlist__propagate_maps(evlist, false);
+       perf_evlist__propagate_maps(evlist);
 }
 
 int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel)
@@ -1387,6 +1400,8 @@ void perf_evlist__close(struct perf_evlist *evlist)
 
 static int perf_evlist__create_syswide_maps(struct perf_evlist *evlist)
 {
+       struct cpu_map    *cpus;
+       struct thread_map *threads;
        int err = -ENOMEM;
 
        /*
@@ -1398,20 +1413,19 @@ static int perf_evlist__create_syswide_maps(struct perf_evlist *evlist)
         * error, and we may not want to do that fallback to a
         * default cpu identity map :-\
         */
-       evlist->cpus = cpu_map__new(NULL);
-       if (evlist->cpus == NULL)
+       cpus = cpu_map__new(NULL);
+       if (!cpus)
                goto out;
 
-       evlist->threads = thread_map__new_dummy();
-       if (evlist->threads == NULL)
-               goto out_free_cpus;
+       threads = thread_map__new_dummy();
+       if (!threads)
+               goto out_put;
 
-       err = 0;
+       perf_evlist__set_maps(evlist, cpus, threads);
 out:
        return err;
-out_free_cpus:
-       cpu_map__put(evlist->cpus);
-       evlist->cpus = NULL;
+out_put:
+       cpu_map__put(cpus);
        goto out;
 }
 
index b39a619..115d8b5 100644 (file)
@@ -42,6 +42,7 @@ struct perf_evlist {
        int              nr_mmaps;
        bool             overwrite;
        bool             enabled;
+       bool             has_user_cpus;
        size_t           mmap_len;
        int              id_pos;
        int              is_pos;
@@ -155,9 +156,8 @@ int perf_evlist__enable_event_idx(struct perf_evlist *evlist,
 void perf_evlist__set_selected(struct perf_evlist *evlist,
                               struct perf_evsel *evsel);
 
-int perf_evlist__set_maps(struct perf_evlist *evlist,
-                         struct cpu_map *cpus,
-                         struct thread_map *threads);
+void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
+                          struct thread_map *threads);
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target);
 int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel);
 
@@ -179,8 +179,7 @@ bool perf_evlist__valid_sample_id_all(struct perf_evlist *evlist);
 bool perf_evlist__valid_read_format(struct perf_evlist *evlist);
 
 void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
-                                  struct list_head *list,
-                                  int nr_entries);
+                                  struct list_head *list);
 
 static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist)
 {
index c53f791..5410483 100644 (file)
@@ -1033,6 +1033,7 @@ void perf_evsel__exit(struct perf_evsel *evsel)
        perf_evsel__free_config_terms(evsel);
        close_cgroup(evsel->cgrp);
        cpu_map__put(evsel->cpus);
+       cpu_map__put(evsel->own_cpus);
        thread_map__put(evsel->threads);
        zfree(&evsel->group_name);
        zfree(&evsel->name);
index 298e6bb..ef8925f 100644 (file)
@@ -98,6 +98,7 @@ struct perf_evsel {
        struct cgroup_sel       *cgrp;
        void                    *handler;
        struct cpu_map          *cpus;
+       struct cpu_map          *own_cpus;
        struct thread_map       *threads;
        unsigned int            sample_size;
        int                     id_pos;
index 4181454..fce6634 100644 (file)
@@ -1438,7 +1438,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused,
        if (ph->needs_swap)
                nr = bswap_32(nr);
 
-       ph->env.nr_cpus_online = nr;
+       ph->env.nr_cpus_avail = nr;
 
        ret = readn(fd, &nr, sizeof(nr));
        if (ret != sizeof(nr))
@@ -1447,7 +1447,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused,
        if (ph->needs_swap)
                nr = bswap_32(nr);
 
-       ph->env.nr_cpus_avail = nr;
+       ph->env.nr_cpus_online = nr;
        return 0;
 }
 
index ea76862..eb0e7f8 100644 (file)
@@ -623,7 +623,7 @@ static int intel_bts_process_event(struct perf_session *session,
        if (err)
                return err;
        if (event->header.type == PERF_RECORD_EXIT) {
-               err = intel_bts_process_tid_exit(bts, event->comm.tid);
+               err = intel_bts_process_tid_exit(bts, event->fork.tid);
                if (err)
                        return err;
        }
index bb41c20..535d86f 100644 (file)
@@ -1494,7 +1494,7 @@ static int intel_pt_process_event(struct perf_session *session,
        if (pt->timeless_decoding) {
                if (event->header.type == PERF_RECORD_EXIT) {
                        err = intel_pt_process_timeless_queues(pt,
-                                                              event->comm.tid,
+                                                              event->fork.tid,
                                                               sample->time);
                }
        } else if (timestamp) {
index d826e6f..21ed6ee 100644 (file)
@@ -287,8 +287,8 @@ __add_event(struct list_head *list, int *idx,
        if (!evsel)
                return NULL;
 
-       if (cpus)
-               evsel->cpus = cpu_map__get(cpus);
+       evsel->cpus     = cpu_map__get(cpus);
+       evsel->own_cpus = cpu_map__get(cpus);
 
        if (name)
                evsel->name = strdup(name);
@@ -1140,10 +1140,9 @@ int parse_events(struct perf_evlist *evlist, const char *str,
        ret = parse_events__scanner(str, &data, PE_START_EVENTS);
        perf_pmu__parse_cleanup();
        if (!ret) {
-               int entries = data.idx - evlist->nr_entries;
                struct perf_evsel *last;
 
-               perf_evlist__splice_list_tail(evlist, &data.list, entries);
+               perf_evlist__splice_list_tail(evlist, &data.list);
                evlist->nr_groups += data.nr_groups;
                last = perf_evlist__last(evlist);
                last->cmdline_group_boundary = true;
index 591905a..9cd7081 100644 (file)
@@ -255,7 +255,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
        list_add_tail(&term->list, head);
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(list, &data->idx, "cpu", head));
+       ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
        parse_events__free_terms(head);
        $$ = list;
 }
index 0501511..89b05e2 100644 (file)
@@ -6,6 +6,7 @@ TARGETS += firmware
 TARGETS += ftrace
 TARGETS += futex
 TARGETS += kcmp
+TARGETS += membarrier
 TARGETS += memfd
 TARGETS += memory-hotplug
 TARGETS += mount
diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore
new file mode 100644 (file)
index 0000000..020c44f
--- /dev/null
@@ -0,0 +1 @@
+membarrier_test
diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile
new file mode 100644 (file)
index 0000000..877a503
--- /dev/null
@@ -0,0 +1,11 @@
+CFLAGS += -g -I../../../../usr/include/
+
+all:
+       $(CC) $(CFLAGS) membarrier_test.c -o membarrier_test
+
+TEST_PROGS := membarrier_test
+
+include ../lib.mk
+
+clean:
+       $(RM) membarrier_test
diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
new file mode 100644 (file)
index 0000000..dde3125
--- /dev/null
@@ -0,0 +1,121 @@
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <linux/membarrier.h>
+#include <asm-generic/unistd.h>
+#include <sys/syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+enum test_membarrier_status {
+       TEST_MEMBARRIER_PASS = 0,
+       TEST_MEMBARRIER_FAIL,
+       TEST_MEMBARRIER_SKIP,
+};
+
+static int sys_membarrier(int cmd, int flags)
+{
+       return syscall(__NR_membarrier, cmd, flags);
+}
+
+static enum test_membarrier_status test_membarrier_cmd_fail(void)
+{
+       int cmd = -1, flags = 0;
+
+       if (sys_membarrier(cmd, flags) != -1) {
+               printf("membarrier: Wrong command should fail but passed.\n");
+               return TEST_MEMBARRIER_FAIL;
+       }
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier_flags_fail(void)
+{
+       int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+
+       if (sys_membarrier(cmd, flags) != -1) {
+               printf("membarrier: Wrong flags should fail but passed.\n");
+               return TEST_MEMBARRIER_FAIL;
+       }
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier_success(void)
+{
+       int cmd = MEMBARRIER_CMD_SHARED, flags = 0;
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               printf("membarrier: Executing MEMBARRIER_CMD_SHARED failed. %s.\n",
+                               strerror(errno));
+               return TEST_MEMBARRIER_FAIL;
+       }
+
+       printf("membarrier: MEMBARRIER_CMD_SHARED success.\n");
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier(void)
+{
+       enum test_membarrier_status status;
+
+       status = test_membarrier_cmd_fail();
+       if (status)
+               return status;
+       status = test_membarrier_flags_fail();
+       if (status)
+               return status;
+       status = test_membarrier_success();
+       if (status)
+               return status;
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier_query(void)
+{
+       int flags = 0, ret;
+
+       printf("membarrier MEMBARRIER_CMD_QUERY ");
+       ret = sys_membarrier(MEMBARRIER_CMD_QUERY, flags);
+       if (ret < 0) {
+               printf("failed. %s.\n", strerror(errno));
+               switch (errno) {
+               case ENOSYS:
+                       /*
+                        * It is valid to build a kernel with
+                        * CONFIG_MEMBARRIER=n. However, this skips the tests.
+                        */
+                       return TEST_MEMBARRIER_SKIP;
+               case EINVAL:
+               default:
+                       return TEST_MEMBARRIER_FAIL;
+               }
+       }
+       if (!(ret & MEMBARRIER_CMD_SHARED)) {
+               printf("command MEMBARRIER_CMD_SHARED is not supported.\n");
+               return TEST_MEMBARRIER_FAIL;
+       }
+       printf("syscall available.\n");
+       return TEST_MEMBARRIER_PASS;
+}
+
+int main(int argc, char **argv)
+{
+       switch (test_membarrier_query()) {
+       case TEST_MEMBARRIER_FAIL:
+               return ksft_exit_fail();
+       case TEST_MEMBARRIER_SKIP:
+               return ksft_exit_skip();
+       }
+       switch (test_membarrier()) {
+       case TEST_MEMBARRIER_FAIL:
+               return ksft_exit_fail();
+       case TEST_MEMBARRIER_SKIP:
+               return ksft_exit_skip();
+       }
+
+       printf("membarrier: tests done!\n");
+       return ksft_exit_pass();
+}
index 9a43a59..421c607 100644 (file)
@@ -116,8 +116,9 @@ static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
        v86->regs.eip = eip;
        ret = vm86(VM86_ENTER, v86);
 
-       if (ret == -1 && errno == ENOSYS) {
-               printf("[SKIP]\tvm86 not supported\n");
+       if (ret == -1 && (errno == ENOSYS || errno == EPERM)) {
+               printf("[SKIP]\tvm86 %s\n",
+                      errno == ENOSYS ? "not supported" : "not allowed");
                return false;
        }
 
index 98c95f2..76e38d2 100644 (file)
@@ -64,10 +64,10 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
        int ret;
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
-       timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
-       ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-                                 timer->irq->irq,
-                                 timer->irq->level);
+       kvm_vgic_set_phys_irq_active(timer->map, true);
+       ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
+                                        timer->map,
+                                        timer->irq->level);
        WARN_ON(ret);
 }
 
@@ -117,7 +117,8 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
        cycle_t cval, now;
 
        if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-               !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
+           !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
+           kvm_vgic_get_phys_irq_active(timer->map))
                return false;
 
        cval = timer->cntv_cval;
@@ -184,10 +185,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
        timer_arm(timer, ns);
 }
 
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-                         const struct kvm_irq_level *irq)
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+                        const struct kvm_irq_level *irq)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       struct irq_phys_map *map;
 
        /*
         * The vcpu timer irq number cannot be determined in
@@ -196,6 +198,17 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
         * vcpu timer irq number when the vcpu is reset.
         */
        timer->irq = irq;
+
+       /*
+        * Tell the VGIC that the virtual interrupt is tied to a
+        * physical interrupt. We do that once per VCPU.
+        */
+       map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq);
+       if (WARN_ON(IS_ERR(map)))
+               return PTR_ERR(map);
+
+       timer->map = map;
+       return 0;
 }
 
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
@@ -335,6 +348,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
        timer_disarm(timer);
+       if (timer->map)
+               kvm_vgic_unmap_phys_irq(vcpu, timer->map);
 }
 
 void kvm_timer_enable(struct kvm *kvm)
index f9b9c7c..8d7b04d 100644 (file)
@@ -48,6 +48,10 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
                lr_desc.state |= LR_STATE_ACTIVE;
        if (val & GICH_LR_EOI)
                lr_desc.state |= LR_EOI_INT;
+       if (val & GICH_LR_HW) {
+               lr_desc.state |= LR_HW;
+               lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
+       }
 
        return lr_desc;
 }
@@ -55,7 +59,9 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
 static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
                           struct vgic_lr lr_desc)
 {
-       u32 lr_val = (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | lr_desc.irq;
+       u32 lr_val;
+
+       lr_val = lr_desc.irq;
 
        if (lr_desc.state & LR_STATE_PENDING)
                lr_val |= GICH_LR_PENDING_BIT;
@@ -64,6 +70,14 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
        if (lr_desc.state & LR_EOI_INT)
                lr_val |= GICH_LR_EOI;
 
+       if (lr_desc.state & LR_HW) {
+               lr_val |= GICH_LR_HW;
+               lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
+       }
+
+       if (lr_desc.irq < VGIC_NR_SGIS)
+               lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
+
        vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 }
 
index dff0602..afbf925 100644 (file)
@@ -67,6 +67,10 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
                lr_desc.state |= LR_STATE_ACTIVE;
        if (val & ICH_LR_EOI)
                lr_desc.state |= LR_EOI_INT;
+       if (val & ICH_LR_HW) {
+               lr_desc.state |= LR_HW;
+               lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
+       }
 
        return lr_desc;
 }
@@ -84,10 +88,17 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
         * Eventually we want to make this configurable, so we may revisit
         * this in the future.
         */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+       switch (vcpu->kvm->arch.vgic.vgic_model) {
+       case KVM_DEV_TYPE_ARM_VGIC_V3:
                lr_val |= ICH_LR_GROUP;
-       else
-               lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+               break;
+       case  KVM_DEV_TYPE_ARM_VGIC_V2:
+               if (lr_desc.irq < VGIC_NR_SGIS)
+                       lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+               break;
+       default:
+               BUG();
+       }
 
        if (lr_desc.state & LR_STATE_PENDING)
                lr_val |= ICH_LR_PENDING_BIT;
@@ -95,6 +106,10 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
                lr_val |= ICH_LR_ACTIVE_BIT;
        if (lr_desc.state & LR_EOI_INT)
                lr_val |= ICH_LR_EOI;
+       if (lr_desc.state & LR_HW) {
+               lr_val |= ICH_LR_HW;
+               lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
+       }
 
        vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 }
index bc40137..9eb489a 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/rculist.h>
 #include <linux/uaccess.h>
 
 #include <asm/kvm_emulate.h>
  *   cause the interrupt to become inactive in such a situation.
  *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
  *   inactive as long as the external input line is held high.
+ *
+ *
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *   [to be renamed to kvm_vgic_init??]
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_cpu_early_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
  */
 
 #include "vgic.h"
@@ -82,6 +105,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
 static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+                                               int virt_irq);
 
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_params *vgic;
@@ -375,7 +400,7 @@ void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
 
 static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
 {
-       return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq);
+       return !vgic_irq_is_queued(vcpu, irq);
 }
 
 /**
@@ -1115,6 +1140,39 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
        if (!vgic_irq_is_edge(vcpu, irq))
                vlr.state |= LR_EOI_INT;
 
+       if (vlr.irq >= VGIC_NR_SGIS) {
+               struct irq_phys_map *map;
+               map = vgic_irq_map_search(vcpu, irq);
+
+               /*
+                * If we have a mapping, and the virtual interrupt is
+                * being injected, then we must set the state to
+                * active in the physical world. Otherwise the
+                * physical interrupt will fire and the guest will
+                * exit before processing the virtual interrupt.
+                */
+               if (map) {
+                       int ret;
+
+                       BUG_ON(!map->active);
+                       vlr.hwirq = map->phys_irq;
+                       vlr.state |= LR_HW;
+                       vlr.state &= ~LR_EOI_INT;
+
+                       ret = irq_set_irqchip_state(map->irq,
+                                                   IRQCHIP_STATE_ACTIVE,
+                                                   true);
+                       WARN_ON(ret);
+
+                       /*
+                        * Make sure we're not going to sample this
+                        * again, as a HW-backed interrupt cannot be
+                        * in the PENDING_ACTIVE stage.
+                        */
+                       vgic_irq_set_queued(vcpu, irq);
+               }
+       }
+
        vgic_set_lr(vcpu, lr_nr, vlr);
        vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
@@ -1339,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
        return level_pending;
 }
 
+/*
+ * Save the physical active state, and reset it to inactive.
+ *
+ * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
+ */
+static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+{
+       struct irq_phys_map *map;
+       int ret;
+
+       if (!(vlr.state & LR_HW))
+               return 0;
+
+       map = vgic_irq_map_search(vcpu, vlr.irq);
+       BUG_ON(!map || !map->active);
+
+       ret = irq_get_irqchip_state(map->irq,
+                                   IRQCHIP_STATE_ACTIVE,
+                                   &map->active);
+
+       WARN_ON(ret);
+
+       if (map->active) {
+               ret = irq_set_irqchip_state(map->irq,
+                                           IRQCHIP_STATE_ACTIVE,
+                                           false);
+               WARN_ON(ret);
+               return 0;
+       }
+
+       return 1;
+}
+
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
@@ -1353,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
        elrsr = vgic_get_elrsr(vcpu);
        elrsr_ptr = u64_to_bitmask(&elrsr);
 
-       /* Clear mappings for empty LRs */
-       for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
+       /* Deal with HW interrupts, and clear mappings for empty LRs */
+       for (lr = 0; lr < vgic->nr_lr; lr++) {
                struct vgic_lr vlr;
 
-               if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
+               if (!test_bit(lr, vgic_cpu->lr_used))
                        continue;
 
                vlr = vgic_get_lr(vcpu, lr);
+               if (vgic_sync_hwirq(vcpu, vlr)) {
+                       /*
+                        * So this is a HW interrupt that the guest
+                        * EOI-ed. Clean the LR state and allow the
+                        * interrupt to be sampled again.
+                        */
+                       vlr.state = 0;
+                       vlr.hwirq = 0;
+                       vgic_set_lr(vcpu, lr, vlr);
+                       vgic_irq_clear_queued(vcpu, vlr.irq);
+                       set_bit(lr, elrsr_ptr);
+               }
+
+               if (!test_bit(lr, elrsr_ptr))
+                       continue;
+
+               clear_bit(lr, vgic_cpu->lr_used);
 
                BUG_ON(vlr.irq >= dist->nr_irqs);
                vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
@@ -1447,7 +1555,8 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 }
 
 static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-                                 unsigned int irq_num, bool level)
+                                  struct irq_phys_map *map,
+                                  unsigned int irq_num, bool level)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
@@ -1455,6 +1564,9 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
        int enabled;
        bool ret = true, can_inject = true;
 
+       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+               return -EINVAL;
+
        spin_lock(&dist->lock);
 
        vcpu = kvm_get_vcpu(kvm, cpuid);
@@ -1517,18 +1629,46 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 out:
        spin_unlock(&dist->lock);
 
-       return ret ? cpuid : -EINVAL;
+       if (ret) {
+               /* kick the specified vcpu */
+               kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
+       }
+
+       return 0;
+}
+
+static int vgic_lazy_init(struct kvm *kvm)
+{
+       int ret = 0;
+
+       if (unlikely(!vgic_initialized(kvm))) {
+               /*
+                * We only provide the automatic initialization of the VGIC
+                * for the legacy case of a GICv2. Any other type must
+                * be explicitly initialized once setup with the respective
+                * KVM device call.
+                */
+               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+                       return -EBUSY;
+
+               mutex_lock(&kvm->lock);
+               ret = vgic_init(kvm);
+               mutex_unlock(&kvm->lock);
+       }
+
+       return ret;
 }
 
 /**
  * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
  * @kvm:     The VM structure pointer
  * @cpuid:   The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device
+ * @irq_num: The IRQ number that is assigned to the device. This IRQ
+ *           must not be mapped to a HW interrupt.
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  *                           false: to ignore the call
- *          Level-sensitive  true:  activates an interrupt
- *                           false: deactivates an interrupt
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
  *
  * The GIC is not concerned with devices being active-LOW or active-HIGH for
  * level-sensitive interrupts.  You can think of the level parameter as 1
@@ -1537,39 +1677,44 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level)
 {
-       int ret = 0;
-       int vcpu_id;
-
-       if (unlikely(!vgic_initialized(kvm))) {
-               /*
-                * We only provide the automatic initialization of the VGIC
-                * for the legacy case of a GICv2. Any other type must
-                * be explicitly initialized once setup with the respective
-                * KVM device call.
-                */
-               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) {
-                       ret = -EBUSY;
-                       goto out;
-               }
-               mutex_lock(&kvm->lock);
-               ret = vgic_init(kvm);
-               mutex_unlock(&kvm->lock);
+       struct irq_phys_map *map;
+       int ret;
 
-               if (ret)
-                       goto out;
-       }
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
 
-       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+       map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
+       if (map)
                return -EINVAL;
 
-       vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-       if (vcpu_id >= 0) {
-               /* kick the specified vcpu */
-               kvm_vcpu_kick(kvm_get_vcpu(kvm, vcpu_id));
-       }
+       return vgic_update_irq_pending(kvm, cpuid, NULL, irq_num, level);
+}
 
-out:
-       return ret;
+/**
+ * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @map:     Pointer to a irq_phys_map structure describing the mapping
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *                           false: to ignore the call
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
+ *
+ * The GIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+                              struct irq_phys_map *map, bool level)
+{
+       int ret;
+
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
+
+       return vgic_update_irq_pending(kvm, cpuid, map, map->virt_irq, level);
 }
 
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -1583,6 +1728,188 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
        return IRQ_HANDLED;
 }
 
+static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
+                                                   int virt_irq)
+{
+       if (virt_irq < VGIC_NR_PRIVATE_IRQS)
+               return &vcpu->arch.vgic_cpu.irq_phys_map_list;
+       else
+               return &vcpu->kvm->arch.vgic.irq_phys_map_list;
+}
+
+/**
+ * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
+ * @vcpu: The VCPU pointer
+ * @virt_irq: The virtual irq number
+ * @irq: The Linux IRQ number
+ *
+ * Establish a mapping between a guest visible irq (@virt_irq) and a
+ * Linux irq (@irq). On injection, @virt_irq will be associated with
+ * the physical interrupt represented by @irq. This mapping can be
+ * established multiple times as long as the parameters are the same.
+ *
+ * Returns a valid pointer on success, and an error pointer otherwise
+ */
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+                                          int virt_irq, int irq)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+       struct irq_phys_map *map;
+       struct irq_phys_map_entry *entry;
+       struct irq_desc *desc;
+       struct irq_data *data;
+       int phys_irq;
+
+       desc = irq_to_desc(irq);
+       if (!desc) {
+               kvm_err("%s: no interrupt descriptor\n", __func__);
+               return ERR_PTR(-EINVAL);
+       }
+
+       data = irq_desc_get_irq_data(desc);
+       while (data->parent_data)
+               data = data->parent_data;
+
+       phys_irq = data->hwirq;
+
+       /* Create a new mapping */
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return ERR_PTR(-ENOMEM);
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       /* Try to match an existing mapping */
+       map = vgic_irq_map_search(vcpu, virt_irq);
+       if (map) {
+               /* Make sure this mapping matches */
+               if (map->phys_irq != phys_irq   ||
+                   map->irq      != irq)
+                       map = ERR_PTR(-EINVAL);
+
+               /* Found an existing, valid mapping */
+               goto out;
+       }
+
+       map           = &entry->map;
+       map->virt_irq = virt_irq;
+       map->phys_irq = phys_irq;
+       map->irq      = irq;
+
+       list_add_tail_rcu(&entry->entry, root);
+
+out:
+       spin_unlock(&dist->irq_phys_map_lock);
+       /* If we've found a hit in the existing list, free the useless
+        * entry */
+       if (IS_ERR(map) || map != &entry->map)
+               kfree(entry);
+       return map;
+}
+
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+                                               int virt_irq)
+{
+       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+       struct irq_phys_map_entry *entry;
+       struct irq_phys_map *map;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(entry, root, entry) {
+               map = &entry->map;
+               if (map->virt_irq == virt_irq) {
+                       rcu_read_unlock();
+                       return map;
+               }
+       }
+
+       rcu_read_unlock();
+
+       return NULL;
+}
+
+static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
+{
+       struct irq_phys_map_entry *entry;
+
+       entry = container_of(rcu, struct irq_phys_map_entry, rcu);
+       kfree(entry);
+}
+
+/**
+ * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
+ *
+ * Return the logical active state of a mapped interrupt. This doesn't
+ * necessarily reflects the current HW state.
+ */
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
+{
+       BUG_ON(!map);
+       return map->active;
+}
+
+/**
+ * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
+ *
+ * Set the logical active state of a mapped interrupt. This doesn't
+ * immediately affects the HW state.
+ */
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
+{
+       BUG_ON(!map);
+       map->active = active;
+}
+
+/**
+ * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
+ * @vcpu: The VCPU pointer
+ * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq
+ *
+ * Remove an existing mapping between virtual and physical interrupts.
+ */
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct irq_phys_map_entry *entry;
+       struct list_head *root;
+
+       if (!map)
+               return -EINVAL;
+
+       root = vgic_get_irq_phys_map_list(vcpu, map->virt_irq);
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       list_for_each_entry(entry, root, entry) {
+               if (&entry->map == map) {
+                       list_del_rcu(&entry->entry);
+                       call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+                       break;
+               }
+       }
+
+       spin_unlock(&dist->irq_phys_map_lock);
+
+       return 0;
+}
+
+static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct irq_phys_map_entry *entry;
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       list_for_each_entry(entry, root, entry) {
+               list_del_rcu(&entry->entry);
+               call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+       }
+
+       spin_unlock(&dist->irq_phys_map_lock);
+}
+
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -1591,6 +1918,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
        kfree(vgic_cpu->active_shared);
        kfree(vgic_cpu->pend_act_shared);
        kfree(vgic_cpu->vgic_irq_lr_map);
+       vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
        vgic_cpu->pending_shared = NULL;
        vgic_cpu->active_shared = NULL;
        vgic_cpu->pend_act_shared = NULL;
@@ -1627,6 +1955,17 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
        return 0;
 }
 
+/**
+ * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
+}
+
 /**
  * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
  *
@@ -1664,6 +2003,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
        kfree(dist->irq_spi_target);
        kfree(dist->irq_pending_on_cpu);
        kfree(dist->irq_active_on_cpu);
+       vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
        dist->irq_sgi_sources = NULL;
        dist->irq_spi_cpu = NULL;
        dist->irq_spi_target = NULL;
@@ -1787,6 +2127,18 @@ static int init_vgic_model(struct kvm *kvm, int type)
        return 0;
 }
 
+/**
+ * kvm_vgic_early_init - Earliest possible vgic initialization stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+       spin_lock_init(&kvm->arch.vgic.lock);
+       spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
+       INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
+}
+
 int kvm_vgic_create(struct kvm *kvm, u32 type)
 {
        int i, vcpu_lock_idx = -1, ret;
@@ -1832,7 +2184,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
        if (ret)
                goto out_unlock;
 
-       spin_lock_init(&kvm->arch.vgic.lock);
        kvm->arch.vgic.in_kernel = true;
        kvm->arch.vgic.vgic_model = type;
        kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
index 21c1424..d7ea8e2 100644 (file)
@@ -213,11 +213,15 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        goto out;
 
                r = -EINVAL;
-               if (ue->flags)
+               if (ue->flags) {
+                       kfree(e);
                        goto out;
+               }
                r = setup_routing_entry(new, e, ue);
-               if (r)
+               if (r) {
+                       kfree(e);
                        goto out;
+               }
                ++ue;
        }
 
index d8db2f8..a25a731 100644 (file)
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static unsigned int halt_poll_ns;
+/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
+static unsigned int halt_poll_ns = 500000;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
+/* Default doubles per-vcpu halt_poll_ns. */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu halt_poll_ns . */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+
 /*
  * Ordering of locks:
  *
@@ -217,6 +226,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        vcpu->pid = NULL;
+       vcpu->halt_poll_ns = 0;
        init_waitqueue_head(&vcpu->wq);
        kvm_async_pf_vcpu_init(vcpu);
 
@@ -387,6 +397,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        return young;
 }
 
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long start,
+                                       unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int young, idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+       /*
+        * Even though we do not flush TLB, this will still adversely
+        * affect performance on pre-Haswell Intel EPT, where there is
+        * no EPT Access Bit to clear so that we have to tear down EPT
+        * tables instead. If we find this unacceptable, we can always
+        * add a parameter to kvm_age_hva so that it effectively doesn't
+        * do anything on clear_young.
+        *
+        * Also note that currently we never issue secondary TLB flushes
+        * from clear_young, leaving this job up to the regular system
+        * cadence. If we find this inaccurate, we might come up with a
+        * more sophisticated heuristic later.
+        */
+       young = kvm_age_hva(kvm, start, end);
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
@@ -419,6 +459,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+       .clear_young            = kvm_mmu_notifier_clear_young,
        .test_young             = kvm_mmu_notifier_test_young,
        .change_pte             = kvm_mmu_notifier_change_pte,
        .release                = kvm_mmu_notifier_release,
@@ -1906,6 +1947,35 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+       int old, val;
+
+       old = val = vcpu->halt_poll_ns;
+       /* 10us base */
+       if (val == 0 && halt_poll_ns_grow)
+               val = 10000;
+       else
+               val *= halt_poll_ns_grow;
+
+       vcpu->halt_poll_ns = val;
+       trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
+}
+
+static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+       int old, val;
+
+       old = val = vcpu->halt_poll_ns;
+       if (halt_poll_ns_shrink == 0)
+               val = 0;
+       else
+               val /= halt_poll_ns_shrink;
+
+       vcpu->halt_poll_ns = val;
+       trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
+}
+
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
        if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -1928,10 +1998,11 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        ktime_t start, cur;
        DEFINE_WAIT(wait);
        bool waited = false;
+       u64 block_ns;
 
        start = cur = ktime_get();
-       if (halt_poll_ns) {
-               ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+       if (vcpu->halt_poll_ns) {
+               ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
                do {
                        /*
@@ -1960,7 +2031,21 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        cur = ktime_get();
 
 out:
-       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
+       block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+
+       if (halt_poll_ns) {
+               if (block_ns <= vcpu->halt_poll_ns)
+                       ;
+               /* we had a long block, shrink polling */
+               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+                       shrink_halt_poll_ns(vcpu);
+               /* we had a short halt and our poll time is too small */
+               else if (vcpu->halt_poll_ns < halt_poll_ns &&
+                       block_ns < halt_poll_ns)
+                       grow_halt_poll_ns(vcpu);
+       }
+
+       trace_kvm_vcpu_wakeup(block_ns, waited);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);