Merge branch 'for-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/bluetoot...

author David S. Miller <davem@davemloft.net>

Sat, 8 Oct 2016 12:24:37 +0000 (08:24 -0400)

committer David S. Miller <davem@davemloft.net>

Sat, 8 Oct 2016 12:24:37 +0000 (08:24 -0400)
author David S. Miller <davem@davemloft.net>
Sat, 8 Oct 2016 12:24:37 +0000 (08:24 -0400)
committer David S. Miller <davem@davemloft.net>
Sat, 8 Oct 2016 12:24:37 +0000 (08:24 -0400)
diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt b/Documentation/devicetree/bindings/net/mediatek-net.txt

index f095257..c010faf 100644 (file)
--- a/Documentation/devicetree/bindings/net/mediatek-net.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-net.txt
@@ -24,7 +24,6 @@ Required properties:
  Optional properties:
  - interrupt-parent: Should be the phandle for the interrupt controller
    that services interrupts for this device
-- mediatek,hwlro: the capability if the hardware supports LRO functions
  
  * Ethernet MAC node
  
@@ -54,7 +53,6 @@ eth: ethernet@1b100000 {
         reset-names = "eth";
         mediatek,ethsys = <&ethsys>;
         mediatek,pctl = <&syscfg_pctl_a>;
-       mediatek,hwlro;
         #address-cells = <1>;
         #size-cells = <0>;
  
diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt

index f9c32ad..c35b5b4 100644 (file)
--- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
+++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
@@ -34,16 +34,17 @@ KSZ9031:
  
    All skew control options are specified in picoseconds. The minimum
    value is 0, and the maximum is property-dependent. The increment
-  step is 60ps.
+  step is 60ps. The default value is the neutral setting, so setting
+  rxc-skew-ps=<0> actually results in -900 picoseconds adjustment.
  
    Optional properties:
  
-    Maximum value of 1860:
+    Maximum value of 1860, default value 900:
  
        - rxc-skew-ps : Skew control of RX clock pad
        - txc-skew-ps : Skew control of TX clock pad
  
-    Maximum value of 900:
+    Maximum value of 900, default value 420:
  
        - rxdv-skew-ps : Skew control of RX CTL pad
        - txen-skew-ps : Skew control of TX CTL pad
diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt

index c8ac222..b519503 100644 (file)
--- a/Documentation/devicetree/bindings/net/renesas,ravb.txt
+++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt
@@ -10,6 +10,7 @@ Required properties:
               "renesas,etheravb-r8a7793" if the device is a part of R8A7793 SoC.
               "renesas,etheravb-r8a7794" if the device is a part of R8A7794 SoC.
               "renesas,etheravb-r8a7795" if the device is a part of R8A7795 SoC.
+             "renesas,etheravb-r8a7796" if the device is a part of R8A7796 SoC.
               "renesas,etheravb-rcar-gen2" for generic R-Car Gen 2 compatible interface.
               "renesas,etheravb-rcar-gen3" for generic R-Car Gen 3 compatible interface.
  
@@ -33,7 +34,7 @@ Optional properties:
  - interrupt-parent: the phandle for the interrupt controller that services
                     interrupts for this device.
  - interrupt-names: A list of interrupt names.
-                  For the R8A7795 SoC this property is mandatory;
+                  For the R8A779[56] SoCs this property is mandatory;
                    it should include one entry per channel, named "ch%u",
                    where %u is the channel number ranging from 0 to 24.
                    For other SoCs this property is optional; if present
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt

index ecd8080..753dd4f 100644 (file)
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -131,6 +131,7 @@ inline_dentry          Enable the inline dir feature: data in new created
                         directory entries can be written into inode block. The
                         space of inode block which is used to store inline
                         dentries is limited to ~3.4k.
+noinline_dentry        Diable the inline dentry feature.
  flush_merge           Merge concurrent cache_flush commands as much as possible
                         to eliminate redundant command issues. If the underlying
                        device handles the cache_flush command relatively slowly,
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt

index 8146e9f..c2d44e6 100644 (file)
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -348,3 +348,126 @@ Removed Sysctls
    ----                         -------
    fs.xfs.xfsbufd_centisec      v4.0
    fs.xfs.age_buffer_centisecs  v4.0
+
+
+Error handling
+==============
+
+XFS can act differently according to the type of error found during its
+operation. The implementation introduces the following concepts to the error
+handler:
+
+ -failure speed:
+       Defines how fast XFS should propagate an error upwards when a specific
+       error is found during the filesystem operation. It can propagate
+       immediately, after a defined number of retries, after a set time period,
+       or simply retry forever.
+
+ -error classes:
+       Specifies the subsystem the error configuration will apply to, such as
+       metadata IO or memory allocation. Different subsystems will have
+       different error handlers for which behaviour can be configured.
+
+ -error handlers:
+       Defines the behavior for a specific error.
+
+The filesystem behavior during an error can be set via sysfs files. Each
+error handler works independently - the first condition met by an error handler
+for a specific class will cause the error to be propagated rather than reset and
+retried.
+
+The action taken by the filesystem when the error is propagated is context
+dependent - it may cause a shut down in the case of an unrecoverable error,
+it may be reported back to userspace, or it may even be ignored because
+there's nothing useful we can with the error or anyone we can report it to (e.g.
+during unmount).
+
+The configuration files are organized into the following hierarchy for each
+mounted filesystem:
+
+  /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+Where:
+  <dev>
+       The short device name of the mounted filesystem. This is the same device
+       name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
+
+  <class>
+       The subsystem the error configuration belongs to. As of 4.9, the defined
+       classes are:
+
+               - "metadata": applies metadata buffer write IO
+
+  <error>
+       The individual error handler configurations.
+
+
+Each filesystem has "global" error configuration options defined in their top
+level directory:
+
+  /sys/fs/xfs/<dev>/error/
+
+  fail_at_unmount              (Min:  0  Default:  1  Max: 1)
+       Defines the filesystem error behavior at unmount time.
+
+       If set to a value of 1, XFS will override all other error configurations
+       during unmount and replace them with "immediate fail" characteristics.
+       i.e. no retries, no retry timeout. This will always allow unmount to
+       succeed when there are persistent errors present.
+
+       If set to 0, the configured retry behaviour will continue until all
+       retries and/or timeouts have been exhausted. This will delay unmount
+       completion when there are persistent errors, and it may prevent the
+       filesystem from ever unmounting fully in the case of "retry forever"
+       handler configurations.
+
+       Note: there is no guarantee that fail_at_unmount can be set whilst an
+       unmount is in progress. It is possible that the sysfs entries are
+       removed by the unmounting filesystem before a "retry forever" error
+       handler configuration causes unmount to hang, and hence the filesystem
+       must be configured appropriately before unmount begins to prevent
+       unmount hangs.
+
+Each filesystem has specific error class handlers that define the error
+propagation behaviour for specific errors. There is also a "default" error
+handler defined, which defines the behaviour for all errors that don't have
+specific handlers defined. Where multiple retry constraints are configuredi for
+a single error, the first retry configuration that expires will cause the error
+to be propagated. The handler configurations are found in the directory:
+
+  /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+  max_retries                  (Min: -1  Default: Varies  Max: INTMAX)
+       Defines the allowed number of retries of a specific error before
+       the filesystem will propagate the error. The retry count for a given
+       error context (e.g. a specific metadata buffer) is reset every time
+       there is a successful completion of the operation.
+
+       Setting the value to "-1" will cause XFS to retry forever for this
+       specific error.
+
+       Setting the value to "0" will cause XFS to fail immediately when the
+       specific error is reported.
+
+       Setting the value to "N" (where 0 < N < Max) will make XFS retry the
+       operation "N" times before propagating the error.
+
+  retry_timeout_seconds                (Min:  -1  Default:  Varies  Max: 1 day)
+       Define the amount of time (in seconds) that the filesystem is
+       allowed to retry its operations when the specific error is
+       found.
+
+       Setting the value to "-1" will allow XFS to retry forever for this
+       specific error.
+
+       Setting the value to "0" will cause XFS to fail immediately when the
+       specific error is reported.
+
+       Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
+       operation for up to "N" seconds before propagating the error.
+
+Note: The default behaviour for a specific error handler is dependent on both
+the class and error context. For example, the default values for
+"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
+to "fail immediately" behaviour. This is done because ENODEV is a fatal,
+unrecoverable error no matter how many times the metadata IO is retried.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 6fa1d8a..ec8d814 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -460,6 +460,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         driver will print ACPI tables for AMD IOMMU during
                         IOMMU initialization.
  
+       amd_iommu_intr= [HW,X86-64]
+                       Specifies one of the following AMD IOMMU interrupt
+                       remapping modes:
+                       legacy     - Use legacy interrupt remapping mode.
+                       vapic      - Use virtual APIC mode, which allows IOMMU
+                                    to inject interrupts directly into guest.
+                                    This mode requires kvm-amd.avic=1.
+                                    (Default when IOMMU HW support is present.)
+
         amijoy.map=     [HW,JOY] Amiga joystick support
                         Map of devices attached to JOY0DAT and JOY1DAT
                         Format: <a>,<b>
diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README

index 8c3306e..91f54ff 100644 (file)
--- a/Documentation/sysctl/README
+++ b/Documentation/sysctl/README
@@ -69,6 +69,7 @@ proc/         <empty>
  sunrpc/                SUN Remote Procedure Call (NFS)
  vm/            memory management tuning
                 buffer and cache management
+user/          Per user per user namespace limits
  
  These are the subdirs I have on my system. There might be more
  or other subdirs in another setup. If you see another dir, I'd
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt

index 302b5ed..35e17f7 100644 (file)
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -265,6 +265,13 @@ aio-nr can grow to.
  
  ==============================================================
  
+mount-max:
+
+This denotes the maximum number of mounts that may exist
+in a mount namespace.
+
+==============================================================
+
  
  2. /proc/sys/fs/binfmt_misc
  ----------------------------------------------------------
diff --git a/Documentation/sysctl/user.txt b/Documentation/sysctl/user.txt

new file mode 100644 (file)

index 0000000..1291c49
--- /dev/null
+++ b/Documentation/sysctl/user.txt
@@ -0,0 +1,66 @@
+Documentation for /proc/sys/user/*     kernel version 4.9.0
+       (c) 2016                Eric Biederman <ebiederm@xmission.com>
+
+==============================================================
+
+This file contains the documetation for the sysctl files in
+/proc/sys/user.
+
+The files in this directory can be used to override the default
+limits on the number of namespaces and other objects that have
+per user per user namespace limits.
+
+The primary purpose of these limits is to stop programs that
+malfunction and attempt to create a ridiculous number of objects,
+before the malfunction becomes a system wide problem.  It is the
+intention that the defaults of these limits are set high enough that
+no program in normal operation should run into these limits.
+
+The creation of per user per user namespace objects are charged to
+the user in the user namespace who created the object and
+verified to be below the per user limit in that user namespace.
+
+The creation of objects is also charged to all of the users
+who created user namespaces the creation of the object happens
+in (user namespaces can be nested) and verified to be below the per user
+limits in the user namespaces of those users.
+
+This recursive counting of created objects ensures that creating a
+user namespace does not allow a user to escape their current limits.
+
+Currently, these files are in /proc/sys/user:
+
+- max_cgroup_namespaces
+
+  The maximum number of cgroup namespaces that any user in the current
+  user namespace may create.
+
+- max_ipc_namespaces
+
+  The maximum number of ipc namespaces that any user in the current
+  user namespace may create.
+
+- max_mnt_namespaces
+
+  The maximum number of mount namespaces that any user in the current
+  user namespace may create.
+
+- max_net_namespaces
+
+  The maximum number of network namespaces that any user in the
+  current user namespace may create.
+
+- max_pid_namespaces
+
+  The maximum number of pid namespaces that any user in the current
+  user namespace may create.
+
+- max_user_namespaces
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
+
+- max_uts_namespaces
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt

index a6b3705..185c39f 100644 (file)
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -858,11 +858,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
                When enabled, it will account time the task has been
                scheduled out as part of the function call.
  
-  graph-time - When running function graph tracer, to include the
-              time to call nested functions. When this is not set,
-              the time reported for the function will only include
-              the time the function itself executed for, not the time
-              for functions that it called.
+  graph-time - When running function profiler with function graph tracer,
+              to include the time to call nested functions. When this is
+              not set, the time reported for the function will only
+              include the time the function itself executed for, not the
+              time for functions that it called.
  
    record-cmd - When any event or tracer is enabled, a hook is enabled
                in the sched_switch trace point to fill comm cache
diff --git a/Documentation/trace/hwlat_detector.txt b/Documentation/trace/hwlat_detector.txt

new file mode 100644 (file)

index 0000000..3207717
--- /dev/null
+++ b/Documentation/trace/hwlat_detector.txt
@@ -0,0 +1,79 @@
+Introduction:
+-------------
+
+The tracer hwlat_detector is a special purpose tracer that is used to
+detect large system latencies induced by the behavior of certain underlying
+hardware or firmware, independent of Linux itself. The code was developed
+originally to detect SMIs (System Management Interrupts) on x86 systems,
+however there is nothing x86 specific about this patchset. It was
+originally written for use by the "RT" patch since the Real Time
+kernel is highly latency sensitive.
+
+SMIs are not serviced by the Linux kernel, which means that it does not
+even know that they are occuring. SMIs are instead set up by BIOS code
+and are serviced by BIOS code, usually for "critical" events such as
+management of thermal sensors and fans. Sometimes though, SMIs are used for
+other tasks and those tasks can spend an inordinate amount of time in the
+handler (sometimes measured in milliseconds). Obviously this is a problem if
+you are trying to keep event service latencies down in the microsecond range.
+
+The hardware latency detector works by hogging one of the cpus for configurable
+amounts of time (with interrupts disabled), polling the CPU Time Stamp Counter
+for some period, then looking for gaps in the TSC data. Any gap indicates a
+time when the polling was interrupted and since the interrupts are disabled,
+the only thing that could do that would be an SMI or other hardware hiccup
+(or an NMI, but those can be tracked).
+
+Note that the hwlat detector should *NEVER* be used in a production environment.
+It is intended to be run manually to determine if the hardware platform has a
+problem with long system firmware service routines.
+
+Usage:
+------
+
+Write the ASCII text "hwlat" into the current_tracer file of the tracing system
+(mounted at /sys/kernel/tracing or /sys/kernel/tracing). It is possible to
+redefine the threshold in microseconds (us) above which latency spikes will
+be taken into account.
+
+Example:
+
+       # echo hwlat > /sys/kernel/tracing/current_tracer
+       # echo 100 > /sys/kernel/tracing/tracing_thresh
+
+The /sys/kernel/tracing/hwlat_detector interface contains the following files:
+
+width                  - time period to sample with CPUs held (usecs)
+                         must be less than the total window size (enforced)
+window                 - total period of sampling, width being inside (usecs)
+
+By default the width is set to 500,000 and window to 1,000,000, meaning that
+for every 1,000,000 usecs (1s) the hwlat detector will spin for 500,000 usecs
+(0.5s). If tracing_thresh contains zero when hwlat tracer is enabled, it will
+change to a default of 10 usecs. If any latencies that exceed the threshold is
+observed then the data will be written to the tracing ring buffer.
+
+The minimum sleep time between periods is 1 millisecond. Even if width
+is less than 1 millisecond apart from window, to allow the system to not
+be totally starved.
+
+If tracing_thresh was zero when hwlat detector was started, it will be set
+back to zero if another tracer is loaded. Note, the last value in
+tracing_thresh that hwlat detector had will be saved and this value will
+be restored in tracing_thresh if it is still zero when hwlat detector is
+started again.
+
+The following tracing directory files are used by the hwlat_detector:
+
+in /sys/kernel/tracing:
+
+ tracing_threshold     - minimum latency value to be considered (usecs)
+ tracing_max_latency   - maximum hardware latency actually observed (usecs)
+ tracing_cpumask       - the CPUs to move the hwlat thread across
+ hwlat_detector/width  - specified amount of time to spin within window (usecs)
+ hwlat_detector/window - amount of time between (width) runs (usecs)
+
+The hwlat detector's kernel thread will migrate across each CPU specified in
+tracing_cpumask between each window. To limit the migration, either modify
+tracing_cpumask, or modify the hwlat kernel thread (named [hwlatd]) CPU
+affinity directly, and the migration will stop.
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt

new file mode 100644 (file)

index 0000000..6081a5b
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
@@ -0,0 +1,38 @@
+ARM Virtual Interrupt Translation Service (ITS)
+===============================================
+
+Device types supported:
+  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
+
+The ITS allows MSI(-X) interrupts to be injected into guests. This extension is
+optional.  Creating a virtual ITS controller also requires a host GICv3 (see
+arm-vgic-v3.txt), but does not depend on having physical ITS controllers.
+
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
+
+Groups:
+  KVM_DEV_ARM_VGIC_GRP_ADDR
+  Attributes:
+    KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 ITS
+      control register frame.
+      This address needs to be 64K aligned and the region covers 128K.
+  Errors:
+    -E2BIG:  Address outside of addressable IPA range
+    -EINVAL: Incorrectly aligned address
+    -EEXIST: Address already configured
+    -EFAULT: Invalid user pointer for attr->addr.
+    -ENODEV: Incorrect attribute or the ITS is not supported.
+
+
+  KVM_DEV_ARM_VGIC_GRP_CTRL
+  Attributes:
+    KVM_DEV_ARM_VGIC_CTRL_INIT
+      request the initialization of the ITS, no additional parameter in
+      kvm_device_attr.addr.
+  Errors:
+    -ENXIO:  ITS not properly configured as required prior to setting
+             this attribute
+    -ENOMEM: Memory shortage when allocating ITS internal data
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt

new file mode 100644 (file)

index 0000000..9348b3c
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
@@ -0,0 +1,206 @@
+ARM Virtual Generic Interrupt Controller v3 and later (VGICv3)
+==============================================================
+
+
+Device types supported:
+  KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
+
+Only one VGIC instance may be instantiated through this API.  The created VGIC
+will act as the VM interrupt controller, requiring emulated user-space devices
+to inject interrupts to the VGIC instead of directly to CPUs.  It is not
+possible to create both a GICv3 and GICv2 on the same VM.
+
+Creating a guest GICv3 device requires a host GICv3 as well.
+
+
+Groups:
+  KVM_DEV_ARM_VGIC_GRP_ADDR
+  Attributes:
+    KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 distributor
+      register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
+      This address needs to be 64K aligned and the region covers 64 KByte.
+
+    KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3
+      redistributor register mappings. There are two 64K pages for each
+      VCPU and all of the redistributor pages are contiguous.
+      Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
+      This address needs to be 64K aligned.
+  Errors:
+    -E2BIG:  Address outside of addressable IPA range
+    -EINVAL: Incorrectly aligned address
+    -EEXIST: Address already configured
+    -ENXIO:  The group or attribute is unknown/unsupported for this device
+             or hardware support is missing.
+    -EFAULT: Invalid user pointer for attr->addr.
+
+
+
+  KVM_DEV_ARM_VGIC_GRP_DIST_REGS
+  KVM_DEV_ARM_VGIC_GRP_REDIST_REGS
+  Attributes:
+    The attr field of kvm_device_attr encodes two values:
+    bits:     | 63   ....  32  |  31   ....    0 |
+    values:   |      mpidr     |      offset     |
+
+    All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a
+    __u32 value.  64-bit registers must be accessed by separately accessing the
+    lower and higher word.
+
+    Writes to read-only registers are ignored by the kernel.
+
+    KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers.
+    KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU
+    specified by the mpidr.
+
+    The offset is relative to the "[Re]Distributor base address" as defined
+    in the GICv3/4 specs.  Getting or setting such a register has the same
+    effect as reading or writing the register on real hardware, except for the
+    following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR,
+    GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0.  These registers behave
+    differently when accessed via this interface compared to their
+    architecturally defined behavior to allow software a full view of the
+    VGIC's internal state.
+
+    The mpidr field is used to specify which
+    redistributor is accessed.  The mpidr is ignored for the distributor.
+
+    The mpidr encoding is based on the affinity information in the
+    architecture defined MPIDR, and the field is encoded as follows:
+      | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+      |    Aff3    |    Aff2    |    Aff1    |    Aff0    |
+
+    Note that distributor fields are not banked, but return the same value
+    regardless of the mpidr used to access the register.
+
+    The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such
+    that a write of a clear bit has no effect, whereas a write with a set bit
+    clears that value.  To allow userspace to freely set the values of these two
+    registers, setting the attributes with the register offsets for these two
+    registers simply sets the non-reserved bits to the value written.
+
+
+    Accesses (reads and writes) to the GICD_ISPENDR register region and
+    GICR_ISPENDR0 registers get/set the value of the latched pending state for
+    the interrupts.
+
+    This is identical to the value returned by a guest read from ISPENDR for an
+    edge triggered interrupt, but may differ for level triggered interrupts.
+    For edge triggered interrupts, once an interrupt becomes pending (whether
+    because of an edge detected on the input line or because of a guest write
+    to ISPENDR) this state is "latched", and only cleared when either the
+    interrupt is activated or when the guest writes to ICPENDR. A level
+    triggered interrupt may be pending either because the level input is held
+    high by a device, or because of a guest write to the ISPENDR register. Only
+    ISPENDR writes are latched; if the device lowers the line level then the
+    interrupt is no longer pending unless the guest also wrote to ISPENDR, and
+    conversely writes to ICPENDR or activations of the interrupt do not clear
+    the pending status if the line level is still being held high.  (These
+    rules are documented in the GICv3 specification descriptions of the ICPENDR
+    and ISPENDR registers.) For a level triggered interrupt the value accessed
+    here is that of the latch which is set by ISPENDR and cleared by ICPENDR or
+    interrupt activation, whereas the value returned by a guest read from
+    ISPENDR is the logical OR of the latch value and the input line level.
+
+    Raw access to the latch state is provided to userspace so that it can save
+    and restore the entire GIC internal state (which is defined by the
+    combination of the current input line level and the latch state, and cannot
+    be deduced from purely the line level and the value of the ISPENDR
+    registers).
+
+    Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have
+    RAZ/WI semantics, meaning that reads always return 0 and writes are always
+    ignored.
+
+  Errors:
+    -ENXIO: Getting or setting this register is not yet supported
+    -EBUSY: One or more VCPUs are running
+
+
+  KVM_DEV_ARM_VGIC_CPU_SYSREGS
+  Attributes:
+    The attr field of kvm_device_attr encodes two values:
+    bits:     | 63      ....       32 | 31  ....  16 | 15  ....  0 |
+    values:   |         mpidr         |      RES     |    instr    |
+
+    The mpidr field encodes the CPU ID based on the affinity information in the
+    architecture defined MPIDR, and the field is encoded as follows:
+      | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+      |    Aff3    |    Aff2    |    Aff1    |    Aff0    |
+
+    The instr field encodes the system register to access based on the fields
+    defined in the A64 instruction set encoding for system register access
+    (RES means the bits are reserved for future use and should be zero):
+
+      | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 |
+      |   Op 0    |    Op1    |    CRn   |   CRm   |   Op2   |
+
+    All system regs accessed through this API are (rw, 64-bit) and
+    kvm_device_attr.addr points to a __u64 value.
+
+    KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the
+    CPU specified by the mpidr field.
+
+  Errors:
+    -ENXIO: Getting or setting this register is not yet supported
+    -EBUSY: VCPU is running
+    -EINVAL: Invalid mpidr supplied
+
+
+  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
+  Attributes:
+    A value describing the number of interrupts (SGI, PPI and SPI) for
+    this GIC instance, ranging from 64 to 1024, in increments of 32.
+
+    kvm_device_attr.addr points to a __u32 value.
+
+  Errors:
+    -EINVAL: Value set is out of the expected range
+    -EBUSY: Value has already be set.
+
+
+  KVM_DEV_ARM_VGIC_GRP_CTRL
+  Attributes:
+    KVM_DEV_ARM_VGIC_CTRL_INIT
+      request the initialization of the VGIC, no additional parameter in
+      kvm_device_attr.addr.
+  Errors:
+    -ENXIO: VGIC not properly configured as required prior to calling
+     this attribute
+    -ENODEV: no online VCPU
+    -ENOMEM: memory shortage when allocating vgic internal data
+
+
+  KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO
+  Attributes:
+    The attr field of kvm_device_attr encodes the following values:
+    bits:     | 63      ....       32 | 31   ....    10 | 9  ....  0 |
+    values:   |         mpidr         |      info       |   vINTID   |
+
+    The vINTID specifies which set of IRQs is reported on.
+
+    The info field specifies which information userspace wants to get or set
+    using this interface.  Currently we support the following info values:
+
+      VGIC_LEVEL_INFO_LINE_LEVEL:
+       Get/Set the input level of the IRQ line for a set of 32 contiguously
+       numbered interrupts.
+       vINTID must be a multiple of 32.
+
+       kvm_device_attr.addr points to a __u32 value which will contain a
+       bitmap where a set bit means the interrupt level is asserted.
+
+       Bit[n] indicates the status for interrupt vINTID + n.
+
+    SGIs and any interrupt with a higher ID than the number of interrupts
+    supported, will be RAZ/WI.  LPIs are always edge-triggered and are
+    therefore not supported by this interface.
+
+    PPIs are reported per VCPU as specified in the mpidr field, and SPIs are
+    reported with the same value regardless of the mpidr specified.
+
+    The mpidr field encodes the CPU ID based on the affinity information in the
+    architecture defined MPIDR, and the field is encoded as follows:
+      | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
+      |    Aff3    |    Aff2    |    Aff1    |    Aff0    |
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt

index 89182f8..76e61c8 100644 (file)
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -1,24 +1,19 @@
-ARM Virtual Generic Interrupt Controller (VGIC)
-===============================================
+ARM Virtual Generic Interrupt Controller v2 (VGIC)
+==================================================
  
  Device types supported:
    KVM_DEV_TYPE_ARM_VGIC_V2     ARM Generic Interrupt Controller v2.0
-  KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
-  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
  
-Only one VGIC instance of the V2/V3 types above may be instantiated through
-either this API or the legacy KVM_CREATE_IRQCHIP api.  The created VGIC will
-act as the VM interrupt controller, requiring emulated user-space devices to
-inject interrupts to the VGIC instead of directly to CPUs.
+Only one VGIC instance may be instantiated through either this API or the
+legacy KVM_CREATE_IRQCHIP API.  The created VGIC will act as the VM interrupt
+controller, requiring emulated user-space devices to inject interrupts to the
+VGIC instead of directly to CPUs.
  
-Creating a guest GICv3 device requires a host GICv3 as well.
-GICv3 implementations with hardware compatibility support allow a guest GICv2
-as well.
+GICv3 implementations with hardware compatibility support allow creating a
+guest GICv2 through this interface.  For information on creating a guest GICv3
+device and guest ITS devices, see arm-vgic-v3.txt.  It is not possible to
+create both a GICv3 and GICv2 device on the same VM.
  
-Creating a virtual ITS controller requires a host GICv3 (but does not depend
-on having physical ITS controllers).
-There can be multiple ITS controllers per guest, each of them has to have
-a separate, non-overlapping MMIO region.
  
  Groups:
    KVM_DEV_ARM_VGIC_GRP_ADDR
@@ -32,26 +27,13 @@ Groups:
        Base address in the guest physical address space of the GIC virtual cpu
        interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2.
        This address needs to be 4K aligned and the region covers 4 KByte.
-
-    KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit)
-      Base address in the guest physical address space of the GICv3 distributor
-      register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
-      This address needs to be 64K aligned and the region covers 64 KByte.
-
-    KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit)
-      Base address in the guest physical address space of the GICv3
-      redistributor register mappings. There are two 64K pages for each
-      VCPU and all of the redistributor pages are contiguous.
-      Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
-      This address needs to be 64K aligned.
-
-    KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
-      Base address in the guest physical address space of the GICv3 ITS
-      control register frame. The ITS allows MSI(-X) interrupts to be
-      injected into guests. This extension is optional. If the kernel
-      does not support the ITS, the call returns -ENODEV.
-      Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
-      This address needs to be 64K aligned and the region covers 128K.
+  Errors:
+    -E2BIG:  Address outside of addressable IPA range
+    -EINVAL: Incorrectly aligned address
+    -EEXIST: Address already configured
+    -ENXIO:  The group or attribute is unknown/unsupported for this device
+             or hardware support is missing.
+    -EFAULT: Invalid user pointer for attr->addr.
  
    KVM_DEV_ARM_VGIC_GRP_DIST_REGS
    Attributes:
diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt

index c041658..02f5068 100644 (file)
--- a/Documentation/virtual/kvm/devices/vcpu.txt
+++ b/Documentation/virtual/kvm/devices/vcpu.txt
@@ -30,4 +30,6 @@ Returns: -ENODEV: PMUv3 not supported
                   attribute
           -EBUSY: PMUv3 already initialized
  
-Request the initialization of the PMUv3.
+Request the initialization of the PMUv3.  This must be done after creating the
+in-kernel irqchip.  Creating a PMU with a userspace irqchip is currently not
+supported.
diff --git a/Documentation/xtensa/mmu.txt b/Documentation/xtensa/mmu.txt

index 0312fe6..222a2c6 100644 (file)
--- a/Documentation/xtensa/mmu.txt
+++ b/Documentation/xtensa/mmu.txt
@@ -3,15 +3,8 @@ MMUv3 initialization sequence.
  The code in the initialize_mmu macro sets up MMUv3 memory mapping
  identically to MMUv2 fixed memory mapping. Depending on
  CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX symbol this code is
-located in one of the following address ranges:
-
-    0xF0000000..0xFFFFFFFF (will keep same address in MMU v2 layout;
-                        typically ROM)
-    0x00000000..0x07FFFFFF (system RAM; this code is actually linked
-                        at 0xD0000000..0xD7FFFFFF [cached]
-                        or 0xD8000000..0xDFFFFFFF [uncached];
-                        in any case, initially runs elsewhere
-                        than linked, so have to be careful)
+located in addresses it was linked for (symbol undefined), or not
+(symbol defined), so it needs to be position-independent.
  
  The code has the following assumptions:
    This code fragment is run only on an MMU v3.
@@ -28,24 +21,26 @@ TLB setup proceeds along the following steps.
      PA = physical address (two upper nibbles of it);
      pc = physical range that contains this code;
  
-After step 2, we jump to virtual address in 0x40000000..0x5fffffff
-that corresponds to next instruction to execute in this code.
-After step 4, we jump to intended (linked) address of this code.
-
-    Step 0     Step1     Step 2     Step3     Step 4     Step5
- ============  =====  ============  =====  ============  =====
-   VA      PA     PA    VA      PA     PA    VA      PA     PA
- ------    --     --  ------    --     --  ------    --     --
- E0..FF -> E0  -> E0  E0..FF -> E0         F0..FF -> F0  -> F0
- C0..DF -> C0  -> C0  C0..DF -> C0         E0..EF -> F0  -> F0
- A0..BF -> A0  -> A0  A0..BF -> A0         D8..DF -> 00  -> 00
- 80..9F -> 80  -> 80  80..9F -> 80         D0..D7 -> 00  -> 00
- 60..7F -> 60  -> 60  60..7F -> 60
- 40..5F -> 40         40..5F -> pc  -> pc  40..5F -> pc
- 20..3F -> 20  -> 20  20..3F -> 20
- 00..1F -> 00  -> 00  00..1F -> 00
-
-The default location of IO peripherals is above 0xf0000000. This may change
+After step 2, we jump to virtual address in the range 0x40000000..0x5fffffff
+or 0x00000000..0x1fffffff, depending on whether the kernel was loaded below
+0x40000000 or above. That address corresponds to next instruction to execute
+in this code. After step 4, we jump to intended (linked) address of this code.
+The scheme below assumes that the kernel is loaded below 0x40000000.
+
+        Step0  Step1  Step2  Step3          Step4  Step5
+        =====  =====  =====  =====          =====  =====
+   VA      PA     PA     PA     PA     VA      PA     PA
+ ------    --     --     --     --   ------    --     --
+ E0..FF -> E0  -> E0  -> E0          F0..FF -> F0  -> F0
+ C0..DF -> C0  -> C0  -> C0          E0..EF -> F0  -> F0
+ A0..BF -> A0  -> A0  -> A0          D8..DF -> 00  -> 00
+ 80..9F -> 80  -> 80  -> 80          D0..D7 -> 00  -> 00
+ 60..7F -> 60  -> 60  -> 60
+ 40..5F -> 40         -> pc  -> pc   40..5F -> pc
+ 20..3F -> 20  -> 20  -> 20
+ 00..1F -> 00  -> 00  -> 00
+
+The default location of IO peripherals is above 0xf0000000. This may be changed
  using a "ranges" property in a device tree simple-bus node. See ePAPR 1.1, §6.5
  for details on the syntax and semantic of simple-bus nodes. The following
  limitations apply:
@@ -62,3 +57,127 @@ limitations apply:
  
  6. The IO area covers the entire 256MB segment of parent-bus-address; the
     "ranges" triplet length field is ignored
+
+
+MMUv3 address space layouts.
+============================
+
+Default MMUv2-compatible layout.
+
+                      Symbol                   VADDR       Size
++------------------+
+| Userspace        |                           0x00000000  TASK_SIZE
++------------------+                           0x40000000
++------------------+
+| Page table       |                           0x80000000
++------------------+                           0x80400000
++------------------+
+| KMAP area        |  PKMAP_BASE                           PTRS_PER_PTE *
+|                  |                                       DCACHE_N_COLORS *
+|                  |                                       PAGE_SIZE
+|                  |                                       (4MB * DCACHE_N_COLORS)
++------------------+
+| Atomic KMAP area |  FIXADDR_START                        KM_TYPE_NR *
+|                  |                                       NR_CPUS *
+|                  |                                       DCACHE_N_COLORS *
+|                  |                                       PAGE_SIZE
++------------------+  FIXADDR_TOP              0xbffff000
++------------------+
+| VMALLOC area     |  VMALLOC_START            0xc0000000  128MB - 64KB
++------------------+  VMALLOC_END
+| Cache aliasing   |  TLBTEMP_BASE_1           0xc7ff0000  DCACHE_WAY_SIZE
+| remap area 1     |
++------------------+
+| Cache aliasing   |  TLBTEMP_BASE_2                       DCACHE_WAY_SIZE
+| remap area 2     |
++------------------+
++------------------+
+| Cached KSEG      |  XCHAL_KSEG_CACHED_VADDR  0xd0000000  128MB
++------------------+
+| Uncached KSEG    |  XCHAL_KSEG_BYPASS_VADDR  0xd8000000  128MB
++------------------+
+| Cached KIO       |  XCHAL_KIO_CACHED_VADDR   0xe0000000  256MB
++------------------+
+| Uncached KIO     |  XCHAL_KIO_BYPASS_VADDR   0xf0000000  256MB
++------------------+
+
+
+256MB cached + 256MB uncached layout.
+
+                      Symbol                   VADDR       Size
++------------------+
+| Userspace        |                           0x00000000  TASK_SIZE
++------------------+                           0x40000000
++------------------+
+| Page table       |                           0x80000000
++------------------+                           0x80400000
++------------------+
+| KMAP area        |  PKMAP_BASE                           PTRS_PER_PTE *
+|                  |                                       DCACHE_N_COLORS *
+|                  |                                       PAGE_SIZE
+|                  |                                       (4MB * DCACHE_N_COLORS)
++------------------+
+| Atomic KMAP area |  FIXADDR_START                        KM_TYPE_NR *
+|                  |                                       NR_CPUS *
+|                  |                                       DCACHE_N_COLORS *
+|                  |                                       PAGE_SIZE
++------------------+  FIXADDR_TOP              0x9ffff000
++------------------+
+| VMALLOC area     |  VMALLOC_START            0xa0000000  128MB - 64KB
++------------------+  VMALLOC_END
+| Cache aliasing   |  TLBTEMP_BASE_1           0xa7ff0000  DCACHE_WAY_SIZE
+| remap area 1     |
++------------------+
+| Cache aliasing   |  TLBTEMP_BASE_2                       DCACHE_WAY_SIZE
+| remap area 2     |
++------------------+
++------------------+
+| Cached KSEG      |  XCHAL_KSEG_CACHED_VADDR  0xb0000000  256MB
++------------------+
+| Uncached KSEG    |  XCHAL_KSEG_BYPASS_VADDR  0xc0000000  256MB
++------------------+
++------------------+
+| Cached KIO       |  XCHAL_KIO_CACHED_VADDR   0xe0000000  256MB
++------------------+
+| Uncached KIO     |  XCHAL_KIO_BYPASS_VADDR   0xf0000000  256MB
++------------------+
+
+
+512MB cached + 512MB uncached layout.
+
+                      Symbol                   VADDR       Size
++------------------+
+| Userspace        |                           0x00000000  TASK_SIZE
++------------------+                           0x40000000
++------------------+
+| Page table       |                           0x80000000
++------------------+                           0x80400000
++------------------+
+| KMAP area        |  PKMAP_BASE                           PTRS_PER_PTE *
+|                  |                                       DCACHE_N_COLORS *
+|                  |                                       PAGE_SIZE
+|                  |                                       (4MB * DCACHE_N_COLORS)
++------------------+
+| Atomic KMAP area |  FIXADDR_START                        KM_TYPE_NR *
+|                  |                                       NR_CPUS *
+|                  |                                       DCACHE_N_COLORS *
+|                  |                                       PAGE_SIZE
++------------------+  FIXADDR_TOP              0x8ffff000
++------------------+
+| VMALLOC area     |  VMALLOC_START            0x90000000  128MB - 64KB
++------------------+  VMALLOC_END
+| Cache aliasing   |  TLBTEMP_BASE_1           0x97ff0000  DCACHE_WAY_SIZE
+| remap area 1     |
++------------------+
+| Cache aliasing   |  TLBTEMP_BASE_2                       DCACHE_WAY_SIZE
+| remap area 2     |
++------------------+
++------------------+
+| Cached KSEG      |  XCHAL_KSEG_CACHED_VADDR  0xa0000000  512MB
++------------------+
+| Uncached KSEG    |  XCHAL_KSEG_BYPASS_VADDR  0xc0000000  512MB
++------------------+
+| Cached KIO       |  XCHAL_KIO_CACHED_VADDR   0xe0000000  256MB
++------------------+
+| Uncached KIO     |  XCHAL_KIO_BYPASS_VADDR   0xf0000000  256MB
++------------------+
diff --git a/MAINTAINERS b/MAINTAINERS

index 841ffa3..274d2a0 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5005,6 +5005,13 @@ F:       drivers/net/ethernet/freescale/fec_ptp.c
  F:     drivers/net/ethernet/freescale/fec.h
  F:     Documentation/devicetree/bindings/net/fsl-fec.txt
  
+FREESCALE QORIQ DPAA FMAN DRIVER
+M:     Madalin Bucur <madalin.bucur@nxp.com>
+L:     netdev@vger.kernel.org
+S:     Maintained
+F:     drivers/net/ethernet/freescale/fman
+F:     Documentation/devicetree/bindings/powerpc/fsl/fman.txt
+
  FREESCALE QUICC ENGINE LIBRARY
  L:     linuxppc-dev@lists.ozlabs.org
  S:     Orphan
@@ -5105,10 +5112,9 @@ F:       include/linux/fscrypto.h
  
  F2FS FILE SYSTEM
  M:     Jaegeuk Kim <jaegeuk@kernel.org>
-M:     Changman Lee <cm224.lee@samsung.com>
-R:     Chao Yu <yuchao0@huawei.com>
+M:     Chao Yu <yuchao0@huawei.com>
  L:     linux-f2fs-devel@lists.sourceforge.net
-W:     http://en.wikipedia.org/wiki/F2FS
+W:     https://f2fs.wiki.kernel.org/
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git
  S:     Maintained
  F:     Documentation/filesystems/f2fs.txt
@@ -13062,6 +13068,7 @@ F:      arch/arm64/include/asm/xen/
  
  XEN NETWORK BACKEND DRIVER
  M:     Wei Liu <wei.liu2@citrix.com>
+M:     Paul Durrant <paul.durrant@citrix.com>
  L:     xen-devel@lists.xenproject.org (moderated for non-subscribers)
  L:     netdev@vger.kernel.org
  S:     Supported
@@ -13099,11 +13106,10 @@ F:    arch/x86/xen/*swiotlb*
  F:     drivers/xen/*swiotlb*
  
  XFS FILESYSTEM
-P:     Silicon Graphics Inc
  M:     Dave Chinner <david@fromorbit.com>
-M:     xfs@oss.sgi.com
-L:     xfs@oss.sgi.com
-W:     http://oss.sgi.com/projects/xfs
+M:     linux-xfs@vger.kernel.org
+L:     linux-xfs@vger.kernel.org
+W:     http://xfs.org/
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git
  S:     Supported
  F:     Documentation/filesystems/xfs.txt
diff --git a/arch/arm/Makefile b/arch/arm/Makefile

index 61f6ccc..6be9ee1 100644 (file)
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -23,7 +23,6 @@ ifeq ($(CONFIG_ARM_MODULE_PLTS),y)
  LDFLAGS_MODULE += -T $(srctree)/arch/arm/kernel/module.lds
  endif
  
-OBJCOPYFLAGS   :=-O binary -R .comment -S
  GZFLAGS                :=-9
  #KBUILD_CFLAGS +=-pipe
  
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile

index bdc1d5a..50f8d1b 100644 (file)
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -11,6 +11,8 @@
  # Copyright (C) 1995-2002 Russell King
  #
  
+OBJCOPYFLAGS   :=-O binary -R .comment -S
+
  ifneq ($(MACHINE),)
  include $(MACHINE)/Makefile.boot
  endif
diff --git a/arch/arm/common/sa1111.c b/arch/arm/common/sa1111.c

index 2e076c4..4ecd512 100644 (file)
--- a/arch/arm/common/sa1111.c
+++ b/arch/arm/common/sa1111.c
@@ -15,6 +15,7 @@
   * from machine specific code with proper arguments when required.
   */
  #include <linux/module.h>
+#include <linux/gpio/driver.h>
  #include <linux/init.h>
  #include <linux/irq.h>
  #include <linux/kernel.h>
@@ -107,6 +108,7 @@ struct sa1111 {
         spinlock_t      lock;
         void __iomem    *base;
         struct sa1111_platform_data *pdata;
+       struct gpio_chip gc;
  #ifdef CONFIG_PM
         void            *saved_state;
  #endif
@@ -231,132 +233,44 @@ static void sa1111_irq_handler(struct irq_desc *desc)
  #define SA1111_IRQMASK_LO(x)   (1 << (x - sachip->irq_base))
  #define SA1111_IRQMASK_HI(x)   (1 << (x - sachip->irq_base - 32))
  
-static void sa1111_ack_irq(struct irq_data *d)
-{
-}
-
-static void sa1111_mask_lowirq(struct irq_data *d)
+static u32 sa1111_irqmask(struct irq_data *d)
  {
         struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned long ie0;
  
-       ie0 = sa1111_readl(mapbase + SA1111_INTEN0);
-       ie0 &= ~SA1111_IRQMASK_LO(d->irq);
-       writel(ie0, mapbase + SA1111_INTEN0);
+       return BIT((d->irq - sachip->irq_base) & 31);
  }
  
-static void sa1111_unmask_lowirq(struct irq_data *d)
+static int sa1111_irqbank(struct irq_data *d)
  {
         struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned long ie0;
-
-       ie0 = sa1111_readl(mapbase + SA1111_INTEN0);
-       ie0 |= SA1111_IRQMASK_LO(d->irq);
-       sa1111_writel(ie0, mapbase + SA1111_INTEN0);
-}
-
-/*
- * Attempt to re-trigger the interrupt.  The SA1111 contains a register
- * (INTSET) which claims to do this.  However, in practice no amount of
- * manipulation of INTEN and INTSET guarantees that the interrupt will
- * be triggered.  In fact, its very difficult, if not impossible to get
- * INTSET to re-trigger the interrupt.
- */
-static int sa1111_retrigger_lowirq(struct irq_data *d)
-{
-       struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned int mask = SA1111_IRQMASK_LO(d->irq);
-       unsigned long ip0;
-       int i;
-
-       ip0 = sa1111_readl(mapbase + SA1111_INTPOL0);
-       for (i = 0; i < 8; i++) {
-               sa1111_writel(ip0 ^ mask, mapbase + SA1111_INTPOL0);
-               sa1111_writel(ip0, mapbase + SA1111_INTPOL0);
-               if (sa1111_readl(mapbase + SA1111_INTSTATCLR0) & mask)
-                       break;
-       }
  
-       if (i == 8)
-               pr_err("Danger Will Robinson: failed to re-trigger IRQ%d\n",
-                      d->irq);
-       return i == 8 ? -1 : 0;
+       return ((d->irq - sachip->irq_base) / 32) * 4;
  }
  
-static int sa1111_type_lowirq(struct irq_data *d, unsigned int flags)
-{
-       struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned int mask = SA1111_IRQMASK_LO(d->irq);
-       unsigned long ip0;
-
-       if (flags == IRQ_TYPE_PROBE)
-               return 0;
-
-       if ((!(flags & IRQ_TYPE_EDGE_RISING) ^ !(flags & IRQ_TYPE_EDGE_FALLING)) == 0)
-               return -EINVAL;
-
-       ip0 = sa1111_readl(mapbase + SA1111_INTPOL0);
-       if (flags & IRQ_TYPE_EDGE_RISING)
-               ip0 &= ~mask;
-       else
-               ip0 |= mask;
-       sa1111_writel(ip0, mapbase + SA1111_INTPOL0);
-       sa1111_writel(ip0, mapbase + SA1111_WAKEPOL0);
-
-       return 0;
-}
-
-static int sa1111_wake_lowirq(struct irq_data *d, unsigned int on)
+static void sa1111_ack_irq(struct irq_data *d)
  {
-       struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned int mask = SA1111_IRQMASK_LO(d->irq);
-       unsigned long we0;
-
-       we0 = sa1111_readl(mapbase + SA1111_WAKEEN0);
-       if (on)
-               we0 |= mask;
-       else
-               we0 &= ~mask;
-       sa1111_writel(we0, mapbase + SA1111_WAKEEN0);
-
-       return 0;
  }
  
-static struct irq_chip sa1111_low_chip = {
-       .name           = "SA1111-l",
-       .irq_ack        = sa1111_ack_irq,
-       .irq_mask       = sa1111_mask_lowirq,
-       .irq_unmask     = sa1111_unmask_lowirq,
-       .irq_retrigger  = sa1111_retrigger_lowirq,
-       .irq_set_type   = sa1111_type_lowirq,
-       .irq_set_wake   = sa1111_wake_lowirq,
-};
-
-static void sa1111_mask_highirq(struct irq_data *d)
+static void sa1111_mask_irq(struct irq_data *d)
  {
         struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned long ie1;
+       void __iomem *mapbase = sachip->base + SA1111_INTC + sa1111_irqbank(d);
+       u32 ie;
  
-       ie1 = sa1111_readl(mapbase + SA1111_INTEN1);
-       ie1 &= ~SA1111_IRQMASK_HI(d->irq);
-       sa1111_writel(ie1, mapbase + SA1111_INTEN1);
+       ie = sa1111_readl(mapbase + SA1111_INTEN0);
+       ie &= ~sa1111_irqmask(d);
+       sa1111_writel(ie, mapbase + SA1111_INTEN0);
  }
  
-static void sa1111_unmask_highirq(struct irq_data *d)
+static void sa1111_unmask_irq(struct irq_data *d)
  {
         struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned long ie1;
+       void __iomem *mapbase = sachip->base + SA1111_INTC + sa1111_irqbank(d);
+       u32 ie;
  
-       ie1 = sa1111_readl(mapbase + SA1111_INTEN1);
-       ie1 |= SA1111_IRQMASK_HI(d->irq);
-       sa1111_writel(ie1, mapbase + SA1111_INTEN1);
+       ie = sa1111_readl(mapbase + SA1111_INTEN0);
+       ie |= sa1111_irqmask(d);
+       sa1111_writel(ie, mapbase + SA1111_INTEN0);
  }
  
  /*
@@ -366,19 +280,18 @@ static void sa1111_unmask_highirq(struct irq_data *d)
   * be triggered.  In fact, its very difficult, if not impossible to get
   * INTSET to re-trigger the interrupt.
   */
-static int sa1111_retrigger_highirq(struct irq_data *d)
+static int sa1111_retrigger_irq(struct irq_data *d)
  {
         struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned int mask = SA1111_IRQMASK_HI(d->irq);
-       unsigned long ip1;
+       void __iomem *mapbase = sachip->base + SA1111_INTC + sa1111_irqbank(d);
+       u32 ip, mask = sa1111_irqmask(d);
         int i;
  
-       ip1 = sa1111_readl(mapbase + SA1111_INTPOL1);
+       ip = sa1111_readl(mapbase + SA1111_INTPOL0);
         for (i = 0; i < 8; i++) {
-               sa1111_writel(ip1 ^ mask, mapbase + SA1111_INTPOL1);
-               sa1111_writel(ip1, mapbase + SA1111_INTPOL1);
-               if (sa1111_readl(mapbase + SA1111_INTSTATCLR1) & mask)
+               sa1111_writel(ip ^ mask, mapbase + SA1111_INTPOL0);
+               sa1111_writel(ip, mapbase + SA1111_INTPOL0);
+               if (sa1111_readl(mapbase + SA1111_INTSTATCLR0) & mask)
                         break;
         }
  
@@ -388,12 +301,11 @@ static int sa1111_retrigger_highirq(struct irq_data *d)
         return i == 8 ? -1 : 0;
  }
  
-static int sa1111_type_highirq(struct irq_data *d, unsigned int flags)
+static int sa1111_type_irq(struct irq_data *d, unsigned int flags)
  {
         struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned int mask = SA1111_IRQMASK_HI(d->irq);
-       unsigned long ip1;
+       void __iomem *mapbase = sachip->base + SA1111_INTC + sa1111_irqbank(d);
+       u32 ip, mask = sa1111_irqmask(d);
  
         if (flags == IRQ_TYPE_PROBE)
                 return 0;
@@ -401,42 +313,41 @@ static int sa1111_type_highirq(struct irq_data *d, unsigned int flags)
         if ((!(flags & IRQ_TYPE_EDGE_RISING) ^ !(flags & IRQ_TYPE_EDGE_FALLING)) == 0)
                 return -EINVAL;
  
-       ip1 = sa1111_readl(mapbase + SA1111_INTPOL1);
+       ip = sa1111_readl(mapbase + SA1111_INTPOL0);
         if (flags & IRQ_TYPE_EDGE_RISING)
-               ip1 &= ~mask;
+               ip &= ~mask;
         else
-               ip1 |= mask;
-       sa1111_writel(ip1, mapbase + SA1111_INTPOL1);
-       sa1111_writel(ip1, mapbase + SA1111_WAKEPOL1);
+               ip |= mask;
+       sa1111_writel(ip, mapbase + SA1111_INTPOL0);
+       sa1111_writel(ip, mapbase + SA1111_WAKEPOL0);
  
         return 0;
  }
  
-static int sa1111_wake_highirq(struct irq_data *d, unsigned int on)
+static int sa1111_wake_irq(struct irq_data *d, unsigned int on)
  {
         struct sa1111 *sachip = irq_data_get_irq_chip_data(d);
-       void __iomem *mapbase = sachip->base + SA1111_INTC;
-       unsigned int mask = SA1111_IRQMASK_HI(d->irq);
-       unsigned long we1;
+       void __iomem *mapbase = sachip->base + SA1111_INTC + sa1111_irqbank(d);
+       u32 we, mask = sa1111_irqmask(d);
  
-       we1 = sa1111_readl(mapbase + SA1111_WAKEEN1);
+       we = sa1111_readl(mapbase + SA1111_WAKEEN0);
         if (on)
-               we1 |= mask;
+               we |= mask;
         else
-               we1 &= ~mask;
-       sa1111_writel(we1, mapbase + SA1111_WAKEEN1);
+               we &= ~mask;
+       sa1111_writel(we, mapbase + SA1111_WAKEEN0);
  
         return 0;
  }
  
-static struct irq_chip sa1111_high_chip = {
-       .name           = "SA1111-h",
+static struct irq_chip sa1111_irq_chip = {
+       .name           = "SA1111",
         .irq_ack        = sa1111_ack_irq,
-       .irq_mask       = sa1111_mask_highirq,
-       .irq_unmask     = sa1111_unmask_highirq,
-       .irq_retrigger  = sa1111_retrigger_highirq,
-       .irq_set_type   = sa1111_type_highirq,
-       .irq_set_wake   = sa1111_wake_highirq,
+       .irq_mask       = sa1111_mask_irq,
+       .irq_unmask     = sa1111_unmask_irq,
+       .irq_retrigger  = sa1111_retrigger_irq,
+       .irq_set_type   = sa1111_type_irq,
+       .irq_set_wake   = sa1111_wake_irq,
  };
  
  static int sa1111_setup_irq(struct sa1111 *sachip, unsigned irq_base)
@@ -482,16 +393,14 @@ static int sa1111_setup_irq(struct sa1111 *sachip, unsigned irq_base)
  
         for (i = IRQ_GPAIN0; i <= SSPROR; i++) {
                 irq = sachip->irq_base + i;
-               irq_set_chip_and_handler(irq, &sa1111_low_chip,
-                                        handle_edge_irq);
+               irq_set_chip_and_handler(irq, &sa1111_irq_chip, handle_edge_irq);
                 irq_set_chip_data(irq, sachip);
                 irq_clear_status_flags(irq, IRQ_NOREQUEST | IRQ_NOPROBE);
         }
  
         for (i = AUDXMTDMADONEA; i <= IRQ_S1_BVD1_STSCHG; i++) {
                 irq = sachip->irq_base + i;
-               irq_set_chip_and_handler(irq, &sa1111_high_chip,
-                                        handle_edge_irq);
+               irq_set_chip_and_handler(irq, &sa1111_irq_chip, handle_edge_irq);
                 irq_set_chip_data(irq, sachip);
                 irq_clear_status_flags(irq, IRQ_NOREQUEST | IRQ_NOPROBE);
         }
@@ -509,6 +418,181 @@ static int sa1111_setup_irq(struct sa1111 *sachip, unsigned irq_base)
         return 0;
  }
  
+static void sa1111_remove_irq(struct sa1111 *sachip)
+{
+       void __iomem *irqbase = sachip->base + SA1111_INTC;
+
+       /* disable all IRQs */
+       sa1111_writel(0, irqbase + SA1111_INTEN0);
+       sa1111_writel(0, irqbase + SA1111_INTEN1);
+       sa1111_writel(0, irqbase + SA1111_WAKEEN0);
+       sa1111_writel(0, irqbase + SA1111_WAKEEN1);
+
+       if (sachip->irq != NO_IRQ) {
+               irq_set_chained_handler_and_data(sachip->irq, NULL, NULL);
+               irq_free_descs(sachip->irq_base, SA1111_IRQ_NR);
+
+               release_mem_region(sachip->phys + SA1111_INTC, 512);
+       }
+}
+
+enum {
+       SA1111_GPIO_PXDDR = (SA1111_GPIO_PADDR - SA1111_GPIO_PADDR),
+       SA1111_GPIO_PXDRR = (SA1111_GPIO_PADRR - SA1111_GPIO_PADDR),
+       SA1111_GPIO_PXDWR = (SA1111_GPIO_PADWR - SA1111_GPIO_PADDR),
+       SA1111_GPIO_PXSDR = (SA1111_GPIO_PASDR - SA1111_GPIO_PADDR),
+       SA1111_GPIO_PXSSR = (SA1111_GPIO_PASSR - SA1111_GPIO_PADDR),
+};
+
+static struct sa1111 *gc_to_sa1111(struct gpio_chip *gc)
+{
+       return container_of(gc, struct sa1111, gc);
+}
+
+static void __iomem *sa1111_gpio_map_reg(struct sa1111 *sachip, unsigned offset)
+{
+       void __iomem *reg = sachip->base + SA1111_GPIO;
+
+       if (offset < 4)
+               return reg + SA1111_GPIO_PADDR;
+       if (offset < 10)
+               return reg + SA1111_GPIO_PBDDR;
+       if (offset < 18)
+               return reg + SA1111_GPIO_PCDDR;
+       return NULL;
+}
+
+static u32 sa1111_gpio_map_bit(unsigned offset)
+{
+       if (offset < 4)
+               return BIT(offset);
+       if (offset < 10)
+               return BIT(offset - 4);
+       if (offset < 18)
+               return BIT(offset - 10);
+       return 0;
+}
+
+static void sa1111_gpio_modify(void __iomem *reg, u32 mask, u32 set)
+{
+       u32 val;
+
+       val = readl_relaxed(reg);
+       val &= ~mask;
+       val |= mask & set;
+       writel_relaxed(val, reg);
+}
+
+static int sa1111_gpio_get_direction(struct gpio_chip *gc, unsigned offset)
+{
+       struct sa1111 *sachip = gc_to_sa1111(gc);
+       void __iomem *reg = sa1111_gpio_map_reg(sachip, offset);
+       u32 mask = sa1111_gpio_map_bit(offset);
+
+       return !!(readl_relaxed(reg + SA1111_GPIO_PXDDR) & mask);
+}
+
+static int sa1111_gpio_direction_input(struct gpio_chip *gc, unsigned offset)
+{
+       struct sa1111 *sachip = gc_to_sa1111(gc);
+       unsigned long flags;
+       void __iomem *reg = sa1111_gpio_map_reg(sachip, offset);
+       u32 mask = sa1111_gpio_map_bit(offset);
+
+       spin_lock_irqsave(&sachip->lock, flags);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXDDR, mask, mask);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXSDR, mask, mask);
+       spin_unlock_irqrestore(&sachip->lock, flags);
+
+       return 0;
+}
+
+static int sa1111_gpio_direction_output(struct gpio_chip *gc, unsigned offset,
+       int value)
+{
+       struct sa1111 *sachip = gc_to_sa1111(gc);
+       unsigned long flags;
+       void __iomem *reg = sa1111_gpio_map_reg(sachip, offset);
+       u32 mask = sa1111_gpio_map_bit(offset);
+
+       spin_lock_irqsave(&sachip->lock, flags);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXDWR, mask, value ? mask : 0);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXSSR, mask, value ? mask : 0);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXDDR, mask, 0);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXSDR, mask, 0);
+       spin_unlock_irqrestore(&sachip->lock, flags);
+
+       return 0;
+}
+
+static int sa1111_gpio_get(struct gpio_chip *gc, unsigned offset)
+{
+       struct sa1111 *sachip = gc_to_sa1111(gc);
+       void __iomem *reg = sa1111_gpio_map_reg(sachip, offset);
+       u32 mask = sa1111_gpio_map_bit(offset);
+
+       return !!(readl_relaxed(reg + SA1111_GPIO_PXDRR) & mask);
+}
+
+static void sa1111_gpio_set(struct gpio_chip *gc, unsigned offset, int value)
+{
+       struct sa1111 *sachip = gc_to_sa1111(gc);
+       unsigned long flags;
+       void __iomem *reg = sa1111_gpio_map_reg(sachip, offset);
+       u32 mask = sa1111_gpio_map_bit(offset);
+
+       spin_lock_irqsave(&sachip->lock, flags);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXDWR, mask, value ? mask : 0);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PXSSR, mask, value ? mask : 0);
+       spin_unlock_irqrestore(&sachip->lock, flags);
+}
+
+static void sa1111_gpio_set_multiple(struct gpio_chip *gc, unsigned long *mask,
+       unsigned long *bits)
+{
+       struct sa1111 *sachip = gc_to_sa1111(gc);
+       unsigned long flags;
+       void __iomem *reg = sachip->base + SA1111_GPIO;
+       u32 msk, val;
+
+       msk = *mask;
+       val = *bits;
+
+       spin_lock_irqsave(&sachip->lock, flags);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PADWR, msk & 15, val);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PASSR, msk & 15, val);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PBDWR, (msk >> 4) & 255, val >> 4);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PBSSR, (msk >> 4) & 255, val >> 4);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PCDWR, (msk >> 12) & 255, val >> 12);
+       sa1111_gpio_modify(reg + SA1111_GPIO_PCSSR, (msk >> 12) & 255, val >> 12);
+       spin_unlock_irqrestore(&sachip->lock, flags);
+}
+
+static int sa1111_gpio_to_irq(struct gpio_chip *gc, unsigned offset)
+{
+       struct sa1111 *sachip = gc_to_sa1111(gc);
+
+       return sachip->irq_base + offset;
+}
+
+static int sa1111_setup_gpios(struct sa1111 *sachip)
+{
+       sachip->gc.label = "sa1111";
+       sachip->gc.parent = sachip->dev;
+       sachip->gc.owner = THIS_MODULE;
+       sachip->gc.get_direction = sa1111_gpio_get_direction;
+       sachip->gc.direction_input = sa1111_gpio_direction_input;
+       sachip->gc.direction_output = sa1111_gpio_direction_output;
+       sachip->gc.get = sa1111_gpio_get;
+       sachip->gc.set = sa1111_gpio_set;
+       sachip->gc.set_multiple = sa1111_gpio_set_multiple;
+       sachip->gc.to_irq = sa1111_gpio_to_irq;
+       sachip->gc.base = -1;
+       sachip->gc.ngpio = 18;
+
+       return devm_gpiochip_add_data(sachip->dev, &sachip->gc, sachip);
+}
+
  /*
   * Bring the SA1111 out of reset.  This requires a set procedure:
   *  1. nRESET asserted (by hardware)
@@ -607,7 +691,7 @@ sa1111_configure_smc(struct sa1111 *sachip, int sdram, unsigned int drac,
  
  static void sa1111_dev_release(struct device *_dev)
  {
-       struct sa1111_dev *dev = SA1111_DEV(_dev);
+       struct sa1111_dev *dev = to_sa1111_device(_dev);
  
         kfree(dev);
  }
@@ -696,19 +780,17 @@ static int __sa1111_probe(struct device *me, struct resource *mem, int irq)
         if (!pd)
                 return -EINVAL;
  
-       sachip = kzalloc(sizeof(struct sa1111), GFP_KERNEL);
+       sachip = devm_kzalloc(me, sizeof(struct sa1111), GFP_KERNEL);
         if (!sachip)
                 return -ENOMEM;
  
-       sachip->clk = clk_get(me, "SA1111_CLK");
-       if (IS_ERR(sachip->clk)) {
-               ret = PTR_ERR(sachip->clk);
-               goto err_free;
-       }
+       sachip->clk = devm_clk_get(me, "SA1111_CLK");
+       if (IS_ERR(sachip->clk))
+               return PTR_ERR(sachip->clk);
  
         ret = clk_prepare(sachip->clk);
         if (ret)
-               goto err_clkput;
+               return ret;
  
         spin_lock_init(&sachip->lock);
  
@@ -757,6 +839,11 @@ static int __sa1111_probe(struct device *me, struct resource *mem, int irq)
                         goto err_clk;
         }
  
+       /* Setup the GPIOs - should really be done after the IRQ setup */
+       ret = sa1111_setup_gpios(sachip);
+       if (ret)
+               goto err_irq;
+
  #ifdef CONFIG_ARCH_SA1100
         {
         unsigned int val;
@@ -799,22 +886,22 @@ static int __sa1111_probe(struct device *me, struct resource *mem, int irq)
  
         return 0;
  
+ err_irq:
+       sa1111_remove_irq(sachip);
   err_clk:
         clk_disable(sachip->clk);
   err_unmap:
         iounmap(sachip->base);
   err_clk_unprep:
         clk_unprepare(sachip->clk);
- err_clkput:
-       clk_put(sachip->clk);
- err_free:
-       kfree(sachip);
         return ret;
  }
  
  static int sa1111_remove_one(struct device *dev, void *data)
  {
-       struct sa1111_dev *sadev = SA1111_DEV(dev);
+       struct sa1111_dev *sadev = to_sa1111_device(dev);
+       if (dev->bus != &sa1111_bus_type)
+               return 0;
         device_del(&sadev->dev);
         release_resource(&sadev->res);
         put_device(&sadev->dev);
@@ -823,29 +910,14 @@ static int sa1111_remove_one(struct device *dev, void *data)
  
  static void __sa1111_remove(struct sa1111 *sachip)
  {
-       void __iomem *irqbase = sachip->base + SA1111_INTC;
-
         device_for_each_child(sachip->dev, NULL, sa1111_remove_one);
  
-       /* disable all IRQs */
-       sa1111_writel(0, irqbase + SA1111_INTEN0);
-       sa1111_writel(0, irqbase + SA1111_INTEN1);
-       sa1111_writel(0, irqbase + SA1111_WAKEEN0);
-       sa1111_writel(0, irqbase + SA1111_WAKEEN1);
+       sa1111_remove_irq(sachip);
  
         clk_disable(sachip->clk);
         clk_unprepare(sachip->clk);
  
-       if (sachip->irq != NO_IRQ) {
-               irq_set_chained_handler_and_data(sachip->irq, NULL, NULL);
-               irq_free_descs(sachip->irq_base, SA1111_IRQ_NR);
-
-               release_mem_region(sachip->phys + SA1111_INTC, 512);
-       }
-
         iounmap(sachip->base);
-       clk_put(sachip->clk);
-       kfree(sachip);
  }
  
  struct sa1111_save_data {
@@ -1285,6 +1357,14 @@ void sa1111_disable_device(struct sa1111_dev *sadev)
  }
  EXPORT_SYMBOL(sa1111_disable_device);
  
+int sa1111_get_irq(struct sa1111_dev *sadev, unsigned num)
+{
+       if (num >= ARRAY_SIZE(sadev->irq))
+               return -EINVAL;
+       return sadev->irq[num];
+}
+EXPORT_SYMBOL_GPL(sa1111_get_irq);
+
  /*
   *     SA1111 "Register Access Bus."
   *
@@ -1293,7 +1373,7 @@ EXPORT_SYMBOL(sa1111_disable_device);
   */
  static int sa1111_match(struct device *_dev, struct device_driver *_drv)
  {
-       struct sa1111_dev *dev = SA1111_DEV(_dev);
+       struct sa1111_dev *dev = to_sa1111_device(_dev);
         struct sa1111_driver *drv = SA1111_DRV(_drv);
  
         return !!(dev->devid & drv->devid);
@@ -1301,7 +1381,7 @@ static int sa1111_match(struct device *_dev, struct device_driver *_drv)
  
  static int sa1111_bus_suspend(struct device *dev, pm_message_t state)
  {
-       struct sa1111_dev *sadev = SA1111_DEV(dev);
+       struct sa1111_dev *sadev = to_sa1111_device(dev);
         struct sa1111_driver *drv = SA1111_DRV(dev->driver);
         int ret = 0;
  
@@ -1312,7 +1392,7 @@ static int sa1111_bus_suspend(struct device *dev, pm_message_t state)
  
  static int sa1111_bus_resume(struct device *dev)
  {
-       struct sa1111_dev *sadev = SA1111_DEV(dev);
+       struct sa1111_dev *sadev = to_sa1111_device(dev);
         struct sa1111_driver *drv = SA1111_DRV(dev->driver);
         int ret = 0;
  
@@ -1326,12 +1406,12 @@ static void sa1111_bus_shutdown(struct device *dev)
         struct sa1111_driver *drv = SA1111_DRV(dev->driver);
  
         if (drv && drv->shutdown)
-               drv->shutdown(SA1111_DEV(dev));
+               drv->shutdown(to_sa1111_device(dev));
  }
  
  static int sa1111_bus_probe(struct device *dev)
  {
-       struct sa1111_dev *sadev = SA1111_DEV(dev);
+       struct sa1111_dev *sadev = to_sa1111_device(dev);
         struct sa1111_driver *drv = SA1111_DRV(dev->driver);
         int ret = -ENODEV;
  
@@ -1342,7 +1422,7 @@ static int sa1111_bus_probe(struct device *dev)
  
  static int sa1111_bus_remove(struct device *dev)
  {
-       struct sa1111_dev *sadev = SA1111_DEV(dev);
+       struct sa1111_dev *sadev = to_sa1111_device(dev);
         struct sa1111_driver *drv = SA1111_DRV(dev->driver);
         int ret = 0;
  
@@ -1407,7 +1487,7 @@ static int sa1111_needs_bounce(struct device *dev, dma_addr_t addr, size_t size)
  static int sa1111_notifier_call(struct notifier_block *n, unsigned long action,
         void *data)
  {
-       struct sa1111_dev *dev = SA1111_DEV(data);
+       struct sa1111_dev *dev = to_sa1111_device(data);
  
         switch (action) {
         case BUS_NOTIFY_ADD_DEVICE:
diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h

index dfe4002..a808829 100644 (file)
--- a/arch/arm/include/asm/arch_gicv3.h
+++ b/arch/arm/include/asm/arch_gicv3.h
@@ -22,9 +22,7 @@
  
  #include <linux/io.h>
  #include <asm/barrier.h>
-
-#define __ACCESS_CP15(CRn, Op1, CRm, Op2)      p15, Op1, %0, CRn, CRm, Op2
-#define __ACCESS_CP15_64(Op1, CRm)             p15, Op1, %Q0, %R0, CRm
+#include <asm/cp15.h>
  
  #define ICC_EOIR1                      __ACCESS_CP15(c12, 0, c12, 1)
  #define ICC_DIR                                __ACCESS_CP15(c12, 0, c11, 1)
@@ -99,68 +97,129 @@
  #define ICH_AP1R2                      __AP1Rx(2)
  #define ICH_AP1R3                      __AP1Rx(3)
  
+/* A32-to-A64 mappings used by VGIC save/restore */
+
+#define CPUIF_MAP(a32, a64)                    \
+static inline void write_ ## a64(u32 val)      \
+{                                              \
+       write_sysreg(val, a32);                 \
+}                                              \
+static inline u32 read_ ## a64(void)           \
+{                                              \
+       return read_sysreg(a32);                \
+}                                              \
+
+#define CPUIF_MAP_LO_HI(a32lo, a32hi, a64)     \
+static inline void write_ ## a64(u64 val)      \
+{                                              \
+       write_sysreg(lower_32_bits(val), a32lo);\
+       write_sysreg(upper_32_bits(val), a32hi);\
+}                                              \
+static inline u64 read_ ## a64(void)           \
+{                                              \
+       u64 val = read_sysreg(a32lo);           \
+                                               \
+       val |=  (u64)read_sysreg(a32hi) << 32;  \
+                                               \
+       return val;                             \
+}
+
+CPUIF_MAP(ICH_HCR, ICH_HCR_EL2)
+CPUIF_MAP(ICH_VTR, ICH_VTR_EL2)
+CPUIF_MAP(ICH_MISR, ICH_MISR_EL2)
+CPUIF_MAP(ICH_EISR, ICH_EISR_EL2)
+CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2)
+CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2)
+CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2)
+CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2)
+CPUIF_MAP(ICH_AP0R1, ICH_AP0R1_EL2)
+CPUIF_MAP(ICH_AP0R0, ICH_AP0R0_EL2)
+CPUIF_MAP(ICH_AP1R3, ICH_AP1R3_EL2)
+CPUIF_MAP(ICH_AP1R2, ICH_AP1R2_EL2)
+CPUIF_MAP(ICH_AP1R1, ICH_AP1R1_EL2)
+CPUIF_MAP(ICH_AP1R0, ICH_AP1R0_EL2)
+CPUIF_MAP(ICC_HSRE, ICC_SRE_EL2)
+CPUIF_MAP(ICC_SRE, ICC_SRE_EL1)
+
+CPUIF_MAP_LO_HI(ICH_LR15, ICH_LRC15, ICH_LR15_EL2)
+CPUIF_MAP_LO_HI(ICH_LR14, ICH_LRC14, ICH_LR14_EL2)
+CPUIF_MAP_LO_HI(ICH_LR13, ICH_LRC13, ICH_LR13_EL2)
+CPUIF_MAP_LO_HI(ICH_LR12, ICH_LRC12, ICH_LR12_EL2)
+CPUIF_MAP_LO_HI(ICH_LR11, ICH_LRC11, ICH_LR11_EL2)
+CPUIF_MAP_LO_HI(ICH_LR10, ICH_LRC10, ICH_LR10_EL2)
+CPUIF_MAP_LO_HI(ICH_LR9, ICH_LRC9, ICH_LR9_EL2)
+CPUIF_MAP_LO_HI(ICH_LR8, ICH_LRC8, ICH_LR8_EL2)
+CPUIF_MAP_LO_HI(ICH_LR7, ICH_LRC7, ICH_LR7_EL2)
+CPUIF_MAP_LO_HI(ICH_LR6, ICH_LRC6, ICH_LR6_EL2)
+CPUIF_MAP_LO_HI(ICH_LR5, ICH_LRC5, ICH_LR5_EL2)
+CPUIF_MAP_LO_HI(ICH_LR4, ICH_LRC4, ICH_LR4_EL2)
+CPUIF_MAP_LO_HI(ICH_LR3, ICH_LRC3, ICH_LR3_EL2)
+CPUIF_MAP_LO_HI(ICH_LR2, ICH_LRC2, ICH_LR2_EL2)
+CPUIF_MAP_LO_HI(ICH_LR1, ICH_LRC1, ICH_LR1_EL2)
+CPUIF_MAP_LO_HI(ICH_LR0, ICH_LRC0, ICH_LR0_EL2)
+
+#define read_gicreg(r)                 read_##r()
+#define write_gicreg(v, r)             write_##r(v)
+
  /* Low-level accessors */
  
  static inline void gic_write_eoir(u32 irq)
  {
-       asm volatile("mcr " __stringify(ICC_EOIR1) : : "r" (irq));
+       write_sysreg(irq, ICC_EOIR1);
         isb();
  }
  
  static inline void gic_write_dir(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_DIR) : : "r" (val));
+       write_sysreg(val, ICC_DIR);
         isb();
  }
  
  static inline u32 gic_read_iar(void)
  {
-       u32 irqstat;
+       u32 irqstat = read_sysreg(ICC_IAR1);
  
-       asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat));
         dsb(sy);
+
         return irqstat;
  }
  
  static inline void gic_write_pmr(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_PMR) : : "r" (val));
+       write_sysreg(val, ICC_PMR);
  }
  
  static inline void gic_write_ctlr(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_CTLR) : : "r" (val));
+       write_sysreg(val, ICC_CTLR);
         isb();
  }
  
  static inline void gic_write_grpen1(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_IGRPEN1) : : "r" (val));
+       write_sysreg(val, ICC_IGRPEN1);
         isb();
  }
  
  static inline void gic_write_sgi1r(u64 val)
  {
-       asm volatile("mcrr " __stringify(ICC_SGI1R) : : "r" (val));
+       write_sysreg(val, ICC_SGI1R);
  }
  
  static inline u32 gic_read_sre(void)
  {
-       u32 val;
-
-       asm volatile("mrc " __stringify(ICC_SRE) : "=r" (val));
-       return val;
+       return read_sysreg(ICC_SRE);
  }
  
  static inline void gic_write_sre(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_SRE) : : "r" (val));
+       write_sysreg(val, ICC_SRE);
         isb();
  }
  
  static inline void gic_write_bpr1(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val));
+       write_sysreg(val, ICC_BPR1);
  }
  
  /*
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h

index 4eaea21..68b06f9 100644 (file)
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -159,7 +159,11 @@
         .endm
  
         .macro  save_and_disable_irqs_notrace, oldcpsr
+#ifdef CONFIG_CPU_V7M
+       mrs     \oldcpsr, primask
+#else
         mrs     \oldcpsr, cpsr
+#endif
         disable_irq_notrace
         .endm
  
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h

index 9156fc3..bdd283b 100644 (file)
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -501,21 +501,4 @@ static inline void set_kernel_text_ro(void) { }
  void flush_uprobe_xol_access(struct page *page, unsigned long uaddr,
                              void *kaddr, unsigned long len);
  
-/**
- * secure_flush_area - ensure coherency across the secure boundary
- * @addr: virtual address
- * @size: size of region
- *
- * Ensure that the specified area of memory is coherent across the secure
- * boundary from the non-secure side.  This is used when calling secure
- * firmware where the secure firmware does not ensure coherency.
- */
-static inline void secure_flush_area(const void *addr, size_t size)
-{
-       phys_addr_t phys = __pa(addr);
-
-       __cpuc_flush_dcache_area((void *)addr, size);
-       outer_flush_range(phys, phys + size);
-}
-
  #endif
diff --git a/arch/arm/include/asm/cachetype.h b/arch/arm/include/asm/cachetype.h

index 7ea7814..01509ae 100644 (file)
--- a/arch/arm/include/asm/cachetype.h
+++ b/arch/arm/include/asm/cachetype.h
@@ -56,4 +56,43 @@ static inline unsigned int __attribute__((pure)) cacheid_is(unsigned int mask)
                (~__CACHEID_NEVER & __CACHEID_ARCH_MIN & mask & cacheid);
  }
  
+#define CSSELR_ICACHE  1
+#define CSSELR_DCACHE  0
+
+#define CSSELR_L1      (0 << 1)
+#define CSSELR_L2      (1 << 1)
+#define CSSELR_L3      (2 << 1)
+#define CSSELR_L4      (3 << 1)
+#define CSSELR_L5      (4 << 1)
+#define CSSELR_L6      (5 << 1)
+#define CSSELR_L7      (6 << 1)
+
+#ifndef CONFIG_CPU_V7M
+static inline void set_csselr(unsigned int cache_selector)
+{
+       asm volatile("mcr p15, 2, %0, c0, c0, 0" : : "r" (cache_selector));
+}
+
+static inline unsigned int read_ccsidr(void)
+{
+       unsigned int val;
+
+       asm volatile("mrc p15, 1, %0, c0, c0, 0" : "=r" (val));
+       return val;
+}
+#else /* CONFIG_CPU_V7M */
+#include <linux/io.h>
+#include "asm/v7m.h"
+
+static inline void set_csselr(unsigned int cache_selector)
+{
+       writel(cache_selector, BASEADDR_V7M_SCB + V7M_SCB_CTR);
+}
+
+static inline unsigned int read_ccsidr(void)
+{
+       return readl(BASEADDR_V7M_SCB + V7M_SCB_CCSIDR);
+}
+#endif
+
  #endif
diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h

index c3f1152..dbdbce1 100644 (file)
--- a/arch/arm/include/asm/cp15.h
+++ b/arch/arm/include/asm/cp15.h
@@ -49,6 +49,21 @@
  
  #ifdef CONFIG_CPU_CP15
  
+#define __ACCESS_CP15(CRn, Op1, CRm, Op2)      \
+       "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
+#define __ACCESS_CP15_64(Op1, CRm)             \
+       "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64
+
+#define __read_sysreg(r, w, c, t) ({                           \
+       t __val;                                                \
+       asm volatile(r " " c : "=r" (__val));                   \
+       __val;                                                  \
+})
+#define read_sysreg(...)               __read_sysreg(__VA_ARGS__)
+
+#define __write_sysreg(v, r, w, c, t)  asm volatile(w " " c : : "r" ((t)(v)))
+#define write_sysreg(v, ...)           __write_sysreg(v, __VA_ARGS__)
+
  extern unsigned long cr_alignment;     /* defined in entry-armv.S */
  
  static inline unsigned long get_cr(void)
diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h

index 1ee94c7..522b5fe 100644 (file)
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -55,11 +55,13 @@
  
  #define MPIDR_LEVEL_BITS 8
  #define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1)
+#define MPIDR_LEVEL_SHIFT(level) (MPIDR_LEVEL_BITS * level)
  
  #define MPIDR_AFFINITY_LEVEL(mpidr, level) \
         ((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK)
  
  #define ARM_CPU_IMP_ARM                        0x41
+#define ARM_CPU_IMP_DEC                        0x44
  #define ARM_CPU_IMP_INTEL              0x69
  
  /* ARM implemented processors */
@@ -76,6 +78,17 @@
  #define ARM_CPU_PART_CORTEX_A15                0x4100c0f0
  #define ARM_CPU_PART_MASK              0xff00fff0
  
+/* DEC implemented cores */
+#define ARM_CPU_PART_SA1100            0x4400a110
+
+/* Intel implemented cores */
+#define ARM_CPU_PART_SA1110            0x6900b110
+#define ARM_CPU_REV_SA1110_A0          0
+#define ARM_CPU_REV_SA1110_B0          4
+#define ARM_CPU_REV_SA1110_B1          5
+#define ARM_CPU_REV_SA1110_B2          6
+#define ARM_CPU_REV_SA1110_B4          8
+
  #define ARM_CPU_XSCALE_ARCH_MASK       0xe000
  #define ARM_CPU_XSCALE_ARCH_V1         0x2000
  #define ARM_CPU_XSCALE_ARCH_V2         0x4000
@@ -152,6 +165,11 @@ static inline unsigned int __attribute_const__ read_cpuid_id(void)
         return read_cpuid(CPUID_ID);
  }
  
+static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)
+{
+       return read_cpuid(CPUID_CACHETYPE);
+}
+
  #elif defined(CONFIG_CPU_V7M)
  
  static inline unsigned int __attribute_const__ read_cpuid_id(void)
@@ -159,6 +177,11 @@ static inline unsigned int __attribute_const__ read_cpuid_id(void)
         return readl(BASEADDR_V7M_SCB + V7M_SCB_CPUID);
  }
  
+static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)
+{
+       return readl(BASEADDR_V7M_SCB + V7M_SCB_CTR);
+}
+
  #else /* ifdef CONFIG_CPU_CP15 / elif defined(CONFIG_CPU_V7M) */
  
  static inline unsigned int __attribute_const__ read_cpuid_id(void)
@@ -173,6 +196,11 @@ static inline unsigned int __attribute_const__ read_cpuid_implementor(void)
         return (read_cpuid_id() & 0xFF000000) >> 24;
  }
  
+static inline unsigned int __attribute_const__ read_cpuid_revision(void)
+{
+       return read_cpuid_id() & 0x0000000f;
+}
+
  /*
   * The CPU part number is meaningless without referring to the CPU
   * implementer: implementers are free to define their own part numbers
@@ -193,11 +221,6 @@ static inline unsigned int __attribute_const__ xscale_cpu_arch_version(void)
         return read_cpuid_id() & ARM_CPU_XSCALE_ARCH_MASK;
  }
  
-static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)
-{
-       return read_cpuid(CPUID_CACHETYPE);
-}
-
  static inline unsigned int __attribute_const__ read_cpuid_tcmstatus(void)
  {
         return read_cpuid(CPUID_TCM);
@@ -208,6 +231,10 @@ static inline unsigned int __attribute_const__ read_cpuid_mpidr(void)
         return read_cpuid(CPUID_MPIDR);
  }
  
+/* StrongARM-11x0 CPUs */
+#define cpu_is_sa1100() (read_cpuid_part() == ARM_CPU_PART_SA1100)
+#define cpu_is_sa1110() (read_cpuid_part() == ARM_CPU_PART_SA1110)
+
  /*
   * Intel's XScale3 core supports some v6 features (supersections, L2)
   * but advertises itself as v5 as it does not support the v6 ISA.  For
diff --git a/arch/arm/include/asm/delay.h b/arch/arm/include/asm/delay.h

index b7a4281..b1ce037 100644 (file)
--- a/arch/arm/include/asm/delay.h
+++ b/arch/arm/include/asm/delay.h
@@ -10,7 +10,7 @@
  #include <asm/param.h> /* HZ */
  
  #define MAX_UDELAY_MS  2
-#define UDELAY_MULT    UL(2047 * HZ + 483648 * HZ / 1000000)
+#define UDELAY_MULT    UL(2147 * HZ + 483648 * HZ / 1000000)
  #define UDELAY_SHIFT   31
  
  #ifndef __ASSEMBLY__
diff --git a/arch/arm/include/asm/flat.h b/arch/arm/include/asm/flat.h

index e847d23..acf1d14 100644 (file)
--- a/arch/arm/include/asm/flat.h
+++ b/arch/arm/include/asm/flat.h
@@ -8,8 +8,9 @@
  #define        flat_argvp_envp_on_stack()              1
  #define        flat_old_ram_flag(flags)                (flags)
  #define        flat_reloc_valid(reloc, size)           ((reloc) <= (size))
-#define        flat_get_addr_from_rp(rp, relval, flags, persistent) ((void)persistent,get_unaligned(rp))
-#define        flat_put_addr_at_rp(rp, val, relval)    put_unaligned(val,rp)
+#define        flat_get_addr_from_rp(rp, relval, flags, persistent) \
+       ({ unsigned long __val; __get_user_unaligned(__val, rp); __val; })
+#define        flat_put_addr_at_rp(rp, val, relval)    __put_user_unaligned(val, rp)
  #define        flat_get_relocate_addr(rel)             (rel)
  #define        flat_set_persistent(relval, p)          0
  
diff --git a/arch/arm/include/asm/glue-cache.h b/arch/arm/include/asm/glue-cache.h

index cab07f6..01c3d92 100644 (file)
--- a/arch/arm/include/asm/glue-cache.h
+++ b/arch/arm/include/asm/glue-cache.h
@@ -118,11 +118,7 @@
  #endif
  
  #if defined(CONFIG_CPU_V7M)
-# ifdef _CACHE
  #  define MULTI_CACHE 1
-# else
-#  define _CACHE nop
-# endif
  #endif
  
  #if !defined(_CACHE) && !defined(MULTI_CACHE)
diff --git a/arch/arm/include/asm/hardware/cache-l2x0.h b/arch/arm/include/asm/hardware/cache-l2x0.h

index 3a5ec1c..736292b 100644 (file)
--- a/arch/arm/include/asm/hardware/cache-l2x0.h
+++ b/arch/arm/include/asm/hardware/cache-l2x0.h
@@ -87,6 +87,15 @@
  #define L310_CACHE_ID_RTL_R3P2         0x08
  #define L310_CACHE_ID_RTL_R3P3         0x09
  
+#define L2X0_EVENT_CNT_CTRL_ENABLE     BIT(0)
+
+#define L2X0_EVENT_CNT_CFG_SRC_SHIFT   2
+#define L2X0_EVENT_CNT_CFG_SRC_MASK    0xf
+#define L2X0_EVENT_CNT_CFG_SRC_DISABLED        0
+#define L2X0_EVENT_CNT_CFG_INT_DISABLED        0
+#define L2X0_EVENT_CNT_CFG_INT_INCR    1
+#define L2X0_EVENT_CNT_CFG_INT_OVERFLOW        2
+
  /* L2C auxiliary control register - bits common to L2C-210/220/310 */
  #define L2C_AUX_CTRL_WAY_SIZE_SHIFT            17
  #define L2C_AUX_CTRL_WAY_SIZE_MASK             (7 << 17)
@@ -157,6 +166,16 @@ static inline int l2x0_of_init(u32 aux_val, u32 aux_mask)
  }
  #endif
  
+#ifdef CONFIG_CACHE_L2X0_PMU
+void l2x0_pmu_register(void __iomem *base, u32 part);
+void l2x0_pmu_suspend(void);
+void l2x0_pmu_resume(void);
+#else
+static inline void l2x0_pmu_register(void __iomem *base, u32 part) {}
+static inline void l2x0_pmu_suspend(void) {}
+static inline void l2x0_pmu_resume(void) {}
+#endif
+
  struct l2x0_regs {
         unsigned long phy_base;
         unsigned long aux_ctrl;
diff --git a/arch/arm/include/asm/hardware/sa1111.h b/arch/arm/include/asm/hardware/sa1111.h

index 7c2bbc7..8979fa3 100644 (file)
--- a/arch/arm/include/asm/hardware/sa1111.h
+++ b/arch/arm/include/asm/hardware/sa1111.h
@@ -420,7 +420,7 @@ struct sa1111_dev {
         u64             dma_mask;
  };
  
-#define SA1111_DEV(_d) container_of((_d), struct sa1111_dev, dev)
+#define to_sa1111_device(x)    container_of(x, struct sa1111_dev, dev)
  
  #define sa1111_get_drvdata(d)  dev_get_drvdata(&(d)->dev)
  #define sa1111_set_drvdata(d,p)        dev_set_drvdata(&(d)->dev, p)
@@ -446,6 +446,8 @@ struct sa1111_driver {
  int sa1111_enable_device(struct sa1111_dev *);
  void sa1111_disable_device(struct sa1111_dev *);
  
+int sa1111_get_irq(struct sa1111_dev *, unsigned num);
+
  unsigned int sa1111_pll_clock(struct sa1111_dev *);
  
  #define SA1111_AUDIO_ACLINK    0
diff --git a/arch/arm/include/asm/hw_breakpoint.h b/arch/arm/include/asm/hw_breakpoint.h

index 8e427c7..afcaf8b 100644 (file)
--- a/arch/arm/include/asm/hw_breakpoint.h
+++ b/arch/arm/include/asm/hw_breakpoint.h
@@ -114,7 +114,6 @@ struct notifier_block;
  struct perf_event;
  struct pmu;
  
-extern struct pmu perf_ops_bp;
  extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
                                   int *gen_len, int *gen_type);
  extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h

index 58faff5..d7ea6bc 100644 (file)
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -21,6 +21,10 @@
  
  #include <asm/virt.h>
  
+#define ARM_EXIT_WITH_ABORT_BIT  31
+#define ARM_EXCEPTION_CODE(x)    ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT))
+#define ARM_ABORT_PENDING(x)     !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT))
+
  #define ARM_EXCEPTION_RESET      0
  #define ARM_EXCEPTION_UNDEFINED   1
  #define ARM_EXCEPTION_SOFTWARE    2
@@ -68,6 +72,9 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
  extern void __init_stage2_translation(void);
  
  extern void __kvm_hyp_reset(unsigned long);
+
+extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern void __vgic_v3_init_lrs(void);
  #endif
  
  #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h

index ee5328f..9a8a45a 100644 (file)
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -40,18 +40,29 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
         *vcpu_reg(vcpu, reg_num) = val;
  }
  
-bool kvm_condition_valid(struct kvm_vcpu *vcpu);
-void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr);
+bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
+void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
  void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_vabt(struct kvm_vcpu *vcpu);
  void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
  void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
  
+static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu)
+{
+       return kvm_condition_valid32(vcpu);
+}
+
+static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+       kvm_skip_instr32(vcpu, is_wide_instr);
+}
+
  static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
  {
         vcpu->arch.hcr = HCR_GUEST_MASK;
  }
  
-static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+static inline unsigned long vcpu_get_hcr(const struct kvm_vcpu *vcpu)
  {
         return vcpu->arch.hcr;
  }
@@ -61,7 +72,7 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
         vcpu->arch.hcr = hcr;
  }
  
-static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
+static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu)
  {
         return 1;
  }
@@ -71,9 +82,9 @@ static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu)
         return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_pc;
  }
  
-static inline unsigned long *vcpu_cpsr(struct kvm_vcpu *vcpu)
+static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu)
  {
-       return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr;
+       return (unsigned long *)&vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr;
  }
  
  static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
@@ -93,11 +104,21 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu)
         return cpsr_mode > USR_MODE;;
  }
  
-static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu)
+static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)
  {
         return vcpu->arch.fault.hsr;
  }
  
+static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
+{
+       u32 hsr = kvm_vcpu_get_hsr(vcpu);
+
+       if (hsr & HSR_CV)
+               return (hsr & HSR_COND) >> HSR_COND_SHIFT;
+
+       return -1;
+}
+
  static inline unsigned long kvm_vcpu_get_hfar(struct kvm_vcpu *vcpu)
  {
         return vcpu->arch.fault.hxfar;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h

index de338d9..2d19e02 100644 (file)
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -39,7 +39,12 @@
  
  #include <kvm/arm_vgic.h>
  
+
+#ifdef CONFIG_ARM_GIC_V3
+#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
+#else
  #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
+#endif
  
  #define KVM_REQ_VCPU_EXIT      8
  
@@ -183,15 +188,15 @@ struct kvm_vcpu_arch {
  };
  
  struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
  };
  
  struct kvm_vcpu_stat {
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 hvc_exit_stat;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 hvc_exit_stat;
         u64 wfe_exit_stat;
         u64 wfi_exit_stat;
         u64 mmio_exit_user;
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h

index 6eaff28..343135e 100644 (file)
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -20,28 +20,15 @@
  
  #include <linux/compiler.h>
  #include <linux/kvm_host.h>
+#include <asm/cp15.h>
  #include <asm/kvm_mmu.h>
  #include <asm/vfp.h>
  
  #define __hyp_text __section(.hyp.text) notrace
  
-#define __ACCESS_CP15(CRn, Op1, CRm, Op2)      \
-       "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
-#define __ACCESS_CP15_64(Op1, CRm)             \
-       "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64
  #define __ACCESS_VFP(CRn)                      \
         "mrc", "mcr", __stringify(p10, 7, %0, CRn, cr0, 0), u32
  
-#define __write_sysreg(v, r, w, c, t)  asm volatile(w " " c : : "r" ((t)(v)))
-#define write_sysreg(v, ...)           __write_sysreg(v, __VA_ARGS__)
-
-#define __read_sysreg(r, w, c, t) ({                           \
-       t __val;                                                \
-       asm volatile(r " " c : "=r" (__val));                   \
-       __val;                                                  \
-})
-#define read_sysreg(...)               __read_sysreg(__VA_ARGS__)
-
  #define write_special(v, r)                                    \
         asm volatile("msr " __stringify(r) ", %0" : : "r" (v))
  #define read_special(r) ({                                     \
@@ -119,6 +106,9 @@ void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
  void __sysreg_save_state(struct kvm_cpu_context *ctxt);
  void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
  
+void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+
  void asmlinkage __vfp_save_state(struct vfp_hard_struct *vfp);
  void asmlinkage __vfp_restore_state(struct vfp_hard_struct *vfp);
  static inline bool __vfp_enabled(void)
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h

index 3bb803d..74a4472 100644 (file)
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -63,37 +63,13 @@ void kvm_clear_hyp_idmap(void);
  static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
  {
         *pmd = new_pmd;
-       flush_pmd_entry(pmd);
+       dsb(ishst);
  }
  
  static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
  {
         *pte = new_pte;
-       /*
-        * flush_pmd_entry just takes a void pointer and cleans the necessary
-        * cache entries, so we can reuse the function for ptes.
-        */
-       flush_pmd_entry(pte);
-}
-
-static inline void kvm_clean_pgd(pgd_t *pgd)
-{
-       clean_dcache_area(pgd, PTRS_PER_S2_PGD * sizeof(pgd_t));
-}
-
-static inline void kvm_clean_pmd(pmd_t *pmd)
-{
-       clean_dcache_area(pmd, PTRS_PER_PMD * sizeof(pmd_t));
-}
-
-static inline void kvm_clean_pmd_entry(pmd_t *pmd)
-{
-       clean_pmd_entry(pmd);
-}
-
-static inline void kvm_clean_pte(pte_t *pte)
-{
-       clean_pte_table(pte);
+       dsb(ishst);
  }
  
  static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h

index 31c07a2..76cbd9c 100644 (file)
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -159,13 +159,8 @@
   * PFNs are used to describe any physical page; this means
   * PFN 0 == physical address 0.
   */
-#if defined(__virt_to_phys)
-#define PHYS_OFFSET    PLAT_PHYS_OFFSET
-#define PHYS_PFN_OFFSET        ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))
-
-#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT)
  
-#elif defined(CONFIG_ARM_PATCH_PHYS_VIRT)
+#if defined(CONFIG_ARM_PATCH_PHYS_VIRT)
  
  /*
   * Constants used to force the right instruction encodings and shifts
@@ -182,10 +177,6 @@ extern const void *__pv_table_begin, *__pv_table_end;
  #define PHYS_OFFSET    ((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)
  #define PHYS_PFN_OFFSET        (__pv_phys_pfn_offset)
  
-#define virt_to_pfn(kaddr) \
-       ((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
-        PHYS_PFN_OFFSET)
-
  #define __pv_stub(from,to,instr,type)                  \
         __asm__("@ __pv_stub\n"                         \
         "1:     " instr "       %0, %1, %2\n"           \
@@ -257,12 +248,12 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
         return x - PHYS_OFFSET + PAGE_OFFSET;
  }
  
+#endif
+
  #define virt_to_pfn(kaddr) \
         ((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
          PHYS_PFN_OFFSET)
  
-#endif
-
  /*
   * These are *only* valid on the kernel direct mapped RAM memory.
   * Note: Drivers should NOT use these.  They are the wrong
diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h

index e358b79..464748b 100644 (file)
--- a/arch/arm/include/asm/module.h
+++ b/arch/arm/include/asm/module.h
@@ -23,10 +23,8 @@ struct mod_arch_specific {
         struct unwind_table *unwind[ARM_SEC_MAX];
  #endif
  #ifdef CONFIG_ARM_MODULE_PLTS
-       struct elf32_shdr   *core_plt;
-       struct elf32_shdr   *init_plt;
-       int                 core_plt_count;
-       int                 init_plt_count;
+       struct elf32_shdr   *plt;
+       int                 plt_count;
  #endif
  };
  
diff --git a/arch/arm/include/asm/v7m.h b/arch/arm/include/asm/v7m.h

index 615781c..1fd775c 100644 (file)
--- a/arch/arm/include/asm/v7m.h
+++ b/arch/arm/include/asm/v7m.h
@@ -24,6 +24,9 @@
  
  #define V7M_SCB_CCR                    0x14
  #define V7M_SCB_CCR_STKALIGN                   (1 << 9)
+#define V7M_SCB_CCR_DC                         (1 << 16)
+#define V7M_SCB_CCR_IC                         (1 << 17)
+#define V7M_SCB_CCR_BP                         (1 << 18)
  
  #define V7M_SCB_SHPR2                  0x1c
  #define V7M_SCB_SHPR3                  0x20
@@ -47,6 +50,25 @@
  #define EXC_RET_STACK_MASK                     0x00000004
  #define EXC_RET_THREADMODE_PROCESSSTACK                0xfffffffd
  
+/* Cache related definitions */
+
+#define        V7M_SCB_CLIDR           0x78    /* Cache Level ID register */
+#define        V7M_SCB_CTR             0x7c    /* Cache Type register */
+#define        V7M_SCB_CCSIDR          0x80    /* Cache size ID register */
+#define        V7M_SCB_CSSELR          0x84    /* Cache size selection register */
+
+/* Cache opeartions */
+#define        V7M_SCB_ICIALLU         0x250   /* I-cache invalidate all to PoU */
+#define        V7M_SCB_ICIMVAU         0x258   /* I-cache invalidate by MVA to PoU */
+#define        V7M_SCB_DCIMVAC         0x25c   /* D-cache invalidate by MVA to PoC */
+#define        V7M_SCB_DCISW           0x260   /* D-cache invalidate by set-way */
+#define        V7M_SCB_DCCMVAU         0x264   /* D-cache clean by MVA to PoU */
+#define        V7M_SCB_DCCMVAC         0x268   /* D-cache clean by MVA to PoC */
+#define        V7M_SCB_DCCSW           0x26c   /* D-cache clean by set-way */
+#define        V7M_SCB_DCCIMVAC        0x270   /* D-cache clean and invalidate by MVA to PoC */
+#define        V7M_SCB_DCCISW          0x274   /* D-cache clean and invalidate by set-way */
+#define        V7M_SCB_BPIALL          0x278   /* D-cache clean and invalidate by set-way */
+
  #ifndef __ASSEMBLY__
  
  enum reboot_mode;
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h

index a2b3eb3..b38c10c 100644 (file)
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -84,6 +84,13 @@ struct kvm_regs {
  #define KVM_VGIC_V2_DIST_SIZE          0x1000
  #define KVM_VGIC_V2_CPU_SIZE           0x2000
  
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST     2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST   3
+
+#define KVM_VGIC_V3_DIST_SIZE          SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE                (2 * SZ_64K)
+
  #define KVM_ARM_VCPU_POWER_OFF         0 /* CPU is started in OFF state */
  #define KVM_ARM_VCPU_PSCI_0_2          1 /* CPU uses PSCI v0.2 */
  
diff --git a/arch/arm/kernel/cpuidle.c b/arch/arm/kernel/cpuidle.c

index 7dccc96..a3308ad 100644 (file)
--- a/arch/arm/kernel/cpuidle.c
+++ b/arch/arm/kernel/cpuidle.c
@@ -19,7 +19,7 @@ extern struct of_cpuidle_method __cpuidle_method_of_table[];
  static const struct of_cpuidle_method __cpuidle_method_of_table_sentinel
         __used __section(__cpuidle_method_of_table_end);
  
-static struct cpuidle_ops cpuidle_ops[NR_CPUS];
+static struct cpuidle_ops cpuidle_ops[NR_CPUS] __ro_after_init;
  
  /**
   * arm_cpuidle_simple_enter() - a wrapper to cpu_do_idle()
diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S

index fb1a69e..6b4eb27 100644 (file)
--- a/arch/arm/kernel/head-nommu.S
+++ b/arch/arm/kernel/head-nommu.S
@@ -158,7 +158,21 @@ __after_proc_init:
         bic     r0, r0, #CR_V
  #endif
         mcr     p15, 0, r0, c1, c0, 0           @ write control reg
-#endif /* CONFIG_CPU_CP15 */
+#elif defined (CONFIG_CPU_V7M)
+       /* For V7M systems we want to modify the CCR similarly to the SCTLR */
+#ifdef CONFIG_CPU_DCACHE_DISABLE
+       bic     r0, r0, #V7M_SCB_CCR_DC
+#endif
+#ifdef CONFIG_CPU_BPREDICT_DISABLE
+       bic     r0, r0, #V7M_SCB_CCR_BP
+#endif
+#ifdef CONFIG_CPU_ICACHE_DISABLE
+       bic     r0, r0, #V7M_SCB_CCR_IC
+#endif
+       movw    r3, #:lower16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+       movt    r3, #:upper16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+       str     r0, [r3]
+#endif /* CONFIG_CPU_CP15 elif CONFIG_CPU_V7M */
         ret     lr
  ENDPROC(__after_proc_init)
         .ltorg
diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c

index 0c7efc3..3a5cba9 100644 (file)
--- a/arch/arm/kernel/module-plts.c
+++ b/arch/arm/kernel/module-plts.c
@@ -9,6 +9,7 @@
  #include <linux/elf.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
+#include <linux/sort.h>
  
  #include <asm/cache.h>
  #include <asm/opcodes.h>
@@ -30,154 +31,198 @@ struct plt_entries {
         u32     lit[PLT_ENT_COUNT];
  };
  
-static bool in_init(const struct module *mod, u32 addr)
+u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
  {
-       return addr - (u32)mod->init_layout.base < mod->init_layout.size;
+       struct plt_entries *plt = (struct plt_entries *)mod->arch.plt->sh_addr;
+       int idx = 0;
+
+       /*
+        * Look for an existing entry pointing to 'val'. Given that the
+        * relocations are sorted, this will be the last entry we allocated.
+        * (if one exists).
+        */
+       if (mod->arch.plt_count > 0) {
+               plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT;
+               idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT;
+
+               if (plt->lit[idx] == val)
+                       return (u32)&plt->ldr[idx];
+
+               idx = (idx + 1) % PLT_ENT_COUNT;
+               if (!idx)
+                       plt++;
+       }
+
+       mod->arch.plt_count++;
+       BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size);
+
+       if (!idx)
+               /* Populate a new set of entries */
+               *plt = (struct plt_entries){
+                       { [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
+                       { val, }
+               };
+       else
+               plt->lit[idx] = val;
+
+       return (u32)&plt->ldr[idx];
  }
  
-u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
+#define cmp_3way(a,b)  ((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rel(const void *a, const void *b)
  {
-       struct plt_entries *plt, *plt_end;
-       int c, *count;
-
-       if (in_init(mod, loc)) {
-               plt = (void *)mod->arch.init_plt->sh_addr;
-               plt_end = (void *)plt + mod->arch.init_plt->sh_size;
-               count = &mod->arch.init_plt_count;
-       } else {
-               plt = (void *)mod->arch.core_plt->sh_addr;
-               plt_end = (void *)plt + mod->arch.core_plt->sh_size;
-               count = &mod->arch.core_plt_count;
-       }
+       const Elf32_Rel *x = a, *y = b;
+       int i;
  
-       /* Look for an existing entry pointing to 'val' */
-       for (c = *count; plt < plt_end; c -= PLT_ENT_COUNT, plt++) {
-               int i;
-
-               if (!c) {
-                       /* Populate a new set of entries */
-                       *plt = (struct plt_entries){
-                               { [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
-                               { val, }
-                       };
-                       ++*count;
-                       return (u32)plt->ldr;
-               }
-               for (i = 0; i < PLT_ENT_COUNT; i++) {
-                       if (!plt->lit[i]) {
-                               plt->lit[i] = val;
-                               ++*count;
-                       }
-                       if (plt->lit[i] == val)
-                               return (u32)&plt->ldr[i];
-               }
+       /* sort by type and symbol index */
+       i = cmp_3way(ELF32_R_TYPE(x->r_info), ELF32_R_TYPE(y->r_info));
+       if (i == 0)
+               i = cmp_3way(ELF32_R_SYM(x->r_info), ELF32_R_SYM(y->r_info));
+       return i;
+}
+
+static bool is_zero_addend_relocation(Elf32_Addr base, const Elf32_Rel *rel)
+{
+       u32 *tval = (u32 *)(base + rel->r_offset);
+
+       /*
+        * Do a bitwise compare on the raw addend rather than fully decoding
+        * the offset and doing an arithmetic comparison.
+        * Note that a zero-addend jump/call relocation is encoded taking the
+        * PC bias into account, i.e., -8 for ARM and -4 for Thumb2.
+        */
+       switch (ELF32_R_TYPE(rel->r_info)) {
+               u16 upper, lower;
+
+       case R_ARM_THM_CALL:
+       case R_ARM_THM_JUMP24:
+               upper = __mem_to_opcode_thumb16(((u16 *)tval)[0]);
+               lower = __mem_to_opcode_thumb16(((u16 *)tval)[1]);
+
+               return (upper & 0x7ff) == 0x7ff && (lower & 0x2fff) == 0x2ffe;
+
+       case R_ARM_CALL:
+       case R_ARM_PC24:
+       case R_ARM_JUMP24:
+               return (__mem_to_opcode_arm(*tval) & 0xffffff) == 0xfffffe;
         }
         BUG();
  }
  
-static int duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num,
-                          u32 mask)
+static bool duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num)
  {
-       u32 *loc1, *loc2;
-       int i;
+       const Elf32_Rel *prev;
  
-       for (i = 0; i < num; i++) {
-               if (rel[i].r_info != rel[num].r_info)
-                       continue;
+       /*
+        * Entries are sorted by type and symbol index. That means that,
+        * if a duplicate entry exists, it must be in the preceding
+        * slot.
+        */
+       if (!num)
+               return false;
  
-               /*
-                * Identical relocation types against identical symbols can
-                * still result in different PLT entries if the addend in the
-                * place is different. So resolve the target of the relocation
-                * to compare the values.
-                */
-               loc1 = (u32 *)(base + rel[i].r_offset);
-               loc2 = (u32 *)(base + rel[num].r_offset);
-               if (((*loc1 ^ *loc2) & mask) == 0)
-                       return 1;
-       }
-       return 0;
+       prev = rel + num - 1;
+       return cmp_rel(rel + num, prev) == 0 &&
+              is_zero_addend_relocation(base, prev);
  }
  
  /* Count how many PLT entries we may need */
-static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num)
+static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base,
+                              const Elf32_Rel *rel, int num)
  {
         unsigned int ret = 0;
+       const Elf32_Sym *s;
         int i;
  
-       /*
-        * Sure, this is order(n^2), but it's usually short, and not
-        * time critical
-        */
-       for (i = 0; i < num; i++)
+       for (i = 0; i < num; i++) {
                 switch (ELF32_R_TYPE(rel[i].r_info)) {
                 case R_ARM_CALL:
                 case R_ARM_PC24:
                 case R_ARM_JUMP24:
-                       if (!duplicate_rel(base, rel, i,
-                                          __opcode_to_mem_arm(0x00ffffff)))
-                               ret++;
-                       break;
-#ifdef CONFIG_THUMB2_KERNEL
                 case R_ARM_THM_CALL:
                 case R_ARM_THM_JUMP24:
-                       if (!duplicate_rel(base, rel, i,
-                                          __opcode_to_mem_thumb32(0x07ff2fff)))
+                       /*
+                        * We only have to consider branch targets that resolve
+                        * to undefined symbols. This is not simply a heuristic,
+                        * it is a fundamental limitation, since the PLT itself
+                        * is part of the module, and needs to be within range
+                        * as well, so modules can never grow beyond that limit.
+                        */
+                       s = syms + ELF32_R_SYM(rel[i].r_info);
+                       if (s->st_shndx != SHN_UNDEF)
+                               break;
+
+                       /*
+                        * Jump relocations with non-zero addends against
+                        * undefined symbols are supported by the ELF spec, but
+                        * do not occur in practice (e.g., 'jump n bytes past
+                        * the entry point of undefined function symbol f').
+                        * So we need to support them, but there is no need to
+                        * take them into consideration when trying to optimize
+                        * this code. So let's only check for duplicates when
+                        * the addend is zero.
+                        */
+                       if (!is_zero_addend_relocation(base, rel + i) ||
+                           !duplicate_rel(base, rel, i))
                                 ret++;
-#endif
                 }
+       }
         return ret;
  }
  
  int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
                               char *secstrings, struct module *mod)
  {
-       unsigned long core_plts = 0, init_plts = 0;
+       unsigned long plts = 0;
         Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
+       Elf32_Sym *syms = NULL;
  
         /*
          * To store the PLTs, we expand the .text section for core module code
-        * and the .init.text section for initialization code.
+        * and for initialization code.
          */
-       for (s = sechdrs; s < sechdrs_end; ++s)
-               if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
-                       mod->arch.core_plt = s;
-               else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
-                       mod->arch.init_plt = s;
-
-       if (!mod->arch.core_plt || !mod->arch.init_plt) {
-               pr_err("%s: sections missing\n", mod->name);
+       for (s = sechdrs; s < sechdrs_end; ++s) {
+               if (strcmp(".plt", secstrings + s->sh_name) == 0)
+                       mod->arch.plt = s;
+               else if (s->sh_type == SHT_SYMTAB)
+                       syms = (Elf32_Sym *)s->sh_addr;
+       }
+
+       if (!mod->arch.plt) {
+               pr_err("%s: module PLT section missing\n", mod->name);
+               return -ENOEXEC;
+       }
+       if (!syms) {
+               pr_err("%s: module symtab section missing\n", mod->name);
                 return -ENOEXEC;
         }
  
         for (s = sechdrs + 1; s < sechdrs_end; ++s) {
-               const Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
+               Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
                 int numrels = s->sh_size / sizeof(Elf32_Rel);
                 Elf32_Shdr *dstsec = sechdrs + s->sh_info;
  
                 if (s->sh_type != SHT_REL)
                         continue;
  
-               if (strstr(secstrings + s->sh_name, ".init"))
-                       init_plts += count_plts(dstsec->sh_addr, rels, numrels);
-               else
-                       core_plts += count_plts(dstsec->sh_addr, rels, numrels);
+               /* ignore relocations that operate on non-exec sections */
+               if (!(dstsec->sh_flags & SHF_EXECINSTR))
+                       continue;
+
+               /* sort by type and symbol index */
+               sort(rels, numrels, sizeof(Elf32_Rel), cmp_rel, NULL);
+
+               plts += count_plts(syms, dstsec->sh_addr, rels, numrels);
         }
  
-       mod->arch.core_plt->sh_type = SHT_NOBITS;
-       mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-       mod->arch.core_plt->sh_addralign = L1_CACHE_BYTES;
-       mod->arch.core_plt->sh_size = round_up(core_plts * PLT_ENT_SIZE,
-                                              sizeof(struct plt_entries));
-       mod->arch.core_plt_count = 0;
-
-       mod->arch.init_plt->sh_type = SHT_NOBITS;
-       mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-       mod->arch.init_plt->sh_addralign = L1_CACHE_BYTES;
-       mod->arch.init_plt->sh_size = round_up(init_plts * PLT_ENT_SIZE,
-                                              sizeof(struct plt_entries));
-       mod->arch.init_plt_count = 0;
-       pr_debug("%s: core.plt=%x, init.plt=%x\n", __func__,
-                mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size);
+       mod->arch.plt->sh_type = SHT_NOBITS;
+       mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+       mod->arch.plt->sh_addralign = L1_CACHE_BYTES;
+       mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE,
+                                         sizeof(struct plt_entries));
+       mod->arch.plt_count = 0;
+
+       pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size);
         return 0;
  }
diff --git a/arch/arm/kernel/module.lds b/arch/arm/kernel/module.lds

index 3682fa1..05881e2 100644 (file)
--- a/arch/arm/kernel/module.lds
+++ b/arch/arm/kernel/module.lds
@@ -1,4 +1,3 @@
  SECTIONS {
-        .core.plt : { BYTE(0) }
-        .init.plt : { BYTE(0) }
+       .plt : { BYTE(0) }
  }
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c

index df7f2a7..34e3f3c 100644 (file)
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -114,19 +114,19 @@ EXPORT_SYMBOL(elf_hwcap2);
  
  
  #ifdef MULTI_CPU
-struct processor processor __read_mostly;
+struct processor processor __ro_after_init;
  #endif
  #ifdef MULTI_TLB
-struct cpu_tlb_fns cpu_tlb __read_mostly;
+struct cpu_tlb_fns cpu_tlb __ro_after_init;
  #endif
  #ifdef MULTI_USER
-struct cpu_user_fns cpu_user __read_mostly;
+struct cpu_user_fns cpu_user __ro_after_init;
  #endif
  #ifdef MULTI_CACHE
-struct cpu_cache_fns cpu_cache __read_mostly;
+struct cpu_cache_fns cpu_cache __ro_after_init;
  #endif
  #ifdef CONFIG_OUTER_CACHE
-struct outer_cache_fns outer_cache __read_mostly;
+struct outer_cache_fns outer_cache __ro_after_init;
  EXPORT_SYMBOL(outer_cache);
  #endif
  
@@ -290,12 +290,9 @@ static int cpu_has_aliasing_icache(unsigned int arch)
         /* arch specifies the register format */
         switch (arch) {
         case CPU_ARCH_ARMv7:
-               asm("mcr        p15, 2, %0, c0, c0, 0 @ set CSSELR"
-                   : /* No output operands */
-                   : "r" (1));
+               set_csselr(CSSELR_ICACHE | CSSELR_L1);
                 isb();
-               asm("mrc        p15, 1, %0, c0, c0, 0 @ read CCSIDR"
-                   : "=r" (id_reg));
+               id_reg = read_ccsidr();
                 line_size = 4 << ((id_reg & 0x7) + 2);
                 num_sets = ((id_reg >> 13) & 0x7fff) + 1;
                 aliasing_icache = (line_size * num_sets) > PAGE_SIZE;
@@ -315,11 +312,12 @@ static void __init cacheid_init(void)
  {
         unsigned int arch = cpu_architecture();
  
-       if (arch == CPU_ARCH_ARMv7M) {
-               cacheid = 0;
-       } else if (arch >= CPU_ARCH_ARMv6) {
+       if (arch >= CPU_ARCH_ARMv6) {
                 unsigned int cachetype = read_cpuid_cachetype();
-               if ((cachetype & (7 << 29)) == 4 << 29) {
+
+               if ((arch == CPU_ARCH_ARMv7M) && !cachetype) {
+                       cacheid = 0;
+               } else if ((cachetype & (7 << 29)) == 4 << 29) {
                         /* ARMv7 register format */
                         arch = CPU_ARCH_ARMv7;
                         cacheid = CACHEID_VIPT_NONALIASING;
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c

index 8615216..937c892 100644 (file)
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -82,7 +82,7 @@ enum ipi_msg_type {
  
  static DECLARE_COMPLETION(cpu_running);
  
-static struct smp_operations smp_ops;
+static struct smp_operations smp_ops __ro_after_init;
  
  void __init smp_set_ops(const struct smp_operations *ops)
  {
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c

index a0affd1..53cf86c 100644 (file)
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -17,6 +17,7 @@
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
  
+#include <linux/cache.h>
  #include <linux/elf.h>
  #include <linux/err.h>
  #include <linux/kernel.h>
@@ -39,7 +40,7 @@
  static struct page **vdso_text_pagelist;
  
  /* Total number of pages needed for the data and text portions of the VDSO. */
-unsigned int vdso_total_pages __read_mostly;
+unsigned int vdso_total_pages __ro_after_init;
  
  /*
   * The VDSO data page.
@@ -47,13 +48,13 @@ unsigned int vdso_total_pages __read_mostly;
  static union vdso_data_store vdso_data_store __page_aligned_data;
  static struct vdso_data *vdso_data = &vdso_data_store.data;
  
-static struct page *vdso_data_page;
-static struct vm_special_mapping vdso_data_mapping = {
+static struct page *vdso_data_page __ro_after_init;
+static const struct vm_special_mapping vdso_data_mapping = {
         .name = "[vvar]",
         .pages = &vdso_data_page,
  };
  
-static struct vm_special_mapping vdso_text_mapping = {
+static struct vm_special_mapping vdso_text_mapping __ro_after_init = {
         .name = "[vdso]",
  };
  
@@ -67,7 +68,7 @@ struct elfinfo {
  /* Cached result of boot-time check for whether the arch timer exists,
   * and if so, whether the virtual counter is useable.
   */
-static bool cntvct_ok __read_mostly;
+static bool cntvct_ok __ro_after_init;
  
  static bool __init cntvct_functional(void)
  {
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile

index 10d77a6..f19842e 100644 (file)
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -21,13 +21,16 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
  obj-y += kvm-arm.o init.o interrupts.o
  obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
  obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
+obj-y += $(KVM)/arm/aarch32.o
  
  obj-y += $(KVM)/arm/vgic/vgic.o
  obj-y += $(KVM)/arm/vgic/vgic-init.o
  obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
  obj-y += $(KVM)/arm/vgic/vgic-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-v3.o
  obj-y += $(KVM)/arm/vgic/vgic-mmio.o
  obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o
  obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
  obj-y += $(KVM)/irqchip.o
  obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c

index c94b90d..03e9273 100644 (file)
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -144,6 +144,16 @@ out_fail_alloc:
         return ret;
  }
  
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
  int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
  {
         return VM_FAULT_SIGBUS;
@@ -1176,6 +1186,10 @@ static int init_common_resources(void)
                 return -ENOMEM;
         }
  
+       /* set size of VMID supported by CPU */
+       kvm_vmid_bits = kvm_get_vmid_bits();
+       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+
         return 0;
  }
  
@@ -1241,10 +1255,6 @@ static void teardown_hyp_mode(void)
  
  static int init_vhe_mode(void)
  {
-       /* set size of VMID supported by CPU */
-       kvm_vmid_bits = kvm_get_vmid_bits();
-       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
         kvm_info("VHE mode initialized successfully\n");
         return 0;
  }
@@ -1328,10 +1338,6 @@ static int init_hyp_mode(void)
                 }
         }
  
-       /* set size of VMID supported by CPU */
-       kvm_vmid_bits = kvm_get_vmid_bits();
-       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
         kvm_info("Hyp mode initialized successfully\n");
  
         return 0;
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c

index 1bb2b79..3e5e419 100644 (file)
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -228,6 +228,35 @@ bool access_vm_reg(struct kvm_vcpu *vcpu,
         return true;
  }
  
+static bool access_gic_sgi(struct kvm_vcpu *vcpu,
+                          const struct coproc_params *p,
+                          const struct coproc_reg *r)
+{
+       u64 reg;
+
+       if (!p->is_write)
+               return read_from_write_only(vcpu, p);
+
+       reg = (u64)*vcpu_reg(vcpu, p->Rt2) << 32;
+       reg |= *vcpu_reg(vcpu, p->Rt1) ;
+
+       vgic_v3_dispatch_sgi(vcpu, reg);
+
+       return true;
+}
+
+static bool access_gic_sre(struct kvm_vcpu *vcpu,
+                          const struct coproc_params *p,
+                          const struct coproc_reg *r)
+{
+       if (p->is_write)
+               return ignore_write(vcpu, p);
+
+       *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+
+       return true;
+}
+
  /*
   * We could trap ID_DFR0 and tell the guest we don't support performance
   * monitoring.  Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -361,10 +390,16 @@ static const struct coproc_reg cp15_regs[] = {
         { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32,
                         access_vm_reg, reset_unknown, c10_AMAIR1},
  
+       /* ICC_SGI1R */
+       { CRm64(12), Op1( 0), is64, access_gic_sgi},
+
         /* VBAR: swapped by interrupt.S. */
         { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32,
                         NULL, reset_val, c12_VBAR, 0x00000000 },
  
+       /* ICC_SRE */
+       { CRn(12), CRm(12), Op1( 0), Op2(5), is32, access_gic_sre },
+
         /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */
         { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32,
                         access_vm_reg, reset_val, c13_CID, 0x00000000 },
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c

index af93e3f..0064b86 100644 (file)
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -161,105 +161,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
         }
  }
  
-/*
- * A conditional instruction is allowed to trap, even though it
- * wouldn't be executed.  So let's re-implement the hardware, in
- * software!
- */
-bool kvm_condition_valid(struct kvm_vcpu *vcpu)
-{
-       unsigned long cpsr, cond, insn;
-
-       /*
-        * Exception Code 0 can only happen if we set HCR.TGE to 1, to
-        * catch undefined instructions, and then we won't get past
-        * the arm_exit_handlers test anyway.
-        */
-       BUG_ON(!kvm_vcpu_trap_get_class(vcpu));
-
-       /* Top two bits non-zero?  Unconditional. */
-       if (kvm_vcpu_get_hsr(vcpu) >> 30)
-               return true;
-
-       cpsr = *vcpu_cpsr(vcpu);
-
-       /* Is condition field valid? */
-       if ((kvm_vcpu_get_hsr(vcpu) & HSR_CV) >> HSR_CV_SHIFT)
-               cond = (kvm_vcpu_get_hsr(vcpu) & HSR_COND) >> HSR_COND_SHIFT;
-       else {
-               /* This can happen in Thumb mode: examine IT state. */
-               unsigned long it;
-
-               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
-               /* it == 0 => unconditional. */
-               if (it == 0)
-                       return true;
-
-               /* The cond for this insn works out as the top 4 bits. */
-               cond = (it >> 4);
-       }
-
-       /* Shift makes it look like an ARM-mode instruction */
-       insn = cond << 28;
-       return arm_check_condition(insn, cpsr) != ARM_OPCODE_CONDTEST_FAIL;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu:      The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
-       unsigned long itbits, cond;
-       unsigned long cpsr = *vcpu_cpsr(vcpu);
-       bool is_arm = !(cpsr & PSR_T_BIT);
-
-       BUG_ON(is_arm && (cpsr & PSR_IT_MASK));
-
-       if (!(cpsr & PSR_IT_MASK))
-               return;
-
-       cond = (cpsr & 0xe000) >> 13;
-       itbits = (cpsr & 0x1c00) >> (10 - 2);
-       itbits |= (cpsr & (0x3 << 25)) >> 25;
-
-       /* Perform ITAdvance (see page A-52 in ARM DDI 0406C) */
-       if ((itbits & 0x7) == 0)
-               itbits = cond = 0;
-       else
-               itbits = (itbits << 1) & 0x1f;
-
-       cpsr &= ~PSR_IT_MASK;
-       cpsr |= cond << 13;
-       cpsr |= (itbits & 0x1c) << (10 - 2);
-       cpsr |= (itbits & 0x3) << 25;
-       *vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
-       bool is_thumb;
-
-       is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT);
-       if (is_thumb && !is_wide_instr)
-               *vcpu_pc(vcpu) += 2;
-       else
-               *vcpu_pc(vcpu) += 4;
-       kvm_adjust_itstate(vcpu);
-}
-
-
  /******************************************************************************
   * Inject exceptions into the guest
   */
@@ -402,3 +303,15 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
  {
         inject_abt(vcpu, true, addr);
  }
+
+/**
+ * kvm_inject_vabt - inject an async abort / SError into the guest
+ * @vcpu: The VCPU to receive the exception
+ *
+ * It is assumed that this code is called from the VCPU thread and that the
+ * VCPU therefore is not currently executing guest code.
+ */
+void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+{
+       vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VA);
+}
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c

index 3f1ef0d..4e40d19 100644 (file)
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -28,14 +28,6 @@
  
  typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
  
-static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       /* SVC called from Hyp mode should never get here */
-       kvm_debug("SVC called from Hyp mode shouldn't go here\n");
-       BUG();
-       return -EINVAL; /* Squash warning */
-}
-
  static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
  {
         int ret;
@@ -59,22 +51,6 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
         return 1;
  }
  
-static int handle_pabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       /* The hypervisor should never cause aborts */
-       kvm_err("Prefetch Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n",
-               kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu));
-       return -EFAULT;
-}
-
-static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       /* This is either an error in the ws. code or an external abort */
-       kvm_err("Data Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n",
-               kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu));
-       return -EFAULT;
-}
-
  /**
   * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests
   * @vcpu:      the vcpu pointer
@@ -112,13 +88,10 @@ static exit_handle_fn arm_exit_handlers[] = {
         [HSR_EC_CP14_64]        = kvm_handle_cp14_access,
         [HSR_EC_CP_0_13]        = kvm_handle_cp_0_13_access,
         [HSR_EC_CP10_ID]        = kvm_handle_cp10_id,
-       [HSR_EC_SVC_HYP]        = handle_svc_hyp,
         [HSR_EC_HVC]            = handle_hvc,
         [HSR_EC_SMC]            = handle_smc,
         [HSR_EC_IABT]           = kvm_handle_guest_abort,
-       [HSR_EC_IABT_HYP]       = handle_pabt_hyp,
         [HSR_EC_DABT]           = kvm_handle_guest_abort,
-       [HSR_EC_DABT_HYP]       = handle_dabt_hyp,
  };
  
  static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
@@ -144,6 +117,25 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
  {
         exit_handle_fn exit_handler;
  
+       if (ARM_ABORT_PENDING(exception_index)) {
+               u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
+
+               /*
+                * HVC/SMC already have an adjusted PC, which we need
+                * to correct in order to return to after having
+                * injected the abort.
+                */
+               if (hsr_ec == HSR_EC_HVC || hsr_ec == HSR_EC_SMC) {
+                       u32 adj =  kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2;
+                       *vcpu_pc(vcpu) -= adj;
+               }
+
+               kvm_inject_vabt(vcpu);
+               return 1;
+       }
+
+       exception_index = ARM_EXCEPTION_CODE(exception_index);
+
         switch (exception_index) {
         case ARM_EXCEPTION_IRQ:
                 return 1;
@@ -160,6 +152,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 exit_handler = kvm_get_exit_handler(vcpu);
  
                 return exit_handler(vcpu, run);
+       case ARM_EXCEPTION_DATA_ABORT:
+               kvm_inject_vabt(vcpu);
+               return 1;
         default:
                 kvm_pr_unimpl("Unsupported exception type: %d",
                               exception_index);
diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile

index 8dfa5f7..3023bb5 100644 (file)
--- a/arch/arm/kvm/hyp/Makefile
+++ b/arch/arm/kvm/hyp/Makefile
@@ -5,6 +5,7 @@
  KVM=../../../../virt/kvm
  
  obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
  obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
  
  obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
diff --git a/arch/arm/kvm/hyp/entry.S b/arch/arm/kvm/hyp/entry.S

index 21c2388..60783f3 100644 (file)
--- a/arch/arm/kvm/hyp/entry.S
+++ b/arch/arm/kvm/hyp/entry.S
@@ -18,6 +18,7 @@
  #include <linux/linkage.h>
  #include <asm/asm-offsets.h>
  #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
  
         .arch_extension     virt
  
@@ -63,6 +64,36 @@ ENTRY(__guest_exit)
         ldr     lr, [r0, #4]
  
         mov     r0, r1
+       mrs     r1, SPSR
+       mrs     r2, ELR_hyp
+       mrc     p15, 4, r3, c5, c2, 0   @ HSR
+
+       /*
+        * Force loads and stores to complete before unmasking aborts
+        * and forcing the delivery of the exception. This gives us a
+        * single instruction window, which the handler will try to
+        * match.
+        */
+       dsb     sy
+       cpsie   a
+
+       .global abort_guest_exit_start
+abort_guest_exit_start:
+
+       isb
+
+       .global abort_guest_exit_end
+abort_guest_exit_end:
+
+       /*
+        * If we took an abort, r0[31] will be set, and cmp will set
+        * the N bit in PSTATE.
+        */
+       cmp     r0, #0
+       msrmi   SPSR_cxsf, r1
+       msrmi   ELR_hyp, r2
+       mcrmi   p15, 4, r3, c5, c2, 0   @ HSR
+
         bx      lr
  ENDPROC(__guest_exit)
  
diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S

index 7809138..96beb53 100644 (file)
--- a/arch/arm/kvm/hyp/hyp-entry.S
+++ b/arch/arm/kvm/hyp/hyp-entry.S
@@ -81,7 +81,6 @@ __kvm_hyp_vector:
         invalid_vector  hyp_undef       ARM_EXCEPTION_UNDEFINED
         invalid_vector  hyp_svc         ARM_EXCEPTION_SOFTWARE
         invalid_vector  hyp_pabt        ARM_EXCEPTION_PREF_ABORT
-       invalid_vector  hyp_dabt        ARM_EXCEPTION_DATA_ABORT
         invalid_vector  hyp_fiq         ARM_EXCEPTION_FIQ
  
  ENTRY(__hyp_do_panic)
@@ -164,6 +163,21 @@ hyp_irq:
         load_vcpu r0                    @ Load VCPU pointer to r0
         b       __guest_exit
  
+hyp_dabt:
+       push    {r0, r1}
+       mrs     r0, ELR_hyp
+       ldr     r1, =abort_guest_exit_start
+THUMB( add     r1, r1, #1)
+       cmp     r0, r1
+       ldrne   r1, =abort_guest_exit_end
+THUMB( addne   r1, r1, #1)
+       cmpne   r0, r1
+       pop     {r0, r1}
+       bne     __hyp_panic
+
+       orr     r0, r0, #(1 << ARM_EXIT_WITH_ABORT_BIT)
+       eret
+
         .ltorg
  
         .popsection
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c

index b13caa9..92678b7 100644 (file)
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -14,6 +14,7 @@
   * You should have received a copy of the GNU General Public License
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
+#include <linux/jump_label.h>
  
  #include <asm/kvm_asm.h>
  #include <asm/kvm_hyp.h>
@@ -54,6 +55,15 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
  {
         u32 val;
  
+       /*
+        * If we pended a virtual abort, preserve it until it gets
+        * cleared. See B1.9.9 (Virtual Abort exception) for details,
+        * but the crucial bit is the zeroing of HCR.VA in the
+        * pseudocode.
+        */
+       if (vcpu->arch.hcr & HCR_VA)
+               vcpu->arch.hcr = read_sysreg(HCR);
+
         write_sysreg(0, HCR);
         write_sysreg(0, HSTR);
         val = read_sysreg(HDCR);
@@ -74,14 +84,21 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
         write_sysreg(read_sysreg(MIDR), VPIDR);
  }
  
+
  static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
  {
-       __vgic_v2_save_state(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_save_state(vcpu);
+       else
+               __vgic_v2_save_state(vcpu);
  }
  
  static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
  {
-       __vgic_v2_restore_state(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_restore_state(vcpu);
+       else
+               __vgic_v2_restore_state(vcpu);
  }
  
  static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
@@ -134,7 +151,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
         return true;
  }
  
-static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
  {
         struct kvm_cpu_context *host_ctxt;
         struct kvm_cpu_context *guest_ctxt;
@@ -191,8 +208,6 @@ again:
         return exit_code;
  }
  
-__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-
  static const char * const __hyp_panic_string[] = {
         [ARM_EXCEPTION_RESET]      = "\nHYP panic: RST   PC:%08x CPSR:%08x",
         [ARM_EXCEPTION_UNDEFINED]  = "\nHYP panic: UNDEF PC:%08x CPSR:%08x",
diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c

index a263600..7296528 100644 (file)
--- a/arch/arm/kvm/hyp/tlb.c
+++ b/arch/arm/kvm/hyp/tlb.c
@@ -34,7 +34,7 @@
   * As v7 does not support flushing per IPA, just nuke the whole TLB
   * instead, ignoring the ipa value.
   */
-static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
  {
         dsb(ishst);
  
@@ -50,21 +50,14 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
         write_sysreg(0, VTTBR);
  }
  
-__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
-
-static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  {
-       __tlb_flush_vmid(kvm);
+       __kvm_tlb_flush_vmid(kvm);
  }
  
-__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
-                                                           phys_addr_t ipa);
-
-static void __hyp_text __tlb_flush_vm_context(void)
+void __hyp_text __kvm_flush_vm_context(void)
  {
         write_sysreg(0, TLBIALLNSNHIS);
         write_sysreg(0, ICIALLUIS);
         dsb(ish);
  }
-
-__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c

index 10f80a6..b6e715f 100644 (file)
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -126,12 +126,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
         int access_size;
         bool sign_extend;
  
-       if (kvm_vcpu_dabt_isextabt(vcpu)) {
-               /* cache operation on I/O addr, tell guest unsupported */
-               kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-               return 1;
-       }
-
         if (kvm_vcpu_dabt_iss1tw(vcpu)) {
                 /* page table accesses IO mem: tell guest to fix its TTBR */
                 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c

index e9a5c0e..a5265ed 100644 (file)
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -744,7 +744,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
         if (!pgd)
                 return -ENOMEM;
  
-       kvm_clean_pgd(pgd);
         kvm->arch.pgd = pgd;
         return 0;
  }
@@ -936,7 +935,6 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
                 if (!cache)
                         return 0; /* ignore calls from kvm_set_spte_hva */
                 pte = mmu_memory_cache_alloc(cache);
-               kvm_clean_pte(pte);
                 pmd_populate_kernel(NULL, pmd, pte);
                 get_page(virt_to_page(pmd));
         }
@@ -1434,6 +1432,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
         int ret, idx;
  
         is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
+       if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) {
+               kvm_inject_vabt(vcpu);
+               return 1;
+       }
+
         fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
  
         trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
diff --git a/arch/arm/lib/delay.c b/arch/arm/lib/delay.c

index 8044591..2cef118 100644 (file)
--- a/arch/arm/lib/delay.c
+++ b/arch/arm/lib/delay.c
@@ -29,7 +29,7 @@
  /*
   * Default to the loop-based delay implementation.
   */
-struct arm_delay_ops arm_delay_ops = {
+struct arm_delay_ops arm_delay_ops __ro_after_init = {
         .delay          = __loop_delay,
         .const_udelay   = __loop_const_udelay,
         .udelay         = __loop_udelay,
diff --git a/arch/arm/mach-footbridge/include/mach/hardware.h b/arch/arm/mach-footbridge/include/mach/hardware.h

index 02f6d7a..20d5ad7 100644 (file)
--- a/arch/arm/mach-footbridge/include/mach/hardware.h
+++ b/arch/arm/mach-footbridge/include/mach/hardware.h
@@ -59,7 +59,7 @@
  #define XBUS_SWITCH_J17_11     ((*XBUS_SWITCH) & (1 << 5))
  #define XBUS_SWITCH_J17_9      ((*XBUS_SWITCH) & (1 << 6))
  
-#define UNCACHEABLE_ADDR       (ARMCSR_BASE + 0x108)
+#define UNCACHEABLE_ADDR       (ARMCSR_BASE + 0x108)   /* CSR_ROMBASEMASK */
  
  
  /* PIC irq control */
diff --git a/arch/arm/mach-rpc/include/mach/hardware.h b/arch/arm/mach-rpc/include/mach/hardware.h

index 257166b..aa79fa4 100644 (file)
--- a/arch/arm/mach-rpc/include/mach/hardware.h
+++ b/arch/arm/mach-rpc/include/mach/hardware.h
@@ -40,7 +40,7 @@
  #define SCREEN_END             0xdfc00000
  #define SCREEN_BASE            0xdf800000
  
-#define UNCACHEABLE_ADDR       0xdf010000
+#define UNCACHEABLE_ADDR       (FLUSH_BASE + 0x10000)
  
  /*
   * IO Addresses
diff --git a/arch/arm/mach-sa1100/include/mach/hardware.h b/arch/arm/mach-sa1100/include/mach/hardware.h

index cbedd75..d944fd7 100644 (file)
--- a/arch/arm/mach-sa1100/include/mach/hardware.h
+++ b/arch/arm/mach-sa1100/include/mach/hardware.h
@@ -13,7 +13,7 @@
  #define __ASM_ARCH_HARDWARE_H
  
  
-#define UNCACHEABLE_ADDR       0xfa050000
+#define UNCACHEABLE_ADDR       0xfa050000      /* ICIP */
  
  
  /*
@@ -36,28 +36,10 @@
  #define io_v2p( x )             \
     ( (((x)&0x00ffffff) | (((x)&(0x30000000>>VIO_SHIFT))<<VIO_SHIFT)) + PIO_START )
  
-#define CPU_SA1110_A0  (0)
-#define CPU_SA1110_B0  (4)
-#define CPU_SA1110_B1  (5)
-#define CPU_SA1110_B2  (6)
-#define CPU_SA1110_B4  (8)
-
-#define CPU_SA1100_ID  (0x4401a110)
-#define CPU_SA1100_MASK        (0xfffffff0)
-#define CPU_SA1110_ID  (0x6901b110)
-#define CPU_SA1110_MASK        (0xfffffff0)
-
  #define __MREG(x)      IOMEM(io_p2v(x))
  
  #ifndef __ASSEMBLY__
  
-#include <asm/cputype.h>
-
-#define CPU_REVISION   (read_cpuid_id() & 15)
-
-#define cpu_is_sa1100()        ((read_cpuid_id() & CPU_SA1100_MASK) == CPU_SA1100_ID)
-#define cpu_is_sa1110()        ((read_cpuid_id() & CPU_SA1110_MASK) == CPU_SA1110_ID)
-
  # define __REG(x)      (*((volatile unsigned long __iomem *)io_p2v(x)))
  # define __PREG(x)     (io_v2p((unsigned long)&(x)))
  
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig

index d15a7fe..c1799dd 100644 (file)
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -403,6 +403,7 @@ config CPU_V7M
         bool
         select CPU_32v7M
         select CPU_ABRT_NOMMU
+       select CPU_CACHE_V7M
         select CPU_CACHE_NOP
         select CPU_PABRT_LEGACY
         select CPU_THUMBONLY
@@ -518,6 +519,9 @@ config CPU_CACHE_VIPT
  config CPU_CACHE_FA
         bool
  
+config CPU_CACHE_V7M
+       bool
+
  if MMU
  # The copy-page model
  config CPU_COPY_V4WT
@@ -750,14 +754,14 @@ config CPU_HIGH_VECTOR
  
  config CPU_ICACHE_DISABLE
         bool "Disable I-Cache (I-bit)"
-       depends on CPU_CP15 && !(CPU_ARM720T || CPU_ARM740T || CPU_XSCALE || CPU_XSC3)
+       depends on (CPU_CP15 && !(CPU_ARM720T || CPU_ARM740T || CPU_XSCALE || CPU_XSC3)) || CPU_V7M
         help
           Say Y here to disable the processor instruction cache. Unless
           you have a reason not to or are unsure, say N.
  
  config CPU_DCACHE_DISABLE
         bool "Disable D-Cache (C-bit)"
-       depends on CPU_CP15 && !SMP
+       depends on (CPU_CP15 && !SMP) || CPU_V7M
         help
           Say Y here to disable the processor data cache. Unless
           you have a reason not to or are unsure, say N.
@@ -792,7 +796,7 @@ config CPU_CACHE_ROUND_ROBIN
  
  config CPU_BPREDICT_DISABLE
         bool "Disable branch prediction"
-       depends on CPU_ARM1020 || CPU_V6 || CPU_V6K || CPU_MOHAWK || CPU_XSC3 || CPU_V7 || CPU_FA526
+       depends on CPU_ARM1020 || CPU_V6 || CPU_V6K || CPU_MOHAWK || CPU_XSC3 || CPU_V7 || CPU_FA526 || CPU_V7M
         help
           Say Y here to disable branch prediction.  If unsure, say N.
  
@@ -916,6 +920,13 @@ config CACHE_L2X0
         help
           This option enables the L2x0 PrimeCell.
  
+config CACHE_L2X0_PMU
+       bool "L2x0 performance monitor support" if CACHE_L2X0
+       depends on PERF_EVENTS
+       help
+         This option enables support for the performance monitoring features
+         of the L220 and PL310 outer cache controllers.
+
  if CACHE_L2X0
  
  config PL310_ERRATA_588369
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile

index 7f76d96..e869824 100644 (file)
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -43,9 +43,11 @@ obj-$(CONFIG_CPU_CACHE_V6)   += cache-v6.o
  obj-$(CONFIG_CPU_CACHE_V7)     += cache-v7.o
  obj-$(CONFIG_CPU_CACHE_FA)     += cache-fa.o
  obj-$(CONFIG_CPU_CACHE_NOP)    += cache-nop.o
+obj-$(CONFIG_CPU_CACHE_V7M)    += cache-v7m.o
  
  AFLAGS_cache-v6.o      :=-Wa,-march=armv6
  AFLAGS_cache-v7.o      :=-Wa,-march=armv7-a
+AFLAGS_cache-v7m.o     :=-Wa,-march=armv7-m
  
  obj-$(CONFIG_CPU_COPY_V4WT)    += copypage-v4wt.o
  obj-$(CONFIG_CPU_COPY_V4WB)    += copypage-v4wb.o
@@ -101,6 +103,7 @@ AFLAGS_proc-v7.o    :=-Wa,-march=armv7-a
  obj-$(CONFIG_OUTER_CACHE)      += l2c-common.o
  obj-$(CONFIG_CACHE_FEROCEON_L2)        += cache-feroceon-l2.o
  obj-$(CONFIG_CACHE_L2X0)       += cache-l2x0.o l2c-l2x0-resume.o
+obj-$(CONFIG_CACHE_L2X0_PMU)   += cache-l2x0-pmu.o
  obj-$(CONFIG_CACHE_XSC3L2)     += cache-xsc3l2.o
  obj-$(CONFIG_CACHE_TAUROS2)    += cache-tauros2.o
  obj-$(CONFIG_CACHE_UNIPHIER)   += cache-uniphier.o
diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c

new file mode 100644 (file)

index 0000000..976d305
--- /dev/null
+++ b/arch/arm/mm/cache-l2x0-pmu.c
@@ -0,0 +1,584 @@
+/*
+ * L220/L310 cache controller support
+ *
+ * Copyright (C) 2016 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/errno.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/hardware/cache-l2x0.h>
+
+#define PMU_NR_COUNTERS 2
+
+static void __iomem *l2x0_base;
+static struct pmu *l2x0_pmu;
+static cpumask_t pmu_cpu;
+
+static const char *l2x0_name;
+
+static ktime_t l2x0_pmu_poll_period;
+static struct hrtimer l2x0_pmu_hrtimer;
+
+/*
+ * The L220/PL310 PMU has two equivalent counters, Counter1 and Counter0.
+ * Registers controlling these are laid out in pairs, in descending order, i.e.
+ * the register for Counter1 comes first, followed by the register for
+ * Counter0.
+ * We ensure that idx 0 -> Counter0, and idx1 -> Counter1.
+ */
+static struct perf_event *events[PMU_NR_COUNTERS];
+
+/* Find an unused counter */
+static int l2x0_pmu_find_idx(void)
+{
+       int i;
+
+       for (i = 0; i < PMU_NR_COUNTERS; i++) {
+               if (!events[i])
+                       return i;
+       }
+
+       return -1;
+}
+
+/* How many counters are allocated? */
+static int l2x0_pmu_num_active_counters(void)
+{
+       int i, cnt = 0;
+
+       for (i = 0; i < PMU_NR_COUNTERS; i++) {
+               if (events[i])
+                       cnt++;
+       }
+
+       return cnt;
+}
+
+static void l2x0_pmu_counter_config_write(int idx, u32 val)
+{
+       writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT0_CFG - 4 * idx);
+}
+
+static u32 l2x0_pmu_counter_read(int idx)
+{
+       return readl_relaxed(l2x0_base + L2X0_EVENT_CNT0_VAL - 4 * idx);
+}
+
+static void l2x0_pmu_counter_write(int idx, u32 val)
+{
+       writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT0_VAL - 4 * idx);
+}
+
+static void __l2x0_pmu_enable(void)
+{
+       u32 val = readl_relaxed(l2x0_base + L2X0_EVENT_CNT_CTRL);
+       val |= L2X0_EVENT_CNT_CTRL_ENABLE;
+       writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT_CTRL);
+}
+
+static void __l2x0_pmu_disable(void)
+{
+       u32 val = readl_relaxed(l2x0_base + L2X0_EVENT_CNT_CTRL);
+       val &= ~L2X0_EVENT_CNT_CTRL_ENABLE;
+       writel_relaxed(val, l2x0_base + L2X0_EVENT_CNT_CTRL);
+}
+
+static void l2x0_pmu_enable(struct pmu *pmu)
+{
+       if (l2x0_pmu_num_active_counters() == 0)
+               return;
+
+       __l2x0_pmu_enable();
+}
+
+static void l2x0_pmu_disable(struct pmu *pmu)
+{
+       if (l2x0_pmu_num_active_counters() == 0)
+               return;
+
+       __l2x0_pmu_disable();
+}
+
+static void warn_if_saturated(u32 count)
+{
+       if (count != 0xffffffff)
+               return;
+
+       pr_warn_ratelimited("L2X0 counter saturated. Poll period too long\n");
+}
+
+static void l2x0_pmu_event_read(struct perf_event *event)
+{
+       struct hw_perf_event *hw = &event->hw;
+       u64 prev_count, new_count, mask;
+
+       do {
+                prev_count = local64_read(&hw->prev_count);
+                new_count = l2x0_pmu_counter_read(hw->idx);
+       } while (local64_xchg(&hw->prev_count, new_count) != prev_count);
+
+       mask = GENMASK_ULL(31, 0);
+       local64_add((new_count - prev_count) & mask, &event->count);
+
+       warn_if_saturated(new_count);
+}
+
+static void l2x0_pmu_event_configure(struct perf_event *event)
+{
+       struct hw_perf_event *hw = &event->hw;
+
+       /*
+        * The L2X0 counters saturate at 0xffffffff rather than wrapping, so we
+        * will *always* lose some number of events when a counter saturates,
+        * and have no way of detecting how many were lost.
+        *
+        * To minimize the impact of this, we try to maximize the period by
+        * always starting counters at zero. To ensure that group ratios are
+        * representative, we poll periodically to avoid counters saturating.
+        * See l2x0_pmu_poll().
+        */
+       local64_set(&hw->prev_count, 0);
+       l2x0_pmu_counter_write(hw->idx, 0);
+}
+
+static enum hrtimer_restart l2x0_pmu_poll(struct hrtimer *hrtimer)
+{
+       unsigned long flags;
+       int i;
+
+       local_irq_save(flags);
+       __l2x0_pmu_disable();
+
+       for (i = 0; i < PMU_NR_COUNTERS; i++) {
+               struct perf_event *event = events[i];
+
+               if (!event)
+                       continue;
+
+               l2x0_pmu_event_read(event);
+               l2x0_pmu_event_configure(event);
+       }
+
+       __l2x0_pmu_enable();
+       local_irq_restore(flags);
+
+       hrtimer_forward_now(hrtimer, l2x0_pmu_poll_period);
+       return HRTIMER_RESTART;
+}
+
+
+static void __l2x0_pmu_event_enable(int idx, u32 event)
+{
+       u32 val;
+
+       val = event << L2X0_EVENT_CNT_CFG_SRC_SHIFT;
+       val |= L2X0_EVENT_CNT_CFG_INT_DISABLED;
+       l2x0_pmu_counter_config_write(idx, val);
+}
+
+static void l2x0_pmu_event_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hw = &event->hw;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(hw->state & PERF_HES_UPTODATE));
+               l2x0_pmu_event_configure(event);
+       }
+
+       hw->state = 0;
+
+       __l2x0_pmu_event_enable(hw->idx, hw->config_base);
+}
+
+static void __l2x0_pmu_event_disable(int idx)
+{
+       u32 val;
+
+       val = L2X0_EVENT_CNT_CFG_SRC_DISABLED << L2X0_EVENT_CNT_CFG_SRC_SHIFT;
+       val |= L2X0_EVENT_CNT_CFG_INT_DISABLED;
+       l2x0_pmu_counter_config_write(idx, val);
+}
+
+static void l2x0_pmu_event_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hw = &event->hw;
+
+       if (WARN_ON_ONCE(event->hw.state & PERF_HES_STOPPED))
+               return;
+
+       __l2x0_pmu_event_disable(hw->idx);
+
+       hw->state |= PERF_HES_STOPPED;
+
+       if (flags & PERF_EF_UPDATE) {
+               l2x0_pmu_event_read(event);
+               hw->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int l2x0_pmu_event_add(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hw = &event->hw;
+       int idx = l2x0_pmu_find_idx();
+
+       if (idx == -1)
+               return -EAGAIN;
+
+       /*
+        * Pin the timer, so that the overflows are handled by the chosen
+        * event->cpu (this is the same one as presented in "cpumask"
+        * attribute).
+        */
+       if (l2x0_pmu_num_active_counters() == 0)
+               hrtimer_start(&l2x0_pmu_hrtimer, l2x0_pmu_poll_period,
+                             HRTIMER_MODE_REL_PINNED);
+
+       events[idx] = event;
+       hw->idx = idx;
+
+       l2x0_pmu_event_configure(event);
+
+       hw->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+       if (flags & PERF_EF_START)
+               l2x0_pmu_event_start(event, 0);
+
+       return 0;
+}
+
+static void l2x0_pmu_event_del(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hw = &event->hw;
+
+       l2x0_pmu_event_stop(event, PERF_EF_UPDATE);
+
+       events[hw->idx] = NULL;
+       hw->idx = -1;
+
+       if (l2x0_pmu_num_active_counters() == 0)
+               hrtimer_cancel(&l2x0_pmu_hrtimer);
+}
+
+static bool l2x0_pmu_group_is_valid(struct perf_event *event)
+{
+       struct pmu *pmu = event->pmu;
+       struct perf_event *leader = event->group_leader;
+       struct perf_event *sibling;
+       int num_hw = 0;
+
+       if (leader->pmu == pmu)
+               num_hw++;
+       else if (!is_software_event(leader))
+               return false;
+
+       list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+               if (sibling->pmu == pmu)
+                       num_hw++;
+               else if (!is_software_event(sibling))
+                       return false;
+       }
+
+       return num_hw <= PMU_NR_COUNTERS;
+}
+
+static int l2x0_pmu_event_init(struct perf_event *event)
+{
+       struct hw_perf_event *hw = &event->hw;
+
+       if (event->attr.type != l2x0_pmu->type)
+               return -ENOENT;
+
+       if (is_sampling_event(event) ||
+           event->attach_state & PERF_ATTACH_TASK)
+               return -EINVAL;
+
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       if (event->attr.config & ~L2X0_EVENT_CNT_CFG_SRC_MASK)
+               return -EINVAL;
+
+       hw->config_base = event->attr.config;
+
+       if (!l2x0_pmu_group_is_valid(event))
+               return -EINVAL;
+
+       event->cpu = cpumask_first(&pmu_cpu);
+
+       return 0;
+}
+
+struct l2x0_event_attribute {
+       struct device_attribute attr;
+       unsigned int config;
+       bool pl310_only;
+};
+
+#define L2X0_EVENT_ATTR(_name, _config, _pl310_only)                           \
+       (&((struct l2x0_event_attribute[]) {{                                   \
+               .attr = __ATTR(_name, S_IRUGO, l2x0_pmu_event_show, NULL),      \
+               .config = _config,                                              \
+               .pl310_only = _pl310_only,                                      \
+       }})[0].attr.attr)
+
+#define L220_PLUS_EVENT_ATTR(_name, _config)                                   \
+       L2X0_EVENT_ATTR(_name, _config, false)
+
+#define PL310_EVENT_ATTR(_name, _config)                                       \
+       L2X0_EVENT_ATTR(_name, _config, true)
+
+static ssize_t l2x0_pmu_event_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       struct l2x0_event_attribute *lattr;
+
+       lattr = container_of(attr, typeof(*lattr), attr);
+       return snprintf(buf, PAGE_SIZE, "config=0x%x\n", lattr->config);
+}
+
+static umode_t l2x0_pmu_event_attr_is_visible(struct kobject *kobj,
+                                             struct attribute *attr,
+                                             int unused)
+{
+       struct device *dev = kobj_to_dev(kobj);
+       struct pmu *pmu = dev_get_drvdata(dev);
+       struct l2x0_event_attribute *lattr;
+
+       lattr = container_of(attr, typeof(*lattr), attr.attr);
+
+       if (!lattr->pl310_only || strcmp("l2c_310", pmu->name) == 0)
+               return attr->mode;
+
+       return 0;
+}
+
+static struct attribute *l2x0_pmu_event_attrs[] = {
+       L220_PLUS_EVENT_ATTR(co,        0x1),
+       L220_PLUS_EVENT_ATTR(drhit,     0x2),
+       L220_PLUS_EVENT_ATTR(drreq,     0x3),
+       L220_PLUS_EVENT_ATTR(dwhit,     0x4),
+       L220_PLUS_EVENT_ATTR(dwreq,     0x5),
+       L220_PLUS_EVENT_ATTR(dwtreq,    0x6),
+       L220_PLUS_EVENT_ATTR(irhit,     0x7),
+       L220_PLUS_EVENT_ATTR(irreq,     0x8),
+       L220_PLUS_EVENT_ATTR(wa,        0x9),
+       PL310_EVENT_ATTR(ipfalloc,      0xa),
+       PL310_EVENT_ATTR(epfhit,        0xb),
+       PL310_EVENT_ATTR(epfalloc,      0xc),
+       PL310_EVENT_ATTR(srrcvd,        0xd),
+       PL310_EVENT_ATTR(srconf,        0xe),
+       PL310_EVENT_ATTR(epfrcvd,       0xf),
+       NULL
+};
+
+static struct attribute_group l2x0_pmu_event_attrs_group = {
+       .name = "events",
+       .attrs = l2x0_pmu_event_attrs,
+       .is_visible = l2x0_pmu_event_attr_is_visible,
+};
+
+static ssize_t l2x0_pmu_cpumask_show(struct device *dev,
+                                    struct device_attribute *attr, char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &pmu_cpu);
+}
+
+static struct device_attribute l2x0_pmu_cpumask_attr =
+               __ATTR(cpumask, S_IRUGO, l2x0_pmu_cpumask_show, NULL);
+
+static struct attribute *l2x0_pmu_cpumask_attrs[] = {
+       &l2x0_pmu_cpumask_attr.attr,
+       NULL,
+};
+
+static struct attribute_group l2x0_pmu_cpumask_attr_group = {
+       .attrs = l2x0_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *l2x0_pmu_attr_groups[] = {
+       &l2x0_pmu_event_attrs_group,
+       &l2x0_pmu_cpumask_attr_group,
+       NULL,
+};
+
+static void l2x0_pmu_reset(void)
+{
+       int i;
+
+       __l2x0_pmu_disable();
+
+       for (i = 0; i < PMU_NR_COUNTERS; i++)
+               __l2x0_pmu_event_disable(i);
+}
+
+static int l2x0_pmu_offline_cpu(unsigned int cpu)
+{
+       unsigned int target;
+
+       if (!cpumask_test_and_clear_cpu(cpu, &pmu_cpu))
+               return 0;
+
+       target = cpumask_any_but(cpu_online_mask, cpu);
+       if (target >= nr_cpu_ids)
+               return 0;
+
+       perf_pmu_migrate_context(l2x0_pmu, cpu, target);
+       cpumask_set_cpu(target, &pmu_cpu);
+
+       return 0;
+}
+
+void l2x0_pmu_suspend(void)
+{
+       int i;
+
+       if (!l2x0_pmu)
+               return;
+
+       l2x0_pmu_disable(l2x0_pmu);
+
+       for (i = 0; i < PMU_NR_COUNTERS; i++) {
+               if (events[i])
+                       l2x0_pmu_event_stop(events[i], PERF_EF_UPDATE);
+       }
+
+}
+
+void l2x0_pmu_resume(void)
+{
+       int i;
+
+       if (!l2x0_pmu)
+               return;
+
+       l2x0_pmu_reset();
+
+       for (i = 0; i < PMU_NR_COUNTERS; i++) {
+               if (events[i])
+                       l2x0_pmu_event_start(events[i], PERF_EF_RELOAD);
+       }
+
+       l2x0_pmu_enable(l2x0_pmu);
+}
+
+void __init l2x0_pmu_register(void __iomem *base, u32 part)
+{
+       /*
+        * Determine whether we support the PMU, and choose the name for sysfs.
+        * This is also used by l2x0_pmu_event_attr_is_visible to determine
+        * which events to display, as the PL310 PMU supports a superset of
+        * L220 events.
+        *
+        * The L210 PMU has a different programmer's interface, and is not
+        * supported by this driver.
+        *
+        * We must defer registering the PMU until the perf subsystem is up and
+        * running, so just stash the name and base, and leave that to another
+        * initcall.
+        */
+       switch (part & L2X0_CACHE_ID_PART_MASK) {
+       case L2X0_CACHE_ID_PART_L220:
+               l2x0_name = "l2c_220";
+               break;
+       case L2X0_CACHE_ID_PART_L310:
+               l2x0_name = "l2c_310";
+               break;
+       default:
+               return;
+       }
+
+       l2x0_base = base;
+}
+
+static __init int l2x0_pmu_init(void)
+{
+       int ret;
+
+       if (!l2x0_base)
+               return 0;
+
+       l2x0_pmu = kzalloc(sizeof(*l2x0_pmu), GFP_KERNEL);
+       if (!l2x0_pmu) {
+               pr_warn("Unable to allocate L2x0 PMU\n");
+               return -ENOMEM;
+       }
+
+       *l2x0_pmu = (struct pmu) {
+               .task_ctx_nr = perf_invalid_context,
+               .pmu_enable = l2x0_pmu_enable,
+               .pmu_disable = l2x0_pmu_disable,
+               .read = l2x0_pmu_event_read,
+               .start = l2x0_pmu_event_start,
+               .stop = l2x0_pmu_event_stop,
+               .add = l2x0_pmu_event_add,
+               .del = l2x0_pmu_event_del,
+               .event_init = l2x0_pmu_event_init,
+               .attr_groups = l2x0_pmu_attr_groups,
+       };
+
+       l2x0_pmu_reset();
+
+       /*
+        * We always use a hrtimer rather than an interrupt.
+        * See comments in l2x0_pmu_event_configure and l2x0_pmu_poll.
+        *
+        * Polling once a second allows the counters to fill up to 1/128th on a
+        * quad-core test chip with cores clocked at 400MHz. Hopefully this
+        * leaves sufficient headroom to avoid overflow on production silicon
+        * at higher frequencies.
+        */
+       l2x0_pmu_poll_period = ms_to_ktime(1000);
+       hrtimer_init(&l2x0_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       l2x0_pmu_hrtimer.function = l2x0_pmu_poll;
+
+       cpumask_set_cpu(0, &pmu_cpu);
+       ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_L2X0_ONLINE,
+                                       "AP_PERF_ARM_L2X0_ONLINE", NULL,
+                                       l2x0_pmu_offline_cpu);
+       if (ret)
+               goto out_pmu;
+
+       ret = perf_pmu_register(l2x0_pmu, l2x0_name, -1);
+       if (ret)
+               goto out_cpuhp;
+
+       return 0;
+
+out_cpuhp:
+       cpuhp_remove_state_nocalls(CPUHP_AP_PERF_ARM_L2X0_ONLINE);
+out_pmu:
+       kfree(l2x0_pmu);
+       l2x0_pmu = NULL;
+       return ret;
+}
+device_initcall(l2x0_pmu_init);
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c

index cc12905..d1870c7 100644 (file)
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -142,6 +142,8 @@ static void l2c_disable(void)
  {
         void __iomem *base = l2x0_base;
  
+       l2x0_pmu_suspend();
+
         outer_cache.flush_all();
         l2c_write_sec(0, base, L2X0_CTRL);
         dsb(st);
@@ -159,6 +161,8 @@ static void l2c_resume(void)
         /* Do not touch the controller if already enabled. */
         if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN))
                 l2c_enable(base, l2x0_data->num_lock);
+
+       l2x0_pmu_resume();
  }
  
  /*
@@ -709,9 +713,8 @@ static void __init l2c310_fixup(void __iomem *base, u32 cache_id,
         if (revision >= L310_CACHE_ID_RTL_R3P0 &&
             revision < L310_CACHE_ID_RTL_R3P2) {
                 u32 val = l2x0_saved_regs.prefetch_ctrl;
-               /* I don't think bit23 is required here... but iMX6 does so */
-               if (val & (BIT(30) | BIT(23))) {
-                       val &= ~(BIT(30) | BIT(23));
+               if (val & L310_PREFETCH_CTRL_DBL_LINEFILL) {
+                       val &= ~L310_PREFETCH_CTRL_DBL_LINEFILL;
                         l2x0_saved_regs.prefetch_ctrl = val;
                         errata[n++] = "752271";
                 }
@@ -892,6 +895,8 @@ static int __init __l2c_init(const struct l2c_init_data *data,
         pr_info("%s: CACHE_ID 0x%08x, AUX_CTRL 0x%08x\n",
                 data->type, cache_id, aux);
  
+       l2x0_pmu_register(l2x0_base, cache_id);
+
         return 0;
  }
  
diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S

new file mode 100644 (file)

index 0000000..816a7e4
--- /dev/null
+++ b/arch/arm/mm/cache-v7m.S
@@ -0,0 +1,453 @@
+/*
+ *  linux/arch/arm/mm/cache-v7m.S
+ *
+ *  Based on linux/arch/arm/mm/cache-v7.S
+ *
+ *  Copyright (C) 2001 Deep Blue Solutions Ltd.
+ *  Copyright (C) 2005 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This is the "shell" of the ARMv7M processor support.
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/unwind.h>
+#include <asm/v7m.h>
+
+#include "proc-macros.S"
+
+/* Generic V7M read/write macros for memory mapped cache operations */
+.macro v7m_cache_read, rt, reg
+       movw    \rt, #:lower16:BASEADDR_V7M_SCB + \reg
+       movt    \rt, #:upper16:BASEADDR_V7M_SCB + \reg
+       ldr     \rt, [\rt]
+.endm
+
+.macro v7m_cacheop, rt, tmp, op, c = al
+       movw\c  \tmp, #:lower16:BASEADDR_V7M_SCB + \op
+       movt\c  \tmp, #:upper16:BASEADDR_V7M_SCB + \op
+       str\c   \rt, [\tmp]
+.endm
+
+
+.macro read_ccsidr, rt
+       v7m_cache_read \rt, V7M_SCB_CCSIDR
+.endm
+
+.macro read_clidr, rt
+       v7m_cache_read \rt, V7M_SCB_CLIDR
+.endm
+
+.macro write_csselr, rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_CSSELR
+.endm
+
+/*
+ * dcisw: Invalidate data cache by set/way
+ */
+.macro dcisw, rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_DCISW
+.endm
+
+/*
+ * dccisw: Clean and invalidate data cache by set/way
+ */
+.macro dccisw, rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_DCCISW
+.endm
+
+/*
+ * dccimvac: Clean and invalidate data cache line by MVA to PoC.
+ */
+.irp    c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+.macro dccimvac\c, rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_DCCIMVAC, \c
+.endm
+.endr
+
+/*
+ * dcimvac: Invalidate data cache line by MVA to PoC
+ */
+.macro dcimvac, rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_DCIMVAC
+.endm
+
+/*
+ * dccmvau: Clean data cache line by MVA to PoU
+ */
+.macro dccmvau, rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_DCCMVAU
+.endm
+
+/*
+ * dccmvac: Clean data cache line by MVA to PoC
+ */
+.macro dccmvac,  rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_DCCMVAC
+.endm
+
+/*
+ * icimvau: Invalidate instruction caches by MVA to PoU
+ */
+.macro icimvau, rt, tmp
+       v7m_cacheop \rt, \tmp, V7M_SCB_ICIMVAU
+.endm
+
+/*
+ * Invalidate the icache, inner shareable if SMP, invalidate BTB for UP.
+ * rt data ignored by ICIALLU(IS), so can be used for the address
+ */
+.macro invalidate_icache, rt
+       v7m_cacheop \rt, \rt, V7M_SCB_ICIALLU
+       mov \rt, #0
+.endm
+
+/*
+ * Invalidate the BTB, inner shareable if SMP.
+ * rt data ignored by BPIALL, so it can be used for the address
+ */
+.macro invalidate_bp, rt
+       v7m_cacheop \rt, \rt, V7M_SCB_BPIALL
+       mov \rt, #0
+.endm
+
+ENTRY(v7m_invalidate_l1)
+       mov     r0, #0
+
+       write_csselr r0, r1
+       read_ccsidr r0
+
+       movw    r1, #0x7fff
+       and     r2, r1, r0, lsr #13
+
+       movw    r1, #0x3ff
+
+       and     r3, r1, r0, lsr #3      @ NumWays - 1
+       add     r2, r2, #1              @ NumSets
+
+       and     r0, r0, #0x7
+       add     r0, r0, #4      @ SetShift
+
+       clz     r1, r3          @ WayShift
+       add     r4, r3, #1      @ NumWays
+1:     sub     r2, r2, #1      @ NumSets--
+       mov     r3, r4          @ Temp = NumWays
+2:     subs    r3, r3, #1      @ Temp--
+       mov     r5, r3, lsl r1
+       mov     r6, r2, lsl r0
+       orr     r5, r5, r6      @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
+       dcisw   r5, r6
+       bgt     2b
+       cmp     r2, #0
+       bgt     1b
+       dsb     st
+       isb
+       ret     lr
+ENDPROC(v7m_invalidate_l1)
+
+/*
+ *     v7m_flush_icache_all()
+ *
+ *     Flush the whole I-cache.
+ *
+ *     Registers:
+ *     r0 - set to 0
+ */
+ENTRY(v7m_flush_icache_all)
+       invalidate_icache r0
+       ret     lr
+ENDPROC(v7m_flush_icache_all)
+
+/*
+ *     v7m_flush_dcache_all()
+ *
+ *     Flush the whole D-cache.
+ *
+ *     Corrupted registers: r0-r7, r9-r11
+ */
+ENTRY(v7m_flush_dcache_all)
+       dmb                                     @ ensure ordering with previous memory accesses
+       read_clidr r0
+       mov     r3, r0, lsr #23                 @ move LoC into position
+       ands    r3, r3, #7 << 1                 @ extract LoC*2 from clidr
+       beq     finished                        @ if loc is 0, then no need to clean
+start_flush_levels:
+       mov     r10, #0                         @ start clean at cache level 0
+flush_levels:
+       add     r2, r10, r10, lsr #1            @ work out 3x current cache level
+       mov     r1, r0, lsr r2                  @ extract cache type bits from clidr
+       and     r1, r1, #7                      @ mask of the bits for current cache only
+       cmp     r1, #2                          @ see what cache we have at this level
+       blt     skip                            @ skip if no cache, or just i-cache
+#ifdef CONFIG_PREEMPT
+       save_and_disable_irqs_notrace r9        @ make cssr&csidr read atomic
+#endif
+       write_csselr r10, r1                    @ set current cache level
+       isb                                     @ isb to sych the new cssr&csidr
+       read_ccsidr r1                          @ read the new csidr
+#ifdef CONFIG_PREEMPT
+       restore_irqs_notrace r9
+#endif
+       and     r2, r1, #7                      @ extract the length of the cache lines
+       add     r2, r2, #4                      @ add 4 (line length offset)
+       movw    r4, #0x3ff
+       ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
+       clz     r5, r4                          @ find bit position of way size increment
+       movw    r7, #0x7fff
+       ands    r7, r7, r1, lsr #13             @ extract max number of the index size
+loop1:
+       mov     r9, r7                          @ create working copy of max index
+loop2:
+       lsl     r6, r4, r5
+       orr     r11, r10, r6                    @ factor way and cache number into r11
+       lsl     r6, r9, r2
+       orr     r11, r11, r6                    @ factor index number into r11
+       dccisw  r11, r6                         @ clean/invalidate by set/way
+       subs    r9, r9, #1                      @ decrement the index
+       bge     loop2
+       subs    r4, r4, #1                      @ decrement the way
+       bge     loop1
+skip:
+       add     r10, r10, #2                    @ increment cache number
+       cmp     r3, r10
+       bgt     flush_levels
+finished:
+       mov     r10, #0                         @ swith back to cache level 0
+       write_csselr r10, r3                    @ select current cache level in cssr
+       dsb     st
+       isb
+       ret     lr
+ENDPROC(v7m_flush_dcache_all)
+
+/*
+ *     v7m_flush_cache_all()
+ *
+ *     Flush the entire cache system.
+ *  The data cache flush is now achieved using atomic clean / invalidates
+ *  working outwards from L1 cache. This is done using Set/Way based cache
+ *  maintenance instructions.
+ *  The instruction cache can still be invalidated back to the point of
+ *  unification in a single instruction.
+ *
+ */
+ENTRY(v7m_flush_kern_cache_all)
+       stmfd   sp!, {r4-r7, r9-r11, lr}
+       bl      v7m_flush_dcache_all
+       invalidate_icache r0
+       ldmfd   sp!, {r4-r7, r9-r11, lr}
+       ret     lr
+ENDPROC(v7m_flush_kern_cache_all)
+
+/*
+ *     v7m_flush_cache_all()
+ *
+ *     Flush all TLB entries in a particular address space
+ *
+ *     - mm    - mm_struct describing address space
+ */
+ENTRY(v7m_flush_user_cache_all)
+       /*FALLTHROUGH*/
+
+/*
+ *     v7m_flush_cache_range(start, end, flags)
+ *
+ *     Flush a range of TLB entries in the specified address space.
+ *
+ *     - start - start address (may not be aligned)
+ *     - end   - end address (exclusive, may not be aligned)
+ *     - flags - vm_area_struct flags describing address space
+ *
+ *     It is assumed that:
+ *     - we have a VIPT cache.
+ */
+ENTRY(v7m_flush_user_cache_range)
+       ret     lr
+ENDPROC(v7m_flush_user_cache_all)
+ENDPROC(v7m_flush_user_cache_range)
+
+/*
+ *     v7m_coherent_kern_range(start,end)
+ *
+ *     Ensure that the I and D caches are coherent within specified
+ *     region.  This is typically used when code has been written to
+ *     a memory region, and will be executed.
+ *
+ *     - start   - virtual start address of region
+ *     - end     - virtual end address of region
+ *
+ *     It is assumed that:
+ *     - the Icache does not read data from the write buffer
+ */
+ENTRY(v7m_coherent_kern_range)
+       /* FALLTHROUGH */
+
+/*
+ *     v7m_coherent_user_range(start,end)
+ *
+ *     Ensure that the I and D caches are coherent within specified
+ *     region.  This is typically used when code has been written to
+ *     a memory region, and will be executed.
+ *
+ *     - start   - virtual start address of region
+ *     - end     - virtual end address of region
+ *
+ *     It is assumed that:
+ *     - the Icache does not read data from the write buffer
+ */
+ENTRY(v7m_coherent_user_range)
+ UNWIND(.fnstart               )
+       dcache_line_size r2, r3
+       sub     r3, r2, #1
+       bic     r12, r0, r3
+1:
+/*
+ * We use open coded version of dccmvau otherwise USER() would
+ * point at movw instruction.
+ */
+       dccmvau r12, r3
+       add     r12, r12, r2
+       cmp     r12, r1
+       blo     1b
+       dsb     ishst
+       icache_line_size r2, r3
+       sub     r3, r2, #1
+       bic     r12, r0, r3
+2:
+       icimvau r12, r3
+       add     r12, r12, r2
+       cmp     r12, r1
+       blo     2b
+       invalidate_bp r0
+       dsb     ishst
+       isb
+       ret     lr
+ UNWIND(.fnend         )
+ENDPROC(v7m_coherent_kern_range)
+ENDPROC(v7m_coherent_user_range)
+
+/*
+ *     v7m_flush_kern_dcache_area(void *addr, size_t size)
+ *
+ *     Ensure that the data held in the page kaddr is written back
+ *     to the page in question.
+ *
+ *     - addr  - kernel address
+ *     - size  - region size
+ */
+ENTRY(v7m_flush_kern_dcache_area)
+       dcache_line_size r2, r3
+       add     r1, r0, r1
+       sub     r3, r2, #1
+       bic     r0, r0, r3
+1:
+       dccimvac r0, r3         @ clean & invalidate D line / unified line
+       add     r0, r0, r2
+       cmp     r0, r1
+       blo     1b
+       dsb     st
+       ret     lr
+ENDPROC(v7m_flush_kern_dcache_area)
+
+/*
+ *     v7m_dma_inv_range(start,end)
+ *
+ *     Invalidate the data cache within the specified region; we will
+ *     be performing a DMA operation in this region and we want to
+ *     purge old data in the cache.
+ *
+ *     - start   - virtual start address of region
+ *     - end     - virtual end address of region
+ */
+v7m_dma_inv_range:
+       dcache_line_size r2, r3
+       sub     r3, r2, #1
+       tst     r0, r3
+       bic     r0, r0, r3
+       dccimvacne r0, r3
+       subne   r3, r2, #1      @ restore r3, corrupted by v7m's dccimvac
+       tst     r1, r3
+       bic     r1, r1, r3
+       dccimvacne r1, r3
+1:
+       dcimvac r0, r3
+       add     r0, r0, r2
+       cmp     r0, r1
+       blo     1b
+       dsb     st
+       ret     lr
+ENDPROC(v7m_dma_inv_range)
+
+/*
+ *     v7m_dma_clean_range(start,end)
+ *     - start   - virtual start address of region
+ *     - end     - virtual end address of region
+ */
+v7m_dma_clean_range:
+       dcache_line_size r2, r3
+       sub     r3, r2, #1
+       bic     r0, r0, r3
+1:
+       dccmvac r0, r3                  @ clean D / U line
+       add     r0, r0, r2
+       cmp     r0, r1
+       blo     1b
+       dsb     st
+       ret     lr
+ENDPROC(v7m_dma_clean_range)
+
+/*
+ *     v7m_dma_flush_range(start,end)
+ *     - start   - virtual start address of region
+ *     - end     - virtual end address of region
+ */
+ENTRY(v7m_dma_flush_range)
+       dcache_line_size r2, r3
+       sub     r3, r2, #1
+       bic     r0, r0, r3
+1:
+       dccimvac r0, r3                  @ clean & invalidate D / U line
+       add     r0, r0, r2
+       cmp     r0, r1
+       blo     1b
+       dsb     st
+       ret     lr
+ENDPROC(v7m_dma_flush_range)
+
+/*
+ *     dma_map_area(start, size, dir)
+ *     - start - kernel virtual start address
+ *     - size  - size of region
+ *     - dir   - DMA direction
+ */
+ENTRY(v7m_dma_map_area)
+       add     r1, r1, r0
+       teq     r2, #DMA_FROM_DEVICE
+       beq     v7m_dma_inv_range
+       b       v7m_dma_clean_range
+ENDPROC(v7m_dma_map_area)
+
+/*
+ *     dma_unmap_area(start, size, dir)
+ *     - start - kernel virtual start address
+ *     - size  - size of region
+ *     - dir   - DMA direction
+ */
+ENTRY(v7m_dma_unmap_area)
+       add     r1, r1, r0
+       teq     r2, #DMA_TO_DEVICE
+       bne     v7m_dma_inv_range
+       ret     lr
+ENDPROC(v7m_dma_unmap_area)
+
+       .globl  v7m_flush_kern_cache_louis
+       .equ    v7m_flush_kern_cache_louis, v7m_flush_kern_cache_all
+
+       __INITDATA
+
+       @ define struct cpu_cache_fns (see <asm/cacheflush.h> and proc-macros.S)
+       define_cache_functions v7m
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c

index c6834c0..a2302ab 100644 (file)
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -436,7 +436,7 @@ static int __init atomic_pool_init(void)
                 gen_pool_set_algo(atomic_pool,
                                 gen_pool_first_fit_order_align,
                                 (void *)PAGE_SHIFT);
-               pr_info("DMA: preallocated %zd KiB pool for atomic coherent allocations\n",
+               pr_info("DMA: preallocated %zu KiB pool for atomic coherent allocations\n",
                        atomic_pool_size / 1024);
                 return 0;
         }
@@ -445,7 +445,7 @@ destroy_genpool:
         gen_pool_destroy(atomic_pool);
         atomic_pool = NULL;
  out:
-       pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n",
+       pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
                atomic_pool_size / 1024);
         return -ENOMEM;
  }
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c

index 30fe03f..4001dd1 100644 (file)
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -243,7 +243,7 @@ __setup("noalign", noalign_setup);
  #define PROT_PTE_S2_DEVICE     PROT_PTE_DEVICE
  #define PROT_SECT_DEVICE       PMD_TYPE_SECT|PMD_SECT_AP_WRITE
  
-static struct mem_type mem_types[] = {
+static struct mem_type mem_types[] __ro_after_init = {
         [MT_DEVICE] = {           /* Strongly ordered / ARMv6 shared device */
                 .prot_pte       = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
                                   L_PTE_SHARED,
diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S

index c671f34..0d40c28 100644 (file)
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S
@@ -7,6 +7,10 @@
  #include <asm/asm-offsets.h>
  #include <asm/thread_info.h>
  
+#ifdef CONFIG_CPU_V7M
+#include <asm/v7m.h>
+#endif
+
  /*
   * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
   */
@@ -70,7 +74,13 @@
   * on ARMv7.
   */
         .macro  dcache_line_size, reg, tmp
+#ifdef CONFIG_CPU_V7M
+       movw    \tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+       movt    \tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+       ldr     \tmp, [\tmp]
+#else
         mrc     p15, 0, \tmp, c0, c0, 1         @ read ctr
+#endif
         lsr     \tmp, \tmp, #16
         and     \tmp, \tmp, #0xf                @ cache line size encoding
         mov     \reg, #4                        @ bytes per word
@@ -82,7 +92,13 @@
   * on ARMv7.
   */
         .macro  icache_line_size, reg, tmp
+#ifdef CONFIG_CPU_V7M
+       movw    \tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+       movt    \tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR
+       ldr     \tmp, [\tmp]
+#else
         mrc     p15, 0, \tmp, c0, c0, 1         @ read ctr
+#endif
         and     \tmp, \tmp, #0xf                @ cache line size encoding
         mov     \reg, #4                        @ bytes per word
         mov     \reg, \reg, lsl \tmp            @ actual cache line size
diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S

index 7229d8d..f6d333f 100644 (file)
--- a/arch/arm/mm/proc-v7m.S
+++ b/arch/arm/mm/proc-v7m.S
@@ -74,14 +74,42 @@ ENTRY(cpu_v7m_do_resume)
  ENDPROC(cpu_v7m_do_resume)
  #endif
  
+ENTRY(cpu_cm7_dcache_clean_area)
+       dcache_line_size r2, r3
+       movw    r3, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_DCCMVAC
+       movt    r3, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_DCCMVAC
+
+1:     str     r0, [r3]                @ clean D entry
+       add     r0, r0, r2
+       subs    r1, r1, r2
+       bhi     1b
+       dsb
+       ret     lr
+ENDPROC(cpu_cm7_dcache_clean_area)
+
+ENTRY(cpu_cm7_proc_fin)
+       movw    r2, #:lower16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+       movt    r2, #:upper16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+       ldr     r0, [r2]
+       bic     r0, r0, #(V7M_SCB_CCR_DC | V7M_SCB_CCR_IC)
+       str     r0, [r2]
+       ret     lr
+ENDPROC(cpu_cm7_proc_fin)
+
         .section ".text.init", #alloc, #execinstr
  
+__v7m_cm7_setup:
+       mov     r8, #(V7M_SCB_CCR_DC | V7M_SCB_CCR_IC| V7M_SCB_CCR_BP)
+       b       __v7m_setup_cont
  /*
   *     __v7m_setup
   *
   *     This should be able to cover all ARMv7-M cores.
   */
  __v7m_setup:
+       mov     r8, 0
+
+__v7m_setup_cont:
         @ Configure the vector table base address
         ldr     r0, =BASEADDR_V7M_SCB
         ldr     r12, =vector_table
@@ -104,6 +132,7 @@ __v7m_setup:
         badr    r1, 1f
         ldr     r5, [r12, #11 * 4]      @ read the SVC vector entry
         str     r1, [r12, #11 * 4]      @ write the temporary SVC vector entry
+       dsb
         mov     r6, lr                  @ save LR
         ldr     sp, =init_thread_union + THREAD_START_SP
         cpsie   i
@@ -116,15 +145,32 @@ __v7m_setup:
         mov     r1, #1
         msr     control, r1             @ Thread mode has unpriviledged access
  
+       @ Configure caches (if implemented)
+       teq     r8, #0
+       stmneia r12, {r0-r6, lr}        @ v7m_invalidate_l1 touches r0-r6
+       blne    v7m_invalidate_l1
+       teq     r8, #0                  @ re-evalutae condition
+       ldmneia r12, {r0-r6, lr}
+
         @ Configure the System Control Register to ensure 8-byte stack alignment
         @ Note the STKALIGN bit is either RW or RAO.
-       ldr     r12, [r0, V7M_SCB_CCR]  @ system control register
-       orr     r12, #V7M_SCB_CCR_STKALIGN
-       str     r12, [r0, V7M_SCB_CCR]
+       ldr     r0, [r0, V7M_SCB_CCR]   @ system control register
+       orr     r0, #V7M_SCB_CCR_STKALIGN
+       orr     r0, r0, r8
+
         ret     lr
  ENDPROC(__v7m_setup)
  
+/*
+ * Cortex-M7 processor functions
+ */
+       globl_equ       cpu_cm7_proc_init,      cpu_v7m_proc_init
+       globl_equ       cpu_cm7_reset,          cpu_v7m_reset
+       globl_equ       cpu_cm7_do_idle,        cpu_v7m_do_idle
+       globl_equ       cpu_cm7_switch_mm,      cpu_v7m_switch_mm
+
         define_processor_functions v7m, dabort=nommu_early_abort, pabort=legacy_pabort, nommu=1
+       define_processor_functions cm7, dabort=nommu_early_abort, pabort=legacy_pabort, nommu=1
  
         .section ".rodata"
         string cpu_arch_name, "armv7m"
@@ -133,6 +179,50 @@ ENDPROC(__v7m_setup)
  
         .section ".proc.info.init", #alloc
  
+.macro __v7m_proc name, initfunc, cache_fns = nop_cache_fns, hwcaps = 0,  proc_fns = v7m_processor_functions
+       .long   0                       /* proc_info_list.__cpu_mm_mmu_flags */
+       .long   0                       /* proc_info_list.__cpu_io_mmu_flags */
+       initfn  \initfunc, \name
+       .long   cpu_arch_name
+       .long   cpu_elf_name
+       .long   HWCAP_HALF | HWCAP_THUMB | HWCAP_FAST_MULT | \hwcaps
+       .long   cpu_v7m_name
+       .long   \proc_fns
+       .long   0                       /* proc_info_list.tlb */
+       .long   0                       /* proc_info_list.user */
+       .long   \cache_fns
+.endm
+
+       /*
+        * Match ARM Cortex-M7 processor.
+        */
+       .type   __v7m_cm7_proc_info, #object
+__v7m_cm7_proc_info:
+       .long   0x410fc270              /* ARM Cortex-M7 0xC27 */
+       .long   0xff0ffff0              /* Mask off revision, patch release */
+       __v7m_proc __v7m_cm7_proc_info, __v7m_cm7_setup, hwcaps = HWCAP_EDSP, cache_fns = v7m_cache_fns, proc_fns = cm7_processor_functions
+       .size   __v7m_cm7_proc_info, . - __v7m_cm7_proc_info
+
+       /*
+        * Match ARM Cortex-M4 processor.
+        */
+       .type   __v7m_cm4_proc_info, #object
+__v7m_cm4_proc_info:
+       .long   0x410fc240              /* ARM Cortex-M4 0xC24 */
+       .long   0xff0ffff0              /* Mask off revision, patch release */
+       __v7m_proc __v7m_cm4_proc_info, __v7m_setup, hwcaps = HWCAP_EDSP
+       .size   __v7m_cm4_proc_info, . - __v7m_cm4_proc_info
+
+       /*
+        * Match ARM Cortex-M3 processor.
+        */
+       .type   __v7m_cm3_proc_info, #object
+__v7m_cm3_proc_info:
+       .long   0x410fc230              /* ARM Cortex-M3 0xC23 */
+       .long   0xff0ffff0              /* Mask off revision, patch release */
+       __v7m_proc __v7m_cm3_proc_info, __v7m_setup
+       .size   __v7m_cm3_proc_info, . - __v7m_cm3_proc_info
+
         /*
          * Match any ARMv7-M processor core.
          */
@@ -140,16 +230,6 @@ ENDPROC(__v7m_setup)
  __v7m_proc_info:
         .long   0x000f0000              @ Required ID value
         .long   0x000f0000              @ Mask for ID
-       .long   0                       @ proc_info_list.__cpu_mm_mmu_flags
-       .long   0                       @ proc_info_list.__cpu_io_mmu_flags
-       initfn  __v7m_setup, __v7m_proc_info    @ proc_info_list.__cpu_flush
-       .long   cpu_arch_name
-       .long   cpu_elf_name
-       .long   HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT
-       .long   cpu_v7m_name
-       .long   v7m_processor_functions @ proc_info_list.proc
-       .long   0                       @ proc_info_list.tlb
-       .long   0                       @ proc_info_list.user
-       .long   nop_cache_fns           @ proc_info_list.cache
+       __v7m_proc __v7m_proc_info, __v7m_setup
         .size   __v7m_proc_info, . - __v7m_proc_info
  
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h

index fc2a0cb..f8ae6d6 100644 (file)
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -80,6 +80,19 @@
  #include <linux/stringify.h>
  #include <asm/barrier.h>
  
+#define read_gicreg(r)                                                 \
+       ({                                                              \
+               u64 reg;                                                \
+               asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
+               reg;                                                    \
+       })
+
+#define write_gicreg(v,r)                                              \
+       do {                                                            \
+               u64 __val = (v);                                        \
+               asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
+       } while (0)
+
  /*
   * Low-level accessors
   *
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h

index 4b5c977..2a2752b 100644 (file)
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -50,7 +50,7 @@
  #define HCR_BSU                (3 << 10)
  #define HCR_BSU_IS     (UL(1) << 10)
  #define HCR_FB         (UL(1) << 9)
-#define HCR_VA         (UL(1) << 8)
+#define HCR_VSE                (UL(1) << 8)
  #define HCR_VI         (UL(1) << 7)
  #define HCR_VF         (UL(1) << 6)
  #define HCR_AMO                (UL(1) << 5)
@@ -80,7 +80,7 @@
  #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
                          HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
                          HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW)
-#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
+#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
  #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
  #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
  
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h

index 7561f63..18f7465 100644 (file)
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -20,10 +20,15 @@
  
  #include <asm/virt.h>
  
+#define ARM_EXIT_WITH_SERROR_BIT  31
+#define ARM_EXCEPTION_CODE(x)    ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
+#define ARM_SERROR_PENDING(x)    !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT))
+
  #define ARM_EXCEPTION_IRQ        0
-#define ARM_EXCEPTION_TRAP       1
+#define ARM_EXCEPTION_EL1_SERROR  1
+#define ARM_EXCEPTION_TRAP       2
  /* The hyp-stub will return this for any kvm_call_hyp() call */
-#define ARM_EXCEPTION_HYP_GONE   2
+#define ARM_EXCEPTION_HYP_GONE   3
  
  #define KVM_ARM64_DEBUG_DIRTY_SHIFT    0
  #define KVM_ARM64_DEBUG_DIRTY          (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h

index 4cdeae3..fd9d5fd 100644 (file)
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -38,6 +38,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
  void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
  
  void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_vabt(struct kvm_vcpu *vcpu);
  void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
  void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
  
@@ -147,6 +148,16 @@ static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)
         return vcpu->arch.fault.esr_el2;
  }
  
+static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
+{
+       u32 esr = kvm_vcpu_get_hsr(vcpu);
+
+       if (esr & ESR_ELx_CV)
+               return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
+
+       return -1;
+}
+
  static inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu)
  {
         return vcpu->arch.fault.far_el2;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h

index 3eda975..bd94e67 100644 (file)
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -290,15 +290,15 @@ struct kvm_vcpu_arch {
  #endif
  
  struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
  };
  
  struct kvm_vcpu_stat {
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 hvc_exit_stat;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 hvc_exit_stat;
         u64 wfe_exit_stat;
         u64 wfi_exit_stat;
         u64 mmio_exit_user;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h

index cff5105..b18e852 100644 (file)
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -123,6 +123,7 @@ typeof(orig) * __hyp_text fname(void)                                       \
  
  void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
  void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
  
  void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
  void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h

index dff1098..a79b969 100644 (file)
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -162,12 +162,6 @@ void kvm_clear_hyp_idmap(void);
  #define        kvm_set_pte(ptep, pte)          set_pte(ptep, pte)
  #define        kvm_set_pmd(pmdp, pmd)          set_pmd(pmdp, pmd)
  
-static inline void kvm_clean_pgd(pgd_t *pgd) {}
-static inline void kvm_clean_pmd(pmd_t *pmd) {}
-static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
-static inline void kvm_clean_pte(pte_t *pte) {}
-static inline void kvm_clean_pte_entry(pte_t *pte) {}
-
  static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
  {
         pte_val(pte) |= PTE_S2_RDWR;
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig

index 9c9edc9..6eaf12c 100644 (file)
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -16,7 +16,7 @@ menuconfig VIRTUALIZATION
  
  if VIRTUALIZATION
  
-config KVM_ARM_VGIC_V3
+config KVM_ARM_VGIC_V3_ITS
         bool
  
  config KVM
@@ -34,7 +34,7 @@ config KVM
         select KVM_VFIO
         select HAVE_KVM_EVENTFD
         select HAVE_KVM_IRQFD
-       select KVM_ARM_VGIC_V3
+       select KVM_ARM_VGIC_V3_ITS
         select KVM_ARM_PMU if HW_PERF_EVENTS
         select HAVE_KVM_MSI
         select HAVE_KVM_IRQCHIP
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile

index 695eb3c..d50a82a 100644 (file)
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -16,9 +16,10 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/e
  kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
  
-kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
+kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o
  kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
  kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
  
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
diff --git a/arch/arm64/kvm/emulate.c b/arch/arm64/kvm/emulate.c

deleted file mode 100644 (file)

index f87d8fb..0000000
--- a/arch/arm64/kvm/emulate.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kvm_host.h>
-#include <asm/esr.h>
-#include <asm/kvm_emulate.h>
-
-/*
- * stolen from arch/arm/kernel/opcodes.c
- *
- * condition code lookup table
- * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
- *
- * bit position in short is condition code: NZCV
- */
-static const unsigned short cc_map[16] = {
-       0xF0F0,                 /* EQ == Z set            */
-       0x0F0F,                 /* NE                     */
-       0xCCCC,                 /* CS == C set            */
-       0x3333,                 /* CC                     */
-       0xFF00,                 /* MI == N set            */
-       0x00FF,                 /* PL                     */
-       0xAAAA,                 /* VS == V set            */
-       0x5555,                 /* VC                     */
-       0x0C0C,                 /* HI == C set && Z clear */
-       0xF3F3,                 /* LS == C clear || Z set */
-       0xAA55,                 /* GE == (N==V)           */
-       0x55AA,                 /* LT == (N!=V)           */
-       0x0A05,                 /* GT == (!Z && (N==V))   */
-       0xF5FA,                 /* LE == (Z || (N!=V))    */
-       0xFFFF,                 /* AL always              */
-       0                       /* NV                     */
-};
-
-static int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
-{
-       u32 esr = kvm_vcpu_get_hsr(vcpu);
-
-       if (esr & ESR_ELx_CV)
-               return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
-
-       return -1;
-}
-
-/*
- * Check if a trapped instruction should have been executed or not.
- */
-bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
-{
-       unsigned long cpsr;
-       u32 cpsr_cond;
-       int cond;
-
-       /* Top two bits non-zero?  Unconditional. */
-       if (kvm_vcpu_get_hsr(vcpu) >> 30)
-               return true;
-
-       /* Is condition field valid? */
-       cond = kvm_vcpu_get_condition(vcpu);
-       if (cond == 0xE)
-               return true;
-
-       cpsr = *vcpu_cpsr(vcpu);
-
-       if (cond < 0) {
-               /* This can happen in Thumb mode: examine IT state. */
-               unsigned long it;
-
-               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
-               /* it == 0 => unconditional. */
-               if (it == 0)
-                       return true;
-
-               /* The cond for this insn works out as the top 4 bits. */
-               cond = (it >> 4);
-       }
-
-       cpsr_cond = cpsr >> 28;
-
-       if (!((cc_map[cond] >> cpsr_cond) & 1))
-               return false;
-
-       return true;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu:      The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
-       unsigned long itbits, cond;
-       unsigned long cpsr = *vcpu_cpsr(vcpu);
-       bool is_arm = !(cpsr & COMPAT_PSR_T_BIT);
-
-       BUG_ON(is_arm && (cpsr & COMPAT_PSR_IT_MASK));
-
-       if (!(cpsr & COMPAT_PSR_IT_MASK))
-               return;
-
-       cond = (cpsr & 0xe000) >> 13;
-       itbits = (cpsr & 0x1c00) >> (10 - 2);
-       itbits |= (cpsr & (0x3 << 25)) >> 25;
-
-       /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
-       if ((itbits & 0x7) == 0)
-               itbits = cond = 0;
-       else
-               itbits = (itbits << 1) & 0x1f;
-
-       cpsr &= ~COMPAT_PSR_IT_MASK;
-       cpsr |= cond << 13;
-       cpsr |= (itbits & 0x1c) << (10 - 2);
-       cpsr |= (itbits & 0x3) << 25;
-       *vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
-       bool is_thumb;
-
-       is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT);
-       if (is_thumb && !is_wide_instr)
-               *vcpu_pc(vcpu) += 2;
-       else
-               *vcpu_pc(vcpu) += 4;
-       kvm_adjust_itstate(vcpu);
-}
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c

index fa96fe2..a204adf 100644 (file)
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -170,9 +170,32 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
  {
         exit_handle_fn exit_handler;
  
+       if (ARM_SERROR_PENDING(exception_index)) {
+               u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu));
+
+               /*
+                * HVC/SMC already have an adjusted PC, which we need
+                * to correct in order to return to after having
+                * injected the SError.
+                */
+               if (hsr_ec == ESR_ELx_EC_HVC32 || hsr_ec == ESR_ELx_EC_HVC64 ||
+                   hsr_ec == ESR_ELx_EC_SMC32 || hsr_ec == ESR_ELx_EC_SMC64) {
+                       u32 adj =  kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2;
+                       *vcpu_pc(vcpu) -= adj;
+               }
+
+               kvm_inject_vabt(vcpu);
+               return 1;
+       }
+
+       exception_index = ARM_EXCEPTION_CODE(exception_index);
+
         switch (exception_index) {
         case ARM_EXCEPTION_IRQ:
                 return 1;
+       case ARM_EXCEPTION_EL1_SERROR:
+               kvm_inject_vabt(vcpu);
+               return 1;
         case ARM_EXCEPTION_TRAP:
                 /*
                  * See ARM ARM B1.14.1: "Hyp traps on instructions
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile

index 0c85feb..aaf42ae 100644 (file)
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -5,9 +5,9 @@
  KVM=../../../../virt/kvm
  
  obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
  obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
  
-obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
  obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
  obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
  obj-$(CONFIG_KVM_ARM_HOST) += entry.o
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c

index 33342a7..4ba5c90 100644 (file)
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -131,9 +131,7 @@ void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu)
                 vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
  }
  
-static u32 __hyp_text __debug_read_mdcr_el2(void)
+u32 __hyp_text __kvm_get_mdcr_el2(void)
  {
         return read_sysreg(mdcr_el2);
  }
-
-__alias(__debug_read_mdcr_el2) u32 __kvm_get_mdcr_el2(void);
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S

index ce9e5e5..12ee62d 100644 (file)
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -55,79 +55,111 @@
   */
  ENTRY(__guest_enter)
         // x0: vcpu
-       // x1: host/guest context
-       // x2-x18: clobbered by macros
+       // x1: host context
+       // x2-x17: clobbered by macros
+       // x18: guest context
  
         // Store the host regs
         save_callee_saved_regs x1
  
-       // Preserve vcpu & host_ctxt for use at exit time
-       stp     x0, x1, [sp, #-16]!
+       // Store the host_ctxt for use at exit time
+       str     x1, [sp, #-16]!
  
-       add     x1, x0, #VCPU_CONTEXT
+       add     x18, x0, #VCPU_CONTEXT
  
-       // Prepare x0-x1 for later restore by pushing them onto the stack
-       ldp     x2, x3, [x1, #CPU_XREG_OFFSET(0)]
-       stp     x2, x3, [sp, #-16]!
+       // Restore guest regs x0-x17
+       ldp     x0, x1,   [x18, #CPU_XREG_OFFSET(0)]
+       ldp     x2, x3,   [x18, #CPU_XREG_OFFSET(2)]
+       ldp     x4, x5,   [x18, #CPU_XREG_OFFSET(4)]
+       ldp     x6, x7,   [x18, #CPU_XREG_OFFSET(6)]
+       ldp     x8, x9,   [x18, #CPU_XREG_OFFSET(8)]
+       ldp     x10, x11, [x18, #CPU_XREG_OFFSET(10)]
+       ldp     x12, x13, [x18, #CPU_XREG_OFFSET(12)]
+       ldp     x14, x15, [x18, #CPU_XREG_OFFSET(14)]
+       ldp     x16, x17, [x18, #CPU_XREG_OFFSET(16)]
  
-       // x2-x18
-       ldp     x2, x3,   [x1, #CPU_XREG_OFFSET(2)]
-       ldp     x4, x5,   [x1, #CPU_XREG_OFFSET(4)]
-       ldp     x6, x7,   [x1, #CPU_XREG_OFFSET(6)]
-       ldp     x8, x9,   [x1, #CPU_XREG_OFFSET(8)]
-       ldp     x10, x11, [x1, #CPU_XREG_OFFSET(10)]
-       ldp     x12, x13, [x1, #CPU_XREG_OFFSET(12)]
-       ldp     x14, x15, [x1, #CPU_XREG_OFFSET(14)]
-       ldp     x16, x17, [x1, #CPU_XREG_OFFSET(16)]
-       ldr     x18,      [x1, #CPU_XREG_OFFSET(18)]
-
-       // x19-x29, lr
-       restore_callee_saved_regs x1
-
-       // Last bits of the 64bit state
-       ldp     x0, x1, [sp], #16
+       // Restore guest regs x19-x29, lr
+       restore_callee_saved_regs x18
+
+       // Restore guest reg x18
+       ldr     x18,      [x18, #CPU_XREG_OFFSET(18)]
  
         // Do not touch any register after this!
         eret
  ENDPROC(__guest_enter)
  
  ENTRY(__guest_exit)
-       // x0: vcpu
-       // x1: return code
-       // x2-x3: free
-       // x4-x29,lr: vcpu regs
-       // vcpu x0-x3 on the stack
+       // x0: return code
+       // x1: vcpu
+       // x2-x29,lr: vcpu regs
+       // vcpu x0-x1 on the stack
  
-       add     x2, x0, #VCPU_CONTEXT
+       add     x1, x1, #VCPU_CONTEXT
  
-       stp     x4, x5,   [x2, #CPU_XREG_OFFSET(4)]
-       stp     x6, x7,   [x2, #CPU_XREG_OFFSET(6)]
-       stp     x8, x9,   [x2, #CPU_XREG_OFFSET(8)]
-       stp     x10, x11, [x2, #CPU_XREG_OFFSET(10)]
-       stp     x12, x13, [x2, #CPU_XREG_OFFSET(12)]
-       stp     x14, x15, [x2, #CPU_XREG_OFFSET(14)]
-       stp     x16, x17, [x2, #CPU_XREG_OFFSET(16)]
-       str     x18,      [x2, #CPU_XREG_OFFSET(18)]
+       ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
  
-       ldp     x6, x7, [sp], #16       // x2, x3
-       ldp     x4, x5, [sp], #16       // x0, x1
+       // Store the guest regs x2 and x3
+       stp     x2, x3,   [x1, #CPU_XREG_OFFSET(2)]
  
-       stp     x4, x5, [x2, #CPU_XREG_OFFSET(0)]
-       stp     x6, x7, [x2, #CPU_XREG_OFFSET(2)]
+       // Retrieve the guest regs x0-x1 from the stack
+       ldp     x2, x3, [sp], #16       // x0, x1
+
+       // Store the guest regs x0-x1 and x4-x18
+       stp     x2, x3,   [x1, #CPU_XREG_OFFSET(0)]
+       stp     x4, x5,   [x1, #CPU_XREG_OFFSET(4)]
+       stp     x6, x7,   [x1, #CPU_XREG_OFFSET(6)]
+       stp     x8, x9,   [x1, #CPU_XREG_OFFSET(8)]
+       stp     x10, x11, [x1, #CPU_XREG_OFFSET(10)]
+       stp     x12, x13, [x1, #CPU_XREG_OFFSET(12)]
+       stp     x14, x15, [x1, #CPU_XREG_OFFSET(14)]
+       stp     x16, x17, [x1, #CPU_XREG_OFFSET(16)]
+       str     x18,      [x1, #CPU_XREG_OFFSET(18)]
+
+       // Store the guest regs x19-x29, lr
+       save_callee_saved_regs x1
  
-       save_callee_saved_regs x2
+       // Restore the host_ctxt from the stack
+       ldr     x2, [sp], #16
  
-       // Restore vcpu & host_ctxt from the stack
-       // (preserving return code in x1)
-       ldp     x0, x2, [sp], #16
         // Now restore the host regs
         restore_callee_saved_regs x2
  
-       mov     x0, x1
-       ret
+       // If we have a pending asynchronous abort, now is the
+       // time to find out. From your VAXorcist book, page 666:
+       // "Threaten me not, oh Evil one!  For I speak with
+       // the power of DEC, and I command thee to show thyself!"
+       mrs     x2, elr_el2
+       mrs     x3, esr_el2
+       mrs     x4, spsr_el2
+       mov     x5, x0
+
+       dsb     sy              // Synchronize against in-flight ld/st
+       msr     daifclr, #4     // Unmask aborts
+
+       // This is our single instruction exception window. A pending
+       // SError is guaranteed to occur at the earliest when we unmask
+       // it, and at the latest just after the ISB.
+       .global abort_guest_exit_start
+abort_guest_exit_start:
+
+       isb
+
+       .global abort_guest_exit_end
+abort_guest_exit_end:
+
+       // If the exception took place, restore the EL1 exception
+       // context so that we can report some information.
+       // Merge the exception code with the SError pending bit.
+       tbz     x0, #ARM_EXIT_WITH_SERROR_BIT, 1f
+       msr     elr_el2, x2
+       msr     esr_el2, x3
+       msr     spsr_el2, x4
+       orr     x0, x0, x5
+1:     ret
  ENDPROC(__guest_exit)
  
  ENTRY(__fpsimd_guest_restore)
+       stp     x2, x3, [sp, #-16]!
         stp     x4, lr, [sp, #-16]!
  
  alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S

index f6d9694..4e92399 100644 (file)
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -27,16 +27,6 @@
         .text
         .pushsection    .hyp.text, "ax"
  
-.macro save_x0_to_x3
-       stp     x0, x1, [sp, #-16]!
-       stp     x2, x3, [sp, #-16]!
-.endm
-
-.macro restore_x0_to_x3
-       ldp     x2, x3, [sp], #16
-       ldp     x0, x1, [sp], #16
-.endm
-
  .macro do_el2_call
         /*
          * Shuffle the parameters before calling the function
@@ -79,23 +69,23 @@ ENTRY(__kvm_hyp_teardown)
  ENDPROC(__kvm_hyp_teardown)
         
  el1_sync:                              // Guest trapped into EL2
-       save_x0_to_x3
+       stp     x0, x1, [sp, #-16]!
  
  alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
         mrs     x1, esr_el2
  alternative_else
         mrs     x1, esr_el1
  alternative_endif
-       lsr     x2, x1, #ESR_ELx_EC_SHIFT
+       lsr     x0, x1, #ESR_ELx_EC_SHIFT
  
-       cmp     x2, #ESR_ELx_EC_HVC64
+       cmp     x0, #ESR_ELx_EC_HVC64
         b.ne    el1_trap
  
-       mrs     x3, vttbr_el2           // If vttbr is valid, the 64bit guest
-       cbnz    x3, el1_trap            // called HVC
+       mrs     x1, vttbr_el2           // If vttbr is valid, the 64bit guest
+       cbnz    x1, el1_trap            // called HVC
  
         /* Here, we're pretty sure the host called HVC. */
-       restore_x0_to_x3
+       ldp     x0, x1, [sp], #16
  
         cmp     x0, #HVC_GET_VECTORS
         b.ne    1f
@@ -113,24 +103,51 @@ alternative_endif
  
  el1_trap:
         /*
-        * x1: ESR
-        * x2: ESR_EC
+        * x0: ESR_EC
          */
  
         /* Guest accessed VFP/SIMD registers, save host, restore Guest */
-       cmp     x2, #ESR_ELx_EC_FP_ASIMD
+       cmp     x0, #ESR_ELx_EC_FP_ASIMD
         b.eq    __fpsimd_guest_restore
  
-       mrs     x0, tpidr_el2
-       mov     x1, #ARM_EXCEPTION_TRAP
+       mrs     x1, tpidr_el2
+       mov     x0, #ARM_EXCEPTION_TRAP
         b       __guest_exit
  
  el1_irq:
-       save_x0_to_x3
-       mrs     x0, tpidr_el2
-       mov     x1, #ARM_EXCEPTION_IRQ
+       stp     x0, x1, [sp, #-16]!
+       mrs     x1, tpidr_el2
+       mov     x0, #ARM_EXCEPTION_IRQ
+       b       __guest_exit
+
+el1_error:
+       stp     x0, x1, [sp, #-16]!
+       mrs     x1, tpidr_el2
+       mov     x0, #ARM_EXCEPTION_EL1_SERROR
         b       __guest_exit
  
+el2_error:
+       /*
+        * Only two possibilities:
+        * 1) Either we come from the exit path, having just unmasked
+        *    PSTATE.A: change the return code to an EL2 fault, and
+        *    carry on, as we're already in a sane state to handle it.
+        * 2) Or we come from anywhere else, and that's a bug: we panic.
+        *
+        * For (1), x0 contains the original return code and x1 doesn't
+        * contain anything meaningful at that stage. We can reuse them
+        * as temp registers.
+        * For (2), who cares?
+        */
+       mrs     x0, elr_el2
+       adr     x1, abort_guest_exit_start
+       cmp     x0, x1
+       adr     x1, abort_guest_exit_end
+       ccmp    x0, x1, #4, ne
+       b.ne    __hyp_panic
+       mov     x0, #(1 << ARM_EXIT_WITH_SERROR_BIT)
+       eret
+
  ENTRY(__hyp_do_panic)
         mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
                       PSR_MODE_EL1h)
@@ -155,11 +172,9 @@ ENDPROC(\label)
         invalid_vector  el2h_sync_invalid
         invalid_vector  el2h_irq_invalid
         invalid_vector  el2h_fiq_invalid
-       invalid_vector  el2h_error_invalid
         invalid_vector  el1_sync_invalid
         invalid_vector  el1_irq_invalid
         invalid_vector  el1_fiq_invalid
-       invalid_vector  el1_error_invalid
  
         .ltorg
  
@@ -174,15 +189,15 @@ ENTRY(__kvm_hyp_vector)
         ventry  el2h_sync_invalid               // Synchronous EL2h
         ventry  el2h_irq_invalid                // IRQ EL2h
         ventry  el2h_fiq_invalid                // FIQ EL2h
-       ventry  el2h_error_invalid              // Error EL2h
+       ventry  el2_error                       // Error EL2h
  
         ventry  el1_sync                        // Synchronous 64-bit EL1
         ventry  el1_irq                         // IRQ 64-bit EL1
         ventry  el1_fiq_invalid                 // FIQ 64-bit EL1
-       ventry  el1_error_invalid               // Error 64-bit EL1
+       ventry  el1_error                       // Error 64-bit EL1
  
         ventry  el1_sync                        // Synchronous 32-bit EL1
         ventry  el1_irq                         // IRQ 32-bit EL1
         ventry  el1_fiq_invalid                 // FIQ 32-bit EL1
-       ventry  el1_error_invalid               // Error 32-bit EL1
+       ventry  el1_error                       // Error 32-bit EL1
  ENDPROC(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c

index 5a84b45..83037cd 100644 (file)
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -16,7 +16,10 @@
   */
  
  #include <linux/types.h>
+#include <linux/jump_label.h>
+
  #include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
  #include <asm/kvm_hyp.h>
  
  static bool __hyp_text __fpsimd_enabled_nvhe(void)
@@ -109,6 +112,15 @@ static hyp_alternate_select(__deactivate_traps_arch,
  
  static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
  {
+       /*
+        * If we pended a virtual abort, preserve it until it gets
+        * cleared. See D1.14.3 (Virtual Interrupts) for details, but
+        * the crucial bit is "On taking a vSError interrupt,
+        * HCR_EL2.VSE is cleared to 0."
+        */
+       if (vcpu->arch.hcr_el2 & HCR_VSE)
+               vcpu->arch.hcr_el2 = read_sysreg(hcr_el2);
+
         __deactivate_traps_arch()();
         write_sysreg(0, hstr_el2);
         write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
@@ -126,17 +138,13 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
         write_sysreg(0, vttbr_el2);
  }
  
-static hyp_alternate_select(__vgic_call_save_state,
-                           __vgic_v2_save_state, __vgic_v3_save_state,
-                           ARM64_HAS_SYSREG_GIC_CPUIF);
-
-static hyp_alternate_select(__vgic_call_restore_state,
-                           __vgic_v2_restore_state, __vgic_v3_restore_state,
-                           ARM64_HAS_SYSREG_GIC_CPUIF);
-
  static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
  {
-       __vgic_call_save_state()(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_save_state(vcpu);
+       else
+               __vgic_v2_save_state(vcpu);
+
         write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2);
  }
  
@@ -149,7 +157,10 @@ static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
         val |= vcpu->arch.irq_lines;
         write_sysreg(val, hcr_el2);
  
-       __vgic_call_restore_state()(vcpu);
+       if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+               __vgic_v3_restore_state(vcpu);
+       else
+               __vgic_v2_restore_state(vcpu);
  }
  
  static bool __hyp_text __true_value(void)
@@ -232,7 +243,22 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
         return true;
  }
  
-static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
+{
+       *vcpu_pc(vcpu) = read_sysreg_el2(elr);
+
+       if (vcpu_mode_is_32bit(vcpu)) {
+               vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr);
+               kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+               write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr);
+       } else {
+               *vcpu_pc(vcpu) += 4;
+       }
+
+       write_sysreg_el2(*vcpu_pc(vcpu), elr);
+}
+
+int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
  {
         struct kvm_cpu_context *host_ctxt;
         struct kvm_cpu_context *guest_ctxt;
@@ -267,9 +293,43 @@ again:
         exit_code = __guest_enter(vcpu, host_ctxt);
         /* And we're baaack! */
  
+       /*
+        * We're using the raw exception code in order to only process
+        * the trap if no SError is pending. We will come back to the
+        * same PC once the SError has been injected, and replay the
+        * trapping instruction.
+        */
         if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
                 goto again;
  
+       if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
+           exit_code == ARM_EXCEPTION_TRAP) {
+               bool valid;
+
+               valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW &&
+                       kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT &&
+                       kvm_vcpu_dabt_isvalid(vcpu) &&
+                       !kvm_vcpu_dabt_isextabt(vcpu) &&
+                       !kvm_vcpu_dabt_iss1tw(vcpu);
+
+               if (valid) {
+                       int ret = __vgic_v2_perform_cpuif_access(vcpu);
+
+                       if (ret == 1) {
+                               __skip_instr(vcpu);
+                               goto again;
+                       }
+
+                       if (ret == -1) {
+                               /* Promote an illegal access to an SError */
+                               __skip_instr(vcpu);
+                               exit_code = ARM_EXCEPTION_EL1_SERROR;
+                       }
+
+                       /* 0 falls through to be handler out of EL2 */
+               }
+       }
+
         fp_enabled = __fpsimd_enabled();
  
         __sysreg_save_guest_state(guest_ctxt);
@@ -293,8 +353,6 @@ again:
         return exit_code;
  }
  
-__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
-
  static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
  
  static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c

index be8177c..9cc0ea7 100644 (file)
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -17,7 +17,7 @@
  
  #include <asm/kvm_hyp.h>
  
-static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  {
         dsb(ishst);
  
@@ -48,10 +48,7 @@ static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
         write_sysreg(0, vttbr_el2);
  }
  
-__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
-                                                           phys_addr_t ipa);
-
-static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
  {
         dsb(ishst);
  
@@ -67,14 +64,10 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
         write_sysreg(0, vttbr_el2);
  }
  
-__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
-
-static void __hyp_text __tlb_flush_vm_context(void)
+void __hyp_text __kvm_flush_vm_context(void)
  {
         dsb(ishst);
         asm volatile("tlbi alle1is      \n"
                      "ic ialluis          ": : );
         dsb(ish);
  }
-
-__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c

deleted file mode 100644 (file)

index 5f8f80b..0000000
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/compiler.h>
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_hyp.h>
-
-#define vtr_to_max_lr_idx(v)           ((v) & 0xf)
-#define vtr_to_nr_pri_bits(v)          (((u32)(v) >> 29) + 1)
-
-#define read_gicreg(r)                                                 \
-       ({                                                              \
-               u64 reg;                                                \
-               asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
-               reg;                                                    \
-       })
-
-#define write_gicreg(v,r)                                              \
-       do {                                                            \
-               u64 __val = (v);                                        \
-               asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
-       } while (0)
-
-static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
-{
-       switch (lr & 0xf) {
-       case 0:
-               return read_gicreg(ICH_LR0_EL2);
-       case 1:
-               return read_gicreg(ICH_LR1_EL2);
-       case 2:
-               return read_gicreg(ICH_LR2_EL2);
-       case 3:
-               return read_gicreg(ICH_LR3_EL2);
-       case 4:
-               return read_gicreg(ICH_LR4_EL2);
-       case 5:
-               return read_gicreg(ICH_LR5_EL2);
-       case 6:
-               return read_gicreg(ICH_LR6_EL2);
-       case 7:
-               return read_gicreg(ICH_LR7_EL2);
-       case 8:
-               return read_gicreg(ICH_LR8_EL2);
-       case 9:
-               return read_gicreg(ICH_LR9_EL2);
-       case 10:
-               return read_gicreg(ICH_LR10_EL2);
-       case 11:
-               return read_gicreg(ICH_LR11_EL2);
-       case 12:
-               return read_gicreg(ICH_LR12_EL2);
-       case 13:
-               return read_gicreg(ICH_LR13_EL2);
-       case 14:
-               return read_gicreg(ICH_LR14_EL2);
-       case 15:
-               return read_gicreg(ICH_LR15_EL2);
-       }
-
-       unreachable();
-}
-
-static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
-{
-       switch (lr & 0xf) {
-       case 0:
-               write_gicreg(val, ICH_LR0_EL2);
-               break;
-       case 1:
-               write_gicreg(val, ICH_LR1_EL2);
-               break;
-       case 2:
-               write_gicreg(val, ICH_LR2_EL2);
-               break;
-       case 3:
-               write_gicreg(val, ICH_LR3_EL2);
-               break;
-       case 4:
-               write_gicreg(val, ICH_LR4_EL2);
-               break;
-       case 5:
-               write_gicreg(val, ICH_LR5_EL2);
-               break;
-       case 6:
-               write_gicreg(val, ICH_LR6_EL2);
-               break;
-       case 7:
-               write_gicreg(val, ICH_LR7_EL2);
-               break;
-       case 8:
-               write_gicreg(val, ICH_LR8_EL2);
-               break;
-       case 9:
-               write_gicreg(val, ICH_LR9_EL2);
-               break;
-       case 10:
-               write_gicreg(val, ICH_LR10_EL2);
-               break;
-       case 11:
-               write_gicreg(val, ICH_LR11_EL2);
-               break;
-       case 12:
-               write_gicreg(val, ICH_LR12_EL2);
-               break;
-       case 13:
-               write_gicreg(val, ICH_LR13_EL2);
-               break;
-       case 14:
-               write_gicreg(val, ICH_LR14_EL2);
-               break;
-       case 15:
-               write_gicreg(val, ICH_LR15_EL2);
-               break;
-       }
-}
-
-static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       int i;
-       bool expect_mi;
-
-       expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE);
-
-       for (i = 0; i < nr_lr; i++) {
-               if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-                               continue;
-
-               expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) &&
-                             (cpu_if->vgic_lr[i] & ICH_LR_EOI));
-       }
-
-       if (expect_mi) {
-               cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
-
-               if (cpu_if->vgic_misr & ICH_MISR_EOI)
-                       cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2);
-               else
-                       cpu_if->vgic_eisr = 0;
-       } else {
-               cpu_if->vgic_misr = 0;
-               cpu_if->vgic_eisr = 0;
-       }
-}
-
-void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u64 val;
-
-       /*
-        * Make sure stores to the GIC via the memory mapped interface
-        * are now visible to the system register interface.
-        */
-       if (!cpu_if->vgic_sre)
-               dsb(st);
-
-       cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
-
-       if (vcpu->arch.vgic_cpu.live_lrs) {
-               int i;
-               u32 max_lr_idx, nr_pri_bits;
-
-               cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
-
-               write_gicreg(0, ICH_HCR_EL2);
-               val = read_gicreg(ICH_VTR_EL2);
-               max_lr_idx = vtr_to_max_lr_idx(val);
-               nr_pri_bits = vtr_to_nr_pri_bits(val);
-
-               save_maint_int_state(vcpu, max_lr_idx + 1);
-
-               for (i = 0; i <= max_lr_idx; i++) {
-                       if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
-                               continue;
-
-                       if (cpu_if->vgic_elrsr & (1 << i))
-                               cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
-                       else
-                               cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
-
-                       __gic_v3_set_lr(0, i);
-               }
-
-               switch (nr_pri_bits) {
-               case 7:
-                       cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
-                       cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
-               case 6:
-                       cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
-               default:
-                       cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
-               }
-
-               switch (nr_pri_bits) {
-               case 7:
-                       cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
-                       cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
-               case 6:
-                       cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
-               default:
-                       cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
-               }
-
-               vcpu->arch.vgic_cpu.live_lrs = 0;
-       } else {
-               cpu_if->vgic_misr  = 0;
-               cpu_if->vgic_eisr  = 0;
-               cpu_if->vgic_elrsr = 0xffff;
-               cpu_if->vgic_ap0r[0] = 0;
-               cpu_if->vgic_ap0r[1] = 0;
-               cpu_if->vgic_ap0r[2] = 0;
-               cpu_if->vgic_ap0r[3] = 0;
-               cpu_if->vgic_ap1r[0] = 0;
-               cpu_if->vgic_ap1r[1] = 0;
-               cpu_if->vgic_ap1r[2] = 0;
-               cpu_if->vgic_ap1r[3] = 0;
-       }
-
-       val = read_gicreg(ICC_SRE_EL2);
-       write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
-
-       if (!cpu_if->vgic_sre) {
-               /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
-               isb();
-               write_gicreg(1, ICC_SRE_EL1);
-       }
-}
-
-void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-       u64 val;
-       u32 max_lr_idx, nr_pri_bits;
-       u16 live_lrs = 0;
-       int i;
-
-       /*
-        * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
-        * Group0 interrupt (as generated in GICv2 mode) to be
-        * delivered as a FIQ to the guest, with potentially fatal
-        * consequences. So we must make sure that ICC_SRE_EL1 has
-        * been actually programmed with the value we want before
-        * starting to mess with the rest of the GIC.
-        */
-       if (!cpu_if->vgic_sre) {
-               write_gicreg(0, ICC_SRE_EL1);
-               isb();
-       }
-
-       val = read_gicreg(ICH_VTR_EL2);
-       max_lr_idx = vtr_to_max_lr_idx(val);
-       nr_pri_bits = vtr_to_nr_pri_bits(val);
-
-       for (i = 0; i <= max_lr_idx; i++) {
-               if (cpu_if->vgic_lr[i] & ICH_LR_STATE)
-                       live_lrs |= (1 << i);
-       }
-
-       write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
-       if (live_lrs) {
-               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-
-               switch (nr_pri_bits) {
-               case 7:
-                       write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
-                       write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
-               case 6:
-                       write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
-               default:
-                       write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
-               }
-
-               switch (nr_pri_bits) {
-               case 7:
-                       write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
-                       write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
-               case 6:
-                       write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
-               default:
-                       write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
-               }
-
-               for (i = 0; i <= max_lr_idx; i++) {
-                       if (!(live_lrs & (1 << i)))
-                               continue;
-
-                       __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
-               }
-       }
-
-       /*
-        * Ensures that the above will have reached the
-        * (re)distributors. This ensure the guest will read the
-        * correct values from the memory-mapped interface.
-        */
-       if (!cpu_if->vgic_sre) {
-               isb();
-               dsb(sy);
-       }
-       vcpu->arch.vgic_cpu.live_lrs = live_lrs;
-
-       /*
-        * Prevent the guest from touching the GIC system registers if
-        * SRE isn't enabled for GICv3 emulation.
-        */
-       write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
-                    ICC_SRE_EL2);
-}
-
-void __hyp_text __vgic_v3_init_lrs(void)
-{
-       int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
-       int i;
-
-       for (i = 0; i <= max_lr_idx; i++)
-               __gic_v3_set_lr(0, i);
-}
-
-static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
-{
-       return read_gicreg(ICH_VTR_EL2);
-}
-
-__alias(__vgic_v3_read_ich_vtr_el2) u64 __vgic_v3_get_ich_vtr_el2(void);
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c

index 898c0e6..da6a8cf 100644 (file)
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -231,3 +231,15 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
         else
                 inject_undef64(vcpu);
  }
+
+/**
+ * kvm_inject_vabt - inject an async abort / SError into the guest
+ * @vcpu: The VCPU to receive the exception
+ *
+ * It is assumed that this code is called from the VCPU thread and that the
+ * VCPU therefore is not currently executing guest code.
+ */
+void kvm_inject_vabt(struct kvm_vcpu *vcpu)
+{
+       vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE);
+}
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig

index 498b567..d140206 100644 (file)
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -14,7 +14,6 @@ config M68K
         select GENERIC_IOMAP
         select GENERIC_STRNCPY_FROM_USER if MMU
         select GENERIC_STRNLEN_USER if MMU
-       select FPU if MMU
         select ARCH_WANT_IPC_PARSE_VERSION
         select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE
         select HAVE_FUTEX_CMPXCHG if MMU && FUTEX
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu

index 967260f..d2219f3 100644 (file)
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -62,6 +62,7 @@ config MCPU32
  config M68020
         bool "68020 support"
         depends on MMU
+       select FPU
         select CPU_HAS_ADDRESS_SPACES
         help
           If you anticipate running this kernel on a computer with a MC68020
@@ -72,6 +73,7 @@ config M68020
  config M68030
         bool "68030 support"
         depends on MMU && !MMU_SUN3
+       select FPU
         select CPU_HAS_ADDRESS_SPACES
         help
           If you anticipate running this kernel on a computer with a MC68030
@@ -81,6 +83,7 @@ config M68030
  config M68040
         bool "68040 support"
         depends on MMU && !MMU_SUN3
+       select FPU
         select CPU_HAS_ADDRESS_SPACES
         help
           If you anticipate running this kernel on a computer with a MC68LC040
@@ -91,6 +94,7 @@ config M68040
  config M68060
         bool "68060 support"
         depends on MMU && !MMU_SUN3
+       select FPU
         select CPU_HAS_ADDRESS_SPACES
         help
           If you anticipate running this kernel on a computer with a MC68060
@@ -259,6 +263,7 @@ config M547x
         bool "MCF547x"
         select M54xx
         select MMU_COLDFIRE if MMU
+       select FPU if MMU
         select HAVE_CACHE_CB
         select HAVE_MBAR
         select CPU_NO_EFFICIENT_FFS
@@ -268,6 +273,7 @@ config M547x
  config M548x
         bool "MCF548x"
         select MMU_COLDFIRE if MMU
+       select FPU if MMU
         select M54xx
         select HAVE_CACHE_CB
         select HAVE_MBAR
diff --git a/arch/m68k/coldfire/clk.c b/arch/m68k/coldfire/clk.c

index fddfdcc..1e3c7e9 100644 (file)
--- a/arch/m68k/coldfire/clk.c
+++ b/arch/m68k/coldfire/clk.c
@@ -101,6 +101,10 @@ EXPORT_SYMBOL(clk_enable);
  void clk_disable(struct clk *clk)
  {
         unsigned long flags;
+
+       if (!clk)
+               return;
+
         spin_lock_irqsave(&clk_lock, flags);
         if ((--clk->enabled == 0) && clk->clk_ops)
                 clk->clk_ops->disable(clk);
diff --git a/arch/m68k/coldfire/head.S b/arch/m68k/coldfire/head.S

index 73d92ea..bdb472c 100644 (file)
--- a/arch/m68k/coldfire/head.S
+++ b/arch/m68k/coldfire/head.S
@@ -280,10 +280,10 @@ _clear_bss:
         movel   %d0,m68k_cputype                /* Mark us as a ColdFire */
         movel   #MMU_COLDFIRE,%d0
         movel   %d0,m68k_mmutype
-       movel   #FPU_COLDFIRE,%d0
-       movel   %d0,m68k_fputype
-       movel   #MACH_M54XX,%d0
-       movel   %d0,m68k_machtype               /* Mark us as a 54xx machine */
+       movel   #FPUTYPE,%d0
+       movel   %d0,m68k_fputype                /* Mark FPU type */
+       movel   #MACHINE,%d0
+       movel   %d0,m68k_machtype               /* Mark machine type */
         lea     init_task,%a2                   /* Set "current" init task */
  #endif
  
diff --git a/arch/m68k/coldfire/m528x.c b/arch/m68k/coldfire/m528x.c

index 45e947a..12f9e37 100644 (file)
--- a/arch/m68k/coldfire/m528x.c
+++ b/arch/m68k/coldfire/m528x.c
@@ -102,14 +102,14 @@ void wildfiremod_halt(void)
         printk(KERN_INFO "WildFireMod hibernating...\n");
  
         /* Set portE.5 to Digital IO */
-       MCF5282_GPIO_PEPAR &= ~(1 << (5 * 2));
+       writew(readw(MCFGPIO_PEPAR) & ~(1 << (5 * 2)), MCFGPIO_PEPAR);
  
         /* Make portE.5 an output */
-       MCF5282_GPIO_DDRE |= (1 << 5);
+       writeb(readb(MCFGPIO_PDDR_E) | (1 << 5), MCFGPIO_PDDR_E);
  
         /* Now toggle portE.5 from low to high */
-       MCF5282_GPIO_PORTE &= ~(1 << 5);
-       MCF5282_GPIO_PORTE |= (1 << 5);
+       writeb(readb(MCFGPIO_PODR_E) & ~(1 << 5), MCFGPIO_PODR_E);
+       writeb(readb(MCFGPIO_PODR_E) | (1 << 5), MCFGPIO_PODR_E);
  
         printk(KERN_EMERG "Failed to hibernate. Halting!\n");
  }
diff --git a/arch/m68k/coldfire/m53xx.c b/arch/m68k/coldfire/m53xx.c

index 80879a7..2502f63 100644 (file)
--- a/arch/m68k/coldfire/m53xx.c
+++ b/arch/m68k/coldfire/m53xx.c
@@ -271,9 +271,6 @@ void __init config_BSP(char *commandp, int size)
  
  #define NAND_FLASH_ADDRESS     (0xD0000000)
  
-int sys_clk_khz = 0;
-int sys_clk_mhz = 0;
-
  void wtm_init(void);
  void scm_init(void);
  void gpio_init(void);
@@ -286,9 +283,8 @@ int  get_sys_clock (void);
  
  asmlinkage void __init sysinit(void)
  {
-       sys_clk_khz = clock_pll(0, 0);
-       sys_clk_mhz = sys_clk_khz/1000;
-       
+       clock_pll(0, 0);
+
         wtm_init();
         scm_init();
         gpio_init();
diff --git a/arch/m68k/coldfire/m54xx.c b/arch/m68k/coldfire/m54xx.c

index c32f767..386df3b 100644 (file)
--- a/arch/m68k/coldfire/m54xx.c
+++ b/arch/m68k/coldfire/m54xx.c
@@ -25,7 +25,6 @@
  #include <asm/m54xxgpt.h>
  #ifdef CONFIG_MMU
  #include <asm/mmu_context.h>
-#include <linux/pfn.h>
  #endif
  
  /***************************************************************************/
@@ -78,47 +77,10 @@ static void mcf54xx_reset(void)
  
  /***************************************************************************/
  
-#ifdef CONFIG_MMU
-
-unsigned long num_pages;
-
-static void __init mcf54xx_bootmem_alloc(void)
-{
-       unsigned long start_pfn;
-       unsigned long memstart;
-
-       /* _rambase and _ramend will be naturally page aligned */
-       m68k_memory[0].addr = _rambase;
-       m68k_memory[0].size = _ramend - _rambase;
-
-       /* compute total pages in system */
-       num_pages = PFN_DOWN(_ramend - _rambase);
-
-       /* page numbers */
-       memstart = PAGE_ALIGN(_ramstart);
-       min_low_pfn = PFN_DOWN(_rambase);
-       start_pfn = PFN_DOWN(memstart);
-       max_pfn = max_low_pfn = PFN_DOWN(_ramend);
-       high_memory = (void *)_ramend;
-
-       m68k_virt_to_node_shift = fls(_ramend - _rambase - 1) - 6;
-       module_fixup(NULL, __start_fixup, __stop_fixup);
-
-       /* setup bootmem data */
-       m68k_setup_node(0);
-       memstart += init_bootmem_node(NODE_DATA(0), start_pfn,
-               min_low_pfn, max_low_pfn);
-       free_bootmem_node(NODE_DATA(0), memstart, _ramend - memstart);
-}
-
-#endif /* CONFIG_MMU */
-
-/***************************************************************************/
-
  void __init config_BSP(char *commandp, int size)
  {
  #ifdef CONFIG_MMU
-       mcf54xx_bootmem_alloc();
+       cf_bootmem_alloc();
         mmu_context_init();
  #endif
         mach_reset = mcf54xx_reset;
diff --git a/arch/m68k/include/asm/bootinfo.h b/arch/m68k/include/asm/bootinfo.h

index 8e21326..81c91af 100644 (file)
--- a/arch/m68k/include/asm/bootinfo.h
+++ b/arch/m68k/include/asm/bootinfo.h
@@ -22,6 +22,12 @@ extern void save_bootinfo(const struct bi_record *bi);
  static inline void save_bootinfo(const struct bi_record *bi) {}
  #endif
  
+#ifdef CONFIG_UBOOT
+void process_uboot_commandline(char *commandp, int size);
+#else
+static inline void process_uboot_commandline(char *commandp, int size) {}
+#endif
+
  #endif /* __ASSEMBLY__ */
  
  
diff --git a/arch/m68k/include/asm/m5441xsim.h b/arch/m68k/include/asm/m5441xsim.h

index cc798ab..64f60be 100644 (file)
--- a/arch/m68k/include/asm/m5441xsim.h
+++ b/arch/m68k/include/asm/m5441xsim.h
@@ -10,6 +10,10 @@
  #define CPU_NAME               "COLDFIRE(m5441x)"
  #define CPU_INSTR_PER_JIFFY    2
  #define MCF_BUSCLK             (MCF_CLK / 2)
+#define MACHINE                        MACH_M5441X
+#define FPUTYPE                        0
+#define IOMEMBASE              0xe0000000
+#define IOMEMSIZE              0x20000000
  
  #include <asm/m54xxacr.h>
  
diff --git a/arch/m68k/include/asm/m54xxacr.h b/arch/m68k/include/asm/m54xxacr.h

index 59e1710..c6ac05c 100644 (file)
--- a/arch/m68k/include/asm/m54xxacr.h
+++ b/arch/m68k/include/asm/m54xxacr.h
@@ -94,7 +94,7 @@
   *     register region as non-cacheable. And then we map all our RAM as
   *     cacheable and supervisor access only.
   */
-#define ACR0_MODE      (ACR_BA(CONFIG_MBAR)+ACR_ADMSK(0x1000000)+ \
+#define ACR0_MODE      (ACR_BA(IOMEMBASE)+ACR_ADMSK(IOMEMSIZE)+ \
                          ACR_ENABLE+ACR_SUPER+ACR_CM_OFF_PRE+ACR_SP)
  #if defined(CONFIG_CACHE_COPYBACK)
  #define ACR1_MODE      (ACR_BA(CONFIG_RAMBASE)+ACR_ADMSK(CONFIG_RAMSIZE)+ \
diff --git a/arch/m68k/include/asm/m54xxsim.h b/arch/m68k/include/asm/m54xxsim.h

index a5fbd17..73d937f 100644 (file)
--- a/arch/m68k/include/asm/m54xxsim.h
+++ b/arch/m68k/include/asm/m54xxsim.h
@@ -8,6 +8,10 @@
  #define        CPU_NAME                "COLDFIRE(m54xx)"
  #define        CPU_INSTR_PER_JIFFY     2
  #define        MCF_BUSCLK              (MCF_CLK / 2)
+#define        MACHINE                 MACH_M54XX
+#define        FPUTYPE                 FPU_COLDFIRE
+#define        IOMEMBASE               MCF_MBAR
+#define        IOMEMSIZE               0x01000000
  
  #include <asm/m54xxacr.h>
  
diff --git a/arch/m68k/include/asm/mcfmmu.h b/arch/m68k/include/asm/mcfmmu.h

index 8824236..10f9930 100644 (file)
--- a/arch/m68k/include/asm/mcfmmu.h
+++ b/arch/m68k/include/asm/mcfmmu.h
@@ -105,6 +105,7 @@ static inline void mmu_write(u32 a, u32 v)
         __asm__ __volatile__ ("nop");
  }
  
+void cf_bootmem_alloc(void);
  int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word);
  
  #endif
diff --git a/arch/m68k/include/asm/nettel.h b/arch/m68k/include/asm/nettel.h

index 2a7a766..926375d 100644 (file)
--- a/arch/m68k/include/asm/nettel.h
+++ b/arch/m68k/include/asm/nettel.h
@@ -92,7 +92,7 @@ static __inline__ unsigned int mcf_getppdata(void)
  
  static __inline__ void mcf_setppdata(unsigned int mask, unsigned int bits)
  {
-       write((readw(MCFSIM_PBDAT) & ~mask) | bits, MCFSIM_PBDAT);
+       writew((readw(MCFSIM_PBDAT) & ~mask) | bits, MCFSIM_PBDAT);
  }
  #endif
  
diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h

index cdeb26a..a48cf54 100644 (file)
--- a/arch/m68k/include/uapi/asm/bootinfo.h
+++ b/arch/m68k/include/uapi/asm/bootinfo.h
@@ -81,6 +81,7 @@ struct mem_info {
  #define MACH_Q40               10
  #define MACH_SUN3X             11
  #define MACH_M54XX             12
+#define MACH_M5441X            13
  
  
      /*
diff --git a/arch/m68k/kernel/Makefile b/arch/m68k/kernel/Makefile

index e47778f..8a1c4d3 100644 (file)
--- a/arch/m68k/kernel/Makefile
+++ b/arch/m68k/kernel/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_HAS_DMA) += dma.o
  
  obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o
  obj-$(CONFIG_BOOTINFO_PROC)    += bootinfo_proc.o
+obj-$(CONFIG_UBOOT)            += uboot.o
  
  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
  
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c

index c55ff71..4ba1ae7 100644 (file)
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -203,11 +203,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
  }
  
  /* Fill in the fpu structure for a core dump.  */
-#ifdef CONFIG_FPU
  int dump_fpu (struct pt_regs *regs, struct user_m68kfp_struct *fpu)
  {
-       char fpustate[216];
-
         if (FPU_IS_EMU) {
                 int i;
  
@@ -222,37 +219,40 @@ int dump_fpu (struct pt_regs *regs, struct user_m68kfp_struct *fpu)
                 return 1;
         }
  
-       /* First dump the fpu context to avoid protocol violation.  */
-       asm volatile ("fsave %0" :: "m" (fpustate[0]) : "memory");
-       if (!CPU_IS_060 ? !fpustate[0] : !fpustate[2])
-               return 0;
+       if (IS_ENABLED(CONFIG_FPU)) {
+               char fpustate[216];
  
-       if (CPU_IS_COLDFIRE) {
-               asm volatile ("fmovel %/fpiar,%0\n\t"
-                             "fmovel %/fpcr,%1\n\t"
-                             "fmovel %/fpsr,%2\n\t"
-                             "fmovemd %/fp0-%/fp7,%3"
-                             :
-                             : "m" (fpu->fpcntl[0]),
-                               "m" (fpu->fpcntl[1]),
-                               "m" (fpu->fpcntl[2]),
-                               "m" (fpu->fpregs[0])
-                             : "memory");
-       } else {
-               asm volatile ("fmovem %/fpiar/%/fpcr/%/fpsr,%0"
-                             :
-                             : "m" (fpu->fpcntl[0])
-                             : "memory");
-               asm volatile ("fmovemx %/fp0-%/fp7,%0"
-                             :
-                             : "m" (fpu->fpregs[0])
-                             : "memory");
+               /* First dump the fpu context to avoid protocol violation.  */
+               asm volatile ("fsave %0" :: "m" (fpustate[0]) : "memory");
+               if (!CPU_IS_060 ? !fpustate[0] : !fpustate[2])
+                       return 0;
+
+               if (CPU_IS_COLDFIRE) {
+                       asm volatile ("fmovel %/fpiar,%0\n\t"
+                                     "fmovel %/fpcr,%1\n\t"
+                                     "fmovel %/fpsr,%2\n\t"
+                                     "fmovemd %/fp0-%/fp7,%3"
+                                     :
+                                     : "m" (fpu->fpcntl[0]),
+                                       "m" (fpu->fpcntl[1]),
+                                       "m" (fpu->fpcntl[2]),
+                                       "m" (fpu->fpregs[0])
+                                     : "memory");
+               } else {
+                       asm volatile ("fmovem %/fpiar/%/fpcr/%/fpsr,%0"
+                                     :
+                                     : "m" (fpu->fpcntl[0])
+                                     : "memory");
+                       asm volatile ("fmovemx %/fp0-%/fp7,%0"
+                                     :
+                                     : "m" (fpu->fpregs[0])
+                                     : "memory");
+               }
         }
  
         return 1;
  }
  EXPORT_SYMBOL(dump_fpu);
-#endif /* CONFIG_FPU */
  
  unsigned long get_wchan(struct task_struct *p)
  {
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c

index 50633c3..7a2c212 100644 (file)
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -245,7 +245,7 @@ void __init setup_arch(char **cmdline_p)
          * We should really do our own FPU check at startup.
          * [what do we do with buggy 68LC040s? if we have problems
          *  with them, we should add a test to check_bugs() below] */
-#ifndef CONFIG_M68KFPU_EMU_ONLY
+#if defined(CONFIG_FPU) && !defined(CONFIG_M68KFPU_EMU_ONLY)
         /* clear the fpu if we have one */
         if (m68k_fputype & (FPU_68881|FPU_68882|FPU_68040|FPU_68060|FPU_COLDFIRE)) {
                 volatile int zero = 0;
@@ -274,6 +274,7 @@ void __init setup_arch(char **cmdline_p)
         strncpy(m68k_command_line, CONFIG_BOOTPARAM_STRING, CL_SIZE);
         m68k_command_line[CL_SIZE - 1] = 0;
  #endif /* CONFIG_BOOTPARAM */
+       process_uboot_commandline(&m68k_command_line[0], CL_SIZE);
         *cmdline_p = m68k_command_line;
         memcpy(boot_command_line, *cmdline_p, CL_SIZE);
  
@@ -341,6 +342,7 @@ void __init setup_arch(char **cmdline_p)
  #endif
  #ifdef CONFIG_COLDFIRE
         case MACH_M54XX:
+       case MACH_M5441X:
                 config_BSP(NULL, 0);
                 break;
  #endif
@@ -548,7 +550,7 @@ module_init(proc_hardware_init);
  
  void check_bugs(void)
  {
-#ifndef CONFIG_M68KFPU_EMU
+#if defined(CONFIG_FPU) && !defined(CONFIG_M68KFPU_EMU)
         if (m68k_fputype == 0) {
                 pr_emerg("*** YOU DO NOT HAVE A FLOATING POINT UNIT, "
                         "WHICH IS REQUIRED BY LINUX/M68K ***\n");
diff --git a/arch/m68k/kernel/setup_no.c b/arch/m68k/kernel/setup_no.c

index 9309789..8afe6f6 100644 (file)
--- a/arch/m68k/kernel/setup_no.c
+++ b/arch/m68k/kernel/setup_no.c
@@ -34,6 +34,7 @@
  #include <linux/rtc.h>
  
  #include <asm/setup.h>
+#include <asm/bootinfo.h>
  #include <asm/irq.h>
  #include <asm/machdep.h>
  #include <asm/pgtable.h>
@@ -82,69 +83,6 @@ void (*mach_power_off)(void);
  #define        CPU_INSTR_PER_JIFFY     16
  #endif
  
-#if defined(CONFIG_UBOOT)
-/*
- * parse_uboot_commandline
- *
- * Copies u-boot commandline arguments and store them in the proper linux
- * variables.
- *
- * Assumes:
- *     _init_sp global contains the address in the stack pointer when the
- *     kernel starts (see head.S::_start)
- *
- *     U-Boot calling convention:
- *     (*kernel) (kbd, initrd_start, initrd_end, cmd_start, cmd_end);
- *
- *     _init_sp can be parsed as such
- *
- *     _init_sp+00 = u-boot cmd after jsr into kernel (skip)
- *     _init_sp+04 = &kernel board_info (residual data)
- *     _init_sp+08 = &initrd_start
- *     _init_sp+12 = &initrd_end
- *     _init_sp+16 = &cmd_start
- *     _init_sp+20 = &cmd_end
- *
- *     This also assumes that the memory locations pointed to are still
- *     unmodified. U-boot places them near the end of external SDRAM.
- *
- * Argument(s):
- *     commandp = the linux commandline arg container to fill.
- *     size     = the sizeof commandp.
- *
- * Returns:
- */
-static void __init parse_uboot_commandline(char *commandp, int size)
-{
-       extern unsigned long _init_sp;
-       unsigned long *sp;
-       unsigned long uboot_kbd;
-       unsigned long uboot_initrd_start, uboot_initrd_end;
-       unsigned long uboot_cmd_start, uboot_cmd_end;
-
-
-       sp = (unsigned long *)_init_sp;
-       uboot_kbd = sp[1];
-       uboot_initrd_start = sp[2];
-       uboot_initrd_end = sp[3];
-       uboot_cmd_start = sp[4];
-       uboot_cmd_end = sp[5];
-
-       if (uboot_cmd_start && uboot_cmd_end)
-               strncpy(commandp, (const char *)uboot_cmd_start, size);
-#if defined(CONFIG_BLK_DEV_INITRD)
-       if (uboot_initrd_start && uboot_initrd_end &&
-               (uboot_initrd_end > uboot_initrd_start)) {
-               initrd_start = uboot_initrd_start;
-               initrd_end = uboot_initrd_end;
-               ROOT_DEV = Root_RAM0;
-               printk(KERN_INFO "initrd at 0x%lx:0x%lx\n",
-                       initrd_start, initrd_end);
-       }
-#endif /* if defined(CONFIG_BLK_DEV_INITRD) */
-}
-#endif /* #if defined(CONFIG_UBOOT) */
-
  void __init setup_arch(char **cmdline_p)
  {
         int bootmap_size;
@@ -164,53 +102,38 @@ void __init setup_arch(char **cmdline_p)
         command_line[sizeof(command_line) - 1] = 0;
  #endif /* CONFIG_BOOTPARAM */
  
-#if defined(CONFIG_UBOOT)
-       /* CONFIG_UBOOT and CONFIG_BOOTPARAM defined, concatenate cmdline */
-       #if defined(CONFIG_BOOTPARAM)
-               /* Add the whitespace separator */
-               command_line[strlen(CONFIG_BOOTPARAM_STRING)] = ' ';
-               /* Parse uboot command line into the rest of the buffer */
-               parse_uboot_commandline(
-                       &command_line[(strlen(CONFIG_BOOTPARAM_STRING)+1)],
-                       (sizeof(command_line) -
-                       (strlen(CONFIG_BOOTPARAM_STRING)+1)));
-       /* Only CONFIG_UBOOT defined, create cmdline */
-       #else
-               parse_uboot_commandline(&command_line[0], sizeof(command_line));
-       #endif /* CONFIG_BOOTPARAM */
-       command_line[sizeof(command_line) - 1] = 0;
-#endif /* CONFIG_UBOOT */
+       process_uboot_commandline(&command_line[0], sizeof(command_line));
  
-       printk(KERN_INFO "\x0F\r\n\nuClinux/" CPU_NAME "\n");
+       pr_info("uClinux with CPU " CPU_NAME "\n");
  
  #ifdef CONFIG_UCDIMM
-       printk(KERN_INFO "uCdimm by Lineo, Inc. <www.lineo.com>\n");
+       pr_info("uCdimm by Lineo, Inc. <www.lineo.com>\n");
  #endif
  #ifdef CONFIG_M68VZ328
-       printk(KERN_INFO "M68VZ328 support by Evan Stawnyczy <e@lineo.ca>\n");
+       pr_info("M68VZ328 support by Evan Stawnyczy <e@lineo.ca>\n");
  #endif
  #ifdef CONFIG_COLDFIRE
-       printk(KERN_INFO "COLDFIRE port done by Greg Ungerer, gerg@snapgear.com\n");
+       pr_info("COLDFIRE port done by Greg Ungerer, gerg@snapgear.com\n");
  #ifdef CONFIG_M5307
-       printk(KERN_INFO "Modified for M5307 by Dave Miller, dmiller@intellistor.com\n");
+       pr_info("Modified for M5307 by Dave Miller, dmiller@intellistor.com\n");
  #endif
  #ifdef CONFIG_ELITE
-       printk(KERN_INFO "Modified for M5206eLITE by Rob Scott, rscott@mtrob.fdns.net\n");
+       pr_info("Modified for M5206eLITE by Rob Scott, rscott@mtrob.fdns.net\n");
  #endif
  #endif
-       printk(KERN_INFO "Flat model support (C) 1998,1999 Kenneth Albanowski, D. Jeff Dionne\n");
+       pr_info("Flat model support (C) 1998,1999 Kenneth Albanowski, D. Jeff Dionne\n");
  
  #if defined( CONFIG_PILOT ) && defined( CONFIG_M68328 )
-       printk(KERN_INFO "TRG SuperPilot FLASH card support <info@trgnet.com>\n");
+       pr_info("TRG SuperPilot FLASH card support <info@trgnet.com>\n");
  #endif
  #if defined( CONFIG_PILOT ) && defined( CONFIG_M68EZ328 )
-       printk(KERN_INFO "PalmV support by Lineo Inc. <jeff@uclinux.com>\n");
+       pr_info("PalmV support by Lineo Inc. <jeff@uclinux.com>\n");
  #endif
  #ifdef CONFIG_DRAGEN2
-       printk(KERN_INFO "DragonEngine II board support by Georges Menie\n");
+       pr_info("DragonEngine II board support by Georges Menie\n");
  #endif
  #ifdef CONFIG_M5235EVB
-       printk(KERN_INFO "Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)\n");
+       pr_info("Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)\n");
  #endif
  
         pr_debug("KERNEL -> TEXT=0x%p-0x%p DATA=0x%p-0x%p BSS=0x%p-0x%p\n",
diff --git a/arch/m68k/kernel/uboot.c b/arch/m68k/kernel/uboot.c

new file mode 100644 (file)

index 0000000..b3536a8
--- /dev/null
+++ b/arch/m68k/kernel/uboot.c
@@ -0,0 +1,107 @@
+/*
+ * uboot.c -- uboot arguments support
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/fb.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/console.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/root_dev.h>
+#include <linux/rtc.h>
+
+#include <asm/setup.h>
+#include <asm/irq.h>
+#include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+
+/*
+ * parse_uboot_commandline
+ *
+ * Copies u-boot commandline arguments and store them in the proper linux
+ * variables.
+ *
+ * Assumes:
+ *     _init_sp global contains the address in the stack pointer when the
+ *     kernel starts (see head.S::_start)
+ *
+ *     U-Boot calling convention:
+ *     (*kernel) (kbd, initrd_start, initrd_end, cmd_start, cmd_end);
+ *
+ *     _init_sp can be parsed as such
+ *
+ *     _init_sp+00 = u-boot cmd after jsr into kernel (skip)
+ *     _init_sp+04 = &kernel board_info (residual data)
+ *     _init_sp+08 = &initrd_start
+ *     _init_sp+12 = &initrd_end
+ *     _init_sp+16 = &cmd_start
+ *     _init_sp+20 = &cmd_end
+ *
+ *     This also assumes that the memory locations pointed to are still
+ *     unmodified. U-boot places them near the end of external SDRAM.
+ *
+ * Argument(s):
+ *     commandp = the linux commandline arg container to fill.
+ *     size     = the sizeof commandp.
+ *
+ * Returns:
+ */
+static void __init parse_uboot_commandline(char *commandp, int size)
+{
+       extern unsigned long _init_sp;
+       unsigned long *sp;
+       unsigned long uboot_kbd;
+       unsigned long uboot_initrd_start, uboot_initrd_end;
+       unsigned long uboot_cmd_start, uboot_cmd_end;
+
+       sp = (unsigned long *)_init_sp;
+       uboot_kbd = sp[1];
+       uboot_initrd_start = sp[2];
+       uboot_initrd_end = sp[3];
+       uboot_cmd_start = sp[4];
+       uboot_cmd_end = sp[5];
+
+       if (uboot_cmd_start && uboot_cmd_end)
+               strncpy(commandp, (const char *)uboot_cmd_start, size);
+#if defined(CONFIG_BLK_DEV_INITRD)
+       if (uboot_initrd_start && uboot_initrd_end &&
+           (uboot_initrd_end > uboot_initrd_start)) {
+               initrd_start = uboot_initrd_start;
+               initrd_end = uboot_initrd_end;
+               ROOT_DEV = Root_RAM0;
+               printk(KERN_INFO "initrd at 0x%lx:0x%lx\n",
+                       initrd_start, initrd_end);
+       }
+#endif /* if defined(CONFIG_BLK_DEV_INITRD) */
+}
+
+__init void process_uboot_commandline(char *commandp, int size)
+{
+       int len, n;
+
+       n = strnlen(commandp, size);
+       commandp += n;
+       len = size - n;
+       if (len) {
+               /* Add the whitespace separator */
+               *commandp++ = ' ';
+               len--;
+       }
+
+       parse_uboot_commandline(commandp, len);
+       commandp[size - 1] = 0;
+}
diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c

index f58fafe..87131cd 100644 (file)
--- a/arch/m68k/mm/mcfmmu.c
+++ b/arch/m68k/mm/mcfmmu.c
@@ -27,7 +27,7 @@ mm_context_t next_mmu_context;
  unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
  atomic_t nr_free_contexts;
  struct mm_struct *context_mm[LAST_CONTEXT+1];
-extern unsigned long num_pages;
+unsigned long num_pages;
  
  /*
   * ColdFire paging_init derived from sun3.
@@ -150,6 +150,35 @@ int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word)
         return 0;
  }
  
+void __init cf_bootmem_alloc(void)
+{
+       unsigned long start_pfn;
+       unsigned long memstart;
+
+       /* _rambase and _ramend will be naturally page aligned */
+       m68k_memory[0].addr = _rambase;
+       m68k_memory[0].size = _ramend - _rambase;
+
+       /* compute total pages in system */
+       num_pages = PFN_DOWN(_ramend - _rambase);
+
+       /* page numbers */
+       memstart = PAGE_ALIGN(_ramstart);
+       min_low_pfn = PFN_DOWN(_rambase);
+       start_pfn = PFN_DOWN(memstart);
+       max_pfn = max_low_pfn = PFN_DOWN(_ramend);
+       high_memory = (void *)_ramend;
+
+       m68k_virt_to_node_shift = fls(_ramend - _rambase - 1) - 6;
+       module_fixup(NULL, __start_fixup, __stop_fixup);
+
+       /* setup bootmem data */
+       m68k_setup_node(0);
+       memstart += init_bootmem_node(NODE_DATA(0), start_pfn,
+               min_low_pfn, max_low_pfn);
+       free_bootmem_node(NODE_DATA(0), memstart, _ramend - memstart);
+}
+
  /*
   * Initialize the context management stuff.
   * The following was taken from arch/ppc/mmu_context.c
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h

index b54bcad..07f58cf 100644 (file)
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -107,35 +107,49 @@
  #define KVM_INVALID_INST               0xdeadbeef
  #define KVM_INVALID_ADDR               0xdeadbeef
  
+/*
+ * EVA has overlapping user & kernel address spaces, so user VAs may be >
+ * PAGE_OFFSET. For this reason we can't use the default KVM_HVA_ERR_BAD of
+ * PAGE_OFFSET.
+ */
+
+#define KVM_HVA_ERR_BAD                        (-1UL)
+#define KVM_HVA_ERR_RO_BAD             (-2UL)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+       return IS_ERR_VALUE(addr);
+}
+
  extern atomic_t kvm_mips_instance;
  
  struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
  };
  
  struct kvm_vcpu_stat {
-       u32 wait_exits;
-       u32 cache_exits;
-       u32 signal_exits;
-       u32 int_exits;
-       u32 cop_unusable_exits;
-       u32 tlbmod_exits;
-       u32 tlbmiss_ld_exits;
-       u32 tlbmiss_st_exits;
-       u32 addrerr_st_exits;
-       u32 addrerr_ld_exits;
-       u32 syscall_exits;
-       u32 resvd_inst_exits;
-       u32 break_inst_exits;
-       u32 trap_inst_exits;
-       u32 msa_fpe_exits;
-       u32 fpe_exits;
-       u32 msa_disabled_exits;
-       u32 flush_dcache_exits;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
+       u64 wait_exits;
+       u64 cache_exits;
+       u64 signal_exits;
+       u64 int_exits;
+       u64 cop_unusable_exits;
+       u64 tlbmod_exits;
+       u64 tlbmiss_ld_exits;
+       u64 tlbmiss_st_exits;
+       u64 addrerr_st_exits;
+       u64 addrerr_ld_exits;
+       u64 syscall_exits;
+       u64 resvd_inst_exits;
+       u64 break_inst_exits;
+       u64 trap_inst_exits;
+       u64 msa_fpe_exits;
+       u64 fpe_exits;
+       u64 msa_disabled_exits;
+       u64 flush_dcache_exits;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
  };
  
  struct kvm_arch_memory_slot {
@@ -314,6 +328,9 @@ struct kvm_vcpu_arch {
         u32 guest_kernel_asid[NR_CPUS];
         struct mm_struct guest_kernel_mm, guest_user_mm;
  
+       /* Guest ASID of last user mode execution */
+       unsigned int last_user_gasid;
+
         int last_sched_cpu;
  
         /* WAIT executed */
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c

index e788515..4db4c03 100644 (file)
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -846,6 +846,47 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
         return EMULATE_FAIL;
  }
  
+/**
+ * kvm_mips_invalidate_guest_tlb() - Indicates a change in guest MMU map.
+ * @vcpu:      VCPU with changed mappings.
+ * @tlb:       TLB entry being removed.
+ *
+ * This is called to indicate a single change in guest MMU mappings, so that we
+ * can arrange TLB flushes on this and other CPUs.
+ */
+static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
+                                         struct kvm_mips_tlb *tlb)
+{
+       int cpu, i;
+       bool user;
+
+       /* No need to flush for entries which are already invalid */
+       if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V))
+               return;
+       /* User address space doesn't need flushing for KSeg2/3 changes */
+       user = tlb->tlb_hi < KVM_GUEST_KSEG0;
+
+       preempt_disable();
+
+       /*
+        * Probe the shadow host TLB for the entry being overwritten, if one
+        * matches, invalidate it
+        */
+       kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+
+       /* Invalidate the whole ASID on other CPUs */
+       cpu = smp_processor_id();
+       for_each_possible_cpu(i) {
+               if (i == cpu)
+                       continue;
+               if (user)
+                       vcpu->arch.guest_user_asid[i] = 0;
+               vcpu->arch.guest_kernel_asid[i] = 0;
+       }
+
+       preempt_enable();
+}
+
  /* Write Guest TLB Entry @ Index */
  enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
  {
@@ -865,11 +906,8 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
         }
  
         tlb = &vcpu->arch.guest_tlb[index];
-       /*
-        * Probe the shadow host TLB for the entry being overwritten, if one
-        * matches, invalidate it
-        */
-       kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+
+       kvm_mips_invalidate_guest_tlb(vcpu, tlb);
  
         tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
         tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
@@ -898,11 +936,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
  
         tlb = &vcpu->arch.guest_tlb[index];
  
-       /*
-        * Probe the shadow host TLB for the entry being overwritten, if one
-        * matches, invalidate it
-        */
-       kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+       kvm_mips_invalidate_guest_tlb(vcpu, tlb);
  
         tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
         tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
@@ -1026,6 +1060,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
         enum emulation_result er = EMULATE_DONE;
         u32 rt, rd, sel;
         unsigned long curr_pc;
+       int cpu, i;
  
         /*
          * Update PC and hold onto current PC in case there is
@@ -1127,16 +1162,31 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
                         } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
                                 u32 nasid =
                                         vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
-                               if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
-                                   ((kvm_read_c0_guest_entryhi(cop0) &
+                               if (((kvm_read_c0_guest_entryhi(cop0) &
                                       KVM_ENTRYHI_ASID) != nasid)) {
                                         trace_kvm_asid_change(vcpu,
                                                 kvm_read_c0_guest_entryhi(cop0)
                                                         & KVM_ENTRYHI_ASID,
                                                 nasid);
  
-                                       /* Blow away the shadow host TLBs */
-                                       kvm_mips_flush_host_tlb(1);
+                                       /*
+                                        * Regenerate/invalidate kernel MMU
+                                        * context.
+                                        * The user MMU context will be
+                                        * regenerated lazily on re-entry to
+                                        * guest user if the guest ASID actually
+                                        * changes.
+                                        */
+                                       preempt_disable();
+                                       cpu = smp_processor_id();
+                                       kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm,
+                                                               cpu, vcpu);
+                                       vcpu->arch.guest_kernel_asid[cpu] =
+                                               vcpu->arch.guest_kernel_mm.context.asid[cpu];
+                                       for_each_possible_cpu(i)
+                                               if (i != cpu)
+                                                       vcpu->arch.guest_kernel_asid[i] = 0;
+                                       preempt_enable();
                                 }
                                 kvm_write_c0_guest_entryhi(cop0,
                                                            vcpu->arch.gprs[rt]);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c

index a6ea084..ce96149 100644 (file)
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -140,6 +140,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         return 0;
  }
  
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
  void kvm_mips_free_vcpus(struct kvm *kvm)
  {
         unsigned int i;
@@ -411,6 +421,31 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
         return -ENOIOCTLCMD;
  }
  
+/* Must be called with preemption disabled, just before entering guest */
+static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       int cpu = smp_processor_id();
+       unsigned int gasid;
+
+       /*
+        * Lazy host ASID regeneration for guest user mode.
+        * If the guest ASID has changed since the last guest usermode
+        * execution, regenerate the host ASID so as to invalidate stale TLB
+        * entries.
+        */
+       if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
+               gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
+               if (gasid != vcpu->arch.last_user_gasid) {
+                       kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu,
+                                               vcpu);
+                       vcpu->arch.guest_user_asid[cpu] =
+                               vcpu->arch.guest_user_mm.context.asid[cpu];
+                       vcpu->arch.last_user_gasid = gasid;
+               }
+       }
+}
+
  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
  {
         int r = 0;
@@ -438,6 +473,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
         htw_stop();
  
         trace_kvm_enter(vcpu);
+
+       kvm_mips_check_asids(vcpu);
+
         r = vcpu->arch.vcpu_run(run, vcpu);
         trace_kvm_out(vcpu);
  
@@ -1551,6 +1589,8 @@ skip_emul:
         if (ret == RESUME_GUEST) {
                 trace_kvm_reenter(vcpu);
  
+               kvm_mips_check_asids(vcpu);
+
                 /*
                  * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
                  * is live), restore FCR31 / MSACSR.
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c

index 121008c..03883ba 100644 (file)
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -250,15 +250,27 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
                 vcpu->arch.guest_kernel_asid[cpu] =
                     vcpu->arch.guest_kernel_mm.context.asid[cpu];
+               newasid++;
+
+               kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+                         cpu_context(cpu, current->mm));
+               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
+       }
+
+       if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
+                                               asid_version_mask(cpu)) {
+               u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) &
+                               KVM_ENTRYHI_ASID;
+
                 kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
                 vcpu->arch.guest_user_asid[cpu] =
                     vcpu->arch.guest_user_mm.context.asid[cpu];
+               vcpu->arch.last_user_gasid = gasid;
                 newasid++;
  
                 kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
                           cpu_context(cpu, current->mm));
-               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
                 kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
                           vcpu->arch.guest_user_asid[cpu]);
         }
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c

index 0915539..3a5484f 100644 (file)
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -175,6 +175,24 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
                         run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                         ret = RESUME_HOST;
                 }
+       } else if (KVM_GUEST_KERNEL_MODE(vcpu)
+                  && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
+               /*
+                * With EVA we may get a TLB exception instead of an address
+                * error when the guest performs MMIO to KSeg1 addresses.
+                */
+               kvm_debug("Emulate %s MMIO space\n",
+                         store ? "Store to" : "Load from");
+               er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
+               if (er == EMULATE_FAIL) {
+                       kvm_err("Emulate %s MMIO space failed\n",
+                               store ? "Store to" : "Load from");
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       ret = RESUME_HOST;
+               } else {
+                       run->exit_reason = KVM_EXIT_MMIO;
+                       ret = RESUME_HOST;
+               }
         } else {
                 kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
                         store ? "ST" : "LD", cause, opc, badvaddr);
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h

index 287a656..e407af2 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -244,6 +244,43 @@ static inline int segment_shift(int ssize)
         return SID_SHIFT_1T;
  }
  
+/*
+ * This array is indexed by the LP field of the HPTE second dword.
+ * Since this field may contain some RPN bits, some entries are
+ * replicated so that we get the same value irrespective of RPN.
+ * The top 4 bits are the page size index (MMU_PAGE_*) for the
+ * actual page size, the bottom 4 bits are the base page size.
+ */
+extern u8 hpte_page_sizes[1 << LP_BITS];
+
+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+                                            bool is_base_size)
+{
+       unsigned int i, lp;
+
+       if (!(h & HPTE_V_LARGE))
+               return 1ul << 12;
+
+       /* Look at the 8 bit LP value */
+       lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+       i = hpte_page_sizes[lp];
+       if (!i)
+               return 0;
+       if (!is_base_size)
+               i >>= 4;
+       return 1ul << mmu_psize_defs[i & 0xf].shift;
+}
+
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+       return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
+{
+       return __hpte_page_size(h, l, 1);
+}
+
  /*
   * The current system page and segment sizes
   */
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h

index 2fd1690..f6fda84 100644 (file)
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
  #endif
  #endif /* __powerpc64__ */
  
+
+/*
+ * Simple Cache inhibited accessors
+ * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
+ * barriers, callers need to manage memory barriers on their own.
+ * These can only be used in hypervisor real mode.
+ */
+
+static inline u32 _lwzcix(unsigned long addr)
+{
+       u32 ret;
+
+       __asm__ __volatile__("lwzcix %0,0, %1"
+                            : "=r" (ret) : "r" (addr) : "memory");
+       return ret;
+}
+
+static inline void _stbcix(u64 addr, u8 val)
+{
+       __asm__ __volatile__("stbcix %0,0,%1"
+               : : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void _stwcix(u64 addr, u32 val)
+{
+       __asm__ __volatile__("stwcix %0,0,%1"
+               : : "r" (val), "r" (addr) : "memory");
+}
+
  /*
   * Low level IO stream instructions are defined out of line for now
   */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h

index 5bca220..05cabed 100644 (file)
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -105,6 +105,15 @@
  #define BOOK3S_INTERRUPT_FAC_UNAVAIL   0xf60
  #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80
  
+/* book3s_hv */
+
+/*
+ * Special trap used to indicate to host that this is a
+ * passthrough interrupt that could not be handled
+ * completely in the guest.
+ */
+#define BOOK3S_INTERRUPT_HV_RM_HARD    0x5555
+
  #define BOOK3S_IRQPRIO_SYSTEM_RESET            0
  #define BOOK3S_IRQPRIO_DATA_SEGMENT            1
  #define BOOK3S_IRQPRIO_INST_SEGMENT            2
@@ -136,6 +145,7 @@
  #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
  #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
  #define RESUME_FLAG_ARCH1      (1<<2)
+#define RESUME_FLAG_ARCH2      (1<<3)
  
  #define RESUME_GUEST            0
  #define RESUME_GUEST_NV         RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h

index 8f39796..5cf306a 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -69,6 +69,43 @@ struct hpte_cache {
         int pagesize;
  };
  
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits.  This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
+ */
+struct kvmppc_vcore {
+       int n_runnable;
+       int num_threads;
+       int entry_exit_map;
+       int napping_threads;
+       int first_vcpuid;
+       u16 pcpu;
+       u16 last_cpu;
+       u8 vcore_state;
+       u8 in_guest;
+       struct kvmppc_vcore *master_vcore;
+       struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
+       struct list_head preempt_list;
+       spinlock_t lock;
+       struct swait_queue_head wq;
+       spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
+       u64 stolen_tb;
+       u64 preempt_tb;
+       struct kvm_vcpu *runner;
+       struct kvm *kvm;
+       u64 tb_offset;          /* guest timebase - host timebase */
+       ulong lpcr;
+       u32 arch_compat;
+       ulong pcr;
+       ulong dpdes;            /* doorbell state (POWER8) */
+       ulong vtb;              /* virtual timebase */
+       ulong conferring_threads;
+       unsigned int halt_poll_ns;
+};
+
  struct kvmppc_vcpu_book3s {
         struct kvmppc_sid_map sid_map[SID_MAP_NUM];
         struct {
@@ -83,6 +120,7 @@ struct kvmppc_vcpu_book3s {
         u64 sdr1;
         u64 hior;
         u64 msr_mask;
+       u64 vtb;
  #ifdef CONFIG_PPC_BOOK3S_32
         u32 vsid_pool[VSID_POOL_SIZE];
         u32 vsid_next;
@@ -191,6 +229,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
                                  struct kvm_vcpu *vcpu);
  extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
                                    struct kvmppc_book3s_shadow_vcpu *svcpu);
+extern int kvm_irq_bypass;
  
  static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
  {
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h

index 88d17b4..8482921 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
  #ifndef __ASM_KVM_BOOK3S_64_H__
  #define __ASM_KVM_BOOK3S_64_H__
  
+#include <asm/book3s/64/mmu-hash.h>
+
  #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
  static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
  {
@@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
         hpte[0] = cpu_to_be64(hpte_v);
  }
  
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-       int i, shift;
-       unsigned int mask;
-
-       /* start from 1 ignoring MMU_PAGE_4K */
-       for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-               /* invalid penc */
-               if (mmu_psize_defs[psize].penc[i] == -1)
-                       continue;
-               /*
-                * encoding bits per actual page size
-                *        PTE LP     actual page size
-                *    rrrr rrrz         >=8KB
-                *    rrrr rrzz         >=16KB
-                *    rrrr rzzz         >=32KB
-                *    rrrr zzzz         >=64KB
-                * .......
-                */
-               shift = mmu_psize_defs[i].shift - LP_SHIFT;
-               if (shift > LP_BITS)
-                       shift = LP_BITS;
-               mask = (1 << shift) - 1;
-               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                       return i;
-       }
-       return -1;
-}
-
  static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                                              unsigned long pte_index)
  {
-       int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
+       int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
         unsigned int penc;
         unsigned long rb = 0, va_low, sllp;
         unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
  
         if (v & HPTE_V_LARGE) {
-               for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) {
-
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[b_psize].shift)
-                               continue;
-
-                       a_psize = __hpte_actual_psize(lp, b_psize);
-                       if (a_psize != -1)
-                               break;
-               }
+               i = hpte_page_sizes[lp];
+               b_psize = i & 0xf;
+               a_psize = i >> 4;
         }
+
         /*
          * Ignore the top 14 bits of va
          * v have top two bits covering segment size, hence move
@@ -159,7 +125,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
         /* This covers 14..54 bits of va*/
         rb = (v & ~0x7fUL) << 16;               /* AVA field */
  
-       rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8;   /*  B field */
         /*
          * AVA in v had cleared lower 23 bits. We need to derive
          * that from pteg index
@@ -211,49 +176,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                 break;
         }
         }
-       rb |= (v >> 54) & 0x300;                /* B field */
+       rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8;   /* B field */
         return rb;
  }
  
-static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
-                                            bool is_base_size)
-{
-
-       int size, a_psize;
-       /* Look at the 8 bit LP value */
-       unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-       /* only handle 4k, 64k and 16M pages for now */
-       if (!(h & HPTE_V_LARGE))
-               return 1ul << 12;
-       else {
-               for (size = 0; size < MMU_PAGE_COUNT; size++) {
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[size].shift)
-                               continue;
-
-                       a_psize = __hpte_actual_psize(lp, size);
-                       if (a_psize != -1) {
-                               if (is_base_size)
-                                       return 1ul << mmu_psize_defs[size].shift;
-                               return 1ul << mmu_psize_defs[a_psize].shift;
-                       }
-               }
-
-       }
-       return 0;
-}
-
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
-{
-       return __hpte_page_size(h, l, 0);
-}
-
-static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
-{
-       return __hpte_page_size(h, l, 1);
-}
-
  static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
  {
         return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index ec35af3..28350a2 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -43,6 +43,8 @@
  #include <asm/cputhreads.h>
  #define KVM_MAX_VCPU_ID                (threads_per_subcore * KVM_MAX_VCORES)
  
+#define __KVM_HAVE_ARCH_INTC_INITIALIZED
+
  #ifdef CONFIG_KVM_MMIO
  #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
  #endif
@@ -95,42 +97,49 @@ struct kvmppc_vcpu_book3s;
  struct kvmppc_book3s_shadow_vcpu;
  
  struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
  };
  
  struct kvm_vcpu_stat {
-       u32 sum_exits;
-       u32 mmio_exits;
-       u32 signal_exits;
-       u32 light_exits;
+       u64 sum_exits;
+       u64 mmio_exits;
+       u64 signal_exits;
+       u64 light_exits;
         /* Account for special types of light exits: */
-       u32 itlb_real_miss_exits;
-       u32 itlb_virt_miss_exits;
-       u32 dtlb_real_miss_exits;
-       u32 dtlb_virt_miss_exits;
-       u32 syscall_exits;
-       u32 isi_exits;
-       u32 dsi_exits;
-       u32 emulated_inst_exits;
-       u32 dec_exits;
-       u32 ext_intr_exits;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 dbell_exits;
-       u32 gdbell_exits;
-       u32 ld;
-       u32 st;
+       u64 itlb_real_miss_exits;
+       u64 itlb_virt_miss_exits;
+       u64 dtlb_real_miss_exits;
+       u64 dtlb_virt_miss_exits;
+       u64 syscall_exits;
+       u64 isi_exits;
+       u64 dsi_exits;
+       u64 emulated_inst_exits;
+       u64 dec_exits;
+       u64 ext_intr_exits;
+       u64 halt_poll_success_ns;
+       u64 halt_poll_fail_ns;
+       u64 halt_wait_ns;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_successful_wait;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 dbell_exits;
+       u64 gdbell_exits;
+       u64 ld;
+       u64 st;
  #ifdef CONFIG_PPC_BOOK3S
-       u32 pf_storage;
-       u32 pf_instruc;
-       u32 sp_storage;
-       u32 sp_instruc;
-       u32 queue_intr;
-       u32 ld_slow;
-       u32 st_slow;
+       u64 pf_storage;
+       u64 pf_instruc;
+       u64 sp_storage;
+       u64 sp_instruc;
+       u64 queue_intr;
+       u64 ld_slow;
+       u64 st_slow;
  #endif
+       u64 pthru_all;
+       u64 pthru_host;
+       u64 pthru_bad_aff;
  };
  
  enum kvm_exit_types {
@@ -197,6 +206,8 @@ struct kvmppc_spapr_tce_table {
  struct kvmppc_xics;
  struct kvmppc_icp;
  
+struct kvmppc_passthru_irqmap;
+
  /*
   * The reverse mapping array has one entry for each HPTE,
   * which stores the guest's view of the second word of the HPTE
@@ -267,6 +278,7 @@ struct kvm_arch {
  #endif
  #ifdef CONFIG_KVM_XICS
         struct kvmppc_xics *xics;
+       struct kvmppc_passthru_irqmap *pimap;
  #endif
         struct kvmppc_ops *kvm_ops;
  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -275,41 +287,6 @@ struct kvm_arch {
  #endif
  };
  
-/*
- * Struct for a virtual core.
- * Note: entry_exit_map combines a bitmap of threads that have entered
- * in the bottom 8 bits and a bitmap of threads that have exited in the
- * next 8 bits.  This is so that we can atomically set the entry bit
- * iff the exit map is 0 without taking a lock.
- */
-struct kvmppc_vcore {
-       int n_runnable;
-       int num_threads;
-       int entry_exit_map;
-       int napping_threads;
-       int first_vcpuid;
-       u16 pcpu;
-       u16 last_cpu;
-       u8 vcore_state;
-       u8 in_guest;
-       struct kvmppc_vcore *master_vcore;
-       struct list_head runnable_threads;
-       struct list_head preempt_list;
-       spinlock_t lock;
-       struct swait_queue_head wq;
-       spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
-       u64 stolen_tb;
-       u64 preempt_tb;
-       struct kvm_vcpu *runner;
-       struct kvm *kvm;
-       u64 tb_offset;          /* guest timebase - host timebase */
-       ulong lpcr;
-       u32 arch_compat;
-       ulong pcr;
-       ulong dpdes;            /* doorbell state (POWER8) */
-       ulong conferring_threads;
-};
-
  #define VCORE_ENTRY_MAP(vc)    ((vc)->entry_exit_map & 0xff)
  #define VCORE_EXIT_MAP(vc)     ((vc)->entry_exit_map >> 8)
  #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
@@ -329,6 +306,7 @@ struct kvmppc_vcore {
  #define VCORE_SLEEPING 3
  #define VCORE_RUNNING  4
  #define VCORE_EXITING  5
+#define VCORE_POLLING  6
  
  /*
   * Struct used to manage memory for a virtual processor area
@@ -397,6 +375,20 @@ struct kvmhv_tb_accumulator {
         u64     tb_max;         /* max time */
  };
  
+#ifdef CONFIG_PPC_BOOK3S_64
+struct kvmppc_irq_map {
+       u32     r_hwirq;
+       u32     v_hwirq;
+       struct irq_desc *desc;
+};
+
+#define        KVMPPC_PIRQ_MAPPED      1024
+struct kvmppc_passthru_irqmap {
+       int n_mapped;
+       struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED];
+};
+#endif
+
  # ifdef CONFIG_PPC_FSL_BOOK3E
  #define KVMPPC_BOOKE_IAC_NUM   2
  #define KVMPPC_BOOKE_DAC_NUM   2
@@ -483,7 +475,6 @@ struct kvm_vcpu_arch {
         ulong purr;
         ulong spurr;
         ulong ic;
-       ulong vtb;
         ulong dscr;
         ulong amr;
         ulong uamor;
@@ -668,7 +659,6 @@ struct kvm_vcpu_arch {
         long pgfault_index;
         unsigned long pgfault_hpte[2];
  
-       struct list_head run_list;
         struct task_struct *run_task;
         struct kvm_run *kvm_run;
  
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h

index 2544eda..f6e4964 100644 (file)
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,6 +287,10 @@ struct kvmppc_ops {
         long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
                               unsigned long arg);
         int (*hcall_implemented)(unsigned long hcall);
+       int (*irq_bypass_add_producer)(struct irq_bypass_consumer *,
+                                      struct irq_bypass_producer *);
+       void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
+                                       struct irq_bypass_producer *);
  };
  
  extern struct kvmppc_ops *kvmppc_hv_ops;
@@ -453,8 +457,19 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
  {
         return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
  }
+
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+                               struct kvm *kvm)
+{
+       if (kvm && kvm_irq_bypass)
+               return kvm->arch.pimap;
+       return NULL;
+}
+
  extern void kvmppc_alloc_host_rm_ops(void);
  extern void kvmppc_free_host_rm_ops(void);
+extern void kvmppc_free_pimap(struct kvm *kvm);
+extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
  extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
  extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
  extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -464,10 +479,23 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
  extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
                         struct kvm_vcpu *vcpu, u32 cpu);
  extern void kvmppc_xics_ipi_action(void);
+extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+                                  unsigned long host_irq);
+extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+                                  unsigned long host_irq);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
+                                struct kvmppc_irq_map *irq_map,
+                                struct kvmppc_passthru_irqmap *pimap);
  extern int h_ipi_redirect;
  #else
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+                               struct kvm *kvm)
+       { return NULL; }
  static inline void kvmppc_alloc_host_rm_ops(void) {};
  static inline void kvmppc_free_host_rm_ops(void) {};
+static inline void kvmppc_free_pimap(struct kvm *kvm) {};
+static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+       { return 0; }
  static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
         { return 0; }
  static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h

index e2fb408..b78e8d3 100644 (file)
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void)
  #define MMU_PAGE_16G   13
  #define MMU_PAGE_64G   14
  
+/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
  #define MMU_PAGE_COUNT 15
  
  #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h

index ee05bd2..e958b70 100644 (file)
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func,
  int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func,
                                    uint64_t offset, uint32_t data);
  int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
+int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
  int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority);
  int64_t opal_register_exception_handler(uint64_t opal_exception,
                                         uint64_t handler_address,
diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h

index 0cbd813..1b46b52 100644 (file)
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -12,6 +12,7 @@
  
  #include <linux/pci.h>
  #include <linux/pci_hotplug.h>
+#include <linux/irq.h>
  #include <misc/cxl-base.h>
  #include <asm/opal-api.h>
  
@@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
  void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
  int pnv_cxl_get_irq_count(struct pci_dev *dev);
  struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+bool is_pnv_opal_msi(struct irq_chip *chip);
  
  #ifdef CONFIG_CXL_BASE
  int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h

index f69f40f..978dada 100644 (file)
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -737,6 +737,7 @@
  #define   MMCR0_FCHV   0x00000001UL /* freeze conditions in hypervisor mode */
  #define SPRN_MMCR1     798
  #define SPRN_MMCR2     785
+#define SPRN_UMMCR2    769
  #define SPRN_MMCRA     0x312
  #define   MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
  #define   MMCRA_SDAR_DCACHE_MISS 0x40000000UL
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c

index b89d14c..a51ae9b 100644 (file)
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -506,7 +506,6 @@ int main(void)
         DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
         DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
         DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic));
-       DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb));
         DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
         DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
         DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
@@ -557,6 +556,7 @@ int main(void)
         DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
         DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
         DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes));
+       DEFINE(VCORE_VTB, offsetof(struct kvmppc_vcore, vtb));
         DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
         DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
         DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig

index c2024ac..029be26 100644 (file)
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,9 @@ config KVM
         select ANON_INODES
         select HAVE_KVM_EVENTFD
         select SRCU
+       select KVM_VFIO
+       select IRQ_BYPASS_MANAGER
+       select HAVE_KVM_IRQ_BYPASS
  
  config KVM_BOOK3S_HANDLER
         bool
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile

index 855d4b9..7dd89b7 100644 (file)
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
  ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
  KVM := ../../../virt/kvm
  
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-               $(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
  common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
  
  CFLAGS_e500_mmu.o := -I.
  CFLAGS_e500_mmu_host.o := -I.
  CFLAGS_emulate.o  := -I.
  CFLAGS_emulate_loadstore.o  := -I.
  
-common-objs-y += powerpc.o emulate.o emulate_loadstore.o
+common-objs-y += powerpc.o emulate_loadstore.o
  obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
  obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
  
@@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
  
  kvm-e500-objs := \
         $(common-objs-y) \
+       emulate.o \
         booke.o \
         booke_emulate.o \
         booke_interrupts.o \
@@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
  
  kvm-e500mc-objs := \
         $(common-objs-y) \
+       emulate.o \
         booke.o \
         booke_emulate.o \
         bookehv_interrupts.o \
@@ -61,9 +63,6 @@ kvm-pr-y := \
         book3s_32_mmu.o
  
  ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-kvm-book3s_64-module-objs := \
-       $(KVM)/coalesced_mmio.o
-
  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
         book3s_rmhandlers.o
  endif
@@ -89,11 +88,8 @@ endif
  kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
         book3s_xics.o
  
-kvm-book3s_64-module-objs += \
-       $(KVM)/kvm_main.o \
-       $(KVM)/eventfd.o \
-       powerpc.o \
-       emulate_loadstore.o \
+kvm-book3s_64-module-objs := \
+       $(common-objs-y) \
         book3s.o \
         book3s_64_vio.o \
         book3s_rtas.o \
@@ -103,6 +99,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
  
  kvm-book3s_32-objs := \
         $(common-objs-y) \
+       emulate.o \
         fpu.o \
         book3s_paired_singles.o \
         book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c

index 47018fc..b6952dd 100644 (file)
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "dec",         VCPU_STAT(dec_exits) },
         { "ext_intr",    VCPU_STAT(ext_intr_exits) },
         { "queue_intr",  VCPU_STAT(queue_intr) },
+       { "halt_poll_success_ns",       VCPU_STAT(halt_poll_success_ns) },
+       { "halt_poll_fail_ns",          VCPU_STAT(halt_poll_fail_ns) },
+       { "halt_wait_ns",               VCPU_STAT(halt_wait_ns) },
         { "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+       { "halt_successful_wait",       VCPU_STAT(halt_successful_wait) },
         { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
         { "pf_storage",  VCPU_STAT(pf_storage) },
@@ -64,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "ld_slow",     VCPU_STAT(ld_slow) },
         { "st",          VCPU_STAT(st) },
         { "st_slow",     VCPU_STAT(st_slow) },
+       { "pthru_all",       VCPU_STAT(pthru_all) },
+       { "pthru_host",      VCPU_STAT(pthru_host) },
+       { "pthru_bad_aff",   VCPU_STAT(pthru_bad_aff) },
         { NULL }
  };
  
@@ -592,9 +599,6 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
                 case KVM_REG_PPC_BESCR:
                         *val = get_reg_val(id, vcpu->arch.bescr);
                         break;
-               case KVM_REG_PPC_VTB:
-                       *val = get_reg_val(id, vcpu->arch.vtb);
-                       break;
                 case KVM_REG_PPC_IC:
                         *val = get_reg_val(id, vcpu->arch.ic);
                         break;
@@ -666,9 +670,6 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
                 case KVM_REG_PPC_BESCR:
                         vcpu->arch.bescr = set_reg_val(id, *val);
                         break;
-               case KVM_REG_PPC_VTB:
-                       vcpu->arch.vtb = set_reg_val(id, *val);
-                       break;
                 case KVM_REG_PPC_IC:
                         vcpu->arch.ic = set_reg_val(id, *val);
                         break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c

index 2afdb9c..8359752 100644 (file)
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -498,6 +498,7 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
         case SPRN_MMCR0:
         case SPRN_MMCR1:
         case SPRN_MMCR2:
+       case SPRN_UMMCR2:
  #endif
                 break;
  unprivileged:
@@ -579,7 +580,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
                 *spr_val = vcpu->arch.spurr;
                 break;
         case SPRN_VTB:
-               *spr_val = vcpu->arch.vtb;
+               *spr_val = to_book3s(vcpu)->vtb;
                 break;
         case SPRN_IC:
                 *spr_val = vcpu->arch.ic;
@@ -640,6 +641,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
         case SPRN_MMCR0:
         case SPRN_MMCR1:
         case SPRN_MMCR2:
+       case SPRN_UMMCR2:
         case SPRN_TIR:
  #endif
                 *spr_val = 0;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index 2fd5580..3686471 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -53,11 +53,15 @@
  #include <asm/smp.h>
  #include <asm/dbell.h>
  #include <asm/hmi.h>
+#include <asm/pnv-pci.h>
  #include <linux/gfp.h>
  #include <linux/vmalloc.h>
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
  #include <linux/module.h>
+#include <linux/compiler.h>
  
  #include "book3s.h"
  
@@ -70,6 +74,8 @@
  
  /* Used to indicate that a guest page fault needs to be handled */
  #define RESUME_PAGE_FAULT      (RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH     (RESUME_GUEST | RESUME_FLAG_ARCH2)
  
  /* Used as a "null" value for timebase values */
  #define TB_NIL (~(u64)0)
@@ -89,14 +95,55 @@ static struct kernel_param_ops module_param_ops = {
         .get = param_get_int,
  };
  
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
+                                                       S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
  module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
                                                         S_IRUGO | S_IWUSR);
  MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
  #endif
  
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
+
  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
  
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+               int *ip)
+{
+       int i = *ip;
+       struct kvm_vcpu *vcpu;
+
+       while (++i < MAX_SMT_THREADS) {
+               vcpu = READ_ONCE(vc->runnable_threads[i]);
+               if (vcpu) {
+                       *ip = i;
+                       return vcpu;
+               }
+       }
+       return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+       for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
+
  static bool kvmppc_ipi_thread(int cpu)
  {
         /* On POWER8 for IPIs to threads in the same core, use msgsnd */
@@ -991,6 +1038,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                 r = RESUME_GUEST;
                 break;
+       case BOOK3S_INTERRUPT_HV_RM_HARD:
+               r = RESUME_PASSTHROUGH;
+               break;
         default:
                 kvmppc_dump_regs(vcpu);
                 printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
@@ -1149,6 +1199,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
         case KVM_REG_PPC_DPDES:
                 *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
                 break;
+       case KVM_REG_PPC_VTB:
+               *val = get_reg_val(id, vcpu->arch.vcore->vtb);
+               break;
         case KVM_REG_PPC_DAWR:
                 *val = get_reg_val(id, vcpu->arch.dawr);
                 break;
@@ -1341,6 +1394,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
         case KVM_REG_PPC_DPDES:
                 vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
                 break;
+       case KVM_REG_PPC_VTB:
+               vcpu->arch.vcore->vtb = set_reg_val(id, *val);
+               break;
         case KVM_REG_PPC_DAWR:
                 vcpu->arch.dawr = set_reg_val(id, *val);
                 break;
@@ -1493,7 +1549,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
         if (vcore == NULL)
                 return NULL;
  
-       INIT_LIST_HEAD(&vcore->runnable_threads);
         spin_lock_init(&vcore->lock);
         spin_lock_init(&vcore->stoltb_lock);
         init_swait_queue_head(&vcore->wq);
@@ -1802,7 +1857,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
         vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
         spin_unlock_irq(&vcpu->arch.tbacct_lock);
         --vc->n_runnable;
-       list_del(&vcpu->arch.run_list);
+       WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
  }
  
  static int kvmppc_grab_hwthread(int cpu)
@@ -2048,66 +2103,6 @@ static void init_master_vcore(struct kvmppc_vcore *vc)
         vc->conferring_threads = 0;
  }
  
-/*
- * See if the existing subcores can be split into 3 (or fewer) subcores
- * of at most two threads each, so we can fit in another vcore.  This
- * assumes there are at most two subcores and at most 6 threads in total.
- */
-static bool can_split_piggybacked_subcores(struct core_info *cip)
-{
-       int sub, new_sub;
-       int large_sub = -1;
-       int thr;
-       int n_subcores = cip->n_subcores;
-       struct kvmppc_vcore *vc, *vcnext;
-       struct kvmppc_vcore *master_vc = NULL;
-
-       for (sub = 0; sub < cip->n_subcores; ++sub) {
-               if (cip->subcore_threads[sub] <= 2)
-                       continue;
-               if (large_sub >= 0)
-                       return false;
-               large_sub = sub;
-               vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
-                                     preempt_list);
-               if (vc->num_threads > 2)
-                       return false;
-               n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
-       }
-       if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2))
-               return false;
-
-       /*
-        * Seems feasible, so go through and move vcores to new subcores.
-        * Note that when we have two or more vcores in one subcore,
-        * all those vcores must have only one thread each.
-        */
-       new_sub = cip->n_subcores;
-       thr = 0;
-       sub = large_sub;
-       list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
-               if (thr >= 2) {
-                       list_del(&vc->preempt_list);
-                       list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
-                       /* vc->num_threads must be 1 */
-                       if (++cip->subcore_threads[new_sub] == 1) {
-                               cip->subcore_vm[new_sub] = vc->kvm;
-                               init_master_vcore(vc);
-                               master_vc = vc;
-                               ++cip->n_subcores;
-                       } else {
-                               vc->master_vcore = master_vc;
-                               ++new_sub;
-                       }
-               }
-               thr += vc->num_threads;
-       }
-       cip->subcore_threads[large_sub] = 2;
-       cip->max_subcore_threads = 2;
-
-       return true;
-}
-
  static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
  {
         int n_threads = vc->num_threads;
@@ -2118,23 +2113,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
  
         if (n_threads < cip->max_subcore_threads)
                 n_threads = cip->max_subcore_threads;
-       if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
-               cip->max_subcore_threads = n_threads;
-       } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
-                  vc->num_threads <= 2) {
-               /*
-                * We may be able to fit another subcore in by
-                * splitting an existing subcore with 3 or 4
-                * threads into two 2-thread subcores, or one
-                * with 5 or 6 threads into three subcores.
-                * We can only do this if those subcores have
-                * piggybacked virtual cores.
-                */
-               if (!can_split_piggybacked_subcores(cip))
-                       return false;
-       } else {
+       if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
                 return false;
-       }
+       cip->max_subcore_threads = n_threads;
  
         sub = cip->n_subcores;
         ++cip->n_subcores;
@@ -2148,43 +2129,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
         return true;
  }
  
-static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
-                                 struct core_info *cip, int sub)
-{
-       struct kvmppc_vcore *vc;
-       int n_thr;
-
-       vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
-                             preempt_list);
-
-       /* require same VM and same per-core reg values */
-       if (pvc->kvm != vc->kvm ||
-           pvc->tb_offset != vc->tb_offset ||
-           pvc->pcr != vc->pcr ||
-           pvc->lpcr != vc->lpcr)
-               return false;
-
-       /* P8 guest with > 1 thread per core would see wrong TIR value */
-       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
-           (vc->num_threads > 1 || pvc->num_threads > 1))
-               return false;
-
-       n_thr = cip->subcore_threads[sub] + pvc->num_threads;
-       if (n_thr > cip->max_subcore_threads) {
-               if (!subcore_config_ok(cip->n_subcores, n_thr))
-                       return false;
-               cip->max_subcore_threads = n_thr;
-       }
-
-       cip->total_threads += pvc->num_threads;
-       cip->subcore_threads[sub] = n_thr;
-       pvc->master_vcore = vc;
-       list_del(&pvc->preempt_list);
-       list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
-
-       return true;
-}
-
  /*
   * Work out whether it is possible to piggyback the execution of
   * vcore *pvc onto the execution of the other vcores described in *cip.
@@ -2192,27 +2136,18 @@ static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
  static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
                           int target_threads)
  {
-       int sub;
-
         if (cip->total_threads + pvc->num_threads > target_threads)
                 return false;
-       for (sub = 0; sub < cip->n_subcores; ++sub)
-               if (cip->subcore_threads[sub] &&
-                   can_piggyback_subcore(pvc, cip, sub))
-                       return true;
-
-       if (can_dynamic_split(pvc, cip))
-               return true;
  
-       return false;
+       return can_dynamic_split(pvc, cip);
  }
  
  static void prepare_threads(struct kvmppc_vcore *vc)
  {
-       struct kvm_vcpu *vcpu, *vnext;
+       int i;
+       struct kvm_vcpu *vcpu;
  
-       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                arch.run_list) {
+       for_each_runnable_thread(i, vcpu, vc) {
                 if (signal_pending(vcpu->arch.run_task))
                         vcpu->arch.ret = -EINTR;
                 else if (vcpu->arch.vpa.update_pending ||
@@ -2259,15 +2194,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
  
  static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
  {
-       int still_running = 0;
+       int still_running = 0, i;
         u64 now;
         long ret;
-       struct kvm_vcpu *vcpu, *vnext;
+       struct kvm_vcpu *vcpu;
  
         spin_lock(&vc->lock);
         now = get_tb();
-       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                arch.run_list) {
+       for_each_runnable_thread(i, vcpu, vc) {
                 /* cancel pending dec exception if dec is positive */
                 if (now < vcpu->arch.dec_expires &&
                     kvmppc_core_pending_dec(vcpu))
@@ -2307,8 +2241,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
                 }
                 if (vc->n_runnable > 0 && vc->runner == NULL) {
                         /* make sure there's a candidate runner awake */
-                       vcpu = list_first_entry(&vc->runnable_threads,
-                                               struct kvm_vcpu, arch.run_list);
+                       i = -1;
+                       vcpu = next_runnable_thread(vc, &i);
                         wake_up(&vcpu->arch.cpu_run);
                 }
         }
@@ -2361,7 +2295,7 @@ static inline void kvmppc_set_host_core(int cpu)
   */
  static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
  {
-       struct kvm_vcpu *vcpu, *vnext;
+       struct kvm_vcpu *vcpu;
         int i;
         int srcu_idx;
         struct core_info core_info;
@@ -2397,8 +2331,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
          */
         if ((threads_per_core > 1) &&
             ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-               list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-                                        arch.run_list) {
+               for_each_runnable_thread(i, vcpu, vc) {
                         vcpu->arch.ret = -EBUSY;
                         kvmppc_remove_runnable(vc, vcpu);
                         wake_up(&vcpu->arch.cpu_run);
@@ -2477,8 +2410,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                 active |= 1 << thr;
                 list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
                         pvc->pcpu = pcpu + thr;
-                       list_for_each_entry(vcpu, &pvc->runnable_threads,
-                                           arch.run_list) {
+                       for_each_runnable_thread(i, vcpu, pvc) {
                                 kvmppc_start_thread(vcpu, pvc);
                                 kvmppc_create_dtl_entry(vcpu, pvc);
                                 trace_kvm_guest_enter(vcpu);
@@ -2604,34 +2536,92 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
         finish_wait(&vcpu->arch.cpu_run, &wait);
  }
  
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+       /* 10us base */
+       if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+               vc->halt_poll_ns = 10000;
+       else
+               vc->halt_poll_ns *= halt_poll_ns_grow;
+
+       if (vc->halt_poll_ns > halt_poll_max_ns)
+               vc->halt_poll_ns = halt_poll_max_ns;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+       if (halt_poll_ns_shrink == 0)
+               vc->halt_poll_ns = 0;
+       else
+               vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       for_each_runnable_thread(i, vcpu, vc) {
+               if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+                       return 1;
+       }
+
+       return 0;
+}
+
  /*
   * All the vcpus in this vcore are idle, so wait for a decrementer
   * or external interrupt to one of the vcpus.  vc->lock is held.
   */
  static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
  {
-       struct kvm_vcpu *vcpu;
+       ktime_t cur, start_poll, start_wait;
         int do_sleep = 1;
+       u64 block_ns;
         DECLARE_SWAITQUEUE(wait);
  
-       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       /* Poll for pending exceptions and ceded state */
+       cur = start_poll = ktime_get();
+       if (vc->halt_poll_ns) {
+               ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+               ++vc->runner->stat.halt_attempted_poll;
  
-       /*
-        * Check one last time for pending exceptions and ceded state after
-        * we put ourselves on the wait queue
-        */
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
-                       do_sleep = 0;
-                       break;
+               vc->vcore_state = VCORE_POLLING;
+               spin_unlock(&vc->lock);
+
+               do {
+                       if (kvmppc_vcore_check_block(vc)) {
+                               do_sleep = 0;
+                               break;
+                       }
+                       cur = ktime_get();
+               } while (single_task_running() && ktime_before(cur, stop));
+
+               spin_lock(&vc->lock);
+               vc->vcore_state = VCORE_INACTIVE;
+
+               if (!do_sleep) {
+                       ++vc->runner->stat.halt_successful_poll;
+                       goto out;
                 }
         }
  
-       if (!do_sleep) {
+       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+
+       if (kvmppc_vcore_check_block(vc)) {
                 finish_swait(&vc->wq, &wait);
-               return;
+               do_sleep = 0;
+               /* If we polled, count this as a successful poll */
+               if (vc->halt_poll_ns)
+                       ++vc->runner->stat.halt_successful_poll;
+               goto out;
         }
  
+       start_wait = ktime_get();
+
         vc->vcore_state = VCORE_SLEEPING;
         trace_kvmppc_vcore_blocked(vc, 0);
         spin_unlock(&vc->lock);
@@ -2640,13 +2630,52 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
         spin_lock(&vc->lock);
         vc->vcore_state = VCORE_INACTIVE;
         trace_kvmppc_vcore_blocked(vc, 1);
+       ++vc->runner->stat.halt_successful_wait;
+
+       cur = ktime_get();
+
+out:
+       block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
+
+       /* Attribute wait time */
+       if (do_sleep) {
+               vc->runner->stat.halt_wait_ns +=
+                       ktime_to_ns(cur) - ktime_to_ns(start_wait);
+               /* Attribute failed poll time */
+               if (vc->halt_poll_ns)
+                       vc->runner->stat.halt_poll_fail_ns +=
+                               ktime_to_ns(start_wait) -
+                               ktime_to_ns(start_poll);
+       } else {
+               /* Attribute successful poll time */
+               if (vc->halt_poll_ns)
+                       vc->runner->stat.halt_poll_success_ns +=
+                               ktime_to_ns(cur) -
+                               ktime_to_ns(start_poll);
+       }
+
+       /* Adjust poll time */
+       if (halt_poll_max_ns) {
+               if (block_ns <= vc->halt_poll_ns)
+                       ;
+               /* We slept and blocked for longer than the max halt time */
+               else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+                       shrink_halt_poll_ns(vc);
+               /* We slept and our poll time is too small */
+               else if (vc->halt_poll_ns < halt_poll_max_ns &&
+                               block_ns < halt_poll_max_ns)
+                       grow_halt_poll_ns(vc);
+       } else
+               vc->halt_poll_ns = 0;
+
+       trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
  }
  
  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
  {
-       int n_ceded;
+       int n_ceded, i;
         struct kvmppc_vcore *vc;
-       struct kvm_vcpu *v, *vn;
+       struct kvm_vcpu *v;
  
         trace_kvmppc_run_vcpu_enter(vcpu);
  
@@ -2666,7 +2695,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
         vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
         vcpu->arch.busy_preempt = TB_NIL;
-       list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+       WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
         ++vc->n_runnable;
  
         /*
@@ -2706,8 +2735,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                         kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
                         continue;
                 }
-               list_for_each_entry_safe(v, vn, &vc->runnable_threads,
-                                        arch.run_list) {
+               for_each_runnable_thread(i, v, vc) {
                         kvmppc_core_prepare_to_enter(v);
                         if (signal_pending(v->arch.run_task)) {
                                 kvmppc_remove_runnable(vc, v);
@@ -2720,7 +2748,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
                         break;
                 n_ceded = 0;
-               list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+               for_each_runnable_thread(i, v, vc) {
                         if (!v->arch.pending_exceptions)
                                 n_ceded += v->arch.ceded;
                         else
@@ -2759,8 +2787,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
  
         if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
                 /* Wake up some vcpu to run the core */
-               v = list_first_entry(&vc->runnable_threads,
-                                    struct kvm_vcpu, arch.run_list);
+               i = -1;
+               v = next_runnable_thread(vc, &i);
                 wake_up(&v->arch.cpu_run);
         }
  
@@ -2818,7 +2846,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
                         r = kvmppc_book3s_hv_page_fault(run, vcpu,
                                 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
                         srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-               }
+               } else if (r == RESUME_PASSTHROUGH)
+                       r = kvmppc_xics_rm_complete(vcpu, 0);
         } while (is_kvmppc_resume_guest(r));
  
   out:
@@ -3247,6 +3276,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
         kvmppc_free_vcores(kvm);
  
         kvmppc_free_hpt(kvm);
+
+       kvmppc_free_pimap(kvm);
  }
  
  /* We don't need to emulate any privileged instructions or dcbz */
@@ -3282,6 +3313,184 @@ static int kvmppc_core_check_processor_compat_hv(void)
         return 0;
  }
  
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+       kfree(kvm->arch.pimap);
+}
+
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+       return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+       struct irq_desc *desc;
+       struct kvmppc_irq_map *irq_map;
+       struct kvmppc_passthru_irqmap *pimap;
+       struct irq_chip *chip;
+       int i;
+
+       if (!kvm_irq_bypass)
+               return 1;
+
+       desc = irq_to_desc(host_irq);
+       if (!desc)
+               return -EIO;
+
+       mutex_lock(&kvm->lock);
+
+       pimap = kvm->arch.pimap;
+       if (pimap == NULL) {
+               /* First call, allocate structure to hold IRQ map */
+               pimap = kvmppc_alloc_pimap();
+               if (pimap == NULL) {
+                       mutex_unlock(&kvm->lock);
+                       return -ENOMEM;
+               }
+               kvm->arch.pimap = pimap;
+       }
+
+       /*
+        * For now, we only support interrupts for which the EOI operation
+        * is an OPAL call followed by a write to XIRR, since that's
+        * what our real-mode EOI code does.
+        */
+       chip = irq_data_get_irq_chip(&desc->irq_data);
+       if (!chip || !is_pnv_opal_msi(chip)) {
+               pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
+                       host_irq, guest_gsi);
+               mutex_unlock(&kvm->lock);
+               return -ENOENT;
+       }
+
+       /*
+        * See if we already have an entry for this guest IRQ number.
+        * If it's mapped to a hardware IRQ number, that's an error,
+        * otherwise re-use this entry.
+        */
+       for (i = 0; i < pimap->n_mapped; i++) {
+               if (guest_gsi == pimap->mapped[i].v_hwirq) {
+                       if (pimap->mapped[i].r_hwirq) {
+                               mutex_unlock(&kvm->lock);
+                               return -EINVAL;
+                       }
+                       break;
+               }
+       }
+
+       if (i == KVMPPC_PIRQ_MAPPED) {
+               mutex_unlock(&kvm->lock);
+               return -EAGAIN;         /* table is full */
+       }
+
+       irq_map = &pimap->mapped[i];
+
+       irq_map->v_hwirq = guest_gsi;
+       irq_map->desc = desc;
+
+       /*
+        * Order the above two stores before the next to serialize with
+        * the KVM real mode handler.
+        */
+       smp_wmb();
+       irq_map->r_hwirq = desc->irq_data.hwirq;
+
+       if (i == pimap->n_mapped)
+               pimap->n_mapped++;
+
+       kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+
+       mutex_unlock(&kvm->lock);
+
+       return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+       struct irq_desc *desc;
+       struct kvmppc_passthru_irqmap *pimap;
+       int i;
+
+       if (!kvm_irq_bypass)
+               return 0;
+
+       desc = irq_to_desc(host_irq);
+       if (!desc)
+               return -EIO;
+
+       mutex_lock(&kvm->lock);
+
+       if (kvm->arch.pimap == NULL) {
+               mutex_unlock(&kvm->lock);
+               return 0;
+       }
+       pimap = kvm->arch.pimap;
+
+       for (i = 0; i < pimap->n_mapped; i++) {
+               if (guest_gsi == pimap->mapped[i].v_hwirq)
+                       break;
+       }
+
+       if (i == pimap->n_mapped) {
+               mutex_unlock(&kvm->lock);
+               return -ENODEV;
+       }
+
+       kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
+       /* invalidate the entry */
+       pimap->mapped[i].r_hwirq = 0;
+
+       /*
+        * We don't free this structure even when the count goes to
+        * zero. The structure is freed when we destroy the VM.
+        */
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+}
+
+static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
+                                            struct irq_bypass_producer *prod)
+{
+       int ret = 0;
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       irqfd->producer = prod;
+
+       ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+       if (ret)
+               pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
+                       prod->irq, irqfd->gsi, ret);
+
+       return ret;
+}
+
+static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
+                                             struct irq_bypass_producer *prod)
+{
+       int ret;
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+       irqfd->producer = NULL;
+
+       /*
+        * When producer of consumer is unregistered, we change back to
+        * default external interrupt handling mode - KVM real mode
+        * will switch back to host.
+        */
+       ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+       if (ret)
+               pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
+                       prod->irq, irqfd->gsi, ret);
+}
+#endif
+
  static long kvm_arch_vm_ioctl_hv(struct file *filp,
                                  unsigned int ioctl, unsigned long arg)
  {
@@ -3400,6 +3609,10 @@ static struct kvmppc_ops kvm_ops_hv = {
         .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
         .arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
         .hcall_implemented = kvmppc_hcall_impl_hv,
+#ifdef CONFIG_KVM_XICS
+       .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
+       .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
+#endif
  };
  
  static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c

index 5f0380d..0c84d6b 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,7 @@
  #include <asm/xics.h>
  #include <asm/dbell.h>
  #include <asm/cputhreads.h>
+#include <asm/io.h>
  
  #define KVM_CMA_CHUNK_ORDER    18
  
@@ -286,3 +287,158 @@ void kvmhv_commence_exit(int trap)
  
  struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
  EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+#ifdef CONFIG_KVM_XICS
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+                                        u32 xisr)
+{
+       int i;
+
+       /*
+        * We access the mapped array here without a lock.  That
+        * is safe because we never reduce the number of entries
+        * in the array and we never change the v_hwirq field of
+        * an entry once it is set.
+        *
+        * We have also carefully ordered the stores in the writer
+        * and the loads here in the reader, so that if we find a matching
+        * hwirq here, the associated GSI and irq_desc fields are valid.
+        */
+       for (i = 0; i < pimap->n_mapped; i++)  {
+               if (xisr == pimap->mapped[i].r_hwirq) {
+                       /*
+                        * Order subsequent reads in the caller to serialize
+                        * with the writer.
+                        */
+                       smp_rmb();
+                       return &pimap->mapped[i];
+               }
+       }
+       return NULL;
+}
+
+/*
+ * If we have an interrupt that's not an IPI, check if we have a
+ * passthrough adapter and if so, check if this external interrupt
+ * is for the adapter.
+ * We will attempt to deliver the IRQ directly to the target VCPU's
+ * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+ *
+ * If the delivery fails or if this is not for a passthrough adapter,
+ * return to the host to handle this interrupt. We earlier
+ * saved a copy of the XIRR in the PACA, it will be picked up by
+ * the host ICP driver.
+ */
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+       struct kvmppc_passthru_irqmap *pimap;
+       struct kvmppc_irq_map *irq_map;
+       struct kvm_vcpu *vcpu;
+
+       vcpu = local_paca->kvm_hstate.kvm_vcpu;
+       if (!vcpu)
+               return 1;
+       pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
+       if (!pimap)
+               return 1;
+       irq_map = get_irqmap(pimap, xisr);
+       if (!irq_map)
+               return 1;
+
+       /* We're handling this interrupt, generic code doesn't need to */
+       local_paca->kvm_hstate.saved_xirr = 0;
+
+       return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
+}
+
+#else
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+       return 1;
+}
+#endif
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ *     0 if no interrupt is pending
+ *     1 if an interrupt is pending that needs to be handled by the host
+ *     2 Passthrough that needs completion in the host
+ *     -1 if there was a guest wakeup IPI (which has now been cleared)
+ *     -2 if there is PCI passthrough external interrupt that was handled
+ */
+
+long kvmppc_read_intr(void)
+{
+       unsigned long xics_phys;
+       u32 h_xirr;
+       __be32 xirr;
+       u32 xisr;
+       u8 host_ipi;
+
+       /* see if a host IPI is pending */
+       host_ipi = local_paca->kvm_hstate.host_ipi;
+       if (host_ipi)
+               return 1;
+
+       /* Now read the interrupt from the ICP */
+       xics_phys = local_paca->kvm_hstate.xics_phys;
+       if (unlikely(!xics_phys))
+               return 1;
+
+       /*
+        * Save XIRR for later. Since we get control in reverse endian
+        * on LE systems, save it byte reversed and fetch it back in
+        * host endian. Note that xirr is the value read from the
+        * XIRR register, while h_xirr is the host endian version.
+        */
+       xirr = _lwzcix(xics_phys + XICS_XIRR);
+       h_xirr = be32_to_cpu(xirr);
+       local_paca->kvm_hstate.saved_xirr = h_xirr;
+       xisr = h_xirr & 0xffffff;
+       /*
+        * Ensure that the store/load complete to guarantee all side
+        * effects of loading from XIRR has completed
+        */
+       smp_mb();
+
+       /* if nothing pending in the ICP */
+       if (!xisr)
+               return 0;
+
+       /* We found something in the ICP...
+        *
+        * If it is an IPI, clear the MFRR and EOI it.
+        */
+       if (xisr == XICS_IPI) {
+               _stbcix(xics_phys + XICS_MFRR, 0xff);
+               _stwcix(xics_phys + XICS_XIRR, xirr);
+               /*
+                * Need to ensure side effects of above stores
+                * complete before proceeding.
+                */
+               smp_mb();
+
+               /*
+                * We need to re-check host IPI now in case it got set in the
+                * meantime. If it's clear, we bounce the interrupt to the
+                * guest
+                */
+               host_ipi = local_paca->kvm_hstate.host_ipi;
+               if (unlikely(host_ipi != 0)) {
+                       /* We raced with the host,
+                        * we need to resend that IPI, bummer
+                        */
+                       _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+                       /* Let side effects complete */
+                       smp_mb();
+                       return 1;
+               }
+
+               /* OK, it's an IPI for us */
+               local_paca->kvm_hstate.saved_xirr = 0;
+               return -1;
+       }
+
+       return kvmppc_check_passthru(xisr, xirr);
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c

index 980d8a6..82ff5de 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -10,6 +10,7 @@
  #include <linux/kernel.h>
  #include <linux/kvm_host.h>
  #include <linux/err.h>
+#include <linux/kernel_stat.h>
  
  #include <asm/kvm_book3s.h>
  #include <asm/kvm_ppc.h>
@@ -18,7 +19,10 @@
  #include <asm/debug.h>
  #include <asm/synch.h>
  #include <asm/cputhreads.h>
+#include <asm/pgtable.h>
  #include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
  
  #include "book3s_xics.h"
  
@@ -26,9 +30,12 @@
  
  int h_ipi_redirect = 1;
  EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
  
  static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                             u32 new_irq);
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
  
  /* -- ICS routines -- */
  static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -708,10 +715,123 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
                 icp->rm_action |= XICS_RM_NOTIFY_EOI;
                 icp->rm_eoied_irq = irq;
         }
+
+       if (state->host_irq) {
+               ++vcpu->stat.pthru_all;
+               if (state->intr_cpu != -1) {
+                       int pcpu = raw_smp_processor_id();
+
+                       pcpu = cpu_first_thread_sibling(pcpu);
+                       ++vcpu->stat.pthru_host;
+                       if (state->intr_cpu != pcpu) {
+                               ++vcpu->stat.pthru_bad_aff;
+                               xics_opal_rm_set_server(state->host_irq, pcpu);
+                       }
+                       state->intr_cpu = -1;
+               }
+       }
   bail:
         return check_too_hard(xics, icp);
  }
  
+unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+{
+       unsigned long xics_phys;
+       int64_t rc;
+
+       rc = pnv_opal_pci_msi_eoi(c, hwirq);
+
+       if (rc)
+               eoi_rc = rc;
+
+       iosync();
+
+       /* EOI it */
+       xics_phys = local_paca->kvm_hstate.xics_phys;
+       _stwcix(xics_phys + XICS_XIRR, xirr);
+}
+
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+{
+       unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
+
+       return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+}
+
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+       unsigned long l;
+       unsigned int *raddr;
+       int cpu = smp_processor_id();
+
+       raddr = per_cpu_ptr(addr, cpu);
+       l = (unsigned long)raddr;
+
+       if (REGION_ID(l) == VMALLOC_REGION_ID) {
+               l = vmalloc_to_phys(raddr);
+               raddr = (unsigned int *)l;
+       }
+       ++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ *  - state flags represent internal IRQ state and are not expected to be
+ *    updated outside the IRQ subsystem
+ *  - more importantly, these are useful for edge triggered interrupts,
+ *    IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ *    and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+       this_cpu_inc_rm(desc->kstat_irqs);
+       __this_cpu_inc(kstat.irqs_sum);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+                                u32 xirr,
+                                struct kvmppc_irq_map *irq_map,
+                                struct kvmppc_passthru_irqmap *pimap)
+{
+       struct kvmppc_xics *xics;
+       struct kvmppc_icp *icp;
+       u32 irq;
+
+       irq = irq_map->v_hwirq;
+       xics = vcpu->kvm->arch.xics;
+       icp = vcpu->arch.icp;
+
+       kvmppc_rm_handle_irq_desc(irq_map->desc);
+       icp_rm_deliver_irq(xics, icp, irq);
+
+       /* EOI the interrupt */
+       icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
+
+       if (check_too_hard(xics, icp) == H_TOO_HARD)
+               return 2;
+       else
+               return -2;
+}
+
  /*  --- Non-real mode XICS-related built-in routines ---  */
  
  /**
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 9756555..c3c1d1b 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -221,6 +221,13 @@ kvmppc_primary_no_guest:
         li      r3, 0           /* Don't wake on privileged (OS) doorbell */
         b       kvm_do_nap
  
+/*
+ * kvm_novcpu_wakeup
+ *     Entered from kvm_start_guest if kvm_hstate.napping is set
+ *     to NAPPING_NOVCPU
+ *             r2 = kernel TOC
+ *             r13 = paca
+ */
  kvm_novcpu_wakeup:
         ld      r1, HSTATE_HOST_R1(r13)
         ld      r5, HSTATE_KVM_VCORE(r13)
@@ -230,6 +237,13 @@ kvm_novcpu_wakeup:
         /* check the wake reason */
         bl      kvmppc_check_wake_reason
  
+       /*
+        * Restore volatile registers since we could have called
+        * a C routine in kvmppc_check_wake_reason.
+        *      r5 = VCORE
+        */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+
         /* see if any other thread is already exiting */
         lwz     r0, VCORE_ENTRY_EXIT(r5)
         cmpwi   r0, 0x100
@@ -322,6 +336,11 @@ kvm_start_guest:
  
         /* Check the wake reason in SRR1 to see why we got here */
         bl      kvmppc_check_wake_reason
+       /*
+        * kvmppc_check_wake_reason could invoke a C routine, but we
+        * have no volatile registers to restore when we return.
+        */
+
         cmpdi   r3, 0
         bge     kvm_no_guest
  
@@ -625,9 +644,11 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
  38:
  
  BEGIN_FTR_SECTION
-       /* DPDES is shared between threads */
+       /* DPDES and VTB are shared between threads */
         ld      r8, VCORE_DPDES(r5)
+       ld      r7, VCORE_VTB(r5)
         mtspr   SPRN_DPDES, r8
+       mtspr   SPRN_VTB, r7
  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  
         /* Mark the subcore state as inside guest */
@@ -787,10 +808,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         mtspr   SPRN_CIABR, r7
         mtspr   SPRN_TAR, r8
         ld      r5, VCPU_IC(r4)
-       ld      r6, VCPU_VTB(r4)
-       mtspr   SPRN_IC, r5
-       mtspr   SPRN_VTB, r6
         ld      r8, VCPU_EBBHR(r4)
+       mtspr   SPRN_IC, r5
         mtspr   SPRN_EBBHR, r8
         ld      r5, VCPU_EBBRR(r4)
         ld      r6, VCPU_BESCR(r4)
@@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         cmpwi   r3, 512         /* 1 microsecond */
         blt     hdec_soon
  
+deliver_guest_interrupt:
         ld      r6, VCPU_CTR(r4)
         ld      r7, VCPU_XER(r4)
  
@@ -895,7 +915,6 @@ kvmppc_cede_reentry:                /* r4 = vcpu, r13 = paca */
         mtspr   SPRN_SRR0, r6
         mtspr   SPRN_SRR1, r7
  
-deliver_guest_interrupt:
         /* r11 = vcpu->arch.msr & ~MSR_HV */
         rldicl  r11, r11, 63 - MSR_HV_LG, 1
         rotldi  r11, r11, 1 + MSR_HV_LG
@@ -1155,10 +1174,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
          * set, we know the host wants us out so let's do it now
          */
         bl      kvmppc_read_intr
+
+       /*
+        * Restore the active volatile registers after returning from
+        * a C function.
+        */
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       li      r12, BOOK3S_INTERRUPT_EXTERNAL
+
+       /*
+        * kvmppc_read_intr return codes:
+        *
+        * Exit to host (r3 > 0)
+        *   1 An interrupt is pending that needs to be handled by the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *
+        *   2 Passthrough that needs completion in the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+        *     to indicate to the host to complete handling the interrupt
+        *
+        * Before returning to guest, we check if any CPU is heading out
+        * to the host and if so, we head out also. If no CPUs are heading
+        * check return values <= 0.
+        *
+        * Return to guest (r3 <= 0)
+        *  0 No external interrupt is pending
+        * -1 A guest wakeup IPI (which has now been cleared)
+        *    In either case, we return to guest to deliver any pending
+        *    guest interrupts.
+        *
+        * -2 A PCI passthrough external interrupt was handled
+        *    (interrupt was delivered directly to guest)
+        *    Return to guest to deliver any pending guest interrupts.
+        */
+
+       cmpdi   r3, 1
+       ble     1f
+
+       /* Return code = 2 */
+       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
+       stw     r12, VCPU_TRAP(r9)
+       b       guest_exit_cont
+
+1:     /* Return code <= 1 */
         cmpdi   r3, 0
         bgt     guest_exit_cont
  
-       /* Check if any CPU is heading out to the host, if so head out too */
+       /* Return code <= 0 */
  4:     ld      r5, HSTATE_KVM_VCORE(r13)
         lwz     r0, VCORE_ENTRY_EXIT(r5)
         cmpwi   r0, 0x100
@@ -1271,10 +1334,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         stw     r6, VCPU_PSPB(r9)
         std     r7, VCPU_FSCR(r9)
         mfspr   r5, SPRN_IC
-       mfspr   r6, SPRN_VTB
         mfspr   r7, SPRN_TAR
         std     r5, VCPU_IC(r9)
-       std     r6, VCPU_VTB(r9)
         std     r7, VCPU_TAR(r9)
         mfspr   r8, SPRN_EBBHR
         std     r8, VCPU_EBBHR(r9)
@@ -1501,9 +1562,11 @@ kvmhv_switch_to_host:
         isync
  
  BEGIN_FTR_SECTION
-       /* DPDES is shared between threads */
+       /* DPDES and VTB are shared between threads */
         mfspr   r7, SPRN_DPDES
+       mfspr   r8, SPRN_VTB
         std     r7, VCORE_DPDES(r5)
+       std     r8, VCORE_VTB(r5)
         /* clear DPDES so we don't get guest doorbells in the host */
         li      r8, 0
         mtspr   SPRN_DPDES, r8
@@ -2213,10 +2276,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
         ld      r29, VCPU_GPR(R29)(r4)
         ld      r30, VCPU_GPR(R30)(r4)
         ld      r31, VCPU_GPR(R31)(r4)
- 
+
         /* Check the wake reason in SRR1 to see why we got here */
         bl      kvmppc_check_wake_reason
  
+       /*
+        * Restore volatile registers since we could have called a
+        * C routine in kvmppc_check_wake_reason
+        *      r4 = VCPU
+        * r3 tells us whether we need to return to host or not
+        * WARNING: it gets checked further down:
+        * should not modify r3 until this check is done.
+        */
+       ld      r4, HSTATE_KVM_VCPU(r13)
+
         /* clear our bit in vcore->napping_threads */
  34:    ld      r5,HSTATE_KVM_VCORE(r13)
         lbz     r7,HSTATE_PTID(r13)
@@ -2230,7 +2303,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
         li      r0,0
         stb     r0,HSTATE_NAPPING(r13)
  
-       /* See if the wake reason means we need to exit */
+       /* See if the wake reason saved in r3 means we need to exit */
         stw     r12, VCPU_TRAP(r4)
         mr      r9, r4
         cmpdi   r3, 0
@@ -2297,10 +2370,14 @@ machine_check_realmode:
   *     0 if nothing needs to be done
   *     1 if something happened that needs to be handled by the host
   *     -1 if there was a guest wakeup (IPI or msgsnd)
+ *     -2 if we handled a PCI passthrough interrupt (returned by
+ *             kvmppc_read_intr only)
   *
   * Also sets r12 to the interrupt vector for any interrupt that needs
   * to be handled now by the host (0x500 for external interrupt), or zero.
- * Modifies r0, r6, r7, r8.
+ * Modifies all volatile registers (since it may call a C function).
+ * This routine calls kvmppc_read_intr, a C function, if an external
+ * interrupt is pending.
   */
  kvmppc_check_wake_reason:
         mfspr   r6, SPRN_SRR1
@@ -2310,8 +2387,7 @@ FTR_SECTION_ELSE
         rlwinm  r6, r6, 45-31, 0xe      /* P7 wake reason field is 3 bits */
  ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
         cmpwi   r6, 8                   /* was it an external interrupt? */
-       li      r12, BOOK3S_INTERRUPT_EXTERNAL
-       beq     kvmppc_read_intr        /* if so, see what it was */
+       beq     7f                      /* if so, see what it was */
         li      r3, 0
         li      r12, 0
         cmpwi   r6, 6                   /* was it the decrementer? */
@@ -2350,83 +2426,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
         li      r3, 1
         blr
  
-/*
- * Determine what sort of external interrupt is pending (if any).
- * Returns:
- *     0 if no interrupt is pending
- *     1 if an interrupt is pending that needs to be handled by the host
- *     -1 if there was a guest wakeup IPI (which has now been cleared)
- * Modifies r0, r6, r7, r8, returns value in r3.
- */
-kvmppc_read_intr:
-       /* see if a host IPI is pending */
-       li      r3, 1
-       lbz     r0, HSTATE_HOST_IPI(r13)
-       cmpwi   r0, 0
-       bne     1f
+       /* external interrupt - create a stack frame so we can call C */
+7:     mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -PPC_MIN_STKFRM(r1)
+       bl      kvmppc_read_intr
+       nop
+       li      r12, BOOK3S_INTERRUPT_EXTERNAL
+       cmpdi   r3, 1
+       ble     1f
  
-       /* Now read the interrupt from the ICP */
-       ld      r6, HSTATE_XICS_PHYS(r13)
-       li      r7, XICS_XIRR
-       cmpdi   r6, 0
-       beq-    1f
-       lwzcix  r0, r6, r7
         /*
-        * Save XIRR for later. Since we get in in reverse endian on LE
-        * systems, save it byte reversed and fetch it back in host endian.
-        */
-       li      r3, HSTATE_SAVED_XIRR
-       STWX_BE r0, r3, r13
-#ifdef __LITTLE_ENDIAN__
-       lwz     r3, HSTATE_SAVED_XIRR(r13)
-#else
-       mr      r3, r0
-#endif
-       rlwinm. r3, r3, 0, 0xffffff
-       sync
-       beq     1f                      /* if nothing pending in the ICP */
-
-       /* We found something in the ICP...
-        *
-        * If it's not an IPI, stash it in the PACA and return to
-        * the host, we don't (yet) handle directing real external
-        * interrupts directly to the guest
+        * Return code of 2 means PCI passthrough interrupt, but
+        * we need to return back to host to complete handling the
+        * interrupt. Trap reason is expected in r12 by guest
+        * exit code.
          */
-       cmpwi   r3, XICS_IPI            /* if there is, is it an IPI? */
-       bne     42f
-
-       /* It's an IPI, clear the MFRR and EOI it */
-       li      r3, 0xff
-       li      r8, XICS_MFRR
-       stbcix  r3, r6, r8              /* clear the IPI */
-       stwcix  r0, r6, r7              /* EOI it */
-       sync
-
-       /* We need to re-check host IPI now in case it got set in the
-        * meantime. If it's clear, we bounce the interrupt to the
-        * guest
-        */
-       lbz     r0, HSTATE_HOST_IPI(r13)
-       cmpwi   r0, 0
-       bne-    43f
-
-       /* OK, it's an IPI for us */
-       li      r12, 0
-       li      r3, -1
-1:     blr
-
-42:    /* It's not an IPI and it's for the host. We saved a copy of XIRR in
-        * the PACA earlier, it will be picked up by the host ICP driver
-        */
-       li      r3, 1
-       b       1b
-
-43:    /* We raced with the host, we need to resend that IPI, bummer */
-       li      r0, IPI_PRIORITY
-       stbcix  r0, r6, r8              /* set the IPI */
-       sync
-       li      r3, 1
-       b       1b
+       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
+1:
+       ld      r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+       addi    r1, r1, PPC_MIN_STKFRM
+       mtlr    r0
+       blr
  
  /*
   * Save away FP, VMX and VSX registers.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c

index e76f79a..826c541 100644 (file)
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -226,7 +226,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
          */
         vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb;
         vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb;
-       vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb;
+       to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb;
         if (cpu_has_feature(CPU_FTR_ARCH_207S))
                 vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
         svcpu->in_use = false;
@@ -448,6 +448,8 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
         case PVR_POWER7:
         case PVR_POWER7p:
         case PVR_POWER8:
+       case PVR_POWER8E:
+       case PVR_POWER8NVL:
                 vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
                         BOOK3S_HFLAG_NEW_TLBIE;
                 break;
@@ -1361,6 +1363,9 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
         case KVM_REG_PPC_HIOR:
                 *val = get_reg_val(id, to_book3s(vcpu)->hior);
                 break;
+       case KVM_REG_PPC_VTB:
+               *val = get_reg_val(id, to_book3s(vcpu)->vtb);
+               break;
         case KVM_REG_PPC_LPCR:
         case KVM_REG_PPC_LPCR_64:
                 /*
@@ -1397,6 +1402,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
                 to_book3s(vcpu)->hior = set_reg_val(id, *val);
                 to_book3s(vcpu)->hior_explicit = true;
                 break;
+       case KVM_REG_PPC_VTB:
+               to_book3s(vcpu)->vtb = set_reg_val(id, *val);
+               break;
         case KVM_REG_PPC_LPCR:
         case KVM_REG_PPC_LPCR_64:
                 kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c

index 05aa113..3bdc639 100644 (file)
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
                 return 0;
         }
  
+       /* Record which CPU this arrived on for passed-through interrupts */
+       if (state->host_irq)
+               state->intr_cpu = raw_smp_processor_id();
+
         /* Attempt delivery */
         icp_deliver_irq(xics, NULL, irq);
  
@@ -812,7 +816,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
         return H_SUCCESS;
  }
  
-static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
  {
         struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
         struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -841,6 +845,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
  
         return H_SUCCESS;
  }
+EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete);
  
  int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
  {
@@ -892,6 +897,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
  
  /* -- Initialisation code etc. -- */
  
+static void xics_debugfs_irqmap(struct seq_file *m,
+                               struct kvmppc_passthru_irqmap *pimap)
+{
+       int i;
+
+       if (!pimap)
+               return;
+       seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n",
+                               pimap->n_mapped);
+       for (i = 0; i < pimap->n_mapped; i++)  {
+               seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+                       pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+       }
+}
+
  static int xics_debug_show(struct seq_file *m, void *private)
  {
         struct kvmppc_xics *xics = m->private;
@@ -913,6 +933,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
         t_check_resend = 0;
         t_reject = 0;
  
+       xics_debugfs_irqmap(m, kvm->arch.pimap);
+
         seq_printf(m, "=========\nICP state\n=========\n");
  
         kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1252,6 +1274,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
  {
         struct kvmppc_xics *xics = kvm->arch.xics;
  
+       if (!xics)
+               return -ENODEV;
         return ics_deliver_irq(xics, irq, level);
  }
  
@@ -1418,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
  {
         return pin;
  }
+
+void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
+                           unsigned long host_irq)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       u16 idx;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &idx);
+       if (!ics)
+               return;
+
+       ics->irq_state[idx].host_irq = host_irq;
+       ics->irq_state[idx].intr_cpu = -1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped);
+
+void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq,
+                           unsigned long host_irq)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       u16 idx;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &idx);
+       if (!ics)
+               return;
+
+       ics->irq_state[idx].host_irq = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h

index a46b954..2a50320 100644 (file)
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -42,6 +42,8 @@ struct ics_irq_state {
         u8  lsi;                /* level-sensitive interrupt */
         u8  asserted; /* Only for LSI */
         u8  exists;
+       int intr_cpu;
+       u32 host_irq;
  };
  
  /* Atomic ICP state, updated with a single compare & swap */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c

index 02b4672..df3f270 100644 (file)
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -2038,7 +2038,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                 if (type == KVMPPC_DEBUG_NONE)
                         continue;
  
-               if (type & !(KVMPPC_DEBUG_WATCH_READ |
+               if (type & ~(KVMPPC_DEBUG_WATCH_READ |
                              KVMPPC_DEBUG_WATCH_WRITE |
                              KVMPPC_DEBUG_BREAKPOINT))
                         return -EINVAL;
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c

index 29911a0..ddbf8f0 100644 (file)
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
         char *virt;
         struct page **pages;
         struct tlbe_priv *privs[2] = {};
-       u64 *g2h_bitmap = NULL;
+       u64 *g2h_bitmap;
         size_t array_len;
         u32 sets;
         int num_pages, ret, i;
@@ -779,41 +779,44 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
  
         num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
                     cfg->array / PAGE_SIZE;
-       pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+       pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
         if (!pages)
                 return -ENOMEM;
  
         ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
         if (ret < 0)
-               goto err_pages;
+               goto free_pages;
  
         if (ret != num_pages) {
                 num_pages = ret;
                 ret = -EFAULT;
-               goto err_put_page;
+               goto put_pages;
         }
  
         virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
         if (!virt) {
                 ret = -ENOMEM;
-               goto err_put_page;
+               goto put_pages;
         }
  
-       privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
-                          GFP_KERNEL);
-       privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
-                          GFP_KERNEL);
+       privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL);
+       if (!privs[0]) {
+               ret = -ENOMEM;
+               goto put_pages;
+       }
  
-       if (!privs[0] || !privs[1]) {
+       privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL);
+       if (!privs[1]) {
                 ret = -ENOMEM;
-               goto err_privs;
+               goto free_privs_first;
         }
  
-       g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
-                            GFP_KERNEL);
+       g2h_bitmap = kcalloc(params.tlb_sizes[1],
+                            sizeof(*g2h_bitmap),
+                            GFP_KERNEL);
         if (!g2h_bitmap) {
                 ret = -ENOMEM;
-               goto err_privs;
+               goto free_privs_second;
         }
  
         free_gtlb(vcpu_e500);
@@ -845,16 +848,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
  
         kvmppc_recalc_tlb1map_range(vcpu_e500);
         return 0;
-
-err_privs:
-       kfree(privs[0]);
+ free_privs_second:
         kfree(privs[1]);
-
-err_put_page:
+ free_privs_first:
+       kfree(privs[0]);
+ put_pages:
         for (i = 0; i < num_pages; i++)
                 put_page(pages[i]);
-
-err_pages:
+ free_pages:
         kfree(pages);
         return ret;
  }
@@ -904,11 +905,9 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
  int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
  {
         struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
-       int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
-       int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
  
         if (e500_mmu_host_init(vcpu_e500))
-               goto err;
+               goto free_vcpu;
  
         vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
         vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -920,37 +919,39 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
         vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
         vcpu_e500->gtlb_params[1].sets = 1;
  
-       vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+       vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE +
+                                            KVM_E500_TLB1_SIZE,
+                                            sizeof(*vcpu_e500->gtlb_arch),
+                                            GFP_KERNEL);
         if (!vcpu_e500->gtlb_arch)
                 return -ENOMEM;
  
         vcpu_e500->gtlb_offset[0] = 0;
         vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
  
-       vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
-                                         vcpu_e500->gtlb_params[0].entries,
+       vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries,
+                                         sizeof(struct tlbe_ref),
                                           GFP_KERNEL);
         if (!vcpu_e500->gtlb_priv[0])
-               goto err;
+               goto free_vcpu;
  
-       vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
-                                         vcpu_e500->gtlb_params[1].entries,
+       vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
+                                         sizeof(struct tlbe_ref),
                                           GFP_KERNEL);
         if (!vcpu_e500->gtlb_priv[1])
-               goto err;
+               goto free_vcpu;
  
-       vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
-                                         vcpu_e500->gtlb_params[1].entries,
+       vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
+                                         sizeof(*vcpu_e500->g2h_tlb1_map),
                                           GFP_KERNEL);
         if (!vcpu_e500->g2h_tlb1_map)
-               goto err;
+               goto free_vcpu;
  
         vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
  
         kvmppc_recalc_tlb1map_range(vcpu_e500);
         return 0;
-
-err:
+ free_vcpu:
         free_gtlb(vcpu_e500);
         return -1;
  }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c

index 6ce40dd..70963c8 100644 (file)
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,8 @@
  #include <linux/slab.h>
  #include <linux/file.h>
  #include <linux/module.h>
+#include <linux/irqbypass.h>
+#include <linux/kvm_irqfd.h>
  #include <asm/cputable.h>
  #include <asm/uaccess.h>
  #include <asm/kvm_ppc.h>
@@ -436,6 +438,16 @@ err_out:
         return -EINVAL;
  }
  
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
  void kvm_arch_destroy_vm(struct kvm *kvm)
  {
         unsigned int i;
@@ -739,6 +751,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  #endif
  }
  
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+       return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+               (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+                                    struct irq_bypass_producer *prod)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+       struct kvm *kvm = irqfd->kvm;
+
+       if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+               return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+       return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+                                     struct irq_bypass_producer *prod)
+{
+       struct kvm_kernel_irqfd *irqfd =
+               container_of(cons, struct kvm_kernel_irqfd, consumer);
+       struct kvm *kvm = irqfd->kvm;
+
+       if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+               kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
  static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                        struct kvm_run *run)
  {
@@ -1167,6 +1215,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
         return r;
  }
  
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_MPIC
+       if (kvm->arch.mpic)
+               return true;
+#endif
+#ifdef CONFIG_KVM_XICS
+       if (kvm->arch.xics)
+               return true;
+#endif
+       return false;
+}
+
  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                      struct kvm_mp_state *mp_state)
  {
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h

index 33d9daf..fb21990 100644 (file)
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
                    __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
  );
  
+TRACE_EVENT(kvmppc_vcore_wakeup,
+       TP_PROTO(int do_sleep, __u64 ns),
+
+       TP_ARGS(do_sleep, ns),
+
+       TP_STRUCT__entry(
+               __field(__u64,  ns)
+               __field(int,    waited)
+               __field(pid_t,  tgid)
+       ),
+
+       TP_fast_assign(
+               __entry->ns     = ns;
+               __entry->waited = do_sleep;
+               __entry->tgid   = current->tgid;
+       ),
+
+       TP_printk("%s time %lld ns, tgid=%d",
+               __entry->waited ? "wait" : "poll",
+               __entry->ns, __entry->tgid)
+);
+
  TRACE_EVENT(kvmppc_run_vcpu_enter,
         TP_PROTO(struct kvm_vcpu *vcpu),
  
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c

index 0e4e965..83ddc0e 100644 (file)
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid,
  }
  #endif
  
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-       int i, shift;
-       unsigned int mask;
-
-       /* start from 1 ignoring MMU_PAGE_4K */
-       for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-               /* invalid penc */
-               if (mmu_psize_defs[psize].penc[i] == -1)
-                       continue;
-               /*
-                * encoding bits per actual page size
-                *        PTE LP     actual page size
-                *    rrrr rrrz         >=8KB
-                *    rrrr rrzz         >=16KB
-                *    rrrr rzzz         >=32KB
-                *    rrrr zzzz         >=64KB
-                * .......
-                */
-               shift = mmu_psize_defs[i].shift - LP_SHIFT;
-               if (shift > LP_BITS)
-                       shift = LP_BITS;
-               mask = (1 << shift) - 1;
-               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                       return i;
-       }
-       return -1;
-}
-
  static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                         int *psize, int *apsize, int *ssize, unsigned long *vpn)
  {
@@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                 size   = MMU_PAGE_4K;
                 a_size = MMU_PAGE_4K;
         } else {
-               for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[size].shift)
-                               continue;
-
-                       a_size = __hpte_actual_psize(lp, size);
-                       if (a_size != -1)
-                               break;
-               }
+               size = hpte_page_sizes[lp] & 0xf;
+               a_size = hpte_page_sizes[lp] >> 4;
         }
         /* This works for all page sizes, and for 256M and 1T segments */
         if (cpu_has_feature(CPU_FTR_ARCH_300))
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c

index 0821556..ef3ae89 100644 (file)
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -93,6 +93,9 @@ static unsigned long _SDR1;
  struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
  EXPORT_SYMBOL_GPL(mmu_psize_defs);
  
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
  struct hash_pte *htab_address;
  unsigned long htab_size_bytes;
  unsigned long htab_hash_mask;
@@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void)
  #endif /* CONFIG_HUGETLB_PAGE */
  }
  
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations.  Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE.  For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ *        PTE LP     actual page size
+ *    rrrr rrrz                >=8KB
+ *    rrrr rrzz                >=16KB
+ *    rrrr rzzz                >=32KB
+ *    rrrr zzzz                >=64KB
+ *    ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+       long int ap, bp;
+       long int shift, penc;
+
+       for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+               if (!mmu_psize_defs[bp].shift)
+                       continue;       /* not a supported page size */
+               for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+                       penc = mmu_psize_defs[bp].penc[ap];
+                       if (penc == -1)
+                               continue;
+                       shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+                       if (shift <= 0)
+                               continue;       /* should never happen */
+                       /*
+                        * For page sizes less than 1MB, this loop
+                        * replicates the entry for all possible values
+                        * of the rrrr bits.
+                        */
+                       while (penc < (1 << LP_BITS)) {
+                               hpte_page_sizes[penc] = (ap << 4) | bp;
+                               penc += 1 << shift;
+                       }
+               }
+       }
+}
+
  static void __init htab_init_page_sizes(void)
  {
+       init_hpte_page_sizes();
+
         if (!debug_pagealloc_enabled()) {
                 /*
                  * Pick a size for the linear mapping. Currently, we only
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S

index 3d29d40..44d2d84 100644 (file)
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -208,6 +208,7 @@ OPAL_CALL(opal_pci_config_write_byte,               OPAL_PCI_CONFIG_WRITE_BYTE);
  OPAL_CALL(opal_pci_config_write_half_word,     OPAL_PCI_CONFIG_WRITE_HALF_WORD);
  OPAL_CALL(opal_pci_config_write_word,          OPAL_PCI_CONFIG_WRITE_WORD);
  OPAL_CALL(opal_set_xive,                       OPAL_SET_XIVE);
+OPAL_CALL_REAL(opal_rm_set_xive,               OPAL_SET_XIVE);
  OPAL_CALL(opal_get_xive,                       OPAL_GET_XIVE);
  OPAL_CALL(opal_register_exception_handler,     OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
  OPAL_CALL(opal_pci_eeh_freeze_status,          OPAL_PCI_EEH_FREEZE_STATUS);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c

index 38a5c65..d314ecc 100644 (file)
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2718,15 +2718,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
  }
  
  #ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
  {
-       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-       struct irq_chip *chip = irq_data_get_irq_chip(d);
         struct pnv_phb *phb = container_of(chip, struct pnv_phb,
                                            ioda.irq_chip);
+
+       return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
         int64_t rc;
+       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+       struct irq_chip *chip = irq_data_get_irq_chip(d);
  
-       rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+       rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
         WARN_ON_ONCE(rc);
  
         icp_native_eoi(d);
@@ -2756,6 +2762,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
         irq_set_chip(virq, &phb->ioda.irq_chip);
  }
  
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+       return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
  static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
                                   unsigned int hwirq, unsigned int virq,
                                   unsigned int is_64, struct msi_msg *msg)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index 8e5daf7..a41faf3 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -28,7 +28,7 @@
  
  #define KVM_S390_BSCA_CPU_SLOTS 64
  #define KVM_S390_ESCA_CPU_SLOTS 248
-#define KVM_MAX_VCPUS KVM_S390_ESCA_CPU_SLOTS
+#define KVM_MAX_VCPUS 255
  #define KVM_USER_MEM_SLOTS 32
  
  /*
@@ -245,72 +245,72 @@ struct sie_page {
  } __packed;
  
  struct kvm_vcpu_stat {
-       u32 exit_userspace;
-       u32 exit_null;
-       u32 exit_external_request;
-       u32 exit_external_interrupt;
-       u32 exit_stop_request;
-       u32 exit_validity;
-       u32 exit_instruction;
-       u32 exit_pei;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 instruction_lctl;
-       u32 instruction_lctlg;
-       u32 instruction_stctl;
-       u32 instruction_stctg;
-       u32 exit_program_interruption;
-       u32 exit_instr_and_program;
-       u32 exit_operation_exception;
-       u32 deliver_external_call;
-       u32 deliver_emergency_signal;
-       u32 deliver_service_signal;
-       u32 deliver_virtio_interrupt;
-       u32 deliver_stop_signal;
-       u32 deliver_prefix_signal;
-       u32 deliver_restart_signal;
-       u32 deliver_program_int;
-       u32 deliver_io_int;
-       u32 exit_wait_state;
-       u32 instruction_pfmf;
-       u32 instruction_stidp;
-       u32 instruction_spx;
-       u32 instruction_stpx;
-       u32 instruction_stap;
-       u32 instruction_storage_key;
-       u32 instruction_ipte_interlock;
-       u32 instruction_stsch;
-       u32 instruction_chsc;
-       u32 instruction_stsi;
-       u32 instruction_stfl;
-       u32 instruction_tprot;
-       u32 instruction_sie;
-       u32 instruction_essa;
-       u32 instruction_sthyi;
-       u32 instruction_sigp_sense;
-       u32 instruction_sigp_sense_running;
-       u32 instruction_sigp_external_call;
-       u32 instruction_sigp_emergency;
-       u32 instruction_sigp_cond_emergency;
-       u32 instruction_sigp_start;
-       u32 instruction_sigp_stop;
-       u32 instruction_sigp_stop_store_status;
-       u32 instruction_sigp_store_status;
-       u32 instruction_sigp_store_adtl_status;
-       u32 instruction_sigp_arch;
-       u32 instruction_sigp_prefix;
-       u32 instruction_sigp_restart;
-       u32 instruction_sigp_init_cpu_reset;
-       u32 instruction_sigp_cpu_reset;
-       u32 instruction_sigp_unknown;
-       u32 diagnose_10;
-       u32 diagnose_44;
-       u32 diagnose_9c;
-       u32 diagnose_258;
-       u32 diagnose_308;
-       u32 diagnose_500;
+       u64 exit_userspace;
+       u64 exit_null;
+       u64 exit_external_request;
+       u64 exit_external_interrupt;
+       u64 exit_stop_request;
+       u64 exit_validity;
+       u64 exit_instruction;
+       u64 exit_pei;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 instruction_lctl;
+       u64 instruction_lctlg;
+       u64 instruction_stctl;
+       u64 instruction_stctg;
+       u64 exit_program_interruption;
+       u64 exit_instr_and_program;
+       u64 exit_operation_exception;
+       u64 deliver_external_call;
+       u64 deliver_emergency_signal;
+       u64 deliver_service_signal;
+       u64 deliver_virtio_interrupt;
+       u64 deliver_stop_signal;
+       u64 deliver_prefix_signal;
+       u64 deliver_restart_signal;
+       u64 deliver_program_int;
+       u64 deliver_io_int;
+       u64 exit_wait_state;
+       u64 instruction_pfmf;
+       u64 instruction_stidp;
+       u64 instruction_spx;
+       u64 instruction_stpx;
+       u64 instruction_stap;
+       u64 instruction_storage_key;
+       u64 instruction_ipte_interlock;
+       u64 instruction_stsch;
+       u64 instruction_chsc;
+       u64 instruction_stsi;
+       u64 instruction_stfl;
+       u64 instruction_tprot;
+       u64 instruction_sie;
+       u64 instruction_essa;
+       u64 instruction_sthyi;
+       u64 instruction_sigp_sense;
+       u64 instruction_sigp_sense_running;
+       u64 instruction_sigp_external_call;
+       u64 instruction_sigp_emergency;
+       u64 instruction_sigp_cond_emergency;
+       u64 instruction_sigp_start;
+       u64 instruction_sigp_stop;
+       u64 instruction_sigp_stop_store_status;
+       u64 instruction_sigp_store_status;
+       u64 instruction_sigp_store_adtl_status;
+       u64 instruction_sigp_arch;
+       u64 instruction_sigp_prefix;
+       u64 instruction_sigp_restart;
+       u64 instruction_sigp_init_cpu_reset;
+       u64 instruction_sigp_cpu_reset;
+       u64 instruction_sigp_unknown;
+       u64 diagnose_10;
+       u64 diagnose_44;
+       u64 diagnose_9c;
+       u64 diagnose_258;
+       u64 diagnose_308;
+       u64 diagnose_500;
  };
  
  #define PGM_OPERATION                  0x01
@@ -577,7 +577,7 @@ struct kvm_vcpu_arch {
  };
  
  struct kvm_vm_stat {
-       u32 remote_tlb_flush;
+       ulong remote_tlb_flush;
  };
  
  struct kvm_arch_memory_slot {
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c

index 1f95cc1..f3df9e0 100644 (file)
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -125,6 +125,7 @@ int main(void)
         OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
         OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
         OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
+       OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
         OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
         OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
         OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c

index 5420020..4aa8a7e 100644 (file)
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -495,6 +495,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
         tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
  
         switch (code) {
+       case PGM_PROTECTION:
+               switch (prot) {
+               case PROT_TYPE_ALC:
+                       tec->b60 = 1;
+                       /* FALL THROUGH */
+               case PROT_TYPE_DAT:
+                       tec->b61 = 1;
+                       break;
+               default: /* LA and KEYC set b61 to 0, other params undefined */
+                       return code;
+               }
+               /* FALL THROUGH */
         case PGM_ASCE_TYPE:
         case PGM_PAGE_TRANSLATION:
         case PGM_REGION_FIRST_TRANS:
@@ -504,8 +516,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
                 /*
                  * op_access_id only applies to MOVE_PAGE -> set bit 61
                  * exc_access_id has to be set to 0 for some instructions. Both
-                * cases have to be handled by the caller. We can always store
-                * exc_access_id, as it is undefined for non-ar cases.
+                * cases have to be handled by the caller.
                  */
                 tec->addr = gva >> PAGE_SHIFT;
                 tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
@@ -516,25 +527,13 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
         case PGM_ASTE_VALIDITY:
         case PGM_ASTE_SEQUENCE:
         case PGM_EXTENDED_AUTHORITY:
+               /*
+                * We can always store exc_access_id, as it is
+                * undefined for non-ar cases. It is undefined for
+                * most DAT protection exceptions.
+                */
                 pgm->exc_access_id = ar;
                 break;
-       case PGM_PROTECTION:
-               switch (prot) {
-               case PROT_TYPE_ALC:
-                       tec->b60 = 1;
-                       /* FALL THROUGH */
-               case PROT_TYPE_DAT:
-                       tec->b61 = 1;
-                       tec->addr = gva >> PAGE_SHIFT;
-                       tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
-                       tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
-                       /* exc_access_id is undefined for most cases */
-                       pgm->exc_access_id = ar;
-                       break;
-               default: /* LA and KEYC set b61 to 0, other params undefined */
-                       break;
-               }
-               break;
         }
         return code;
  }
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c

index 31a0533..d7c6a7f 100644 (file)
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -206,7 +206,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
  int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
                             struct kvm_guest_debug *dbg)
  {
-       int ret = 0, nr_wp = 0, nr_bp = 0, i, size;
+       int ret = 0, nr_wp = 0, nr_bp = 0, i;
         struct kvm_hw_breakpoint *bp_data = NULL;
         struct kvm_hw_wp_info_arch *wp_info = NULL;
         struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -216,17 +216,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
         else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
                 return -EINVAL;
  
-       size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint);
-       bp_data = kmalloc(size, GFP_KERNEL);
-       if (!bp_data) {
-               ret = -ENOMEM;
-               goto error;
-       }
-
-       if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) {
-               ret = -EFAULT;
-               goto error;
-       }
+       bp_data = memdup_user(dbg->arch.hw_bp,
+                             sizeof(*bp_data) * dbg->arch.nr_hw_bp);
+       if (IS_ERR(bp_data))
+               return PTR_ERR(bp_data);
  
         for (i = 0; i < dbg->arch.nr_hw_bp; i++) {
                 switch (bp_data[i].type) {
@@ -241,17 +234,19 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
                 }
         }
  
-       size = nr_wp * sizeof(struct kvm_hw_wp_info_arch);
-       if (size > 0) {
-               wp_info = kmalloc(size, GFP_KERNEL);
+       if (nr_wp > 0) {
+               wp_info = kmalloc_array(nr_wp,
+                                       sizeof(*wp_info),
+                                       GFP_KERNEL);
                 if (!wp_info) {
                         ret = -ENOMEM;
                         goto error;
                 }
         }
-       size = nr_bp * sizeof(struct kvm_hw_bp_info_arch);
-       if (size > 0) {
-               bp_info = kmalloc(size, GFP_KERNEL);
+       if (nr_bp > 0) {
+               bp_info = kmalloc_array(nr_bp,
+                                       sizeof(*bp_info),
+                                       GFP_KERNEL);
                 if (!bp_info) {
                         ret = -ENOMEM;
                         goto error;
@@ -382,14 +377,20 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
         vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
  }
  
+#define PER_CODE_MASK          (PER_EVENT_MASK >> 24)
+#define PER_CODE_BRANCH                (PER_EVENT_BRANCH >> 24)
+#define PER_CODE_IFETCH                (PER_EVENT_IFETCH >> 24)
+#define PER_CODE_STORE         (PER_EVENT_STORE >> 24)
+#define PER_CODE_STORE_REAL    (PER_EVENT_STORE_REAL >> 24)
+
  #define per_bp_event(code) \
-                       (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH))
+                       (code & (PER_CODE_IFETCH | PER_CODE_BRANCH))
  #define per_write_wp_event(code) \
-                       (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL))
+                       (code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
  
  static int debug_exit_required(struct kvm_vcpu *vcpu)
  {
-       u32 perc = (vcpu->arch.sie_block->perc << 24);
+       u8 perc = vcpu->arch.sie_block->perc;
         struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
         struct kvm_hw_wp_info_arch *wp_info = NULL;
         struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -444,7 +445,7 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
         const u8 ilen = kvm_s390_get_ilen(vcpu);
         struct kvm_s390_pgm_info pgm_info = {
                 .code = PGM_PER,
-               .per_code = PER_EVENT_IFETCH >> 24,
+               .per_code = PER_CODE_IFETCH,
                 .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
         };
  
@@ -458,33 +459,33 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
  
  static void filter_guest_per_event(struct kvm_vcpu *vcpu)
  {
-       u32 perc = vcpu->arch.sie_block->perc << 24;
+       const u8 perc = vcpu->arch.sie_block->perc;
         u64 peraddr = vcpu->arch.sie_block->peraddr;
         u64 addr = vcpu->arch.sie_block->gpsw.addr;
         u64 cr9 = vcpu->arch.sie_block->gcr[9];
         u64 cr10 = vcpu->arch.sie_block->gcr[10];
         u64 cr11 = vcpu->arch.sie_block->gcr[11];
         /* filter all events, demanded by the guest */
-       u32 guest_perc = perc & cr9 & PER_EVENT_MASK;
+       u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
  
         if (!guest_per_enabled(vcpu))
                 guest_perc = 0;
  
         /* filter "successful-branching" events */
-       if (guest_perc & PER_EVENT_BRANCH &&
+       if (guest_perc & PER_CODE_BRANCH &&
             cr9 & PER_CONTROL_BRANCH_ADDRESS &&
             !in_addr_range(addr, cr10, cr11))
-               guest_perc &= ~PER_EVENT_BRANCH;
+               guest_perc &= ~PER_CODE_BRANCH;
  
         /* filter "instruction-fetching" events */
-       if (guest_perc & PER_EVENT_IFETCH &&
+       if (guest_perc & PER_CODE_IFETCH &&
             !in_addr_range(peraddr, cr10, cr11))
-               guest_perc &= ~PER_EVENT_IFETCH;
+               guest_perc &= ~PER_CODE_IFETCH;
  
         /* All other PER events will be given to the guest */
         /* TODO: Check altered address/address space */
  
-       vcpu->arch.sie_block->perc = guest_perc >> 24;
+       vcpu->arch.sie_block->perc = guest_perc;
  
         if (!guest_perc)
                 vcpu->arch.sie_block->iprcc &= ~PGM_PER;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c

index dfd0ca2..1cab8a1 100644 (file)
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -29,6 +29,7 @@ static const intercept_handler_t instruction_handlers[256] = {
         [0x01] = kvm_s390_handle_01,
         [0x82] = kvm_s390_handle_lpsw,
         [0x83] = kvm_s390_handle_diag,
+       [0xaa] = kvm_s390_handle_aa,
         [0xae] = kvm_s390_handle_sigp,
         [0xb2] = kvm_s390_handle_b2,
         [0xb6] = kvm_s390_handle_stctl,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c

index 24524c0..be4db07 100644 (file)
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -24,6 +24,8 @@
  #include <asm/sclp.h>
  #include <asm/isc.h>
  #include <asm/gmap.h>
+#include <asm/switch_to.h>
+#include <asm/nmi.h>
  #include "kvm-s390.h"
  #include "gaccess.h"
  #include "trace-s390.h"
@@ -40,6 +42,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
         if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
                 return 0;
  
+       BUG_ON(!kvm_s390_use_sca_entries());
         read_lock(&vcpu->kvm->arch.sca_lock);
         if (vcpu->kvm->arch.use_esca) {
                 struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -68,6 +71,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
  {
         int expect, rc;
  
+       BUG_ON(!kvm_s390_use_sca_entries());
         read_lock(&vcpu->kvm->arch.sca_lock);
         if (vcpu->kvm->arch.use_esca) {
                 struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -109,6 +113,8 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
         int rc, expect;
  
+       if (!kvm_s390_use_sca_entries())
+               return;
         atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags);
         read_lock(&vcpu->kvm->arch.sca_lock);
         if (vcpu->kvm->arch.use_esca) {
@@ -400,12 +406,78 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu)
         return rc ? -EFAULT : 0;
  }
  
+static int __write_machine_check(struct kvm_vcpu *vcpu,
+                                struct kvm_s390_mchk_info *mchk)
+{
+       unsigned long ext_sa_addr;
+       freg_t fprs[NUM_FPRS];
+       union mci mci;
+       int rc;
+
+       mci.val = mchk->mcic;
+       /* take care of lazy register loading via vcpu load/put */
+       save_fpu_regs();
+       save_access_regs(vcpu->run->s.regs.acrs);
+
+       /* Extended save area */
+       rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr,
+                           sizeof(unsigned long));
+       /* Only bits 0-53 are used for address formation */
+       ext_sa_addr &= ~0x3ffUL;
+       if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) {
+               if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs,
+                                   512))
+                       mci.vr = 0;
+       } else {
+               mci.vr = 0;
+       }
+
+       /* General interruption information */
+       rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID);
+       rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
+                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
+                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE);
+
+       /* Register-save areas */
+       if (MACHINE_HAS_VX) {
+               convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
+               rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128);
+       } else {
+               rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA,
+                                    vcpu->run->s.regs.fprs, 128);
+       }
+       rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA,
+                            vcpu->run->s.regs.gprs, 128);
+       rc |= put_guest_lc(vcpu, current->thread.fpu.fpc,
+                          (u32 __user *) __LC_FP_CREG_SAVE_AREA);
+       rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr,
+                          (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA);
+       rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu),
+                          (u64 __user *) __LC_CPU_TIMER_SAVE_AREA);
+       rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8,
+                          (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA);
+       rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA,
+                            &vcpu->run->s.regs.acrs, 64);
+       rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA,
+                            &vcpu->arch.sie_block->gcr, 128);
+
+       /* Extended interruption information */
+       rc |= put_guest_lc(vcpu, mchk->ext_damage_code,
+                          (u32 __user *) __LC_EXT_DAMAGE_CODE);
+       rc |= put_guest_lc(vcpu, mchk->failing_storage_address,
+                          (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
+       rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout,
+                            sizeof(mchk->fixed_logout));
+       return rc ? -EFAULT : 0;
+}
+
  static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
  {
         struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
         struct kvm_s390_mchk_info mchk = {};
-       unsigned long adtl_status_addr;
         int deliver = 0;
         int rc = 0;
  
@@ -446,29 +518,9 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
                 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
                                                  KVM_S390_MCHK,
                                                  mchk.cr14, mchk.mcic);
-
-               rc  = kvm_s390_vcpu_store_status(vcpu,
-                                                KVM_S390_STORE_STATUS_PREFIXED);
-               rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR,
-                                   &adtl_status_addr,
-                                   sizeof(unsigned long));
-               rc |= kvm_s390_vcpu_store_adtl_status(vcpu,
-                                                     adtl_status_addr);
-               rc |= put_guest_lc(vcpu, mchk.mcic,
-                                  (u64 __user *) __LC_MCCK_CODE);
-               rc |= put_guest_lc(vcpu, mchk.failing_storage_address,
-                                  (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
-               rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
-                                    &mchk.fixed_logout,
-                                    sizeof(mchk.fixed_logout));
-               rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
-                                    &vcpu->arch.sie_block->gpsw,
-                                    sizeof(psw_t));
-               rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
-                                   &vcpu->arch.sie_block->gpsw,
-                                   sizeof(psw_t));
+               rc = __write_machine_check(vcpu, &mchk);
         }
-       return rc ? -EFAULT : 0;
+       return rc;
  }
  
  static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index 7e8cb6a..9c7a1ec 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -384,7 +384,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_NR_VCPUS:
         case KVM_CAP_MAX_VCPUS:
                 r = KVM_S390_BSCA_CPU_SLOTS;
-               if (sclp.has_esca && sclp.has_64bscao)
+               if (!kvm_s390_use_sca_entries())
+                       r = KVM_MAX_VCPUS;
+               else if (sclp.has_esca && sclp.has_64bscao)
                         r = KVM_S390_ESCA_CPU_SLOTS;
                 break;
         case KVM_CAP_NR_MEMSLOTS:
@@ -1498,6 +1500,16 @@ out_err:
         return rc;
  }
  
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  {
         VCPU_EVENT(vcpu, 3, "%s", "free cpu");
@@ -1561,6 +1573,8 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
  
  static void sca_del_vcpu(struct kvm_vcpu *vcpu)
  {
+       if (!kvm_s390_use_sca_entries())
+               return;
         read_lock(&vcpu->kvm->arch.sca_lock);
         if (vcpu->kvm->arch.use_esca) {
                 struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1578,6 +1592,13 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
  
  static void sca_add_vcpu(struct kvm_vcpu *vcpu)
  {
+       if (!kvm_s390_use_sca_entries()) {
+               struct bsca_block *sca = vcpu->kvm->arch.sca;
+
+               /* we still need the basic sca for the ipte control */
+               vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
+               vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+       }
         read_lock(&vcpu->kvm->arch.sca_lock);
         if (vcpu->kvm->arch.use_esca) {
                 struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1658,6 +1679,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
  {
         int rc;
  
+       if (!kvm_s390_use_sca_entries()) {
+               if (id < KVM_MAX_VCPUS)
+                       return true;
+               return false;
+       }
         if (id < KVM_S390_BSCA_CPU_SLOTS)
                 return true;
         if (!sclp.has_esca || !sclp.has_64bscao)
@@ -1946,8 +1972,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
                 vcpu->arch.sie_block->eca |= 1;
         if (sclp.has_sigpif)
                 vcpu->arch.sie_block->eca |= 0x10000000U;
-       if (test_kvm_facility(vcpu->kvm, 64))
-               vcpu->arch.sie_block->ecb3 |= 0x01;
         if (test_kvm_facility(vcpu->kvm, 129)) {
                 vcpu->arch.sie_block->eca |= 0x00020000;
                 vcpu->arch.sie_block->ecd |= 0x20000000;
@@ -2704,6 +2728,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
                         kvm_clear_async_pf_completion_queue(vcpu);
         }
+       /*
+        * If userspace sets the riccb (e.g. after migration) to a valid state,
+        * we should enable RI here instead of doing the lazy enablement.
+        */
+       if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
+           test_kvm_facility(vcpu->kvm, 64)) {
+               struct runtime_instr_cb *riccb =
+                       (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+
+               if (riccb->valid)
+                       vcpu->arch.sie_block->ecb3 |= 0x01;
+       }
+
         kvm_run->kvm_dirty_regs = 0;
  }
  
@@ -2847,38 +2884,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
         return kvm_s390_store_status_unloaded(vcpu, addr);
  }
  
-/*
- * store additional status at address
- */
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
-                                       unsigned long gpa)
-{
-       /* Only bits 0-53 are used for address formation */
-       if (!(gpa & ~0x3ff))
-               return 0;
-
-       return write_guest_abs(vcpu, gpa & ~0x3ff,
-                              (void *)&vcpu->run->s.regs.vrs, 512);
-}
-
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-       if (!test_kvm_facility(vcpu->kvm, 129))
-               return 0;
-
-       /*
-        * The guest VXRS are in the host VXRs due to the lazy
-        * copying in vcpu load/put. We can simply call save_fpu_regs()
-        * to save the current register state because we are in the
-        * middle of a load/put cycle.
-        *
-        * Let's update our copies before we save it into the save area.
-        */
-       save_fpu_regs();
-
-       return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
-}
-
  static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
  {
         kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h

index b843286..3a4e97f 100644 (file)
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -20,6 +20,7 @@
  #include <linux/kvm_host.h>
  #include <asm/facility.h>
  #include <asm/processor.h>
+#include <asm/sclp.h>
  
  typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
  
@@ -245,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
  
  /* implemented in priv.c */
  int is_valid_psw(psw_t *psw);
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu);
  int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
  int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
  int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
@@ -273,10 +275,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu);
  void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
  long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
  int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
-                                       unsigned long addr);
  int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
  void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
  void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
  void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
@@ -389,4 +388,13 @@ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
  
         return &sca->ipte_control;
  }
+static inline int kvm_s390_use_sca_entries(void)
+{
+       /*
+        * Without SIGP interpretation, only SRS interpretation (if available)
+        * might use the entries. By not setting the entries and keeping them
+        * invalid, hardware will not access them but intercept.
+        */
+       return sclp.has_sigpif;
+}
  #endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c

index 4616038..e184353 100644 (file)
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -32,6 +32,24 @@
  #include "kvm-s390.h"
  #include "trace.h"
  
+static int handle_ri(struct kvm_vcpu *vcpu)
+{
+       if (test_kvm_facility(vcpu->kvm, 64)) {
+               vcpu->arch.sie_block->ecb3 |= 0x01;
+               kvm_s390_retry_instr(vcpu);
+               return 0;
+       } else
+               return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
+{
+       if ((vcpu->arch.sie_block->ipa & 0xf) <= 4)
+               return handle_ri(vcpu);
+       else
+               return -EOPNOTSUPP;
+}
+
  /* Handle SCK (SET CLOCK) interception */
  static int handle_set_clock(struct kvm_vcpu *vcpu)
  {
@@ -1093,6 +1111,9 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
  static const intercept_handler_t eb_handlers[256] = {
         [0x2f] = handle_lctlg,
         [0x25] = handle_stctg,
+       [0x60] = handle_ri,
+       [0x61] = handle_ri,
+       [0x62] = handle_ri,
  };
  
  int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c

index 95eccd4..5378397 100644 (file)
--- a/arch/sh/kernel/ftrace.c
+++ b/arch/sh/kernel/ftrace.c
@@ -139,7 +139,7 @@ static void ftrace_mod_code(void)
                 clear_mod_flag();
  }
  
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
  {
         if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
                 smp_rmb();
@@ -150,7 +150,7 @@ void ftrace_nmi_enter(void)
         smp_mb();
  }
  
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
  {
         /* Finish all executions before clearing nmi_running */
         smp_mb();
diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h

index f5b6537..666d5ba 100644 (file)
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -1744,6 +1744,7 @@ unsigned long sun4v_vintr_set_target(unsigned long dev_handle,
  
  #define HV_PCI_MAP_ATTR_READ           0x01
  #define HV_PCI_MAP_ATTR_WRITE          0x02
+#define HV_PCI_MAP_ATTR_RELAXED_ORDER  0x04
  
  #define HV_PCI_DEVICE_BUILD(b,d,f)     \
         ((((b) & 0xff) << 16) | \
diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c

index cd83be5..b0377db 100644 (file)
--- a/arch/sparc/kernel/kprobes.c
+++ b/arch/sparc/kernel/kprobes.c
@@ -5,7 +5,7 @@
  
  #include <linux/kernel.h>
  #include <linux/kprobes.h>
-#include <linux/module.h>
+#include <linux/extable.h>
  #include <linux/kdebug.h>
  #include <linux/slab.h>
  #include <linux/context_tracking.h>
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c

index 61c6f93..db57d8a 100644 (file)
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -30,8 +30,19 @@
  #define DRIVER_NAME    "pci_sun4v"
  #define PFX            DRIVER_NAME ": "
  
-static unsigned long vpci_major = 1;
-static unsigned long vpci_minor = 1;
+static unsigned long vpci_major;
+static unsigned long vpci_minor;
+
+struct vpci_version {
+       unsigned long major;
+       unsigned long minor;
+};
+
+/* Ordered from largest major to lowest */
+static struct vpci_version vpci_versions[] = {
+       { .major = 2, .minor = 0 },
+       { .major = 1, .minor = 1 },
+};
  
  #define PGLIST_NENTS   (PAGE_SIZE / sizeof(u64))
  
@@ -67,6 +78,10 @@ static long iommu_batch_flush(struct iommu_batch *p)
         u64 *pglist = p->pglist;
         unsigned long npages = p->npages;
  
+       /* VPCI maj=1, min=[0,1] only supports read and write */
+       if (vpci_major < 2)
+               prot &= (HV_PCI_MAP_ATTR_READ | HV_PCI_MAP_ATTR_WRITE);
+
         while (npages != 0) {
                 long num;
  
@@ -133,6 +148,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
                                    unsigned long attrs)
  {
         unsigned long flags, order, first_page, npages, n;
+       unsigned long prot = 0;
         struct iommu *iommu;
         struct page *page;
         void *ret;
@@ -146,6 +162,9 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
  
         npages = size >> IO_PAGE_SHIFT;
  
+       if (attrs & DMA_ATTR_WEAK_ORDERING)
+               prot = HV_PCI_MAP_ATTR_RELAXED_ORDER;
+
         nid = dev->archdata.numa_node;
         page = alloc_pages_node(nid, gfp, order);
         if (unlikely(!page))
@@ -169,7 +188,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
         local_irq_save(flags);
  
         iommu_batch_start(dev,
-                         (HV_PCI_MAP_ATTR_READ |
+                         (HV_PCI_MAP_ATTR_READ | prot |
                            HV_PCI_MAP_ATTR_WRITE),
                           entry);
  
@@ -266,6 +285,9 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
         if (direction != DMA_TO_DEVICE)
                 prot |= HV_PCI_MAP_ATTR_WRITE;
  
+       if (attrs & DMA_ATTR_WEAK_ORDERING)
+               prot |= HV_PCI_MAP_ATTR_RELAXED_ORDER;
+
         local_irq_save(flags);
  
         iommu_batch_start(dev, prot, entry);
@@ -344,6 +366,9 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
         if (direction != DMA_TO_DEVICE)
                 prot |= HV_PCI_MAP_ATTR_WRITE;
  
+       if (attrs & DMA_ATTR_WEAK_ORDERING)
+               prot |= HV_PCI_MAP_ATTR_RELAXED_ORDER;
+
         outs = s = segstart = &sglist[0];
         outcount = 1;
         incount = nelems;
@@ -907,22 +932,27 @@ static int pci_sun4v_probe(struct platform_device *op)
         struct device_node *dp;
         struct iommu *iommu;
         u32 devhandle;
-       int i, err;
+       int i, err = -ENODEV;
  
         dp = op->dev.of_node;
  
         if (!hvapi_negotiated++) {
-               err = sun4v_hvapi_register(HV_GRP_PCI,
-                                          vpci_major,
-                                          &vpci_minor);
+               for (i = 0; i < ARRAY_SIZE(vpci_versions); i++) {
+                       vpci_major = vpci_versions[i].major;
+                       vpci_minor = vpci_versions[i].minor;
+
+                       err = sun4v_hvapi_register(HV_GRP_PCI, vpci_major,
+                                                  &vpci_minor);
+                       if (!err)
+                               break;
+               }
  
                 if (err) {
-                       printk(KERN_ERR PFX "Could not register hvapi, "
-                              "err=%d\n", err);
+                       pr_err(PFX "Could not register hvapi, err=%d\n", err);
                         return err;
                 }
-               printk(KERN_INFO PFX "Registered hvapi major[%lu] minor[%lu]\n",
-                      vpci_major, vpci_minor);
+               pr_info(PFX "Registered hvapi major[%lu] minor[%lu]\n",
+                       vpci_major, vpci_minor);
  
                 dma_ops = &sun4v_dma_ops;
         }
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c

index d21cd62..4094a51 100644 (file)
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -8,7 +8,7 @@
   * I like traps on v9, :))))
   */
  
-#include <linux/module.h>
+#include <linux/extable.h>
  #include <linux/sched.h>
  #include <linux/linkage.h>
  #include <linux/kernel.h>
diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c

index 9aacb91..52c00d9 100644 (file)
--- a/arch/sparc/kernel/unaligned_64.c
+++ b/arch/sparc/kernel/unaligned_64.c
@@ -11,7 +11,7 @@
  #include <linux/kernel.h>
  #include <linux/sched.h>
  #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/extable.h>
  #include <asm/asi.h>
  #include <asm/ptrace.h>
  #include <asm/pstate.h>
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c

index 3f291d8..643c149 100644 (file)
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -14,7 +14,7 @@
  #include <linux/mman.h>
  #include <linux/signal.h>
  #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/extable.h>
  #include <linux/init.h>
  #include <linux/perf_event.h>
  #include <linux/interrupt.h>
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c

index 7ac6b62..439784b 100644 (file)
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -5,7 +5,7 @@
   *  Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
   */
   
-#include <linux/module.h>
+#include <linux/extable.h>
  #include <linux/kernel.h>
  #include <linux/sched.h>
  #include <linux/string.h>
diff --git a/arch/sparc/prom/ranges.c b/arch/sparc/prom/ranges.c

index ad143c1..6d8dc2a 100644 (file)
--- a/arch/sparc/prom/ranges.c
+++ b/arch/sparc/prom/ranges.c
@@ -16,9 +16,8 @@ static struct linux_prom_ranges promlib_obio_ranges[PROMREG_MAX];
  static int num_obio_ranges;
  
  /* Adjust register values based upon the ranges parameters. */
-static void
-prom_adjust_regs(struct linux_prom_registers *regp, int nregs,
-                struct linux_prom_ranges *rangep, int nranges)
+static void prom_adjust_regs(struct linux_prom_registers *regp, int nregs,
+                            struct linux_prom_ranges *rangep, int nranges)
  {
         int regc, rngc;
  
@@ -34,33 +33,30 @@ prom_adjust_regs(struct linux_prom_registers *regp, int nregs,
         }
  }
  
-static void
-prom_adjust_ranges(struct linux_prom_ranges *ranges1, int nranges1,
-                  struct linux_prom_ranges *ranges2, int nranges2)
+static void prom_adjust_ranges(struct linux_prom_ranges *ranges1, int nranges1,
+                              struct linux_prom_ranges *ranges2, int nranges2)
  {
         int rng1c, rng2c;
  
-       for(rng1c=0; rng1c < nranges1; rng1c++) {
-               for(rng2c=0; rng2c < nranges2; rng2c++)
-                       if(ranges1[rng1c].ot_parent_space == ranges2[rng2c].ot_child_space &&
+       for (rng1c = 0; rng1c < nranges1; rng1c++) {
+               for (rng2c = 0; rng2c < nranges2; rng2c++)
+                       if (ranges1[rng1c].ot_parent_space == ranges2[rng2c].ot_child_space &&
                            ranges1[rng1c].ot_parent_base >= ranges2[rng2c].ot_child_base &&
                            ranges2[rng2c].ot_child_base + ranges2[rng2c].or_size - ranges1[rng1c].ot_parent_base > 0U)
                         break;
-               if(rng2c == nranges2) /* oops */
+               if (rng2c == nranges2) /* oops */
                         prom_printf("adjust_ranges: Could not find matching bus type...\n");
                 else if (ranges1[rng1c].ot_parent_base + ranges1[rng1c].or_size > ranges2[rng2c].ot_child_base + ranges2[rng2c].or_size)
-                       ranges1[rng1c].or_size =
-                               ranges2[rng2c].ot_child_base + ranges2[rng2c].or_size - ranges1[rng1c].ot_parent_base;
+                       ranges1[rng1c].or_size = ranges2[rng2c].ot_child_base + ranges2[rng2c].or_size - ranges1[rng1c].ot_parent_base;
                 ranges1[rng1c].ot_parent_space = ranges2[rng2c].ot_parent_space;
                 ranges1[rng1c].ot_parent_base += ranges2[rng2c].ot_parent_base;
         }
  }
  
  /* Apply probed obio ranges to registers passed, if no ranges return. */
-void
-prom_apply_obio_ranges(struct linux_prom_registers *regs, int nregs)
+void prom_apply_obio_ranges(struct linux_prom_registers *regs, int nregs)
  {
-       if(num_obio_ranges)
+       if (num_obio_ranges)
                 prom_adjust_regs(regs, nregs, promlib_obio_ranges, num_obio_ranges);
  }
  EXPORT_SYMBOL(prom_apply_obio_ranges);
@@ -76,40 +72,40 @@ void __init prom_ranges_init(void)
         node = prom_getchild(prom_root_node);
         obio_node = prom_searchsiblings(node, "obio");
  
-       if(obio_node) {
+       if (obio_node) {
                 success = prom_getproperty(obio_node, "ranges",
                                            (char *) promlib_obio_ranges,
                                            sizeof(promlib_obio_ranges));
-               if(success != -1)
-                       num_obio_ranges = (success/sizeof(struct linux_prom_ranges));
+               if (success != -1)
+                       num_obio_ranges = (success / sizeof(struct linux_prom_ranges));
         }
  
-       if(num_obio_ranges)
+       if (num_obio_ranges)
                 prom_printf("PROMLIB: obio_ranges %d\n", num_obio_ranges);
  }
  
  void prom_apply_generic_ranges(phandle node, phandle parent,
-               struct linux_prom_registers *regs, int nregs)
+                              struct linux_prom_registers *regs, int nregs)
  {
         int success;
         int num_ranges;
         struct linux_prom_ranges ranges[PROMREG_MAX];
-       
+
         success = prom_getproperty(node, "ranges",
                                    (char *) ranges,
-                                  sizeof (ranges));
+                                  sizeof(ranges));
         if (success != -1) {
-               num_ranges = (success/sizeof(struct linux_prom_ranges));
+               num_ranges = (success / sizeof(struct linux_prom_ranges));
                 if (parent) {
                         struct linux_prom_ranges parent_ranges[PROMREG_MAX];
                         int num_parent_ranges;
-               
+
                         success = prom_getproperty(parent, "ranges",
-                                                  (char *) parent_ranges,
-                                                  sizeof (parent_ranges));
+                                                  (char *) parent_ranges,
+                                                  sizeof(parent_ranges));
                         if (success != -1) {
-                               num_parent_ranges = (success/sizeof(struct linux_prom_ranges));
-                               prom_adjust_ranges (ranges, num_ranges, parent_ranges, num_parent_ranges);
+                               num_parent_ranges = (success / sizeof(struct linux_prom_ranges));
+                               prom_adjust_ranges(ranges, num_ranges, parent_ranges, num_parent_ranges);
                         }
                 }
                 prom_adjust_regs(regs, nregs, ranges, num_ranges);
diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config

deleted file mode 100644 (file)

index 9906505..0000000
--- a/arch/x86/configs/kvm_guest.config
+++ /dev/null
@@ -1,31 +0,0 @@
-CONFIG_NET=y
-CONFIG_NET_CORE=y
-CONFIG_NETDEVICES=y
-CONFIG_BLOCK=y
-CONFIG_BLK_DEV=y
-CONFIG_NETWORK_FILESYSTEMS=y
-CONFIG_INET=y
-CONFIG_TTY=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_BINFMT_ELF=y
-CONFIG_PCI=y
-CONFIG_PCI_MSI=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_VIRTUALIZATION=y
-CONFIG_HYPERVISOR_GUEST=y
-CONFIG_PARAVIRT=y
-CONFIG_KVM_GUEST=y
-CONFIG_VIRTIO=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_VIRTIO_CONSOLE=y
-CONFIG_VIRTIO_NET=y
-CONFIG_9P_FS=y
-CONFIG_NET_9P=y
-CONFIG_NET_9P_VIRTIO=y
-CONFIG_SCSI_LOWLEVEL=y
-CONFIG_SCSI_VIRTIO=y
-CONFIG_VIRTIO_INPUT=y
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c

index 94d54d0..02223cb 100644 (file)
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
                         return 0;
                 }
  
-               ret = __pvclock_read_cycles(pvti);
+               ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
         } while (pvclock_read_retry(pvti, version));
  
         /* refer to vread_tsc() comment for rationale */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 33ae3a4..4b20f73 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -568,6 +568,7 @@ struct kvm_vcpu_arch {
                 struct kvm_steal_time steal;
         } st;
  
+       u64 tsc_offset;
         u64 last_guest_tsc;
         u64 last_host_tsc;
         u64 tsc_offset_adjustment;
@@ -701,6 +702,8 @@ struct kvm_hv {
         /* Hyper-v based guest crash (NT kernel bugcheck) parameters */
         u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
         u64 hv_crash_ctl;
+
+       HV_REFERENCE_TSC_PAGE tsc_ref;
  };
  
  struct kvm_arch {
@@ -781,54 +784,56 @@ struct kvm_arch {
         bool disabled_lapic_found;
  
         /* Struct members for AVIC */
+       u32 avic_vm_id;
         u32 ldr_mode;
         struct page *avic_logical_id_table_page;
         struct page *avic_physical_id_table_page;
+       struct hlist_node hnode;
  
         bool x2apic_format;
         bool x2apic_broadcast_quirk_disabled;
  };
  
  struct kvm_vm_stat {
-       u32 mmu_shadow_zapped;
-       u32 mmu_pte_write;
-       u32 mmu_pte_updated;
-       u32 mmu_pde_zapped;
-       u32 mmu_flooded;
-       u32 mmu_recycled;
-       u32 mmu_cache_miss;
-       u32 mmu_unsync;
-       u32 remote_tlb_flush;
-       u32 lpages;
+       ulong mmu_shadow_zapped;
+       ulong mmu_pte_write;
+       ulong mmu_pte_updated;
+       ulong mmu_pde_zapped;
+       ulong mmu_flooded;
+       ulong mmu_recycled;
+       ulong mmu_cache_miss;
+       ulong mmu_unsync;
+       ulong remote_tlb_flush;
+       ulong lpages;
  };
  
  struct kvm_vcpu_stat {
-       u32 pf_fixed;
-       u32 pf_guest;
-       u32 tlb_flush;
-       u32 invlpg;
-
-       u32 exits;
-       u32 io_exits;
-       u32 mmio_exits;
-       u32 signal_exits;
-       u32 irq_window_exits;
-       u32 nmi_window_exits;
-       u32 halt_exits;
-       u32 halt_successful_poll;
-       u32 halt_attempted_poll;
-       u32 halt_poll_invalid;
-       u32 halt_wakeup;
-       u32 request_irq_exits;
-       u32 irq_exits;
-       u32 host_state_reload;
-       u32 efer_reload;
-       u32 fpu_reload;
-       u32 insn_emulation;
-       u32 insn_emulation_fail;
-       u32 hypercalls;
-       u32 irq_injections;
-       u32 nmi_injections;
+       u64 pf_fixed;
+       u64 pf_guest;
+       u64 tlb_flush;
+       u64 invlpg;
+
+       u64 exits;
+       u64 io_exits;
+       u64 mmio_exits;
+       u64 signal_exits;
+       u64 irq_window_exits;
+       u64 nmi_window_exits;
+       u64 halt_exits;
+       u64 halt_successful_poll;
+       u64 halt_attempted_poll;
+       u64 halt_poll_invalid;
+       u64 halt_wakeup;
+       u64 request_irq_exits;
+       u64 irq_exits;
+       u64 host_state_reload;
+       u64 efer_reload;
+       u64 fpu_reload;
+       u64 insn_emulation;
+       u64 insn_emulation_fail;
+       u64 hypercalls;
+       u64 irq_injections;
+       u64 nmi_injections;
  };
  
  struct x86_instruction_info;
@@ -951,7 +956,6 @@ struct kvm_x86_ops {
  
         bool (*has_wbinvd_exit)(void);
  
-       u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
         void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
  
         u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h

index d019f0c..3ad741b 100644 (file)
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
  }
  
  static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+                             u64 tsc)
  {
-       u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+       u64 delta = tsc - src->tsc_timestamp;
         cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
                                              src->tsc_shift);
         return src->system_time + offset;
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h

index e6911ca..608a79d 100644 (file)
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,15 +20,4 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
  /* No need for a barrier -- XCHG is a barrier on x86. */
  #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
  
-extern int xen_have_vector_callback;
-
-/*
- * Events delivered via platform PCI interrupts are always
- * routed to vcpu 0 and hence cannot be rebound.
- */
-static inline bool xen_support_evtchn_rebind(void)
-{
-       return (!xen_hvm_domain() || xen_have_vector_callback);
-}
-
  #endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c

index 3599404..5b2cc88 100644 (file)
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
  
         do {
                 version = pvclock_read_begin(src);
-               ret = __pvclock_read_cycles(src);
+               ret = __pvclock_read_cycles(src, rdtsc_ordered());
                 flags = src->flags;
         } while (pvclock_read_retry(src, version));
  
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile

index 464fa47..3bff207 100644 (file)
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF)    += $(KVM)/async_pf.o
  
  kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
                            i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-                          hyperv.o page_track.o
+                          hyperv.o page_track.o debugfs.o
  
  kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)    += assigned-dev.o iommu.o
  
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 3235e0f..afa7bbb 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
                 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
                 F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-               F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
+               F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+               F(AVX512BW) | F(AVX512VL);
  
         /* cpuid 0xD.1.eax */
         const u32 kvm_cpuid_D_1_eax_x86_features =
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c

new file mode 100644 (file)

index 0000000..c19c7ed
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+       return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+       struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+       *val = vcpu->arch.tsc_offset;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+       struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+       *val = vcpu->arch.tsc_scaling_ratio;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+       *val = kvm_tsc_scaling_ratio_frac_bits;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       struct dentry *ret;
+
+       ret = debugfs_create_file("tsc-offset", 0444,
+                                                       vcpu->debugfs_dentry,
+                                                       vcpu, &vcpu_tsc_offset_fops);
+       if (!ret)
+               return -ENOMEM;
+
+       if (kvm_has_tsc_control) {
+               ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+                                                       vcpu->debugfs_dentry,
+                                                       vcpu, &vcpu_tsc_scaling_fops);
+               if (!ret)
+                       return -ENOMEM;
+               ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+                                                       vcpu->debugfs_dentry,
+                                                       vcpu, &vcpu_tsc_scaling_frac_fops);
+               if (!ret)
+                       return -ENOMEM;
+
+       }
+
+       return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c

index 01bd7b7..42b1c83 100644 (file)
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
  
  static u64 get_time_ref_counter(struct kvm *kvm)
  {
-       return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_vcpu *vcpu;
+       u64 tsc;
+
+       /*
+        * The guest has not set up the TSC page or the clock isn't
+        * stable, fall back to get_kvmclock_ns.
+        */
+       if (!hv->tsc_ref.tsc_sequence)
+               return div_u64(get_kvmclock_ns(kvm), 100);
+
+       vcpu = kvm_get_vcpu(kvm, 0);
+       tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+       return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+               + hv->tsc_ref.tsc_offset;
  }
  
  static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
         return 0;
  }
  
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+                                       HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+       u64 max_mul;
+
+       if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+               return false;
+
+       /*
+        * check if scale would overflow, if so we use the time ref counter
+        *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+        *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+        *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+        */
+       max_mul = 100ull << (32 - hv_clock->tsc_shift);
+       if (hv_clock->tsc_to_system_mul >= max_mul)
+               return false;
+
+       /*
+        * Otherwise compute the scale and offset according to the formulas
+        * derived above.
+        */
+       tsc_ref->tsc_scale =
+               mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+                               hv_clock->tsc_to_system_mul,
+                               100);
+
+       tsc_ref->tsc_offset = hv_clock->system_time;
+       do_div(tsc_ref->tsc_offset, 100);
+       tsc_ref->tsc_offset -=
+               mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+       return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+                          struct pvclock_vcpu_time_info *hv_clock)
+{
+       struct kvm_hv *hv = &kvm->arch.hyperv;
+       u32 tsc_seq;
+       u64 gfn;
+
+       BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+       BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+       if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+               return;
+
+       gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+       /*
+        * Because the TSC parameters only vary when there is a
+        * change in the master clock, do not bother with caching.
+        */
+       if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+                                   &tsc_seq, sizeof(tsc_seq))))
+               return;
+
+       /*
+        * While we're computing and writing the parameters, force the
+        * guest to use the time reference count MSR.
+        */
+       hv->tsc_ref.tsc_sequence = 0;
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+                           &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+               return;
+
+       if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+               return;
+
+       /* Ensure sequence is zero before writing the rest of the struct.  */
+       smp_wmb();
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+               return;
+
+       /*
+        * Now switch to the TSC page mechanism by writing the sequence.
+        */
+       tsc_seq++;
+       if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+               tsc_seq = 1;
+
+       /* Write the struct entirely before the non-zero sequence.  */
+       smp_wmb();
+
+       hv->tsc_ref.tsc_sequence = tsc_seq;
+       kvm_write_guest(kvm, gfn_to_gpa(gfn),
+                       &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
  static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                              bool host)
  {
@@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                 mark_page_dirty(kvm, gfn);
                 break;
         }
-       case HV_X64_MSR_REFERENCE_TSC: {
-               u64 gfn;
-               HV_REFERENCE_TSC_PAGE tsc_ref;
-
-               memset(&tsc_ref, 0, sizeof(tsc_ref));
+       case HV_X64_MSR_REFERENCE_TSC:
                 hv->hv_tsc_page = data;
-               if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-                       break;
-               gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-               if (kvm_write_guest(
-                               kvm,
-                               gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-                               &tsc_ref, sizeof(tsc_ref)))
-                       return 1;
-               mark_page_dirty(kvm, gfn);
+               if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+                       kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
                 break;
-       }
         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
                 return kvm_hv_msr_set_crash_data(vcpu,
                                                  msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h

index 60eccd4..cd11195 100644 (file)
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
  
  void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
  
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+                          struct pvclock_vcpu_time_info *hv_clock);
+
  #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index b62c852..23b99f3 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                 if (value & MSR_IA32_APICBASE_ENABLE) {
                         kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
                         static_key_slow_dec_deferred(&apic_hw_disabled);
-               } else
+               } else {
                         static_key_slow_inc(&apic_hw_disabled.key);
-               recalculate_apic_map(vcpu->kvm);
+                       recalculate_apic_map(vcpu->kvm);
+               }
         }
  
         if ((old_value ^ value) & X2APIC_ENABLE) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 3d4cc8c..d9c7e98 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
   *
   * Return true if tlb need be flushed.
   */
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
  {
         u64 spte = *sptep;
  
@@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm,
         bool flush = false;
  
         for_each_rmap_spte(rmap_head, &iter, sptep)
-               flush |= spte_write_protect(kvm, sptep, pt_protect);
+               flush |= spte_write_protect(sptep, pt_protect);
  
         return flush;
  }
  
-static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_clear_dirty(u64 *sptep)
  {
         u64 spte = *sptep;
  
@@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
         bool flush = false;
  
         for_each_rmap_spte(rmap_head, &iter, sptep)
-               flush |= spte_clear_dirty(kvm, sptep);
+               flush |= spte_clear_dirty(sptep);
  
         return flush;
  }
  
-static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_set_dirty(u64 *sptep)
  {
         u64 spte = *sptep;
  
@@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
         bool flush = false;
  
         for_each_rmap_spte(rmap_head, &iter, sptep)
-               flush |= spte_set_dirty(kvm, sptep);
+               flush |= spte_set_dirty(sptep);
  
         return flush;
  }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 1e6b84b..f8157a3 100644 (file)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,8 @@
  #include <linux/sched.h>
  #include <linux/trace_events.h>
  #include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
  
  #include <asm/apic.h>
  #include <asm/perf_event.h>
@@ -41,6 +43,7 @@
  #include <asm/desc.h>
  #include <asm/debugreg.h>
  #include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
  
  #include <asm/virtext.h>
  #include "trace.h"
@@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  #define AVIC_UNACCEL_ACCESS_OFFSET_MASK                0xFF0
  #define AVIC_UNACCEL_ACCESS_VECTOR_MASK                0xFFFFFFFF
  
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS              8
+#define AVIC_VCPU_ID_MASK              ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS                        24
+#define AVIC_VM_ID_NR                  (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK                        ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y)               (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+                                               (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x)          ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x)                (x & AVIC_VCPU_ID_MASK)
+
  static bool erratum_383_found __read_mostly;
  
  static const u32 host_save_user_msrs[] = {
@@ -185,6 +201,23 @@ struct vcpu_svm {
         struct page *avic_backing_page;
         u64 *avic_physical_id_cache;
         bool avic_is_running;
+
+       /*
+        * Per-vcpu list of struct amd_svm_iommu_ir:
+        * This is used mainly to store interrupt remapping information used
+        * when update the vcpu affinity. This avoids the need to scan for
+        * IRTE and try to match ga_tag in the IOMMU driver.
+        */
+       struct list_head ir_list;
+       spinlock_t ir_list_lock;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+       struct list_head node;  /* Used by SVM for per-vcpu ir_list */
+       void *data;             /* Storing pointer to struct amd_ir_data */
  };
  
  #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK   (0xFF)
@@ -242,6 +275,10 @@ static int avic;
  module_param(avic, int, S_IRUGO);
  #endif
  
+/* AVIC VM ID bit masks and lock */
+static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+static DEFINE_SPINLOCK(avic_vm_id_lock);
+
  static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
  static void svm_flush_tlb(struct kvm_vcpu *vcpu);
  static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
  }
  
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_arch,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS  8
+DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static spinlock_t svm_vm_data_hash_lock;
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+       unsigned long flags;
+       struct kvm_arch *ka = NULL;
+       struct kvm_vcpu *vcpu = NULL;
+       u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+       u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+       pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+               struct kvm *kvm = container_of(ka, struct kvm, arch);
+               struct kvm_arch *vm_data = &kvm->arch;
+
+               if (vm_data->avic_vm_id != vm_id)
+                       continue;
+               vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               break;
+       }
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+       if (!vcpu)
+               return 0;
+
+       /* Note:
+        * At this point, the IOMMU should have already set the pending
+        * bit in the vAPIC backing page. So, we just need to schedule
+        * in the vcpu.
+        */
+       if (vcpu->mode == OUTSIDE_GUEST_MODE)
+               kvm_vcpu_wake_up(vcpu);
+
+       return 0;
+}
+
  static __init int svm_hardware_setup(void)
  {
         int cpu;
@@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void)
         if (avic) {
                 if (!npt_enabled ||
                     !boot_cpu_has(X86_FEATURE_AVIC) ||
-                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
                         avic = false;
-               else
+               } else {
                         pr_info("AVIC enabled\n");
+
+                       hash_init(svm_vm_data_hash);
+                       spin_lock_init(&svm_vm_data_hash_lock);
+                       amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+               }
         }
  
         return 0;
@@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
         seg->base = 0;
  }
  
-static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return svm->vmcb->control.tsc_offset;
-}
-
  static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
         return 0;
  }
  
+static inline int avic_get_next_vm_id(void)
+{
+       int id;
+
+       spin_lock(&avic_vm_id_lock);
+
+       /* AVIC VM ID is one-based. */
+       id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+       if (id <= AVIC_VM_ID_MASK)
+               __set_bit(id, avic_vm_id_bitmap);
+       else
+               id = -EAGAIN;
+
+       spin_unlock(&avic_vm_id_lock);
+       return id;
+}
+
+static inline int avic_free_vm_id(int id)
+{
+       if (id <= 0 || id > AVIC_VM_ID_MASK)
+               return -EINVAL;
+
+       spin_lock(&avic_vm_id_lock);
+       __clear_bit(id, avic_vm_id_bitmap);
+       spin_unlock(&avic_vm_id_lock);
+       return 0;
+}
+
  static void avic_vm_destroy(struct kvm *kvm)
  {
+       unsigned long flags;
         struct kvm_arch *vm_data = &kvm->arch;
  
+       avic_free_vm_id(vm_data->avic_vm_id);
+
         if (vm_data->avic_logical_id_table_page)
                 __free_page(vm_data->avic_logical_id_table_page);
         if (vm_data->avic_physical_id_table_page)
                 __free_page(vm_data->avic_physical_id_table_page);
+
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_del(&vm_data->hnode);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
  }
  
  static int avic_vm_init(struct kvm *kvm)
  {
-       int err = -ENOMEM;
+       unsigned long flags;
+       int vm_id, err = -ENOMEM;
         struct kvm_arch *vm_data = &kvm->arch;
         struct page *p_page;
         struct page *l_page;
@@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm)
         if (!avic)
                 return 0;
  
+       vm_id = avic_get_next_vm_id();
+       if (vm_id < 0)
+               return vm_id;
+       vm_data->avic_vm_id = (u32)vm_id;
+
         /* Allocating physical APIC ID table (4KB) */
         p_page = alloc_page(GFP_KERNEL);
         if (!p_page)
@@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm)
         vm_data->avic_logical_id_table_page = l_page;
         clear_page(page_address(l_page));
  
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
         return 0;
  
  free_avic:
@@ -1323,31 +1452,34 @@ free_avic:
         return err;
  }
  
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
  {
-       u64 entry;
-       int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       svm->avic_is_running = is_run;
+       if (!kvm_arch_has_assigned_device(vcpu->kvm))
+               return 0;
  
-       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
-       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
-               return;
+       /*
+        * Here, we go through the per-vcpu ir_list to update all existing
+        * interrupt remapping table entry targeting this vcpu.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
  
-       entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+       if (list_empty(&svm->ir_list))
+               goto out;
  
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       if (is_run)
-               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       list_for_each_entry(ir, &svm->ir_list, node) {
+               ret = amd_iommu_update_ga(cpu, r, ir->data);
+               if (ret)
+                       break;
+       }
+out:
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return ret;
  }
  
  static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
  
         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+                                       svm->avic_is_running);
  }
  
  static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
                 return;
  
         entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+               avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
  }
  
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->avic_is_running = is_run;
+       if (is_run)
+               avic_vcpu_load(vcpu, vcpu->cpu);
+       else
+               avic_vcpu_put(vcpu);
+}
+
  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
                 err = avic_init_backing_page(&svm->vcpu);
                 if (err)
                         goto free_page4;
+
+               INIT_LIST_HEAD(&svm->ir_list);
+               spin_lock_init(&svm->ir_list_lock);
         }
  
         /* We initialize this flag to true to make sure that the is_running
@@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
                 kvm_vcpu_wake_up(vcpu);
  }
  
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+       unsigned long flags;
+       struct amd_svm_iommu_ir *cur;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(cur, &svm->ir_list, node) {
+               if (cur->data != pi->ir_data)
+                       continue;
+               list_del(&cur->node);
+               kfree(cur);
+               break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
+
+       /**
+        * In some cases, the existing irte is updaed and re-set,
+        * so we need to check here if it's already been * added
+        * to the ir_list.
+        */
+       if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+               struct kvm *kvm = svm->vcpu.kvm;
+               u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+               struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               struct vcpu_svm *prev_svm;
+
+               if (!prev_vcpu) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               prev_svm = to_svm(prev_vcpu);
+               svm_ir_list_del(prev_svm, pi);
+       }
+
+       /**
+        * Allocating new amd_iommu_pi_data, which will get
+        * add to the per-vcpu ir_list.
+        */
+       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+       if (!ir) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ir->data = pi->ir_data;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_add(&ir->node, &svm->ir_list);
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+       return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+                struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu = NULL;
+
+       kvm_set_msi_irq(kvm, e, &irq);
+
+       if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+               pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+                        __func__, irq.vector);
+               return -1;
+       }
+
+       pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+                irq.vector);
+       *svm = to_svm(vcpu);
+       vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+       vcpu_info->vector = irq.vector;
+
+       return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                             uint32_t guest_irq, bool set)
+{
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       int idx, ret = -EINVAL;
+
+       if (!kvm_arch_has_assigned_device(kvm) ||
+           !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+
+       pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+                __func__, host_irq, guest_irq, set);
+
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               struct vcpu_data vcpu_info;
+               struct vcpu_svm *svm = NULL;
+
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+
+               /**
+                * Here, we setup with legacy mode in the following cases:
+                * 1. When cannot target interrupt to a specific vcpu.
+                * 2. Unsetting posted interrupt.
+                * 3. APIC virtialization is disabled for the vcpu.
+                */
+               if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+                   kvm_vcpu_apicv_active(&svm->vcpu)) {
+                       struct amd_iommu_pi_data pi;
+
+                       /* Try to enable guest_mode in IRTE */
+                       pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+                       pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+                                                    svm->vcpu.vcpu_id);
+                       pi.is_guest_mode = true;
+                       pi.vcpu_data = &vcpu_info;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * Here, we successfully setting up vcpu affinity in
+                        * IOMMU guest mode. Now, we need to store the posted
+                        * interrupt information in a per-vcpu ir_list so that
+                        * we can reference to them directly when we update vcpu
+                        * scheduling information in IOMMU irte.
+                        */
+                       if (!ret && pi.is_guest_mode)
+                               svm_ir_list_add(svm, &pi);
+               } else {
+                       /* Use legacy mode in IRTE */
+                       struct amd_iommu_pi_data pi;
+
+                       /**
+                        * Here, pi is used to:
+                        * - Tell IOMMU to use legacy mode for this interrupt.
+                        * - Retrieve ga_tag of prior interrupt remapping data.
+                        */
+                       pi.is_guest_mode = false;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * Check if the posted interrupt was previously
+                        * setup with the guest_mode by checking if the ga_tag
+                        * was cached. If so, we need to clean up the per-vcpu
+                        * ir_list.
+                        */
+                       if (!ret && pi.prev_ga_tag) {
+                               int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+                               struct kvm_vcpu *vcpu;
+
+                               vcpu = kvm_get_vcpu_by_id(kvm, id);
+                               if (vcpu)
+                                       svm_ir_list_del(to_svm(vcpu), &pi);
+                       }
+               }
+
+               if (!ret && svm) {
+                       trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+                                                host_irq, e->gsi,
+                                                vcpu_info.vector,
+                                                vcpu_info.pi_desc_addr, set);
+               }
+
+               if (ret < 0) {
+                       pr_err("%s: failed to update PI IRTE\n", __func__);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+}
+
  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
  
         .has_wbinvd_exit = svm_has_wbinvd_exit,
  
-       .read_tsc_offset = svm_read_tsc_offset,
         .write_tsc_offset = svm_write_tsc_offset,
         .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
         .read_l1_tsc = svm_read_l1_tsc,
@@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
  
         .pmu_ops = &amd_pmu_ops,
         .deliver_posted_interrupt = svm_deliver_avic_intr,
+       .update_pi_irte = svm_update_pi_irte,
  };
  
  static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 121fdf6..cf1b16d 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
  static unsigned long *vmx_msr_bitmap_longmode;
  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
  static unsigned long *vmx_vmread_bitmap;
  static unsigned long *vmx_vmwrite_bitmap;
  
@@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock);
  static struct vmcs_config {
         int size;
         int order;
+       u32 basic_cap;
         u32 revision_id;
         u32 pin_based_exec_ctrl;
         u32 cpu_based_exec_ctrl;
@@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void)
                 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
  }
  
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+       return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
  static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
  {
         return flexpriority_enabled && lapic_in_kernel(vcpu);
@@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
         else if (cpu_has_secondary_exec_ctrls() &&
                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-               if (is_long_mode(vcpu))
-                       msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-               else
-                       msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+               if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+                       if (is_long_mode(vcpu))
+                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+                       else
+                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+               } else {
+                       if (is_long_mode(vcpu))
+                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+                       else
+                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+               }
         } else {
                 if (is_long_mode(vcpu))
                         msr_bitmap = vmx_msr_bitmap_longmode;
@@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
         return host_tsc + tsc_offset;
  }
  
-static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-       return vmcs_read64(TSC_OFFSET);
-}
-
  /*
   * writes 'offset' into guest's timestamp counter offset register
   */
@@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
                            ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
                            (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+               if (cpu_has_vmx_basic_inout())
+                       *pdata |= VMX_BASIC_INOUT;
                 break;
         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
         case MSR_IA32_VMX_PINBASED_CTLS:
@@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                 return -EIO;
  
         vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->order = get_order(vmcs_config.size);
+       vmcs_conf->order = get_order(vmcs_conf->size);
+       vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
         vmcs_conf->revision_id = vmx_msr_low;
  
         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
                                                 msr, MSR_TYPE_R | MSR_TYPE_W);
  }
  
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
  {
-       __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_R);
-       __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_R);
+       if (apicv_active) {
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_R);
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_R);
+       } else {
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+       }
  }
  
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
  {
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_R);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_R);
+       if (apicv_active) {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_R);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_R);
+       } else {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+       }
  }
  
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
  {
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_W);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_W);
+       if (apicv_active) {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_W);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_W);
+       } else {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_W);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_W);
+       }
  }
  
  static bool vmx_get_enable_apicv(void)
@@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-       if (is_guest_mode(vcpu))
-               return;
+       if (!is_guest_mode(vcpu)) {
+               if (!cpu_has_virtual_nmis()) {
+                       /*
+                        * Tracking the NMI-blocked state in software is built upon
+                        * finding the next open IRQ window. This, in turn, depends on
+                        * well-behaving guests: They have to keep IRQs disabled at
+                        * least as long as the NMI handler runs. Otherwise we may
+                        * cause NMI nesting, maybe breaking the guest. But as this is
+                        * highly unlikely, we can live with the residual risk.
+                        */
+                       vmx->soft_vnmi_blocked = 1;
+                       vmx->vnmi_blocked_time = 0;
+               }
  
-       if (!cpu_has_virtual_nmis()) {
-               /*
-                * Tracking the NMI-blocked state in software is built upon
-                * finding the next open IRQ window. This, in turn, depends on
-                * well-behaving guests: They have to keep IRQs disabled at
-                * least as long as the NMI handler runs. Otherwise we may
-                * cause NMI nesting, maybe breaking the guest. But as this is
-                * highly unlikely, we can live with the residual risk.
-                */
-               vmx->soft_vnmi_blocked = 1;
-               vmx->vnmi_blocked_time = 0;
+               ++vcpu->stat.nmi_injections;
+               vmx->nmi_known_unmasked = false;
         }
  
-       ++vcpu->stat.nmi_injections;
-       vmx->nmi_known_unmasked = false;
         if (vmx->rmode.vm86_active) {
                 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                 return;
         }
+
         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
  }
@@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
  
         gla_validity = (exit_qualification >> 7) & 0x3;
-       if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+       if (gla_validity == 0x2) {
                 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
                 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@ -6360,22 +6395,32 @@ static __init int hardware_setup(void)
         if (!vmx_msr_bitmap_legacy_x2apic)
                 goto out2;
  
+       vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+               goto out3;
+
         vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_msr_bitmap_longmode)
-               goto out3;
+               goto out4;
  
         vmx_msr_bitmap_longmode_x2apic =
                                 (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_msr_bitmap_longmode_x2apic)
-               goto out4;
+               goto out5;
+
+       vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+               goto out6;
  
         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_vmread_bitmap)
-               goto out6;
+               goto out7;
  
         vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_vmwrite_bitmap)
-               goto out7;
+               goto out8;
  
         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6394,7 +6439,7 @@ static __init int hardware_setup(void)
  
         if (setup_vmcs_config(&vmcs_config) < 0) {
                 r = -EIO;
-               goto out8;
+               goto out9;
         }
  
         if (boot_cpu_has(X86_FEATURE_NX))
@@ -6461,20 +6506,35 @@ static __init int hardware_setup(void)
                         vmx_msr_bitmap_legacy, PAGE_SIZE);
         memcpy(vmx_msr_bitmap_longmode_x2apic,
                         vmx_msr_bitmap_longmode, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                       vmx_msr_bitmap_legacy, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                       vmx_msr_bitmap_longmode, PAGE_SIZE);
  
         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
  
+       /*
+        * enable_apicv && kvm_vcpu_apicv_active()
+        */
         for (msr = 0x800; msr <= 0x8ff; msr++)
-               vmx_disable_intercept_msr_read_x2apic(msr);
+               vmx_disable_intercept_msr_read_x2apic(msr, true);
  
         /* TMCCT */
-       vmx_enable_intercept_msr_read_x2apic(0x839);
+       vmx_enable_intercept_msr_read_x2apic(0x839, true);
         /* TPR */
-       vmx_disable_intercept_msr_write_x2apic(0x808);
+       vmx_disable_intercept_msr_write_x2apic(0x808, true);
         /* EOI */
-       vmx_disable_intercept_msr_write_x2apic(0x80b);
+       vmx_disable_intercept_msr_write_x2apic(0x80b, true);
         /* SELF-IPI */
-       vmx_disable_intercept_msr_write_x2apic(0x83f);
+       vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+
+       /*
+        * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+        *      !enable_apicv
+        */
+       /* TPR */
+       vmx_disable_intercept_msr_read_x2apic(0x808, false);
+       vmx_disable_intercept_msr_write_x2apic(0x808, false);
  
         if (enable_ept) {
                 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6521,14 +6581,18 @@ static __init int hardware_setup(void)
  
         return alloc_kvm_area();
  
-out8:
+out9:
         free_page((unsigned long)vmx_vmwrite_bitmap);
-out7:
+out8:
         free_page((unsigned long)vmx_vmread_bitmap);
+out7:
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
  out6:
         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
+out5:
         free_page((unsigned long)vmx_msr_bitmap_longmode);
+out4:
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
  out3:
         free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
  out2:
@@ -6544,7 +6608,9 @@ out:
  static __exit void hardware_unsetup(void)
  {
         free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
         free_page((unsigned long)vmx_msr_bitmap_legacy);
         free_page((unsigned long)vmx_msr_bitmap_longmode);
         free_page((unsigned long)vmx_io_bitmap_b);
@@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
  {
         /* TODO: not to reset guest simply here. */
         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-       pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+       pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
  }
  
  static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         vmx->nested.vmcs02_num = 0;
  
         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_REL);
+                    HRTIMER_MODE_REL_PINNED);
         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
  
         vmx->nested.vmxon = true;
@@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
                 return;
         }
  
-       /*
-        * There is not point to enable virtualize x2apic without enable
-        * apicv
-        */
-       if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-                               !kvm_vcpu_apicv_active(vcpu))
+       if (!cpu_has_vmx_virtualize_x2apic_mode())
                 return;
  
         if (!cpu_need_tpr_shadow(vcpu))
@@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
         maxphyaddr = cpuid_maxphyaddr(vcpu);
         if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
             (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
-               pr_warn_ratelimited(
+               pr_debug_ratelimited(
                         "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
                         addr_field, maxphyaddr, count, addr);
                 return -EINVAL;
@@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
         for (i = 0; i < count; i++) {
                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
                                         &e, sizeof(e))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                 __func__, i, gpa + i * sizeof(e));
                         goto fail;
                 }
                 if (nested_vmx_load_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s check failed (%u, 0x%x, 0x%x)\n",
                                 __func__, i, e.index, e.reserved);
                         goto fail;
@@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                 msr.index = e.index;
                 msr.data = e.value;
                 if (kvm_set_msr(vcpu, &msr)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                 __func__, i, e.index, e.value);
                         goto fail;
@@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                 if (kvm_vcpu_read_guest(vcpu,
                                         gpa + i * sizeof(e),
                                         &e, 2 * sizeof(u32))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                 __func__, i, gpa + i * sizeof(e));
                         return -EINVAL;
                 }
                 if (nested_vmx_store_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s check failed (%u, 0x%x, 0x%x)\n",
                                 __func__, i, e.index, e.reserved);
                         return -EINVAL;
@@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                 msr_info.host_initiated = false;
                 msr_info.index = e.index;
                 if (kvm_get_msr(vcpu, &msr_info)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot read MSR (%u, 0x%x)\n",
                                 __func__, i, e.index);
                         return -EINVAL;
@@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
                                          gpa + i * sizeof(e) +
                                              offsetof(struct vmx_msr_entry, value),
                                          &msr_info.data, sizeof(msr_info.data))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                 __func__, i, e.index, msr_info.data);
                         return -EINVAL;
@@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
         }
  
+       if (nested_cpu_has_ept(vmcs12))
+               vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
         if (nested_cpu_has_vid(vmcs12))
                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
  
@@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
          * We are now running in L2, mmu_notifier will force to reload the
          * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
          */
-       kvm_vcpu_reload_apic_access_page(vcpu);
+       kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
         /*
          * Exiting from L2 to L1, we're now back to L1 which thinks it just
@@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
  
         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
-       .read_tsc_offset = vmx_read_tsc_offset,
         .write_tsc_offset = vmx_write_tsc_offset,
         .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
         .read_l1_tsc = vmx_read_l1_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 699f872..6c633de 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
  
  static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
  {
-       u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+       u64 curr_offset = vcpu->arch.tsc_offset;
         vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
  }
  
@@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
  }
  EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
  
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       vcpu->arch.tsc_offset = offset;
+}
+
  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
  {
         struct kvm *kvm = vcpu->kvm;
@@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
  
         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
         offset = kvm_compute_tsc_offset(vcpu, data);
-       ns = get_kernel_ns();
+       ns = ktime_get_boot_ns();
         elapsed = ns - kvm->arch.last_tsc_nsec;
  
         if (vcpu->arch.virtual_tsc_khz) {
@@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
  
         if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
                 update_ia32_tsc_adjust_msr(vcpu, offset);
-       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       kvm_vcpu_write_tsc_offset(vcpu, offset);
         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
         spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
  #endif
  }
  
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+       struct kvm_arch *ka = &kvm->arch;
+       s64 ns;
+
+       if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+               u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+               ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+       } else {
+               ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+       }
+
+       return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+       unsigned long flags;
+       s64 ns;
+
+       local_irq_save(flags);
+       ns = __get_kvmclock_ns(kvm);
+       local_irq_restore(flags);
+
+       return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+       struct kvm_vcpu_arch *vcpu = &v->arch;
+       struct pvclock_vcpu_time_info guest_hv_clock;
+
+       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+               &guest_hv_clock, sizeof(guest_hv_clock))))
+               return;
+
+       /* This VCPU is paused, but it's legal for a guest to read another
+        * VCPU's kvmclock, so we really have to follow the specification where
+        * it says that version is odd if data is being modified, and even after
+        * it is consistent.
+        *
+        * Version field updates must be kept separate.  This is because
+        * kvm_write_guest_cached might use a "rep movs" instruction, and
+        * writes within a string instruction are weakly ordered.  So there
+        * are three writes overall.
+        *
+        * As a small optimization, only write the version field in the first
+        * and third write.  The vcpu->pv_time cache is still valid, because the
+        * version field is the first in the struct.
+        */
+       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+       vcpu->hv_clock.version = guest_hv_clock.version + 1;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+
+       smp_wmb();
+
+       /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+       vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+       if (vcpu->pvclock_set_guest_stopped_request) {
+               vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+               vcpu->pvclock_set_guest_stopped_request = false;
+       }
+
+       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock));
+
+       smp_wmb();
+
+       vcpu->hv_clock.version++;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+}
+
  static int kvm_guest_time_update(struct kvm_vcpu *v)
  {
         unsigned long flags, tgt_tsc_khz;
@@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         struct kvm_arch *ka = &v->kvm->arch;
         s64 kernel_ns;
         u64 tsc_timestamp, host_tsc;
-       struct pvclock_vcpu_time_info guest_hv_clock;
         u8 pvclock_flags;
         bool use_master_clock;
  
@@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         }
         if (!use_master_clock) {
                 host_tsc = rdtsc();
-               kernel_ns = get_kernel_ns();
+               kernel_ns = ktime_get_boot_ns();
         }
  
         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
  
         local_irq_restore(flags);
  
-       if (!vcpu->pv_time_enabled)
-               return 0;
+       /* With all the info we got, fill in the values */
  
         if (kvm_has_tsc_control)
                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                 vcpu->hw_tsc_khz = tgt_tsc_khz;
         }
  
-       /* With all the info we got, fill in the values */
         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
         vcpu->last_guest_tsc = tsc_timestamp;
  
-       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-               &guest_hv_clock, sizeof(guest_hv_clock))))
-               return 0;
-
-       /* This VCPU is paused, but it's legal for a guest to read another
-        * VCPU's kvmclock, so we really have to follow the specification where
-        * it says that version is odd if data is being modified, and even after
-        * it is consistent.
-        *
-        * Version field updates must be kept separate.  This is because
-        * kvm_write_guest_cached might use a "rep movs" instruction, and
-        * writes within a string instruction are weakly ordered.  So there
-        * are three writes overall.
-        *
-        * As a small optimization, only write the version field in the first
-        * and third write.  The vcpu->pv_time cache is still valid, because the
-        * version field is the first in the struct.
-        */
-       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-       vcpu->hv_clock.version = guest_hv_clock.version + 1;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
-
-       smp_wmb();
-
-       /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-       pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
-       if (vcpu->pvclock_set_guest_stopped_request) {
-               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-               vcpu->pvclock_set_guest_stopped_request = false;
-       }
-
         /* If the host uses TSC clocksource, then it is stable */
+       pvclock_flags = 0;
         if (use_master_clock)
                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
  
         vcpu->hv_clock.flags = pvclock_flags;
  
-       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock));
-
-       smp_wmb();
-
-       vcpu->hv_clock.version++;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
+       if (vcpu->pv_time_enabled)
+               kvm_setup_pvclock_page(v);
+       if (v == kvm_get_vcpu(v->kvm, 0))
+               kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
         return 0;
  }
  
@@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 if (check_tsc_unstable()) {
                         u64 offset = kvm_compute_tsc_offset(vcpu,
                                                 vcpu->arch.last_guest_tsc);
-                       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+                       kvm_vcpu_write_tsc_offset(vcpu, offset);
                         vcpu->arch.tsc_catchup = 1;
                 }
                 if (kvm_lapic_hv_timer_in_use(vcpu) &&
@@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
         case KVM_SET_CLOCK: {
                 struct kvm_clock_data user_ns;
                 u64 now_ns;
-               s64 delta;
  
                 r = -EFAULT;
                 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
  
                 r = 0;
                 local_irq_disable();
-               now_ns = get_kernel_ns();
-               delta = user_ns.clock - now_ns;
+               now_ns = __get_kvmclock_ns(kvm);
+               kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
                 local_irq_enable();
-               kvm->arch.kvmclock_offset = delta;
                 kvm_gen_update_masterclock(kvm);
                 break;
         }
@@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 struct kvm_clock_data user_ns;
                 u64 now_ns;
  
-               local_irq_disable();
-               now_ns = get_kernel_ns();
-               user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
-               local_irq_enable();
+               now_ns = get_kvmclock_ns(kvm);
+               user_ns.clock = now_ns;
                 user_ns.flags = 0;
                 memset(&user_ns.pad, 0, sizeof(user_ns.pad));
  
@@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  
         kvm_put_guest_xcr0(vcpu);
  
-       /* Interrupt is enabled by handle_external_intr() */
         kvm_x86_ops->handle_external_intr(vcpu);
  
         ++vcpu->stat.exits;
@@ -7530,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
          * before any KVM threads can be running.  Unfortunately, we can't
          * bring the TSCs fully up to date with real time, as we aren't yet far
          * enough into CPU bringup that we know how much real time has actually
-        * elapsed; our helper function, get_kernel_ns() will be using boot
+        * elapsed; our helper function, ktime_get_boot_ns() will be using boot
          * variables that haven't been updated yet.
          *
          * So we simply find the maximum observed TSC above, then record the
@@ -7765,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         mutex_init(&kvm->arch.apic_map_lock);
         spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
  
+       kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
         pvclock_update_vm_gtod_copy(kvm);
  
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index a82ca46..e8ff3e4 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
         return kvm_register_write(vcpu, reg, val);
  }
  
-static inline u64 get_kernel_ns(void)
-{
-       return ktime_get_boot_ns();
-}
-
  static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
  {
         return !(kvm->arch.disabled_quirks & quirk);
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
  int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
  
  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
  
  int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
         gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c

index 3a483cb..bedfab9 100644 (file)
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -456,7 +456,7 @@ void __init xen_msi_init(void)
  
  int __init pci_xen_hvm_init(void)
  {
-       if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs))
+       if (!xen_feature(XENFEAT_hvm_pirqs))
                 return 0;
  
  #ifdef CONFIG_ACPI
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index f1d2182..c0fdd57 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -137,8 +137,10 @@ struct shared_info xen_dummy_shared_info;
  void *xen_initial_gdt;
  
  RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
-__read_mostly int xen_have_vector_callback;
-EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
+static int xen_cpu_up_prepare(unsigned int cpu);
+static int xen_cpu_up_online(unsigned int cpu);
+static int xen_cpu_dead(unsigned int cpu);
  
  /*
   * Point at some empty memory to start with. We map the real shared_info
@@ -1519,10 +1521,7 @@ static void __init xen_pvh_early_guest_init(void)
         if (!xen_feature(XENFEAT_auto_translated_physmap))
                 return;
  
-       if (!xen_feature(XENFEAT_hvm_callback_vector))
-               return;
-
-       xen_have_vector_callback = 1;
+       BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
  
         xen_pvh_early_cpu_init(0, false);
         xen_pvh_set_cr_flags(0);
@@ -1538,6 +1537,24 @@ static void __init xen_dom0_set_legacy_features(void)
         x86_platform.legacy.rtc = 1;
  }
  
+static int xen_cpuhp_setup(void)
+{
+       int rc;
+
+       rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE,
+                                      "XEN_HVM_GUEST_PREPARE",
+                                      xen_cpu_up_prepare, xen_cpu_dead);
+       if (rc >= 0) {
+               rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+                                              "XEN_HVM_GUEST_ONLINE",
+                                              xen_cpu_up_online, NULL);
+               if (rc < 0)
+                       cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE);
+       }
+
+       return rc >= 0 ? 0 : rc;
+}
+
  /* First C function to be called on Xen boot */
  asmlinkage __visible void __init xen_start_kernel(void)
  {
@@ -1639,6 +1656,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
            possible map and a non-dummy shared_info. */
         per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
  
+       WARN_ON(xen_cpuhp_setup());
+
         local_irq_disable();
         early_boot_irqs_disabled = true;
  
@@ -1819,31 +1838,54 @@ static void __init init_hvm_pv_info(void)
         xen_domain_type = XEN_HVM_DOMAIN;
  }
  
-static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
-                             void *hcpu)
+static int xen_cpu_up_prepare(unsigned int cpu)
  {
-       int cpu = (long)hcpu;
-       switch (action) {
-       case CPU_UP_PREPARE:
+       int rc;
+
+       if (xen_hvm_domain()) {
+               /*
+                * This can happen if CPU was offlined earlier and
+                * offlining timed out in common_cpu_die().
+                */
+               if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+                       xen_smp_intr_free(cpu);
+                       xen_uninit_lock_cpu(cpu);
+               }
+
                 if (cpu_acpi_id(cpu) != U32_MAX)
                         per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
                 else
                         per_cpu(xen_vcpu_id, cpu) = cpu;
                 xen_vcpu_setup(cpu);
-               if (xen_have_vector_callback) {
-                       if (xen_feature(XENFEAT_hvm_safe_pvclock))
-                               xen_setup_timer(cpu);
-               }
-               break;
-       default:
-               break;
         }
-       return NOTIFY_OK;
+
+       if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+               xen_setup_timer(cpu);
+
+       rc = xen_smp_intr_init(cpu);
+       if (rc) {
+               WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
+                    cpu, rc);
+               return rc;
+       }
+       return 0;
  }
  
-static struct notifier_block xen_hvm_cpu_notifier = {
-       .notifier_call  = xen_hvm_cpu_notify,
-};
+static int xen_cpu_dead(unsigned int cpu)
+{
+       xen_smp_intr_free(cpu);
+
+       if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+               xen_teardown_timer(cpu);
+
+       return 0;
+}
+
+static int xen_cpu_up_online(unsigned int cpu)
+{
+       xen_init_lock_cpu(cpu);
+       return 0;
+}
  
  #ifdef CONFIG_KEXEC_CORE
  static void xen_hvm_shutdown(void)
@@ -1871,10 +1913,10 @@ static void __init xen_hvm_guest_init(void)
  
         xen_panic_handler_init();
  
-       if (xen_feature(XENFEAT_hvm_callback_vector))
-               xen_have_vector_callback = 1;
+       BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
+
         xen_hvm_smp_init();
-       register_cpu_notifier(&xen_hvm_cpu_notifier);
+       WARN_ON(xen_cpuhp_setup());
         xen_unplug_emulated_devices();
         x86_init.irqs.intr_init = xen_init_IRQ;
         xen_hvm_init_time_ops();
@@ -1910,7 +1952,7 @@ bool xen_hvm_need_lapic(void)
                 return false;
         if (!xen_hvm_domain())
                 return false;
-       if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+       if (xen_feature(XENFEAT_hvm_pirqs))
                 return false;
         return true;
  }
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c

index de4144c..809b6c8 100644 (file)
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -89,7 +89,7 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
  
  static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
  {
-       area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+       area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL);
         if (area->ptes == NULL)
                 return -ENOMEM;
  
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c

index d37a0c7..90d1b83 100644 (file)
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -61,7 +61,7 @@ static int check_platform_magic(void)
                 }
                 break;
         default:
-               printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+               printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version\n");
                 return XEN_PLATFORM_ERR_PROTOCOL;
         }
  
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c

index 32bdc2c..b9fc525 100644 (file)
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -547,8 +547,11 @@ void xen_pmu_init(int cpu)
         return;
  
  fail:
-       pr_info_once("Could not initialize VPMU for cpu %d, error %d\n",
-               cpu, err);
+       if (err == -EOPNOTSUPP || err == -ENOSYS)
+               pr_info_once("VPMU disabled by hypervisor.\n");
+       else
+               pr_info_once("Could not initialize VPMU for cpu %d, error %d\n",
+                       cpu, err);
         free_pages((unsigned long)xenpmu_data, 0);
  }
  
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c

index 0b4d04c..9fa27ce 100644 (file)
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -87,6 +87,12 @@ static void cpu_bringup(void)
         cpu_data(cpu).x86_max_cores = 1;
         set_cpu_sibling_map(cpu);
  
+       /*
+        * identify_cpu() may have set logical_pkg_id to -1 due
+        * to incorrect phys_proc_id. Let's re-comupte it.
+        */
+       topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu);
+
         xen_setup_cpu_clockevents();
  
         notify_cpu_starting(cpu);
@@ -115,7 +121,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
         cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
  }
  
-static void xen_smp_intr_free(unsigned int cpu)
+void xen_smp_intr_free(unsigned int cpu)
  {
         if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
                 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
@@ -159,7 +165,7 @@ static void xen_smp_intr_free(unsigned int cpu)
                 per_cpu(xen_pmu_irq, cpu).name = NULL;
         }
  };
-static int xen_smp_intr_init(unsigned int cpu)
+int xen_smp_intr_init(unsigned int cpu)
  {
         int rc;
         char *resched_name, *callfunc_name, *debug_name, *pmu_name;
@@ -475,8 +481,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
         common_cpu_up(cpu, idle);
  
         xen_setup_runstate_info(cpu);
-       xen_setup_timer(cpu);
-       xen_init_lock_cpu(cpu);
  
         /*
          * PV VCPUs are always successfully taken down (see 'while' loop
@@ -495,10 +499,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
  
         xen_pmu_init(cpu);
  
-       rc = xen_smp_intr_init(cpu);
-       if (rc)
-               return rc;
-
         rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
         BUG_ON(rc);
  
@@ -769,47 +769,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
         xen_init_lock_cpu(0);
  }
  
-static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
-{
-       int rc;
-
-       /*
-        * This can happen if CPU was offlined earlier and
-        * offlining timed out in common_cpu_die().
-        */
-       if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
-               xen_smp_intr_free(cpu);
-               xen_uninit_lock_cpu(cpu);
-       }
-
-       /*
-        * xen_smp_intr_init() needs to run before native_cpu_up()
-        * so that IPI vectors are set up on the booting CPU before
-        * it is marked online in native_cpu_up().
-       */
-       rc = xen_smp_intr_init(cpu);
-       WARN_ON(rc);
-       if (!rc)
-               rc =  native_cpu_up(cpu, tidle);
-
-       /*
-        * We must initialize the slowpath CPU kicker _after_ the native
-        * path has executed. If we initialized it before none of the
-        * unlocker IPI kicks would reach the booting CPU as the booting
-        * CPU had not set itself 'online' in cpu_online_mask. That mask
-        * is checked when IPIs are sent (on HVM at least).
-        */
-       xen_init_lock_cpu(cpu);
-       return rc;
-}
-
  void __init xen_hvm_smp_init(void)
  {
-       if (!xen_have_vector_callback)
-               return;
         smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
         smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
-       smp_ops.cpu_up = xen_hvm_cpu_up;
         smp_ops.cpu_die = xen_cpu_die;
         smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
         smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h

index 963d62a..c5c16dc 100644 (file)
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -1,5 +1,6 @@
  #ifndef _XEN_SMP_H
  
+#ifdef CONFIG_SMP
  extern void xen_send_IPI_mask(const struct cpumask *mask,
                               int vector);
  extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
@@ -8,6 +9,18 @@ extern void xen_send_IPI_allbutself(int vector);
  extern void xen_send_IPI_all(int vector);
  extern void xen_send_IPI_self(int vector);
  
+extern int xen_smp_intr_init(unsigned int cpu);
+extern void xen_smp_intr_free(unsigned int cpu);
+
+#else /* CONFIG_SMP */
+
+static inline int xen_smp_intr_init(unsigned int cpu)
+{
+       return 0;
+}
+static inline void xen_smp_intr_free(unsigned int cpu) {}
+#endif /* CONFIG_SMP */
+
  #ifdef CONFIG_XEN_PVH
  extern void xen_pvh_early_cpu_init(int cpu, bool entry);
  #else
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c

index 67356d2..33d8f6a 100644 (file)
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -432,11 +432,6 @@ static void xen_hvm_setup_cpu_clockevents(void)
  
  void __init xen_hvm_init_time_ops(void)
  {
-       /* vector callback is needed otherwise we cannot receive interrupts
-        * on cpu > 0 and at this point we don't know how many cpus are
-        * available */
-       if (!xen_have_vector_callback)
-               return;
         if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
                 printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
                                 "disable pv timer\n");
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig

index 64336f6..f610586 100644 (file)
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -13,16 +13,19 @@ config XTENSA
         select GENERIC_IRQ_SHOW
         select GENERIC_PCI_IOMAP
         select GENERIC_SCHED_CLOCK
+       select HAVE_DEBUG_KMEMLEAK
         select HAVE_DMA_API_DEBUG
         select HAVE_EXIT_THREAD
         select HAVE_FUNCTION_TRACER
         select HAVE_FUTEX_CMPXCHG if !MMU
         select HAVE_HW_BREAKPOINT if PERF_EVENTS
         select HAVE_IRQ_TIME_ACCOUNTING
+       select HAVE_MEMBLOCK
         select HAVE_OPROFILE
         select HAVE_PERF_EVENTS
         select IRQ_DOMAIN
         select MODULES_USE_ELF_RELA
+       select NO_BOOTMEM
         select PERF_USE_VMALLOC
         select VIRT_TO_BUS
         help
@@ -209,7 +212,8 @@ config HOTPLUG_CPU
  
  config INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX
         bool "Initialize Xtensa MMU inside the Linux kernel code"
-       default y
+       depends on !XTENSA_VARIANT_FSF && !XTENSA_VARIANT_DC232B
+       default y if XTENSA_VARIANT_DC233C || XTENSA_VARIANT_CUSTOM
         help
           Earlier version initialized the MMU in the exception vector
           before jumping to _startup in head.S and had an advantage that
@@ -236,6 +240,71 @@ config INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX
  
           If in doubt, say Y.
  
+config KSEG_PADDR
+       hex "Physical address of the KSEG mapping"
+       depends on INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX && MMU
+       default 0x00000000
+       help
+         This is the physical address where KSEG is mapped. Please refer to
+         the chosen KSEG layout help for the required address alignment.
+         Unpacked kernel image (including vectors) must be located completely
+         within KSEG.
+         Physical memory below this address is not available to linux.
+
+         If unsure, leave the default value here.
+
+config KERNEL_LOAD_ADDRESS
+       hex "Kernel load address"
+       default 0x60003000 if !MMU
+       default 0x00003000 if MMU && INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX
+       default 0xd0003000 if MMU && !INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX
+       help
+         This is the address where the kernel is loaded.
+         It is virtual address for MMUv2 configurations and physical address
+         for all other configurations.
+
+         If unsure, leave the default value here.
+
+config VECTORS_OFFSET
+       hex "Kernel vectors offset"
+       default 0x00003000
+       help
+         This is the offset of the kernel image from the relocatable vectors
+         base.
+
+         If unsure, leave the default value here.
+
+choice
+       prompt "KSEG layout"
+       depends on MMU
+       default XTENSA_KSEG_MMU_V2
+
+config XTENSA_KSEG_MMU_V2
+       bool "MMUv2: 128MB cached + 128MB uncached"
+       help
+         MMUv2 compatible kernel memory map: TLB way 5 maps 128MB starting
+         at KSEG_PADDR to 0xd0000000 with cache and to 0xd8000000
+         without cache.
+         KSEG_PADDR must be aligned to 128MB.
+
+config XTENSA_KSEG_256M
+       bool "256MB cached + 256MB uncached"
+       depends on INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX
+       help
+         TLB way 6 maps 256MB starting at KSEG_PADDR to 0xb0000000
+         with cache and to 0xc0000000 without cache.
+         KSEG_PADDR must be aligned to 256MB.
+
+config XTENSA_KSEG_512M
+       bool "512MB cached + 512MB uncached"
+       depends on INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX
+       help
+         TLB way 6 maps 512MB starting at KSEG_PADDR to 0xa0000000
+         with cache and to 0xc0000000 without cache.
+         KSEG_PADDR must be aligned to 256MB.
+
+endchoice
+
  config HIGHMEM
         bool "High Memory Support"
         depends on MMU
@@ -331,7 +400,7 @@ config XTENSA_PLATFORM_XT2000
  config XTENSA_PLATFORM_XTFPGA
         bool "XTFPGA"
         select ETHOC if ETHERNET
-       select PLATFORM_WANT_DEFAULT_MEM
+       select PLATFORM_WANT_DEFAULT_MEM if !MMU
         select SERIAL_CONSOLE
         select XTENSA_CALIBRATE_CCOUNT
         help
@@ -369,6 +438,7 @@ config USE_OF
         bool "Flattened Device Tree support"
         select OF
         select OF_EARLY_FLATTREE
+       select OF_RESERVED_MEM
         help
           Include support for flattened device tree machine descriptions.
  
@@ -439,16 +509,9 @@ config DEFAULT_MEM_START
         default 0x00000000 if MMU
         default 0x60000000 if !MMU
         help
-         This is a fallback start address of the default memory area, it is
-         used when no physical memory size is passed through DTB or through
-         boot parameter from bootloader.
-
-         In noMMU configuration the following parameters are derived from it:
-         - kernel load address;
-         - kernel entry point address;
-         - relocatable vectors base address;
-         - uBoot load address;
-         - TASK_SIZE.
+         This is the base address of the default memory area.
+         Default memory area has platform-specific meaning, it may be used
+         for e.g. early cache initialization.
  
           If unsure, leave the default value here.
  
@@ -457,11 +520,9 @@ config DEFAULT_MEM_SIZE
         depends on PLATFORM_WANT_DEFAULT_MEM
         default 0x04000000
         help
-         This is a fallback size of the default memory area, it is used when
-         no physical memory size is passed through DTB or through boot
-         parameter from bootloader.
-
-         It's also used for TASK_SIZE calculation in noMMU configuration.
+         This is the size of the default memory area.
+         Default memory area has platform-specific meaning, it may be used
+         for e.g. early cache initialization.
  
           If unsure, leave the default value here.
  
diff --git a/arch/xtensa/boot/boot-elf/boot.lds.S b/arch/xtensa/boot/boot-elf/boot.lds.S

index e54f2c9..a309930 100644 (file)
--- a/arch/xtensa/boot/boot-elf/boot.lds.S
+++ b/arch/xtensa/boot/boot-elf/boot.lds.S
@@ -23,7 +23,7 @@ SECTIONS
                 *(.ResetVector.text)
         }
  
-       .image KERNELOFFSET: AT (LOAD_MEMORY_ADDRESS)
+       .image KERNELOFFSET: AT (CONFIG_KERNEL_LOAD_ADDRESS)
         {
                 _image_start = .;
                 *(image)
diff --git a/arch/xtensa/boot/boot-elf/bootstrap.S b/arch/xtensa/boot/boot-elf/bootstrap.S

index e6bf313..b6aa853 100644 (file)
--- a/arch/xtensa/boot/boot-elf/bootstrap.S
+++ b/arch/xtensa/boot/boot-elf/bootstrap.S
@@ -35,7 +35,12 @@ _ResetVector:
  
         .align 4
  RomInitAddr:
-       .word   LOAD_MEMORY_ADDRESS
+#if defined(CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX) && \
+       XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY
+       .word   CONFIG_KERNEL_LOAD_ADDRESS
+#else
+       .word   KERNELOFFSET
+#endif
  RomBootParam:
         .word _bootparam
  _bootparam:
diff --git a/arch/xtensa/boot/boot-uboot/Makefile b/arch/xtensa/boot/boot-uboot/Makefile

index 403fcf2..0f4c417 100644 (file)
--- a/arch/xtensa/boot/boot-uboot/Makefile
+++ b/arch/xtensa/boot/boot-uboot/Makefile
@@ -4,15 +4,7 @@
  # for more details.
  #
  
-ifdef CONFIG_MMU
-ifdef CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX
-UIMAGE_LOADADDR = 0x00003000
-else
-UIMAGE_LOADADDR = 0xd0003000
-endif
-else
-UIMAGE_LOADADDR = $(shell printf "0x%x" $$(( ${CONFIG_DEFAULT_MEM_START} + 0x3000 )) )
-endif
+UIMAGE_LOADADDR = $(CONFIG_KERNEL_LOAD_ADDRESS)
  UIMAGE_COMPRESSION = gzip
  
  $(obj)/../uImage: vmlinux.bin.gz FORCE
diff --git a/arch/xtensa/boot/dts/csp.dts b/arch/xtensa/boot/dts/csp.dts

new file mode 100644 (file)

index 0000000..4082f26
--- /dev/null
+++ b/arch/xtensa/boot/dts/csp.dts
@@ -0,0 +1,54 @@
+/dts-v1/;
+
+/ {
+       compatible = "cdns,xtensa-xtfpga";
+       #address-cells = <1>;
+       #size-cells = <1>;
+       interrupt-parent = <&pic>;
+
+       chosen {
+               bootargs = "earlycon=cdns,0xfd000000,115200 console=tty0 console=ttyPS0,115200 root=/dev/ram0 rw earlyprintk xilinx_uartps.rx_trigger_level=32 loglevel=8 nohz=off ignore_loglevel";
+       };
+
+       memory@0 {
+               device_type = "memory";
+               reg = <0x00000000 0x40000000>;
+       };
+
+       cpus {
+               #address-cells = <1>;
+               #size-cells = <0>;
+               cpu@0 {
+                       compatible = "cdns,xtensa-cpu";
+                       reg = <0>;
+               };
+       };
+
+       pic: pic {
+               compatible = "cdns,xtensa-pic";
+               #interrupt-cells = <2>;
+               interrupt-controller;
+       };
+
+       clocks {
+               osc: main-oscillator {
+                       #clock-cells = <0>;
+                       compatible = "fixed-clock";
+               };
+       };
+
+       soc {
+               #address-cells = <1>;
+               #size-cells = <1>;
+               compatible = "simple-bus";
+               ranges = <0x00000000 0xf0000000 0x10000000>;
+
+               uart0: serial@0d000000 {
+                       compatible = "xlnx,xuartps", "cdns,uart-r1p8";
+                       clocks = <&osc>, <&osc>;
+                       clock-names = "uart_clk", "pclk";
+                       reg = <0x0d000000 0x1000>;
+                       interrupts = <0 1>;
+               };
+       };
+};
diff --git a/arch/xtensa/boot/dts/xtfpga.dtsi b/arch/xtensa/boot/dts/xtfpga.dtsi

index cd45f9c..91616a9 100644 (file)
--- a/arch/xtensa/boot/dts/xtfpga.dtsi
+++ b/arch/xtensa/boot/dts/xtfpga.dtsi
@@ -19,9 +19,7 @@
                 cpu@0 {
                         compatible = "cdns,xtensa-cpu";
                         reg = <0>;
-                       /* Filled in by platform_setup from FPGA register
-                        * clock-frequency = <100000000>;
-                        */
+                       clocks = <&osc>;
                 };
         };
  
@@ -36,11 +34,6 @@
         };
  
         clocks {
-               osc: main-oscillator {
-                       #clock-cells = <0>;
-                       compatible = "fixed-clock";
-               };
-
                 clk54: clk54 {
                         #clock-cells = <0>;
                         compatible = "fixed-clock";
@@ -54,6 +47,12 @@
                 compatible = "simple-bus";
                 ranges = <0x00000000 0xf0000000 0x10000000>;
  
+               osc: main-oscillator {
+                       #clock-cells = <0>;
+                       compatible = "cdns,xtfpga-clock";
+                       reg = <0x0d020004 0x4>;
+               };
+
                 serial0: serial@0d050020 {
                         device_type = "serial";
                         compatible = "ns16550a";
diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig

index c4904db..8d16925 100644 (file)
--- a/arch/xtensa/configs/audio_kc705_defconfig
+++ b/arch/xtensa/configs/audio_kc705_defconfig
@@ -33,7 +33,7 @@ CONFIG_HIGHMEM=y
  # CONFIG_PCI is not set
  CONFIG_XTENSA_PLATFORM_XTFPGA=y
  CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="earlycon=uart8250,mmio32,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug"
+CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=0x38000000@0"
  CONFIG_USE_OF=y
  CONFIG_BUILTIN_DTB="kc705"
  # CONFIG_COMPACTION is not set
diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig

new file mode 100644 (file)

index 0000000..f2d3094
--- /dev/null
+++ b/arch/xtensa/configs/cadence_csp_defconfig
@@ -0,0 +1,122 @@
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_USELIB=y
+CONFIG_IRQ_DOMAIN_DEBUG=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IRQ_TIME_ACCOUNTING=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_NAMESPACES=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_RELAY=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE="$$KERNEL_INITRAMFS_SOURCE"
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+# CONFIG_RD_XZ is not set
+# CONFIG_RD_LZO is not set
+# CONFIG_RD_LZ4 is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_EMBEDDED=y
+CONFIG_PROFILING=y
+CONFIG_MODULES=y
+CONFIG_MODULE_FORCE_LOAD=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+CONFIG_XTENSA_VARIANT_CUSTOM=y
+CONFIG_XTENSA_VARIANT_CUSTOM_NAME="csp"
+CONFIG_XTENSA_UNALIGNED_USER=y
+CONFIG_PREEMPT=y
+CONFIG_HIGHMEM=y
+# CONFIG_PCI is not set
+CONFIG_XTENSA_PLATFORM_XTFPGA=y
+CONFIG_USE_OF=y
+CONFIG_BUILTIN_DTB="csp"
+# CONFIG_COMPACTION is not set
+CONFIG_XTFPGA_LCD=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_DIAG is not set
+# CONFIG_IPV6 is not set
+# CONFIG_WIRELESS is not set
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+# CONFIG_STANDALONE is not set
+# CONFIG_FW_LOADER is not set
+CONFIG_MTD=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_JEDECPROBE=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_MTD_CFI_STAA=y
+CONFIG_MTD_PHYSMAP_OF=y
+CONFIG_MTD_UBI=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_SCSI=y
+CONFIG_BLK_DEV_SD=y
+# CONFIG_INPUT_MOUSEDEV is not set
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_LEGACY_PTY_COUNT=16
+CONFIG_SERIAL_XILINX_PS_UART=y
+CONFIG_SERIAL_XILINX_PS_UART_CONSOLE=y
+CONFIG_HW_RANDOM=y
+# CONFIG_HWMON is not set
+CONFIG_WATCHDOG=y
+CONFIG_WATCHDOG_NOWAYOUT=y
+CONFIG_SOFT_WATCHDOG=y
+# CONFIG_VGA_CONSOLE is not set
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_EXT3_FS=y
+CONFIG_FANOTIFY=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_NFS_FS=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_SWAP=y
+CONFIG_ROOT_NFS=y
+CONFIG_SUNRPC_DEBUG=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_LOCKUP_DETECTOR=y
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_RCU_TRACE=y
+CONFIG_FUNCTION_TRACER=y
+# CONFIG_S32C1I_SELFTEST is not set
+# CONFIG_CRYPTO_ECHAINIV is not set
+CONFIG_CRYPTO_DEFLATE=y
+CONFIG_CRYPTO_LZO=y
+# CONFIG_CRYPTO_HW is not set
diff --git a/arch/xtensa/configs/common_defconfig b/arch/xtensa/configs/common_defconfig

index 721df12..4bcc76b 100644 (file)
--- a/arch/xtensa/configs/common_defconfig
+++ b/arch/xtensa/configs/common_defconfig
@@ -1,204 +1,15 @@
-#
-# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.11-rc2
-# Tue Mar  1 16:36:53 2005
-#
-# CONFIG_FRAME_POINTER is not set
-CONFIG_XTENSA=y
-# CONFIG_UID16 is not set
-CONFIG_RWSEM_XCHGADD_ALGORITHM=y
-CONFIG_HAVE_DEC_LOCK=y
-
-#
-# Code maturity level options
-#
-CONFIG_EXPERIMENTAL=y
-CONFIG_CLEAN_COMPILE=y
-CONFIG_BROKEN_ON_SMP=y
-
-#
-# General setup
-#
-CONFIG_LOCALVERSION=""
-CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
-# CONFIG_POSIX_MQUEUE is not set
  CONFIG_BSD_PROCESS_ACCT=y
-# CONFIG_BSD_PROCESS_ACCT_V3 is not set
-CONFIG_SYSCTL=y
-# CONFIG_AUDIT is not set
  CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_HOTPLUG is not set
-CONFIG_KOBJECT_UEVENT=y
-# CONFIG_IKCONFIG is not set
-# CONFIG_EXPERT is not set
-CONFIG_KALLSYMS=y
-# CONFIG_KALLSYMS_ALL is not set
-# CONFIG_KALLSYMS_EXTRA_PASS is not set
-CONFIG_FUTEX=y
-CONFIG_EPOLL=y
-# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-CONFIG_SHMEM=y
-CONFIG_CC_ALIGN_FUNCTIONS=0
-CONFIG_CC_ALIGN_LABELS=0
-CONFIG_CC_ALIGN_LOOPS=0
-CONFIG_CC_ALIGN_JUMPS=0
-# CONFIG_TINY_SHMEM is not set
-
-#
-# Loadable module support
-#
  CONFIG_MODULES=y
-# CONFIG_MODULE_UNLOAD is not set
-CONFIG_OBSOLETE_MODPARM=y
  CONFIG_MODVERSIONS=y
-# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_KMOD=y
-
-#
-# Processor type and features
-#
-CONFIG_XTENSA_ARCH_LINUX_BE=y
-# CONFIG_XTENSA_ARCH_LINUX_LE is not set
-# CONFIG_XTENSA_ARCH_LINUX_TEST is not set
-# CONFIG_XTENSA_ARCH_S5 is not set
-# CONFIG_XTENSA_CUSTOM is not set
-CONFIG_MMU=y
-# CONFIG_XTENSA_UNALIGNED_USER is not set
-# CONFIG_PREEMPT is not set
-# CONFIG_HIGHMEM is not set
-
-#
-# Platform options
-#
-# CONFIG_XTENSA_PLATFORM_ISS is not set
  CONFIG_XTENSA_PLATFORM_XT2000=y
-CONFIG_XTENSA_CALIBRATE_CCOUNT=y
  CONFIG_GENERIC_CALIBRATE_DELAY=y
  CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="console=ttyS0,38400 ip=bootp root=nfs nfsroot=/opt/montavista/pro/devkit/xtensa/linux_be/target"
-
-#
-# Bus options
-#
-CONFIG_PCI=y
-# CONFIG_PCI_LEGACY_PROC is not set
-# CONFIG_PCI_NAMES is not set
-
-#
-# PCCARD (PCMCIA/CardBus) support
-#
-# CONFIG_PCCARD is not set
-
-#
-# PC-card bridges
-#
-
-#
-# PCI Hotplug Support
-#
-# CONFIG_HOTPLUG_PCI is not set
-
-#
-# Exectuable file formats
-#
-CONFIG_KCORE_ELF=y
-CONFIG_BINFMT_ELF=y
+CONFIG_CMDLINE="console=ttyS0,38400 ip=bootp root=nfs nfsroot=/opt/montavista/pro/devkit/xtensa/linux_be/target memmap=128M@0"
  CONFIG_BINFMT_MISC=y
-
-#
-# Device Drivers
-#
-
-#
-# Generic Driver Options
-#
-CONFIG_STANDALONE=y
-CONFIG_PREVENT_FIRMWARE_BUILD=y
-# CONFIG_FW_LOADER is not set
-# CONFIG_DEBUG_DRIVER is not set
-
-#
-# Memory Technology Devices (MTD)
-#
-# CONFIG_MTD is not set
-
-#
-# Parallel port support
-#
-# CONFIG_PARPORT is not set
-
-#
-# Plug and Play support
-#
-
-#
-# Block devices
-#
-# CONFIG_BLK_DEV_FD is not set
-# CONFIG_BLK_CPQ_DA is not set
-# CONFIG_BLK_CPQ_CISS_DA is not set
-# CONFIG_BLK_DEV_DAC960 is not set
-# CONFIG_BLK_DEV_UMEM is not set
-# CONFIG_BLK_DEV_COW_COMMON is not set
-# CONFIG_BLK_DEV_LOOP is not set
-# CONFIG_BLK_DEV_NBD is not set
-# CONFIG_BLK_DEV_SX8 is not set
-# CONFIG_BLK_DEV_RAM is not set
-CONFIG_BLK_DEV_RAM_COUNT=16
-CONFIG_INITRAMFS_SOURCE=""
-# CONFIG_CDROM_PKTCDVD is not set
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
-CONFIG_IOSCHED_DEADLINE=y
-CONFIG_IOSCHED_CFQ=y
-# CONFIG_ATA_OVER_ETH is not set
-
-#
-# ATA/ATAPI/MFM/RLL support
-#
-# CONFIG_IDE is not set
-
-#
-# SCSI device support
-#
-# CONFIG_SCSI is not set
-
-#
-# Multi-device support (RAID and LVM)
-#
-# CONFIG_MD is not set
-
-#
-# Fusion MPT device support
-#
-
-#
-# IEEE 1394 (FireWire) support
-#
-# CONFIG_IEEE1394 is not set
-
-#
-# I2O device support
-#
-# CONFIG_I2O is not set
-
-#
-# Networking support
-#
  CONFIG_NET=y
-
-#
-# Networking options
-#
-# CONFIG_PACKET is not set
-# CONFIG_NETLINK_DEV is not set
  CONFIG_UNIX=y
-# CONFIG_NET_KEY is not set
  CONFIG_INET=y
  CONFIG_IP_MULTICAST=y
  CONFIG_IP_ADVANCED_ROUTER=y
@@ -209,47 +20,10 @@ CONFIG_IP_PNP=y
  CONFIG_IP_PNP_DHCP=y
  CONFIG_IP_PNP_BOOTP=y
  CONFIG_IP_PNP_RARP=y
-# CONFIG_NET_IPIP is not set
-# CONFIG_NET_IPGRE is not set
-# CONFIG_IP_MROUTE is not set
-# CONFIG_ARPD is not set
-# CONFIG_SYN_COOKIES is not set
-# CONFIG_INET_AH is not set
-# CONFIG_INET_ESP is not set
-# CONFIG_INET_IPCOMP is not set
-# CONFIG_INET_TUNNEL is not set
-# CONFIG_IP_TCPDIAG is not set
-# CONFIG_IP_TCPDIAG_IPV6 is not set
  # CONFIG_IPV6 is not set
-# CONFIG_NETFILTER is not set
-
-#
-# SCTP Configuration (EXPERIMENTAL)
-#
-# CONFIG_IP_SCTP is not set
-# CONFIG_ATM is not set
-# CONFIG_BRIDGE is not set
-# CONFIG_VLAN_8021Q is not set
-# CONFIG_DECNET is not set
-# CONFIG_LLC2 is not set
-# CONFIG_IPX is not set
-# CONFIG_ATALK is not set
-# CONFIG_X25 is not set
-# CONFIG_LAPB is not set
-# CONFIG_NET_DIVERT is not set
-# CONFIG_ECONET is not set
-# CONFIG_WAN_ROUTER is not set
-
-#
-# QoS and/or fair queueing
-#
  CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_CLK_JIFFIES=y
-# CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set
-# CONFIG_NET_SCH_CLK_CPU is not set
  CONFIG_NET_SCH_CBQ=m
  CONFIG_NET_SCH_HTB=m
-# CONFIG_NET_SCH_HFSC is not set
  CONFIG_NET_SCH_PRIO=m
  CONFIG_NET_SCH_RED=m
  CONFIG_NET_SCH_SFQ=m
@@ -257,399 +31,24 @@ CONFIG_NET_SCH_TEQL=m
  CONFIG_NET_SCH_TBF=m
  CONFIG_NET_SCH_GRED=m
  CONFIG_NET_SCH_DSMARK=m
-# CONFIG_NET_SCH_NETEM is not set
-CONFIG_NET_SCH_INGRESS=m
-CONFIG_NET_QOS=y
-CONFIG_NET_ESTIMATOR=y
-CONFIG_NET_CLS=y
  CONFIG_NET_CLS_TCINDEX=m
  CONFIG_NET_CLS_ROUTE4=m
-CONFIG_NET_CLS_ROUTE=y
  CONFIG_NET_CLS_FW=m
  CONFIG_NET_CLS_U32=m
-# CONFIG_CLS_U32_PERF is not set
-# CONFIG_NET_CLS_IND is not set
  CONFIG_NET_CLS_RSVP=m
  CONFIG_NET_CLS_RSVP6=m
-# CONFIG_NET_CLS_ACT is not set
-CONFIG_NET_CLS_POLICE=y
-
-#
-# Network testing
-#
-# CONFIG_NET_PKTGEN is not set
-# CONFIG_NETPOLL is not set
-# CONFIG_NET_POLL_CONTROLLER is not set
-# CONFIG_HAMRADIO is not set
-# CONFIG_IRDA is not set
-# CONFIG_BT is not set
  CONFIG_NETDEVICES=y
  CONFIG_DUMMY=y
-# CONFIG_BONDING is not set
-# CONFIG_EQUALIZER is not set
-# CONFIG_TUN is not set
-
-#
-# ARCnet devices
-#
-# CONFIG_ARCNET is not set
-
-#
-# Ethernet (10 or 100Mbit)
-#
-CONFIG_NET_ETHERNET=y
-# CONFIG_MII is not set
-CONFIG_XT2000_SONIC=y
-# CONFIG_HAPPYMEAL is not set
-# CONFIG_SUNGEM is not set
  # CONFIG_NET_VENDOR_3COM is not set
-
-#
-# Tulip family network device support
-#
-# CONFIG_NET_TULIP is not set
-# CONFIG_HP100 is not set
-# CONFIG_NET_PCI is not set
-
-#
-# Ethernet (1000 Mbit)
-#
-# CONFIG_ACENIC is not set
-# CONFIG_DL2K is not set
-# CONFIG_E1000 is not set
-# CONFIG_NS83820 is not set
-# CONFIG_HAMACHI is not set
-# CONFIG_YELLOWFIN is not set
-# CONFIG_R8169 is not set
-# CONFIG_SK98LIN is not set
-# CONFIG_TIGON3 is not set
-
-#
-# Ethernet (10000 Mbit)
-#
-# CONFIG_IXGB is not set
-# CONFIG_S2IO is not set
-
-#
-# Wireless LAN (non-hamradio)
-#
-CONFIG_NET_RADIO=y
-
-#
-# Obsolete Wireless cards support (pre-802.11)
-#
-CONFIG_STRIP=m
-
-#
-# Wireless 802.11b ISA/PCI cards support
-#
-CONFIG_HERMES=m
-# CONFIG_PLX_HERMES is not set
-# CONFIG_TMD_HERMES is not set
-# CONFIG_PCI_HERMES is not set
-# CONFIG_ATMEL is not set
-
-#
-# Prism GT/Duette 802.11(a/b/g) PCI/Cardbus support
-#
-# CONFIG_PRISM54 is not set
-CONFIG_NET_WIRELESS=y
-
-#
-# Wan interfaces
-#
-# CONFIG_WAN is not set
-# CONFIG_FDDI is not set
-# CONFIG_HIPPI is not set
-# CONFIG_PPP is not set
-# CONFIG_SLIP is not set
-# CONFIG_SHAPER is not set
-# CONFIG_NETCONSOLE is not set
-
-#
-# ISDN subsystem
-#
-# CONFIG_ISDN is not set
-
-#
-# Telephony Support
-#
-# CONFIG_PHONE is not set
-
-#
-# Input device support
-#
-CONFIG_INPUT=y
-
-#
-# Userland interfaces
-#
-CONFIG_INPUT_MOUSEDEV=y
  # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
-CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
-# CONFIG_INPUT_JOYDEV is not set
-# CONFIG_INPUT_TSDEV is not set
-# CONFIG_INPUT_EVDEV is not set
-# CONFIG_INPUT_EVBUG is not set
-
-#
-# Input I/O drivers
-#
-# CONFIG_GAMEPORT is not set
-CONFIG_SOUND_GAMEPORT=y
-CONFIG_SERIO=y
-# CONFIG_SERIO_I8042 is not set
-# CONFIG_SERIO_SERPORT is not set
-# CONFIG_SERIO_CT82C710 is not set
-# CONFIG_SERIO_PCIPS2 is not set
-# CONFIG_SERIO_RAW is not set
-
-#
-# Input Device Drivers
-#
  # CONFIG_INPUT_KEYBOARD is not set
  # CONFIG_INPUT_MOUSE is not set
-# CONFIG_INPUT_JOYSTICK is not set
-# CONFIG_INPUT_TOUCHSCREEN is not set
-# CONFIG_INPUT_MISC is not set
-
-#
-# Character devices
-#
-CONFIG_VT=y
-CONFIG_VT_CONSOLE=y
-CONFIG_HW_CONSOLE=y
-# CONFIG_SERIAL_NONSTANDARD is not set
-
-#
-# Serial drivers
-#
+# CONFIG_SERIO_SERPORT is not set
  CONFIG_SERIAL_8250=y
  CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=4
-# CONFIG_SERIAL_8250_EXTENDED is not set
-
-#
-# Non-8250 serial port support
-#
-CONFIG_SERIAL_CORE=y
-CONFIG_SERIAL_CORE_CONSOLE=y
-CONFIG_UNIX98_PTYS=y
-CONFIG_LEGACY_PTYS=y
-CONFIG_LEGACY_PTY_COUNT=256
-
-#
-# IPMI
-#
-# CONFIG_IPMI_HANDLER is not set
-
-#
-# Watchdog Cards
-#
-# CONFIG_WATCHDOG is not set
-# CONFIG_RTC is not set
-# CONFIG_GEN_RTC is not set
-# CONFIG_DTLK is not set
-# CONFIG_R3964 is not set
-# CONFIG_APPLICOM is not set
-
-#
-# Ftape, the floppy tape device driver
-#
-# CONFIG_DRM is not set
-# CONFIG_RAW_DRIVER is not set
-
-#
-# I2C support
-#
-# CONFIG_I2C is not set
-
-#
-# Dallas's 1-wire bus
-#
-# CONFIG_W1 is not set
-
-#
-# Misc devices
-#
-
-#
-# Multimedia devices
-#
-# CONFIG_VIDEO_DEV is not set
-
-#
-# Digital Video Broadcasting Devices
-#
-# CONFIG_DVB is not set
-
-#
-# Graphics support
-#
-# CONFIG_FB is not set
-
-#
-# Console display driver support
-#
  # CONFIG_VGA_CONSOLE is not set
-CONFIG_DUMMY_CONSOLE=y
-# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
-
-#
-# Sound
-#
-# CONFIG_SOUND is not set
-
-#
-# USB support
-#
-# CONFIG_USB is not set
-CONFIG_USB_ARCH_HAS_HCD=y
-CONFIG_USB_ARCH_HAS_OHCI=y
-
-#
-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information
-#
-
-#
-# USB Gadget Support
-#
-# CONFIG_USB_GADGET is not set
-
-#
-# MMC/SD Card support
-#
-# CONFIG_MMC is not set
-
-#
-# InfiniBand support
-#
-# CONFIG_INFINIBAND is not set
-
-#
-# File systems
-#
-# CONFIG_EXT2_FS is not set
-# CONFIG_EXT3_FS is not set
-# CONFIG_JBD is not set
-# CONFIG_REISERFS_FS is not set
-# CONFIG_JFS_FS is not set
-# CONFIG_XFS_FS is not set
-# CONFIG_MINIX_FS is not set
-# CONFIG_ROMFS_FS is not set
-# CONFIG_QUOTA is not set
-CONFIG_DNOTIFY=y
-# CONFIG_AUTOFS_FS is not set
-# CONFIG_AUTOFS4_FS is not set
-
-#
-# CD-ROM/DVD Filesystems
-#
-# CONFIG_ISO9660_FS is not set
-# CONFIG_UDF_FS is not set
-
-#
-# DOS/FAT/NT Filesystems
-#
-# CONFIG_MSDOS_FS is not set
-# CONFIG_VFAT_FS is not set
-# CONFIG_NTFS_FS is not set
-
-#
-# Pseudo filesystems
-#
-CONFIG_PROC_FS=y
-# CONFIG_PROC_KCORE is not set
-CONFIG_SYSFS=y
-CONFIG_DEVFS_FS=y
-# CONFIG_DEVFS_MOUNT is not set
-# CONFIG_DEVFS_DEBUG is not set
-# CONFIG_DEVPTS_FS_XATTR is not set
-# CONFIG_TMPFS is not set
-# CONFIG_HUGETLB_PAGE is not set
-CONFIG_RAMFS=y
-
-#
-# Miscellaneous filesystems
-#
-# CONFIG_ADFS_FS is not set
-# CONFIG_AFFS_FS is not set
-# CONFIG_HFS_FS is not set
-# CONFIG_HFSPLUS_FS is not set
-# CONFIG_BEFS_FS is not set
-# CONFIG_BFS_FS is not set
-# CONFIG_EFS_FS is not set
-# CONFIG_CRAMFS is not set
-# CONFIG_VXFS_FS is not set
-# CONFIG_HPFS_FS is not set
-# CONFIG_QNX4FS_FS is not set
-# CONFIG_SYSV_FS is not set
-# CONFIG_UFS_FS is not set
-
-#
-# Network File Systems
-#
  CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
-# CONFIG_NFS_V4 is not set
-# CONFIG_NFS_DIRECTIO is not set
-# CONFIG_NFSD is not set
  CONFIG_ROOT_NFS=y
-CONFIG_LOCKD=y
-CONFIG_LOCKD_V4=y
-# CONFIG_EXPORTFS is not set
-CONFIG_SUNRPC=y
-# CONFIG_RPCSEC_GSS_KRB5 is not set
-# CONFIG_RPCSEC_GSS_SPKM3 is not set
-# CONFIG_SMB_FS is not set
-# CONFIG_CIFS is not set
-# CONFIG_NCP_FS is not set
-# CONFIG_CODA_FS is not set
-# CONFIG_AFS_FS is not set
-
-#
-# Partition Types
-#
-# CONFIG_PARTITION_ADVANCED is not set
-CONFIG_MSDOS_PARTITION=y
-
-#
-# Native Language Support
-#
-# CONFIG_NLS is not set
-
-#
-# Kernel hacking
-#
-CONFIG_DEBUG_KERNEL=y
-# CONFIG_DEBUG_STACKOVERFLOW is not set
-# CONFIG_DEBUG_SLAB is not set
+# CONFIG_FRAME_POINTER is not set
  CONFIG_MAGIC_SYSRQ=y
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_PAGEALLOC is not set
-# CONFIG_DEBUG_INFO is not set
-# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
-# CONFIG_KGDB is not set
-
-#
-# Security options
-#
-# CONFIG_KEYS is not set
-# CONFIG_SECURITY is not set
-
-#
-# Cryptographic options
-#
-# CONFIG_CRYPTO is not set
-
-#
-# Hardware crypto devices
-#
-
-#
-# Library routines
-#
-# CONFIG_CRC_CCITT is not set
-# CONFIG_CRC32 is not set
-# CONFIG_LIBCRC32C is not set
+CONFIG_DEBUG_KERNEL=y
diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig

index d9444f0..744adea 100644 (file)
--- a/arch/xtensa/configs/generic_kc705_defconfig
+++ b/arch/xtensa/configs/generic_kc705_defconfig
@@ -32,7 +32,7 @@ CONFIG_HIGHMEM=y
  # CONFIG_PCI is not set
  CONFIG_XTENSA_PLATFORM_XTFPGA=y
  CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="earlycon=uart8250,mmio32,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug"
+CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=0x38000000@0"
  CONFIG_USE_OF=y
  CONFIG_BUILTIN_DTB="kc705"
  # CONFIG_COMPACTION is not set
diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig

index 44c6764..4bb5b76 100644 (file)
--- a/arch/xtensa/configs/iss_defconfig
+++ b/arch/xtensa/configs/iss_defconfig
@@ -1,758 +1,34 @@
-#
-# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.34-rc6
-# Tue Aug  3 00:10:54 2010
-#
-# CONFIG_FRAME_POINTER is not set
-CONFIG_ZONE_DMA=y
-CONFIG_XTENSA=y
-CONFIG_RWSEM_XCHGADD_ALGORITHM=y
-CONFIG_GENERIC_FIND_NEXT_BIT=y
-CONFIG_GENERIC_HWEIGHT=y
-# CONFIG_ARCH_HAS_ILOG2_U32 is not set
-# CONFIG_ARCH_HAS_ILOG2_U64 is not set
-CONFIG_NO_IOPORT_MAP=y
-CONFIG_HZ=100
-CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
-CONFIG_CONSTRUCTORS=y
-
-#
-# General setup
-#
-CONFIG_EXPERIMENTAL=y
-CONFIG_BROKEN_ON_SMP=y
-CONFIG_INIT_ENV_ARG_LIMIT=32
-CONFIG_LOCALVERSION=""
-CONFIG_LOCALVERSION_AUTO=y
-CONFIG_SWAP=y
  CONFIG_SYSVIPC=y
-CONFIG_SYSVIPC_SYSCTL=y
-# CONFIG_POSIX_MQUEUE is not set
-# CONFIG_BSD_PROCESS_ACCT is not set
-# CONFIG_TASKSTATS is not set
-# CONFIG_AUDIT is not set
-
-#
-# RCU Subsystem
-#
-CONFIG_TREE_RCU=y
-# CONFIG_TREE_PREEMPT_RCU is not set
-# CONFIG_TINY_RCU is not set
-# CONFIG_RCU_TRACE is not set
-CONFIG_RCU_FANOUT=32
-# CONFIG_RCU_FANOUT_EXACT is not set
-# CONFIG_TREE_RCU_TRACE is not set
-# CONFIG_IKCONFIG is not set
  CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_CGROUPS is not set
-# CONFIG_SYSFS_DEPRECATED_V2 is not set
-# CONFIG_RELAY is not set
-# CONFIG_NAMESPACES is not set
-# CONFIG_BLK_DEV_INITRD is not set
-# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-CONFIG_SYSCTL=y
-CONFIG_ANON_INODES=y
  CONFIG_EXPERT=y
  CONFIG_SYSCTL_SYSCALL=y
-CONFIG_KALLSYMS=y
-# CONFIG_KALLSYMS_ALL is not set
-# CONFIG_KALLSYMS_EXTRA_PASS is not set
-# CONFIG_HOTPLUG is not set
-CONFIG_PRINTK=y
-CONFIG_BUG=y
-CONFIG_ELF_CORE=y
-CONFIG_BASE_FULL=y
-CONFIG_FUTEX=y
-CONFIG_EPOLL=y
-CONFIG_SIGNALFD=y
-CONFIG_TIMERFD=y
-CONFIG_EVENTFD=y
-CONFIG_SHMEM=y
-CONFIG_AIO=y
-
-#
-# Kernel Performance Events And Counters
-#
-CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_SLUB_DEBUG=y
-CONFIG_COMPAT_BRK=y
-# CONFIG_SLAB is not set
-CONFIG_SLUB=y
-# CONFIG_SLOB is not set
-# CONFIG_PROFILING is not set
-
-#
-# GCOV-based kernel profiling
-#
-# CONFIG_SLOW_WORK is not set
-# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
-CONFIG_SLABINFO=y
-CONFIG_RT_MUTEXES=y
-CONFIG_BASE_SMALL=0
-# CONFIG_MODULES is not set
-CONFIG_BLOCK=y
-CONFIG_LBDAF=y
-CONFIG_BLK_DEV_BSG=y
-# CONFIG_BLK_DEV_INTEGRITY is not set
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
  # CONFIG_IOSCHED_DEADLINE is not set
  # CONFIG_IOSCHED_CFQ is not set
-# CONFIG_DEFAULT_DEADLINE is not set
-# CONFIG_DEFAULT_CFQ is not set
-CONFIG_DEFAULT_NOOP=y
-CONFIG_DEFAULT_IOSCHED="noop"
-# CONFIG_INLINE_SPIN_TRYLOCK is not set
-# CONFIG_INLINE_SPIN_TRYLOCK_BH is not set
-# CONFIG_INLINE_SPIN_LOCK is not set
-# CONFIG_INLINE_SPIN_LOCK_BH is not set
-# CONFIG_INLINE_SPIN_LOCK_IRQ is not set
-# CONFIG_INLINE_SPIN_LOCK_IRQSAVE is not set
-# CONFIG_UNINLINE_SPIN_UNLOCK is not set
-# CONFIG_INLINE_SPIN_UNLOCK_BH is not set
-CONFIG_INLINE_SPIN_UNLOCK_IRQ=y
-# CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE is not set
-# CONFIG_INLINE_READ_TRYLOCK is not set
-# CONFIG_INLINE_READ_LOCK is not set
-# CONFIG_INLINE_READ_LOCK_BH is not set
-# CONFIG_INLINE_READ_LOCK_IRQ is not set
-# CONFIG_INLINE_READ_LOCK_IRQSAVE is not set
-CONFIG_INLINE_READ_UNLOCK=y
-# CONFIG_INLINE_READ_UNLOCK_BH is not set
-CONFIG_INLINE_READ_UNLOCK_IRQ=y
-# CONFIG_INLINE_READ_UNLOCK_IRQRESTORE is not set
-# CONFIG_INLINE_WRITE_TRYLOCK is not set
-# CONFIG_INLINE_WRITE_LOCK is not set
-# CONFIG_INLINE_WRITE_LOCK_BH is not set
-# CONFIG_INLINE_WRITE_LOCK_IRQ is not set
-# CONFIG_INLINE_WRITE_LOCK_IRQSAVE is not set
-CONFIG_INLINE_WRITE_UNLOCK=y
-# CONFIG_INLINE_WRITE_UNLOCK_BH is not set
-CONFIG_INLINE_WRITE_UNLOCK_IRQ=y
-# CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE is not set
-# CONFIG_MUTEX_SPIN_ON_OWNER is not set
-# CONFIG_FREEZER is not set
-CONFIG_MMU=y
-# CONFIG_VARIANT_IRQ_SWITCH is not set
-
-#
-# Processor type and features
-#
-CONFIG_XTENSA_VARIANT_FSF=y
-# CONFIG_XTENSA_VARIANT_DC232B is not set
-# CONFIG_XTENSA_UNALIGNED_USER is not set
-# CONFIG_PREEMPT is not set
-CONFIG_XTENSA_CALIBRATE_CCOUNT=y
-CONFIG_SERIAL_CONSOLE=y
-CONFIG_XTENSA_ISS_NETWORK=y
-
-#
-# Bus options
-#
  # CONFIG_PCI is not set
-# CONFIG_ARCH_SUPPORTS_MSI is not set
-
-#
-# Platform options
-#
-CONFIG_XTENSA_PLATFORM_ISS=y
-# CONFIG_XTENSA_PLATFORM_XT2000 is not set
-# CONFIG_GENERIC_CALIBRATE_DELAY is not set
  CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="console=ttyS0,38400 eth0=tuntap,,tap0 ip=192.168.168.5:192.168.168.1 root=nfs nfsroot=192.168.168.1:/opt/montavista/pro/devkit/xtensa/linux_be/target"
-CONFIG_SELECT_MEMORY_MODEL=y
-CONFIG_FLATMEM_MANUAL=y
-# CONFIG_DISCONTIGMEM_MANUAL is not set
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_FLATMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
-# CONFIG_PHYS_ADDR_T_64BIT is not set
-CONFIG_ZONE_DMA_FLAG=1
-CONFIG_BOUNCE=y
-CONFIG_VIRT_TO_BUS=y
-# CONFIG_KSM is not set
-CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
-
-#
-# Executable file formats
-#
-CONFIG_KCORE_ELF=y
-CONFIG_BINFMT_ELF=y
+CONFIG_CMDLINE="console=ttyS0,38400 eth0=tuntap,,tap0 ip=192.168.168.5:192.168.168.1 root=nfs nfsroot=192.168.168.1:/opt/montavista/pro/devkit/xtensa/linux_be/target memmap=128M@0"
  # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_HAVE_AOUT is not set
-# CONFIG_BINFMT_MISC is not set
  CONFIG_NET=y
-
-#
-# Networking options
-#
  CONFIG_PACKET=y
  CONFIG_UNIX=y
-CONFIG_XFRM=y
-# CONFIG_XFRM_USER is not set
-# CONFIG_XFRM_SUB_POLICY is not set
-# CONFIG_XFRM_MIGRATE is not set
-# CONFIG_XFRM_STATISTICS is not set
-# CONFIG_NET_KEY is not set
  CONFIG_INET=y
-# CONFIG_IP_MULTICAST is not set
-# CONFIG_IP_ADVANCED_ROUTER is not set
-CONFIG_IP_FIB_HASH=y
  CONFIG_IP_PNP=y
  CONFIG_IP_PNP_DHCP=y
  CONFIG_IP_PNP_BOOTP=y
  CONFIG_IP_PNP_RARP=y
-# CONFIG_NET_IPIP is not set
-# CONFIG_NET_IPGRE is not set
-# CONFIG_ARPD is not set
-# CONFIG_SYN_COOKIES is not set
-# CONFIG_INET_AH is not set
-# CONFIG_INET_ESP is not set
-# CONFIG_INET_IPCOMP is not set
-# CONFIG_INET_XFRM_TUNNEL is not set
-# CONFIG_INET_TUNNEL is not set
-CONFIG_INET_XFRM_MODE_TRANSPORT=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_INET_XFRM_MODE_BEET=y
-CONFIG_INET_LRO=y
-CONFIG_INET_DIAG=y
-CONFIG_INET_TCP_DIAG=y
-# CONFIG_TCP_CONG_ADVANCED is not set
-CONFIG_TCP_CONG_CUBIC=y
-CONFIG_DEFAULT_TCP_CONG="cubic"
-# CONFIG_TCP_MD5SIG is not set
  # CONFIG_IPV6 is not set
-# CONFIG_NETWORK_SECMARK is not set
-# CONFIG_NETFILTER is not set
-# CONFIG_IP_DCCP is not set
-# CONFIG_IP_SCTP is not set
-# CONFIG_RDS is not set
-# CONFIG_TIPC is not set
-# CONFIG_ATM is not set
-# CONFIG_BRIDGE is not set
-# CONFIG_NET_DSA is not set
-# CONFIG_VLAN_8021Q is not set
-# CONFIG_DECNET is not set
-# CONFIG_LLC2 is not set
-# CONFIG_IPX is not set
-# CONFIG_ATALK is not set
-# CONFIG_X25 is not set
-# CONFIG_LAPB is not set
-# CONFIG_ECONET is not set
-# CONFIG_WAN_ROUTER is not set
-# CONFIG_PHONET is not set
-# CONFIG_IEEE802154 is not set
-# CONFIG_NET_SCHED is not set
-# CONFIG_DCB is not set
-
-#
-# Network testing
-#
-# CONFIG_NET_PKTGEN is not set
-# CONFIG_HAMRADIO is not set
-# CONFIG_CAN is not set
-# CONFIG_IRDA is not set
-# CONFIG_BT is not set
-# CONFIG_AF_RXRPC is not set
-CONFIG_WIRELESS=y
-# CONFIG_CFG80211 is not set
-# CONFIG_LIB80211 is not set
-
-#
-# CFG80211 needs to be enabled for MAC80211
-#
-# CONFIG_WIMAX is not set
-# CONFIG_RFKILL is not set
-# CONFIG_NET_9P is not set
-
-#
-# Device Drivers
-#
-
-#
-# Generic Driver Options
-#
  # CONFIG_STANDALONE is not set
-CONFIG_PREVENT_FIRMWARE_BUILD=y
-# CONFIG_DEBUG_DRIVER is not set
-# CONFIG_DEBUG_DEVRES is not set
-# CONFIG_SYS_HYPERVISOR is not set
-# CONFIG_CONNECTOR is not set
-# CONFIG_MTD is not set
-# CONFIG_PARPORT is not set
-CONFIG_BLK_DEV=y
-# CONFIG_BLK_DEV_COW_COMMON is not set
-# CONFIG_BLK_DEV_LOOP is not set
-
-#
-# DRBD disabled because PROC_FS, INET or CONNECTOR not selected
-#
-# CONFIG_BLK_DEV_NBD is not set
-# CONFIG_BLK_DEV_RAM is not set
-# CONFIG_CDROM_PKTCDVD is not set
-# CONFIG_ATA_OVER_ETH is not set
-# CONFIG_BLK_DEV_HD is not set
-CONFIG_MISC_DEVICES=y
-# CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_C2PORT is not set
-
-#
-# EEPROM support
-#
-# CONFIG_EEPROM_93CX6 is not set
-# CONFIG_HAVE_IDE is not set
-# CONFIG_IDE is not set
-
-#
-# SCSI device support
-#
-CONFIG_SCSI_MOD=y
-# CONFIG_RAID_ATTRS is not set
-# CONFIG_SCSI is not set
-# CONFIG_SCSI_DMA is not set
-# CONFIG_SCSI_NETLINK is not set
-# CONFIG_ATA is not set
-# CONFIG_MD is not set
-# CONFIG_NETDEVICES is not set
-# CONFIG_ISDN is not set
-# CONFIG_PHONE is not set
-
-#
-# Input device support
-#
-CONFIG_INPUT=y
-# CONFIG_INPUT_FF_MEMLESS is not set
-# CONFIG_INPUT_POLLDEV is not set
-# CONFIG_INPUT_SPARSEKMAP is not set
-
-#
-# Userland interfaces
-#
  # CONFIG_INPUT_MOUSEDEV is not set
-# CONFIG_INPUT_JOYDEV is not set
-# CONFIG_INPUT_EVDEV is not set
-# CONFIG_INPUT_EVBUG is not set
-
-#
-# Input Device Drivers
-#
  # CONFIG_INPUT_KEYBOARD is not set
  # CONFIG_INPUT_MOUSE is not set
-# CONFIG_INPUT_JOYSTICK is not set
-# CONFIG_INPUT_TABLET is not set
-# CONFIG_INPUT_TOUCHSCREEN is not set
-# CONFIG_INPUT_MISC is not set
-
-#
-# Hardware I/O ports
-#
  # CONFIG_SERIO is not set
-# CONFIG_GAMEPORT is not set
-
-#
-# Character devices
-#
-CONFIG_VT=y
-CONFIG_CONSOLE_TRANSLATIONS=y
-CONFIG_VT_CONSOLE=y
-CONFIG_HW_CONSOLE=y
-# CONFIG_VT_HW_CONSOLE_BINDING is not set
-CONFIG_DEVKMEM=y
-# CONFIG_SERIAL_NONSTANDARD is not set
-
-#
-# Serial drivers
-#
-# CONFIG_SERIAL_8250 is not set
-
-#
-# Non-8250 serial port support
-#
-# CONFIG_SERIAL_TIMBERDALE is not set
-CONFIG_UNIX98_PTYS=y
-# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
-CONFIG_LEGACY_PTYS=y
-CONFIG_LEGACY_PTY_COUNT=256
-# CONFIG_IPMI_HANDLER is not set
-CONFIG_HW_RANDOM=y
-# CONFIG_HW_RANDOM_TIMERIOMEM is not set
-# CONFIG_RTC is not set
-# CONFIG_GEN_RTC is not set
-# CONFIG_R3964 is not set
-# CONFIG_RAW_DRIVER is not set
-# CONFIG_TCG_TPM is not set
-# CONFIG_I2C is not set
-# CONFIG_SPI is not set
-
-#
-# PPS support
-#
-# CONFIG_PPS is not set
-# CONFIG_W1 is not set
-# CONFIG_POWER_SUPPLY is not set
-CONFIG_HWMON=y
-# CONFIG_HWMON_VID is not set
-# CONFIG_HWMON_DEBUG_CHIP is not set
-
-#
-# Native drivers
-#
-# CONFIG_SENSORS_F71805F is not set
-# CONFIG_SENSORS_F71882FG is not set
-# CONFIG_SENSORS_IT87 is not set
-# CONFIG_SENSORS_PC87360 is not set
-# CONFIG_SENSORS_PC87427 is not set
-# CONFIG_SENSORS_SHT15 is not set
-# CONFIG_SENSORS_SMSC47M1 is not set
-# CONFIG_SENSORS_SMSC47B397 is not set
-# CONFIG_SENSORS_VT1211 is not set
-# CONFIG_SENSORS_W83627HF is not set
-# CONFIG_SENSORS_W83627EHF is not set
-# CONFIG_THERMAL is not set
  CONFIG_WATCHDOG=y
  CONFIG_WATCHDOG_NOWAYOUT=y
-
-#
-# Watchdog Device Drivers
-#
  CONFIG_SOFT_WATCHDOG=y
-CONFIG_SSB_POSSIBLE=y
-
-#
-# Sonics Silicon Backplane
-#
-# CONFIG_SSB is not set
-
-#
-# Multifunction device drivers
-#
-# CONFIG_MFD_CORE is not set
-# CONFIG_MFD_SM501 is not set
-# CONFIG_HTC_PASIC3 is not set
-# CONFIG_MFD_TMIO is not set
-# CONFIG_REGULATOR is not set
-# CONFIG_MEDIA_SUPPORT is not set
-
-#
-# Graphics support
-#
-# CONFIG_VGASTATE is not set
-# CONFIG_VIDEO_OUTPUT_CONTROL is not set
-# CONFIG_FB is not set
-# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
-
-#
-# Display device support
-#
-# CONFIG_DISPLAY_SUPPORT is not set
-
-#
-# Console display driver support
-#
  # CONFIG_VGA_CONSOLE is not set
-CONFIG_DUMMY_CONSOLE=y
-# CONFIG_SOUND is not set
-CONFIG_HID_SUPPORT=y
-CONFIG_HID=y
-# CONFIG_HIDRAW is not set
-# CONFIG_HID_PID is not set
-
-#
-# Special HID drivers
-#
-CONFIG_USB_SUPPORT=y
-# CONFIG_USB_ARCH_HAS_HCD is not set
-# CONFIG_USB_ARCH_HAS_OHCI is not set
-# CONFIG_USB_ARCH_HAS_EHCI is not set
-# CONFIG_USB_OTG_WHITELIST is not set
-# CONFIG_USB_OTG_BLACKLIST_HUB is not set
-
-#
-# Enable Host or Gadget support to see Inventra options
-#
-
-#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
-#
-# CONFIG_USB_GADGET is not set
-
-#
-# OTG and related infrastructure
-#
-# CONFIG_MMC is not set
-# CONFIG_MEMSTICK is not set
-# CONFIG_NEW_LEDS is not set
-# CONFIG_ACCESSIBILITY is not set
-# CONFIG_RTC_CLASS is not set
-# CONFIG_DMADEVICES is not set
-# CONFIG_AUXDISPLAY is not set
-# CONFIG_UIO is not set
-
-#
-# TI VLYNQ
-#
-# CONFIG_STAGING is not set
-
-#
-# File systems
-#
-# CONFIG_EXT2_FS is not set
-# CONFIG_EXT3_FS is not set
-# CONFIG_EXT4_FS is not set
-# CONFIG_REISERFS_FS is not set
-# CONFIG_JFS_FS is not set
-# CONFIG_FS_POSIX_ACL is not set
-# CONFIG_XFS_FS is not set
-# CONFIG_GFS2_FS is not set
-# CONFIG_OCFS2_FS is not set
-# CONFIG_BTRFS_FS is not set
-# CONFIG_NILFS2_FS is not set
-CONFIG_FILE_LOCKING=y
-CONFIG_FSNOTIFY=y
  # CONFIG_DNOTIFY is not set
-# CONFIG_INOTIFY is not set
-CONFIG_INOTIFY_USER=y
-# CONFIG_QUOTA is not set
-# CONFIG_AUTOFS_FS is not set
-# CONFIG_AUTOFS4_FS is not set
-# CONFIG_FUSE_FS is not set
-
-#
-# Caches
-#
-# CONFIG_FSCACHE is not set
-
-#
-# CD-ROM/DVD Filesystems
-#
-# CONFIG_ISO9660_FS is not set
-# CONFIG_UDF_FS is not set
-
-#
-# DOS/FAT/NT Filesystems
-#
-# CONFIG_MSDOS_FS is not set
-# CONFIG_VFAT_FS is not set
-# CONFIG_NTFS_FS is not set
-
-#
-# Pseudo filesystems
-#
-CONFIG_PROC_FS=y
  CONFIG_PROC_KCORE=y
-CONFIG_PROC_SYSCTL=y
-CONFIG_PROC_PAGE_MONITOR=y
-CONFIG_SYSFS=y
  CONFIG_TMPFS=y
-# CONFIG_TMPFS_POSIX_ACL is not set
-# CONFIG_HUGETLB_PAGE is not set
-# CONFIG_CONFIGFS_FS is not set
-CONFIG_MISC_FILESYSTEMS=y
-# CONFIG_ADFS_FS is not set
-# CONFIG_AFFS_FS is not set
-# CONFIG_HFS_FS is not set
-# CONFIG_HFSPLUS_FS is not set
-# CONFIG_BEFS_FS is not set
-# CONFIG_BFS_FS is not set
-# CONFIG_EFS_FS is not set
-# CONFIG_LOGFS is not set
-# CONFIG_CRAMFS is not set
-# CONFIG_SQUASHFS is not set
-# CONFIG_VXFS_FS is not set
-# CONFIG_MINIX_FS is not set
-# CONFIG_OMFS_FS is not set
-# CONFIG_HPFS_FS is not set
-# CONFIG_QNX4FS_FS is not set
-# CONFIG_ROMFS_FS is not set
-# CONFIG_SYSV_FS is not set
-# CONFIG_UFS_FS is not set
-CONFIG_NETWORK_FILESYSTEMS=y
-# CONFIG_NFS_FS is not set
-# CONFIG_NFSD is not set
-# CONFIG_SMB_FS is not set
-# CONFIG_CEPH_FS is not set
-# CONFIG_CIFS is not set
-# CONFIG_NCP_FS is not set
-# CONFIG_CODA_FS is not set
-# CONFIG_AFS_FS is not set
-
-#
-# Partition Types
-#
-# CONFIG_PARTITION_ADVANCED is not set
-CONFIG_MSDOS_PARTITION=y
-# CONFIG_NLS is not set
-# CONFIG_DLM is not set
-
-#
-# Kernel hacking
-#
-# CONFIG_PRINTK_TIME is not set
-CONFIG_ENABLE_WARN_DEPRECATED=y
-CONFIG_ENABLE_MUST_CHECK=y
-CONFIG_FRAME_WARN=1024
-# CONFIG_MAGIC_SYSRQ is not set
-# CONFIG_STRIP_ASM_SYMS is not set
-# CONFIG_UNUSED_SYMBOLS is not set
-# CONFIG_DEBUG_FS is not set
-# CONFIG_HEADERS_CHECK is not set
-CONFIG_DEBUG_KERNEL=y
-# CONFIG_DEBUG_SHIRQ is not set
-CONFIG_DETECT_SOFTLOCKUP=y
-# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+# CONFIG_FRAME_POINTER is not set
  CONFIG_DETECT_HUNG_TASK=y
-# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
-CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
-CONFIG_SCHED_DEBUG=y
-# CONFIG_SCHEDSTATS is not set
-# CONFIG_TIMER_STATS is not set
-# CONFIG_DEBUG_OBJECTS is not set
-# CONFIG_SLUB_DEBUG_ON is not set
-# CONFIG_SLUB_STATS is not set
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
-# CONFIG_DEBUG_KOBJECT is not set
-# CONFIG_DEBUG_INFO is not set
-# CONFIG_DEBUG_VM is not set
-# CONFIG_DEBUG_MEMORY_INIT is not set
-# CONFIG_DEBUG_LIST is not set
-# CONFIG_DEBUG_SG is not set
-# CONFIG_DEBUG_NOTIFIERS is not set
-# CONFIG_DEBUG_CREDENTIALS is not set
-# CONFIG_RCU_TORTURE_TEST is not set
-CONFIG_RCU_CPU_STALL_DETECTOR=y
-# CONFIG_BACKTRACE_SELF_TEST is not set
-# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
-# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
-# CONFIG_FAULT_INJECTION is not set
-# CONFIG_SYSCTL_SYSCALL_CHECK is not set
-# CONFIG_PAGE_POISONING is not set
-# CONFIG_SAMPLES is not set
-
-#
-# Security options
-#
-# CONFIG_KEYS is not set
-# CONFIG_SECURITY is not set
-# CONFIG_SECURITYFS is not set
-# CONFIG_DEFAULT_SECURITY_SELINUX is not set
-# CONFIG_DEFAULT_SECURITY_SMACK is not set
-# CONFIG_DEFAULT_SECURITY_TOMOYO is not set
-CONFIG_DEFAULT_SECURITY_DAC=y
-CONFIG_DEFAULT_SECURITY=""
-CONFIG_CRYPTO=y
-
-#
-# Crypto core or helper
-#
-# CONFIG_CRYPTO_FIPS is not set
-CONFIG_CRYPTO_ALGAPI=y
-CONFIG_CRYPTO_ALGAPI2=y
-CONFIG_CRYPTO_RNG=y
-CONFIG_CRYPTO_RNG2=y
-# CONFIG_CRYPTO_MANAGER is not set
-# CONFIG_CRYPTO_MANAGER2 is not set
-# CONFIG_CRYPTO_GF128MUL is not set
-# CONFIG_CRYPTO_NULL is not set
-# CONFIG_CRYPTO_CRYPTD is not set
-# CONFIG_CRYPTO_AUTHENC is not set
-
-#
-# Authenticated Encryption with Associated Data
-#
-# CONFIG_CRYPTO_CCM is not set
-# CONFIG_CRYPTO_GCM is not set
-# CONFIG_CRYPTO_SEQIV is not set
-
-#
-# Block modes
-#
-# CONFIG_CRYPTO_CBC is not set
-# CONFIG_CRYPTO_CTR is not set
-# CONFIG_CRYPTO_CTS is not set
-# CONFIG_CRYPTO_ECB is not set
-# CONFIG_CRYPTO_LRW is not set
-# CONFIG_CRYPTO_PCBC is not set
-# CONFIG_CRYPTO_XTS is not set
-
-#
-# Hash modes
-#
-# CONFIG_CRYPTO_HMAC is not set
-# CONFIG_CRYPTO_XCBC is not set
-# CONFIG_CRYPTO_VMAC is not set
-
-#
-# Digest
-#
-# CONFIG_CRYPTO_CRC32C is not set
-# CONFIG_CRYPTO_GHASH is not set
-# CONFIG_CRYPTO_MD4 is not set
-# CONFIG_CRYPTO_MD5 is not set
-# CONFIG_CRYPTO_MICHAEL_MIC is not set
-# CONFIG_CRYPTO_RMD128 is not set
-# CONFIG_CRYPTO_RMD160 is not set
-# CONFIG_CRYPTO_RMD256 is not set
-# CONFIG_CRYPTO_RMD320 is not set
-# CONFIG_CRYPTO_SHA1 is not set
-# CONFIG_CRYPTO_SHA256 is not set
-# CONFIG_CRYPTO_SHA512 is not set
-# CONFIG_CRYPTO_TGR192 is not set
-# CONFIG_CRYPTO_WP512 is not set
-
-#
-# Ciphers
-#
-CONFIG_CRYPTO_AES=y
-# CONFIG_CRYPTO_ANUBIS is not set
-# CONFIG_CRYPTO_ARC4 is not set
-# CONFIG_CRYPTO_BLOWFISH is not set
-# CONFIG_CRYPTO_CAMELLIA is not set
-# CONFIG_CRYPTO_CAST5 is not set
-# CONFIG_CRYPTO_CAST6 is not set
-# CONFIG_CRYPTO_DES is not set
-# CONFIG_CRYPTO_FCRYPT is not set
-# CONFIG_CRYPTO_KHAZAD is not set
-# CONFIG_CRYPTO_SALSA20 is not set
-# CONFIG_CRYPTO_SEED is not set
-# CONFIG_CRYPTO_SERPENT is not set
-# CONFIG_CRYPTO_TEA is not set
-# CONFIG_CRYPTO_TWOFISH is not set
-
-#
-# Compression
-#
-# CONFIG_CRYPTO_DEFLATE is not set
-# CONFIG_CRYPTO_ZLIB is not set
-# CONFIG_CRYPTO_LZO is not set
-
-#
-# Random Number Generation
-#
  CONFIG_CRYPTO_ANSI_CPRNG=y
-CONFIG_CRYPTO_HW=y
-# CONFIG_BINARY_PRINTF is not set
-
-#
-# Library routines
-#
-CONFIG_GENERIC_FIND_LAST_BIT=y
-# CONFIG_CRC_CCITT is not set
-# CONFIG_CRC16 is not set
-# CONFIG_CRC_T10DIF is not set
-# CONFIG_CRC_ITU_T is not set
-# CONFIG_CRC32 is not set
-# CONFIG_CRC7 is not set
-# CONFIG_LIBCRC32C is not set
-CONFIG_HAS_IOMEM=y
-CONFIG_HAS_DMA=y
-CONFIG_NLATTR=y
-CONFIG_LD_NO_RELAX=y
diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig

index 337d5ba..78c2529 100644 (file)
--- a/arch/xtensa/configs/nommu_kc705_defconfig
+++ b/arch/xtensa/configs/nommu_kc705_defconfig
@@ -37,7 +37,7 @@ CONFIG_PREEMPT=y
  # CONFIG_PCI is not set
  CONFIG_XTENSA_PLATFORM_XTFPGA=y
  CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="earlycon=uart8250,mmio32,0x9d050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug"
+CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0x9d050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=256M@0x60000000"
  CONFIG_USE_OF=y
  CONFIG_BUILTIN_DTB="kc705_nommu"
  CONFIG_DEFAULT_MEM_SIZE=0x10000000
diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig

index 61f943c..14e3ca3 100644 (file)
--- a/arch/xtensa/configs/smp_lx200_defconfig
+++ b/arch/xtensa/configs/smp_lx200_defconfig
@@ -36,7 +36,7 @@ CONFIG_HOTPLUG_CPU=y
  # CONFIG_PCI is not set
  CONFIG_XTENSA_PLATFORM_XTFPGA=y
  CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="earlycon=uart8250,mmio32,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug"
+CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=96M@0"
  CONFIG_USE_OF=y
  CONFIG_BUILTIN_DTB="lx200mx"
  # CONFIG_COMPACTION is not set
diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h

index 3f44fa2..d349018 100644 (file)
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -48,7 +48,7 @@ static inline int ffz(unsigned long x)
   * __ffs: Find first bit set in word. Return 0 for bit 0
   */
  
-static inline int __ffs(unsigned long x)
+static inline unsigned long __ffs(unsigned long x)
  {
         return 31 - __cntlz(x & -x);
  }
diff --git a/arch/xtensa/include/asm/cacheasm.h b/arch/xtensa/include/asm/cacheasm.h

index e0f9e11..2041abb 100644 (file)
--- a/arch/xtensa/include/asm/cacheasm.h
+++ b/arch/xtensa/include/asm/cacheasm.h
@@ -69,26 +69,23 @@
         .endm
  
  
-#if XCHAL_DCACHE_LINE_LOCKABLE
-
         .macro  ___unlock_dcache_all ar at
  
-#if XCHAL_DCACHE_SIZE
+#if XCHAL_DCACHE_LINE_LOCKABLE && XCHAL_DCACHE_SIZE
         __loop_cache_all \ar \at diu XCHAL_DCACHE_SIZE XCHAL_DCACHE_LINEWIDTH
  #endif
  
         .endm
  
-#endif
-
-#if XCHAL_ICACHE_LINE_LOCKABLE
  
         .macro  ___unlock_icache_all ar at
  
+#if XCHAL_ICACHE_LINE_LOCKABLE && XCHAL_ICACHE_SIZE
         __loop_cache_all \ar \at iiu XCHAL_ICACHE_SIZE XCHAL_ICACHE_LINEWIDTH
+#endif
  
         .endm
-#endif
+
  
         .macro  ___flush_invalidate_dcache_all ar at
  
diff --git a/arch/xtensa/include/asm/fixmap.h b/arch/xtensa/include/asm/fixmap.h

index 62b507d..0d30403 100644 (file)
--- a/arch/xtensa/include/asm/fixmap.h
+++ b/arch/xtensa/include/asm/fixmap.h
@@ -59,6 +59,11 @@ enum fixed_addresses {
   */
  static __always_inline unsigned long fix_to_virt(const unsigned int idx)
  {
+       /* Check if this memory layout is broken because fixmap overlaps page
+        * table.
+        */
+       BUILD_BUG_ON(FIXADDR_START <
+                    XCHAL_PAGE_TABLE_VADDR + XCHAL_PAGE_TABLE_SIZE);
         BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
         return __fix_to_virt(idx);
  }
diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h

index 01cef6b..6e070db 100644 (file)
--- a/arch/xtensa/include/asm/highmem.h
+++ b/arch/xtensa/include/asm/highmem.h
@@ -68,6 +68,11 @@ void kunmap_high(struct page *page);
  
  static inline void *kmap(struct page *page)
  {
+       /* Check if this memory layout is broken because PKMAP overlaps
+        * page table.
+        */
+       BUILD_BUG_ON(PKMAP_BASE <
+                    XCHAL_PAGE_TABLE_VADDR + XCHAL_PAGE_TABLE_SIZE);
         BUG_ON(in_interrupt());
         if (!PageHighMem(page))
                 return page_address(page);
diff --git a/arch/xtensa/include/asm/initialize_mmu.h b/arch/xtensa/include/asm/initialize_mmu.h

index 7a1e075..42410f2 100644 (file)
--- a/arch/xtensa/include/asm/initialize_mmu.h
+++ b/arch/xtensa/include/asm/initialize_mmu.h
@@ -77,13 +77,16 @@
  
         .align  4
  1:     movi    a2, 0x10000000
-       movi    a3, 0x18000000
-       add     a2, a2, a0
-9:     bgeu    a2, a3, 9b      /* PC is out of the expected range */
+
+#if CONFIG_KERNEL_LOAD_ADDRESS < 0x40000000ul
+#define TEMP_MAPPING_VADDR 0x40000000
+#else
+#define TEMP_MAPPING_VADDR 0x00000000
+#endif
  
         /* Step 1: invalidate mapping at 0x40000000..0x5FFFFFFF. */
  
-       movi    a2, 0x40000000 | XCHAL_SPANNING_WAY
+       movi    a2, TEMP_MAPPING_VADDR | XCHAL_SPANNING_WAY
         idtlb   a2
         iitlb   a2
         isync
@@ -95,14 +98,14 @@
         srli    a3, a0, 27
         slli    a3, a3, 27
         addi    a3, a3, CA_BYPASS
-       addi    a7, a2, -1
+       addi    a7, a2, 5 - XCHAL_SPANNING_WAY
         wdtlb   a3, a7
         witlb   a3, a7
         isync
  
         slli    a4, a0, 5
         srli    a4, a4, 5
-       addi    a5, a2, -6
+       addi    a5, a2, -XCHAL_SPANNING_WAY
         add     a4, a4, a5
         jx      a4
  
@@ -116,35 +119,48 @@
         add     a5, a5, a4
         bne     a5, a2, 3b
  
-       /* Step 4: Setup MMU with the old V2 mappings. */
+       /* Step 4: Setup MMU with the requested static mappings. */
+
         movi    a6, 0x01000000
         wsr     a6, ITLBCFG
         wsr     a6, DTLBCFG
         isync
  
-       movi    a5, 0xd0000005
-       movi    a4, CA_WRITEBACK
+       movi    a5, XCHAL_KSEG_CACHED_VADDR + XCHAL_KSEG_TLB_WAY
+       movi    a4, XCHAL_KSEG_PADDR + CA_WRITEBACK
         wdtlb   a4, a5
         witlb   a4, a5
  
-       movi    a5, 0xd8000005
-       movi    a4, CA_BYPASS
+       movi    a5, XCHAL_KSEG_BYPASS_VADDR + XCHAL_KSEG_TLB_WAY
+       movi    a4, XCHAL_KSEG_PADDR + CA_BYPASS
         wdtlb   a4, a5
         witlb   a4, a5
  
-       movi    a5, XCHAL_KIO_CACHED_VADDR + 6
+#ifdef CONFIG_XTENSA_KSEG_512M
+       movi    a5, XCHAL_KSEG_CACHED_VADDR + 0x10000000 + XCHAL_KSEG_TLB_WAY
+       movi    a4, XCHAL_KSEG_PADDR + 0x10000000 + CA_WRITEBACK
+       wdtlb   a4, a5
+       witlb   a4, a5
+
+       movi    a5, XCHAL_KSEG_BYPASS_VADDR + 0x10000000 + XCHAL_KSEG_TLB_WAY
+       movi    a4, XCHAL_KSEG_PADDR + 0x10000000 + CA_BYPASS
+       wdtlb   a4, a5
+       witlb   a4, a5
+#endif
+
+       movi    a5, XCHAL_KIO_CACHED_VADDR + XCHAL_KIO_TLB_WAY
         movi    a4, XCHAL_KIO_DEFAULT_PADDR + CA_WRITEBACK
         wdtlb   a4, a5
         witlb   a4, a5
  
-       movi    a5, XCHAL_KIO_BYPASS_VADDR + 6
+       movi    a5, XCHAL_KIO_BYPASS_VADDR + XCHAL_KIO_TLB_WAY
         movi    a4, XCHAL_KIO_DEFAULT_PADDR + CA_BYPASS
         wdtlb   a4, a5
         witlb   a4, a5
  
         isync
  
-       /* Jump to self, using MMU v2 mappings. */
+       /* Jump to self, using final mappings. */
         movi    a4, 1f
         jx      a4
  
diff --git a/arch/xtensa/include/asm/kmem_layout.h b/arch/xtensa/include/asm/kmem_layout.h

new file mode 100644 (file)

index 0000000..561f872
--- /dev/null
+++ b/arch/xtensa/include/asm/kmem_layout.h
@@ -0,0 +1,74 @@
+/*
+ * Kernel virtual memory layout definitions.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * Copyright (C) 2016 Cadence Design Systems Inc.
+ */
+
+#ifndef _XTENSA_KMEM_LAYOUT_H
+#define _XTENSA_KMEM_LAYOUT_H
+
+#include <asm/types.h>
+
+#ifdef CONFIG_MMU
+
+/*
+ * Fixed TLB translations in the processor.
+ */
+
+#define XCHAL_PAGE_TABLE_VADDR __XTENSA_UL_CONST(0x80000000)
+#define XCHAL_PAGE_TABLE_SIZE  __XTENSA_UL_CONST(0x00400000)
+
+#if defined(CONFIG_XTENSA_KSEG_MMU_V2)
+
+#define XCHAL_KSEG_CACHED_VADDR        __XTENSA_UL_CONST(0xd0000000)
+#define XCHAL_KSEG_BYPASS_VADDR        __XTENSA_UL_CONST(0xd8000000)
+#define XCHAL_KSEG_SIZE                __XTENSA_UL_CONST(0x08000000)
+#define XCHAL_KSEG_ALIGNMENT   __XTENSA_UL_CONST(0x08000000)
+#define XCHAL_KSEG_TLB_WAY     5
+#define XCHAL_KIO_TLB_WAY      6
+
+#elif defined(CONFIG_XTENSA_KSEG_256M)
+
+#define XCHAL_KSEG_CACHED_VADDR        __XTENSA_UL_CONST(0xb0000000)
+#define XCHAL_KSEG_BYPASS_VADDR        __XTENSA_UL_CONST(0xc0000000)
+#define XCHAL_KSEG_SIZE                __XTENSA_UL_CONST(0x10000000)
+#define XCHAL_KSEG_ALIGNMENT   __XTENSA_UL_CONST(0x10000000)
+#define XCHAL_KSEG_TLB_WAY     6
+#define XCHAL_KIO_TLB_WAY      6
+
+#elif defined(CONFIG_XTENSA_KSEG_512M)
+
+#define XCHAL_KSEG_CACHED_VADDR        __XTENSA_UL_CONST(0xa0000000)
+#define XCHAL_KSEG_BYPASS_VADDR        __XTENSA_UL_CONST(0xc0000000)
+#define XCHAL_KSEG_SIZE                __XTENSA_UL_CONST(0x20000000)
+#define XCHAL_KSEG_ALIGNMENT   __XTENSA_UL_CONST(0x10000000)
+#define XCHAL_KSEG_TLB_WAY     6
+#define XCHAL_KIO_TLB_WAY      6
+
+#else
+#error Unsupported KSEG configuration
+#endif
+
+#ifdef CONFIG_KSEG_PADDR
+#define XCHAL_KSEG_PADDR        __XTENSA_UL_CONST(CONFIG_KSEG_PADDR)
+#else
+#define XCHAL_KSEG_PADDR       __XTENSA_UL_CONST(0x00000000)
+#endif
+
+#if XCHAL_KSEG_PADDR & (XCHAL_KSEG_ALIGNMENT - 1)
+#error XCHAL_KSEG_PADDR is not properly aligned to XCHAL_KSEG_ALIGNMENT
+#endif
+
+#else
+
+#define XCHAL_KSEG_CACHED_VADDR        __XTENSA_UL_CONST(0xd0000000)
+#define XCHAL_KSEG_BYPASS_VADDR        __XTENSA_UL_CONST(0xd8000000)
+#define XCHAL_KSEG_SIZE                __XTENSA_UL_CONST(0x08000000)
+
+#endif
+
+#endif
diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h

index ad38500..976b1d7 100644 (file)
--- a/arch/xtensa/include/asm/page.h
+++ b/arch/xtensa/include/asm/page.h
@@ -15,15 +15,7 @@
  #include <asm/types.h>
  #include <asm/cache.h>
  #include <platform/hardware.h>
-
-/*
- * Fixed TLB translations in the processor.
- */
-
-#define XCHAL_KSEG_CACHED_VADDR __XTENSA_UL_CONST(0xd0000000)
-#define XCHAL_KSEG_BYPASS_VADDR __XTENSA_UL_CONST(0xd8000000)
-#define XCHAL_KSEG_PADDR        __XTENSA_UL_CONST(0x00000000)
-#define XCHAL_KSEG_SIZE         __XTENSA_UL_CONST(0x08000000)
+#include <asm/kmem_layout.h>
  
  /*
   * PAGE_SHIFT determines the page size
@@ -35,10 +27,13 @@
  
  #ifdef CONFIG_MMU
  #define PAGE_OFFSET    XCHAL_KSEG_CACHED_VADDR
-#define MAX_MEM_PFN    XCHAL_KSEG_SIZE
+#define PHYS_OFFSET    XCHAL_KSEG_PADDR
+#define MAX_LOW_PFN    (PHYS_PFN(XCHAL_KSEG_PADDR) + \
+                        PHYS_PFN(XCHAL_KSEG_SIZE))
  #else
-#define PAGE_OFFSET    __XTENSA_UL_CONST(0)
-#define MAX_MEM_PFN    (PLATFORM_DEFAULT_MEM_START + PLATFORM_DEFAULT_MEM_SIZE)
+#define PAGE_OFFSET    PLATFORM_DEFAULT_MEM_START
+#define PHYS_OFFSET    PLATFORM_DEFAULT_MEM_START
+#define MAX_LOW_PFN    PHYS_PFN(0xfffffffful)
  #endif
  
  #define PGTABLE_START  0x80000000
@@ -167,10 +162,12 @@ void copy_user_highpage(struct page *to, struct page *from,
   * addresses.
   */
  
-#define ARCH_PFN_OFFSET                (PLATFORM_DEFAULT_MEM_START >> PAGE_SHIFT)
+#define ARCH_PFN_OFFSET                (PHYS_OFFSET >> PAGE_SHIFT)
  
-#define __pa(x)                        ((unsigned long) (x) - PAGE_OFFSET)
-#define __va(x)                        ((void *)((unsigned long) (x) + PAGE_OFFSET))
+#define __pa(x)        \
+       ((unsigned long) (x) - PAGE_OFFSET + PHYS_OFFSET)
+#define __va(x)        \
+       ((void *)((unsigned long) (x) - PHYS_OFFSET + PAGE_OFFSET))
  #define pfn_valid(pfn) \
         ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr)
  
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h

index fb02fdc..8aa0e0d 100644 (file)
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -13,6 +13,7 @@
  
  #include <asm-generic/pgtable-nopmd.h>
  #include <asm/page.h>
+#include <asm/kmem_layout.h>
  
  /*
   * We only use two ring levels, user and kernel space.
@@ -68,9 +69,9 @@
   * Virtual memory area. We keep a distance to other memory regions to be
   * on the safe side. We also use this area for cache aliasing.
   */
-#define VMALLOC_START          0xC0000000
-#define VMALLOC_END            0xC7FEFFFF
-#define TLBTEMP_BASE_1         0xC7FF0000
+#define VMALLOC_START          (XCHAL_KSEG_CACHED_VADDR - 0x10000000)
+#define VMALLOC_END            (VMALLOC_START + 0x07FEFFFF)
+#define TLBTEMP_BASE_1         (VMALLOC_END + 1)
  #define TLBTEMP_BASE_2         (TLBTEMP_BASE_1 + DCACHE_WAY_SIZE)
  #if 2 * DCACHE_WAY_SIZE > ICACHE_WAY_SIZE
  #define TLBTEMP_SIZE           (2 * DCACHE_WAY_SIZE)
diff --git a/arch/xtensa/include/asm/platform.h b/arch/xtensa/include/asm/platform.h

index 32e98f2..f8fbef6 100644 (file)
--- a/arch/xtensa/include/asm/platform.h
+++ b/arch/xtensa/include/asm/platform.h
@@ -69,4 +69,10 @@ extern int platform_pcibios_fixup (void);
   */
  extern void platform_calibrate_ccount (void);
  
+/*
+ * Flush and reset the mmu, simulate a processor reset, and
+ * jump to the reset vector.
+ */
+void cpu_reset(void) __attribute__((noreturn));
+
  #endif /* _XTENSA_PLATFORM_H */
diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h

index d2e40d3..b42d68b 100644 (file)
--- a/arch/xtensa/include/asm/processor.h
+++ b/arch/xtensa/include/asm/processor.h
@@ -37,7 +37,7 @@
  #ifdef CONFIG_MMU
  #define TASK_SIZE      __XTENSA_UL_CONST(0x40000000)
  #else
-#define TASK_SIZE      (PLATFORM_DEFAULT_MEM_START + PLATFORM_DEFAULT_MEM_SIZE)
+#define TASK_SIZE      __XTENSA_UL_CONST(0xffffffff)
  #endif
  
  #define STACK_TOP      TASK_SIZE
diff --git a/arch/xtensa/include/asm/sysmem.h b/arch/xtensa/include/asm/sysmem.h

index c015c5c..552cdfd 100644 (file)
--- a/arch/xtensa/include/asm/sysmem.h
+++ b/arch/xtensa/include/asm/sysmem.h
@@ -11,27 +11,8 @@
  #ifndef _XTENSA_SYSMEM_H
  #define _XTENSA_SYSMEM_H
  
-#define SYSMEM_BANKS_MAX 31
+#include <linux/memblock.h>
  
-struct meminfo {
-       unsigned long start;
-       unsigned long end;
-};
-
-/*
- * Bank array is sorted by .start.
- * Banks don't overlap and there's at least one page gap
- * between adjacent bank entries.
- */
-struct sysmem_info {
-       int nr_banks;
-       struct meminfo bank[SYSMEM_BANKS_MAX];
-};
-
-extern struct sysmem_info sysmem;
-
-int add_sysmem_bank(unsigned long start, unsigned long end);
-int mem_reserve(unsigned long, unsigned long, int);
  void bootmem_init(void);
  void zones_init(void);
  
diff --git a/arch/xtensa/include/asm/vectors.h b/arch/xtensa/include/asm/vectors.h

index 288c776..77d41cc 100644 (file)
--- a/arch/xtensa/include/asm/vectors.h
+++ b/arch/xtensa/include/asm/vectors.h
@@ -20,6 +20,7 @@
  
  #include <variant/core.h>
  #include <platform/hardware.h>
+#include <asm/kmem_layout.h>
  
  #if XCHAL_HAVE_PTP_MMU
  #define XCHAL_KIO_CACHED_VADDR         0xe0000000
@@ -47,61 +48,42 @@ static inline unsigned long xtensa_get_kio_paddr(void)
  
  #if defined(CONFIG_MMU)
  
-/* Will Become VECBASE */
-#define VIRTUAL_MEMORY_ADDRESS         0xD0000000
-
+#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY
  /* Image Virtual Start Address */
-#define KERNELOFFSET                   0xD0003000
-
-#if defined(XCHAL_HAVE_PTP_MMU) && XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY
-  /* MMU v3  - XCHAL_HAVE_PTP_MMU  == 1 */
-  #define LOAD_MEMORY_ADDRESS          0x00003000
+#define KERNELOFFSET                   (XCHAL_KSEG_CACHED_VADDR + \
+                                        CONFIG_KERNEL_LOAD_ADDRESS - \
+                                        XCHAL_KSEG_PADDR)
  #else
-  /* MMU V2 -  XCHAL_HAVE_PTP_MMU  == 0 */
-  #define LOAD_MEMORY_ADDRESS          0xD0003000
+#define KERNELOFFSET                   CONFIG_KERNEL_LOAD_ADDRESS
  #endif
  
-#define RESET_VECTOR1_VADDR            (VIRTUAL_MEMORY_ADDRESS + \
-                                        XCHAL_RESET_VECTOR1_PADDR)
-
  #else /* !defined(CONFIG_MMU) */
    /* MMU Not being used - Virtual == Physical */
  
-  /* VECBASE */
-  #define VIRTUAL_MEMORY_ADDRESS       (PLATFORM_DEFAULT_MEM_START + 0x2000)
+/* Location of the start of the kernel text, _start */
+#define KERNELOFFSET                   CONFIG_KERNEL_LOAD_ADDRESS
  
-  /* Location of the start of the kernel text, _start */
-  #define KERNELOFFSET                 (PLATFORM_DEFAULT_MEM_START + 0x3000)
-
-  /* Loaded just above possibly live vectors */
-  #define LOAD_MEMORY_ADDRESS          (PLATFORM_DEFAULT_MEM_START + 0x3000)
-
-#define RESET_VECTOR1_VADDR            (XCHAL_RESET_VECTOR1_VADDR)
  
  #endif /* CONFIG_MMU */
  
-#define XC_VADDR(offset)               (VIRTUAL_MEMORY_ADDRESS  + offset)
-
-/* Used to set VECBASE register */
-#define VECBASE_RESET_VADDR            VIRTUAL_MEMORY_ADDRESS
+#define RESET_VECTOR1_VADDR            (XCHAL_RESET_VECTOR1_VADDR)
+#define VECBASE_VADDR                  (KERNELOFFSET - CONFIG_VECTORS_OFFSET)
  
  #if defined(XCHAL_HAVE_VECBASE) && XCHAL_HAVE_VECBASE
  
-#define USER_VECTOR_VADDR              XC_VADDR(XCHAL_USER_VECOFS)
-#define KERNEL_VECTOR_VADDR            XC_VADDR(XCHAL_KERNEL_VECOFS)
-#define DOUBLEEXC_VECTOR_VADDR         XC_VADDR(XCHAL_DOUBLEEXC_VECOFS)
-#define WINDOW_VECTORS_VADDR           XC_VADDR(XCHAL_WINDOW_OF4_VECOFS)
-#define INTLEVEL2_VECTOR_VADDR         XC_VADDR(XCHAL_INTLEVEL2_VECOFS)
-#define INTLEVEL3_VECTOR_VADDR         XC_VADDR(XCHAL_INTLEVEL3_VECOFS)
-#define INTLEVEL4_VECTOR_VADDR         XC_VADDR(XCHAL_INTLEVEL4_VECOFS)
-#define INTLEVEL5_VECTOR_VADDR         XC_VADDR(XCHAL_INTLEVEL5_VECOFS)
-#define INTLEVEL6_VECTOR_VADDR         XC_VADDR(XCHAL_INTLEVEL6_VECOFS)
-
-#define DEBUG_VECTOR_VADDR             XC_VADDR(XCHAL_DEBUG_VECOFS)
+#define VECTOR_VADDR(offset)           (VECBASE_VADDR + offset)
  
-#define NMI_VECTOR_VADDR               XC_VADDR(XCHAL_NMI_VECOFS)
-
-#define INTLEVEL7_VECTOR_VADDR         XC_VADDR(XCHAL_INTLEVEL7_VECOFS)
+#define USER_VECTOR_VADDR              VECTOR_VADDR(XCHAL_USER_VECOFS)
+#define KERNEL_VECTOR_VADDR            VECTOR_VADDR(XCHAL_KERNEL_VECOFS)
+#define DOUBLEEXC_VECTOR_VADDR         VECTOR_VADDR(XCHAL_DOUBLEEXC_VECOFS)
+#define WINDOW_VECTORS_VADDR           VECTOR_VADDR(XCHAL_WINDOW_OF4_VECOFS)
+#define INTLEVEL2_VECTOR_VADDR         VECTOR_VADDR(XCHAL_INTLEVEL2_VECOFS)
+#define INTLEVEL3_VECTOR_VADDR         VECTOR_VADDR(XCHAL_INTLEVEL3_VECOFS)
+#define INTLEVEL4_VECTOR_VADDR         VECTOR_VADDR(XCHAL_INTLEVEL4_VECOFS)
+#define INTLEVEL5_VECTOR_VADDR         VECTOR_VADDR(XCHAL_INTLEVEL5_VECOFS)
+#define INTLEVEL6_VECTOR_VADDR         VECTOR_VADDR(XCHAL_INTLEVEL6_VECOFS)
+#define INTLEVEL7_VECTOR_VADDR         VECTOR_VADDR(XCHAL_INTLEVEL7_VECOFS)
+#define DEBUG_VECTOR_VADDR             VECTOR_VADDR(XCHAL_DEBUG_VECOFS)
  
  /*
   * These XCHAL_* #defines from varian/core.h
@@ -109,7 +91,6 @@ static inline unsigned long xtensa_get_kio_paddr(void)
   * constants are defined above and should be used.
   */
  #undef  XCHAL_VECBASE_RESET_VADDR
-#undef  XCHAL_RESET_VECTOR0_VADDR
  #undef  XCHAL_USER_VECTOR_VADDR
  #undef  XCHAL_KERNEL_VECTOR_VADDR
  #undef  XCHAL_DOUBLEEXC_VECTOR_VADDR
@@ -119,9 +100,8 @@ static inline unsigned long xtensa_get_kio_paddr(void)
  #undef  XCHAL_INTLEVEL4_VECTOR_VADDR
  #undef  XCHAL_INTLEVEL5_VECTOR_VADDR
  #undef  XCHAL_INTLEVEL6_VECTOR_VADDR
-#undef  XCHAL_DEBUG_VECTOR_VADDR
-#undef  XCHAL_NMI_VECTOR_VADDR
  #undef  XCHAL_INTLEVEL7_VECTOR_VADDR
+#undef  XCHAL_DEBUG_VECTOR_VADDR
  
  #else
  
@@ -134,6 +114,7 @@ static inline unsigned long xtensa_get_kio_paddr(void)
  #define INTLEVEL4_VECTOR_VADDR         XCHAL_INTLEVEL4_VECTOR_VADDR
  #define INTLEVEL5_VECTOR_VADDR         XCHAL_INTLEVEL5_VECTOR_VADDR
  #define INTLEVEL6_VECTOR_VADDR         XCHAL_INTLEVEL6_VECTOR_VADDR
+#define INTLEVEL7_VECTOR_VADDR         XCHAL_INTLEVEL6_VECTOR_VADDR
  #define DEBUG_VECTOR_VADDR             XCHAL_DEBUG_VECTOR_VADDR
  
  #endif
diff --git a/arch/xtensa/include/uapi/asm/types.h b/arch/xtensa/include/uapi/asm/types.h

index 87ec7ae..2efc921 100644 (file)
--- a/arch/xtensa/include/uapi/asm/types.h
+++ b/arch/xtensa/include/uapi/asm/types.h
@@ -18,7 +18,8 @@
  # define __XTENSA_UL_CONST(x)  x
  #else
  # define __XTENSA_UL(x)                ((unsigned long)(x))
-# define __XTENSA_UL_CONST(x)  x##UL
+# define ___XTENSA_UL_CONST(x) x##UL
+# define __XTENSA_UL_CONST(x)  ___XTENSA_UL_CONST(x)
  #endif
  
  #ifndef __ASSEMBLY__
diff --git a/arch/xtensa/include/uapi/asm/unistd.h b/arch/xtensa/include/uapi/asm/unistd.h

index b95c305..de9b14b 100644 (file)
--- a/arch/xtensa/include/uapi/asm/unistd.h
+++ b/arch/xtensa/include/uapi/asm/unistd.h
@@ -754,7 +754,20 @@ __SYSCALL(340, sys_bpf, 3)
  #define __NR_execveat                          341
  __SYSCALL(341, sys_execveat, 5)
  
-#define __NR_syscall_count                     342
+#define __NR_userfaultfd                       342
+__SYSCALL(342, sys_userfaultfd, 1)
+#define __NR_membarrier                                343
+__SYSCALL(343, sys_membarrier, 2)
+#define __NR_mlock2                            344
+__SYSCALL(344, sys_mlock2, 3)
+#define __NR_copy_file_range                   345
+__SYSCALL(345, sys_copy_file_range, 6)
+#define __NR_preadv2                           346
+__SYSCALL(346, sys_preadv2, 6)
+#define __NR_pwritev2                          347
+__SYSCALL(347, sys_pwritev2, 6)
+
+#define __NR_syscall_count                     348
  
  /*
   * sysxtensa syscall handler
diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S

index fe8f7e7..fa04d9d 100644 (file)
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -1632,10 +1632,11 @@ ENTRY(fast_second_level_miss)
          * The messy computation for 'pteval' above really simplifies
          * into the following:
          *
-        * pteval = ((pmdval - PAGE_OFFSET) & PAGE_MASK) | PAGE_DIRECTORY
+        * pteval = ((pmdval - PAGE_OFFSET + PHYS_OFFSET) & PAGE_MASK)
+        *                 | PAGE_DIRECTORY
          */
  
-       movi    a1, (-PAGE_OFFSET) & 0xffffffff
+       movi    a1, (PHYS_OFFSET - PAGE_OFFSET) & 0xffffffff
         add     a0, a0, a1              # pmdval - PAGE_OFFSET
         extui   a1, a0, 0, PAGE_SHIFT   # ... & PAGE_MASK
         xor     a0, a0, a1
diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S

index bc4f4bf..23ce62e 100644 (file)
--- a/arch/xtensa/kernel/head.S
+++ b/arch/xtensa/kernel/head.S
@@ -113,7 +113,7 @@ ENTRY(_startup)
         movi    a0, 0
  
  #if XCHAL_HAVE_VECBASE
-       movi    a2, VECBASE_RESET_VADDR
+       movi    a2, VECBASE_VADDR
         wsr     a2, vecbase
  #endif
  
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c

index 143251e..88a044a 100644 (file)
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -7,6 +7,7 @@
   *
   * Copyright (C) 1995  Linus Torvalds
   * Copyright (C) 2001 - 2005  Tensilica Inc.
+ * Copyright (C) 2014 - 2016  Cadence Design Systems Inc.
   *
   * Chris Zankel        <chris@zankel.net>
   * Joe Taylor  <joe@tensilica.com, joetylr@yahoo.com>
@@ -22,7 +23,6 @@
  #include <linux/bootmem.h>
  #include <linux/kernel.h>
  #include <linux/percpu.h>
-#include <linux/clk-provider.h>
  #include <linux/cpu.h>
  #include <linux/of.h>
  #include <linux/of_fdt.h>
@@ -114,7 +114,7 @@ static int __init parse_tag_mem(const bp_tag_t *tag)
         if (mi->type != MEMORY_TYPE_CONVENTIONAL)
                 return -1;
  
-       return add_sysmem_bank(mi->start, mi->end);
+       return memblock_add(mi->start, mi->end - mi->start);
  }
  
  __tagtable(BP_TAG_MEMORY, parse_tag_mem);
@@ -188,7 +188,6 @@ static int __init parse_bootparam(const bp_tag_t* tag)
  }
  
  #ifdef CONFIG_OF
-bool __initdata dt_memory_scan = false;
  
  #if !XCHAL_HAVE_PTP_MMU || XCHAL_HAVE_SPANNING_WAY
  unsigned long xtensa_kio_paddr = XCHAL_KIO_DEFAULT_PADDR;
@@ -228,11 +227,8 @@ static int __init xtensa_dt_io_area(unsigned long node, const char *uname,
  
  void __init early_init_dt_add_memory_arch(u64 base, u64 size)
  {
-       if (!dt_memory_scan)
-               return;
-
         size &= PAGE_MASK;
-       add_sysmem_bank(base, base + size);
+       memblock_add(base, size);
  }
  
  void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
@@ -242,9 +238,6 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
  
  void __init early_init_devtree(void *params)
  {
-       if (sysmem.nr_banks == 0)
-               dt_memory_scan = true;
-
         early_init_dt_scan(params);
         of_scan_flat_dt(xtensa_dt_io_area, NULL);
  
@@ -252,14 +245,6 @@ void __init early_init_devtree(void *params)
                 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
  }
  
-static int __init xtensa_device_probe(void)
-{
-       of_clk_init(NULL);
-       return 0;
-}
-
-device_initcall(xtensa_device_probe);
-
  #endif /* CONFIG_OF */
  
  /*
@@ -277,12 +262,6 @@ void __init init_arch(bp_tag_t *bp_start)
         early_init_devtree(dtb_start);
  #endif
  
-       if (sysmem.nr_banks == 0) {
-               add_sysmem_bank(PLATFORM_DEFAULT_MEM_START,
-                               PLATFORM_DEFAULT_MEM_START +
-                               PLATFORM_DEFAULT_MEM_SIZE);
-       }
-
  #ifdef CONFIG_CMDLINE_BOOL
         if (!command_line[0])
                 strlcpy(command_line, default_command_line, COMMAND_LINE_SIZE);
@@ -452,6 +431,10 @@ static int __init check_s32c1i(void)
  early_initcall(check_s32c1i);
  #endif /* CONFIG_S32C1I_SELFTEST */
  
+static inline int mem_reserve(unsigned long start, unsigned long end)
+{
+       return memblock_reserve(start, end - start);
+}
  
  void __init setup_arch(char **cmdline_p)
  {
@@ -463,54 +446,54 @@ void __init setup_arch(char **cmdline_p)
  #ifdef CONFIG_BLK_DEV_INITRD
         if (initrd_start < initrd_end) {
                 initrd_is_mapped = mem_reserve(__pa(initrd_start),
-                                              __pa(initrd_end), 0) == 0;
+                                              __pa(initrd_end)) == 0;
                 initrd_below_start_ok = 1;
         } else {
                 initrd_start = 0;
         }
  #endif
  
-       mem_reserve(__pa(&_stext),__pa(&_end), 1);
+       mem_reserve(__pa(&_stext), __pa(&_end));
  
         mem_reserve(__pa(&_WindowVectors_text_start),
-                   __pa(&_WindowVectors_text_end), 0);
+                   __pa(&_WindowVectors_text_end));
  
         mem_reserve(__pa(&_DebugInterruptVector_literal_start),
-                   __pa(&_DebugInterruptVector_text_end), 0);
+                   __pa(&_DebugInterruptVector_text_end));
  
         mem_reserve(__pa(&_KernelExceptionVector_literal_start),
-                   __pa(&_KernelExceptionVector_text_end), 0);
+                   __pa(&_KernelExceptionVector_text_end));
  
         mem_reserve(__pa(&_UserExceptionVector_literal_start),
-                   __pa(&_UserExceptionVector_text_end), 0);
+                   __pa(&_UserExceptionVector_text_end));
  
         mem_reserve(__pa(&_DoubleExceptionVector_literal_start),
-                   __pa(&_DoubleExceptionVector_text_end), 0);
+                   __pa(&_DoubleExceptionVector_text_end));
  
  #if XCHAL_EXCM_LEVEL >= 2
         mem_reserve(__pa(&_Level2InterruptVector_text_start),
-                   __pa(&_Level2InterruptVector_text_end), 0);
+                   __pa(&_Level2InterruptVector_text_end));
  #endif
  #if XCHAL_EXCM_LEVEL >= 3
         mem_reserve(__pa(&_Level3InterruptVector_text_start),
-                   __pa(&_Level3InterruptVector_text_end), 0);
+                   __pa(&_Level3InterruptVector_text_end));
  #endif
  #if XCHAL_EXCM_LEVEL >= 4
         mem_reserve(__pa(&_Level4InterruptVector_text_start),
-                   __pa(&_Level4InterruptVector_text_end), 0);
+                   __pa(&_Level4InterruptVector_text_end));
  #endif
  #if XCHAL_EXCM_LEVEL >= 5
         mem_reserve(__pa(&_Level5InterruptVector_text_start),
-                   __pa(&_Level5InterruptVector_text_end), 0);
+                   __pa(&_Level5InterruptVector_text_end));
  #endif
  #if XCHAL_EXCM_LEVEL >= 6
         mem_reserve(__pa(&_Level6InterruptVector_text_start),
-                   __pa(&_Level6InterruptVector_text_end), 0);
+                   __pa(&_Level6InterruptVector_text_end));
  #endif
  
  #ifdef CONFIG_SMP
         mem_reserve(__pa(&_SecondaryResetVector_text_start),
-                   __pa(&_SecondaryResetVector_text_end), 0);
+                   __pa(&_SecondaryResetVector_text_end));
  #endif
         parse_early_param();
         bootmem_init();
@@ -555,6 +538,137 @@ static int __init topology_init(void)
  }
  subsys_initcall(topology_init);
  
+void cpu_reset(void)
+{
+#if XCHAL_HAVE_PTP_MMU
+       local_irq_disable();
+       /*
+        * We have full MMU: all autoload ways, ways 7, 8 and 9 of DTLB must
+        * be flushed.
+        * Way 4 is not currently used by linux.
+        * Ways 5 and 6 shall not be touched on MMUv2 as they are hardwired.
+        * Way 5 shall be flushed and way 6 shall be set to identity mapping
+        * on MMUv3.
+        */
+       local_flush_tlb_all();
+       invalidate_page_directory();
+#if XCHAL_HAVE_SPANNING_WAY
+       /* MMU v3 */
+       {
+               unsigned long vaddr = (unsigned long)cpu_reset;
+               unsigned long paddr = __pa(vaddr);
+               unsigned long tmpaddr = vaddr + SZ_512M;
+               unsigned long tmp0, tmp1, tmp2, tmp3;
+
+               /*
+                * Find a place for the temporary mapping. It must not be
+                * in the same 512MB region with vaddr or paddr, otherwise
+                * there may be multihit exception either on entry to the
+                * temporary mapping, or on entry to the identity mapping.
+                * (512MB is the biggest page size supported by TLB.)
+                */
+               while (((tmpaddr ^ paddr) & -SZ_512M) == 0)
+                       tmpaddr += SZ_512M;
+
+               /* Invalidate mapping in the selected temporary area */
+               if (itlb_probe(tmpaddr) & 0x8)
+                       invalidate_itlb_entry(itlb_probe(tmpaddr));
+               if (itlb_probe(tmpaddr + PAGE_SIZE) & 0x8)
+                       invalidate_itlb_entry(itlb_probe(tmpaddr + PAGE_SIZE));
+
+               /*
+                * Map two consecutive pages starting at the physical address
+                * of this function to the temporary mapping area.
+                */
+               write_itlb_entry(__pte((paddr & PAGE_MASK) |
+                                      _PAGE_HW_VALID |
+                                      _PAGE_HW_EXEC |
+                                      _PAGE_CA_BYPASS),
+                                tmpaddr & PAGE_MASK);
+               write_itlb_entry(__pte(((paddr & PAGE_MASK) + PAGE_SIZE) |
+                                      _PAGE_HW_VALID |
+                                      _PAGE_HW_EXEC |
+                                      _PAGE_CA_BYPASS),
+                                (tmpaddr & PAGE_MASK) + PAGE_SIZE);
+
+               /* Reinitialize TLB */
+               __asm__ __volatile__ ("movi     %0, 1f\n\t"
+                                     "movi     %3, 2f\n\t"
+                                     "add      %0, %0, %4\n\t"
+                                     "add      %3, %3, %5\n\t"
+                                     "jx       %0\n"
+                                     /*
+                                      * No literal, data or stack access
+                                      * below this point
+                                      */
+                                     "1:\n\t"
+                                     /* Initialize *tlbcfg */
+                                     "movi     %0, 0\n\t"
+                                     "wsr      %0, itlbcfg\n\t"
+                                     "wsr      %0, dtlbcfg\n\t"
+                                     /* Invalidate TLB way 5 */
+                                     "movi     %0, 4\n\t"
+                                     "movi     %1, 5\n"
+                                     "1:\n\t"
+                                     "iitlb    %1\n\t"
+                                     "idtlb    %1\n\t"
+                                     "add      %1, %1, %6\n\t"
+                                     "addi     %0, %0, -1\n\t"
+                                     "bnez     %0, 1b\n\t"
+                                     /* Initialize TLB way 6 */
+                                     "movi     %0, 7\n\t"
+                                     "addi     %1, %9, 3\n\t"
+                                     "addi     %2, %9, 6\n"
+                                     "1:\n\t"
+                                     "witlb    %1, %2\n\t"
+                                     "wdtlb    %1, %2\n\t"
+                                     "add      %1, %1, %7\n\t"
+                                     "add      %2, %2, %7\n\t"
+                                     "addi     %0, %0, -1\n\t"
+                                     "bnez     %0, 1b\n\t"
+                                     /* Jump to identity mapping */
+                                     "jx       %3\n"
+                                     "2:\n\t"
+                                     /* Complete way 6 initialization */
+                                     "witlb    %1, %2\n\t"
+                                     "wdtlb    %1, %2\n\t"
+                                     /* Invalidate temporary mapping */
+                                     "sub      %0, %9, %7\n\t"
+                                     "iitlb    %0\n\t"
+                                     "add      %0, %0, %8\n\t"
+                                     "iitlb    %0"
+                                     : "=&a"(tmp0), "=&a"(tmp1), "=&a"(tmp2),
+                                       "=&a"(tmp3)
+                                     : "a"(tmpaddr - vaddr),
+                                       "a"(paddr - vaddr),
+                                       "a"(SZ_128M), "a"(SZ_512M),
+                                       "a"(PAGE_SIZE),
+                                       "a"((tmpaddr + SZ_512M) & PAGE_MASK)
+                                     : "memory");
+       }
+#endif
+#endif
+       __asm__ __volatile__ ("movi     a2, 0\n\t"
+                             "wsr      a2, icountlevel\n\t"
+                             "movi     a2, 0\n\t"
+                             "wsr      a2, icount\n\t"
+#if XCHAL_NUM_IBREAK > 0
+                             "wsr      a2, ibreakenable\n\t"
+#endif
+#if XCHAL_HAVE_LOOPS
+                             "wsr      a2, lcount\n\t"
+#endif
+                             "movi     a2, 0x1f\n\t"
+                             "wsr      a2, ps\n\t"
+                             "isync\n\t"
+                             "jx       %0\n\t"
+                             :
+                             : "a" (XCHAL_RESET_VECTOR_VADDR)
+                             : "a2");
+       for (;;)
+               ;
+}
+
  void machine_restart(char * cmd)
  {
         platform_restart();
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c

index b9ad9fe..9a5bcd0 100644 (file)
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -12,6 +12,8 @@
   * Chris Zankel <chris@zankel.net>
   */
  
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
  #include <linux/errno.h>
  #include <linux/sched.h>
  #include <linux/time.h>
@@ -134,16 +136,52 @@ void local_timer_setup(unsigned cpu)
                                         0xf, 0xffffffff);
  }
  
+#ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
+#ifdef CONFIG_OF
+static void __init calibrate_ccount(void)
+{
+       struct device_node *cpu;
+       struct clk *clk;
+
+       cpu = of_find_compatible_node(NULL, NULL, "cdns,xtensa-cpu");
+       if (cpu) {
+               clk = of_clk_get(cpu, 0);
+               if (!IS_ERR(clk)) {
+                       ccount_freq = clk_get_rate(clk);
+                       return;
+               } else {
+                       pr_warn("%s: CPU input clock not found\n",
+                               __func__);
+               }
+       } else {
+               pr_warn("%s: CPU node not found in the device tree\n",
+                       __func__);
+       }
+
+       platform_calibrate_ccount();
+}
+#else
+static inline void calibrate_ccount(void)
+{
+       platform_calibrate_ccount();
+}
+#endif
+#endif
+
  void __init time_init(void)
  {
+       of_clk_init(NULL);
  #ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
         printk("Calibrating CPU frequency ");
-       platform_calibrate_ccount();
+       calibrate_ccount();
         printk("%d.%02d MHz\n", (int)ccount_freq/1000000,
                         (int)(ccount_freq/10000)%100);
  #else
         ccount_freq = CONFIG_XTENSA_CPU_CLOCK*1000000UL;
  #endif
+       WARN(!ccount_freq,
+            "%s: CPU clock frequency is not set up correctly\n",
+            __func__);
         clocksource_register_hz(&ccount_clocksource, ccount_freq);
         local_timer_setup(0);
         setup_irq(this_cpu_ptr(&ccount_timer)->evt.irq, &timer_irqaction);
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S

index c417cbe..72cfe35 100644 (file)
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -30,10 +30,6 @@ jiffies = jiffies_64 + 4;
  jiffies = jiffies_64;
  #endif
  
-#ifndef KERNELOFFSET
-#define KERNELOFFSET 0xd0003000
-#endif
-
  /* Note: In the following macros, it would be nice to specify only the
     vector name and section kind and construct "sym" and "section" using
     CPP concatenation, but that does not work reliably.  Concatenating a
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c

index 9a9a593..80e4cfb 100644 (file)
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -8,7 +8,7 @@
   * for more details.
   *
   * Copyright (C) 2001 - 2005 Tensilica Inc.
- * Copyright (C) 2014 Cadence Design Systems Inc.
+ * Copyright (C) 2014 - 2016 Cadence Design Systems Inc.
   *
   * Chris Zankel        <chris@zankel.net>
   * Joe Taylor  <joe@tensilica.com, joetylr@yahoo.com>
@@ -25,284 +25,43 @@
  #include <linux/mman.h>
  #include <linux/nodemask.h>
  #include <linux/mm.h>
+#include <linux/of_fdt.h>
  
  #include <asm/bootparam.h>
  #include <asm/page.h>
  #include <asm/sections.h>
  #include <asm/sysmem.h>
  
-struct sysmem_info sysmem __initdata;
-
-static void __init sysmem_dump(void)
-{
-       unsigned i;
-
-       pr_debug("Sysmem:\n");
-       for (i = 0; i < sysmem.nr_banks; ++i)
-               pr_debug("  0x%08lx - 0x%08lx (%ldK)\n",
-                        sysmem.bank[i].start, sysmem.bank[i].end,
-                        (sysmem.bank[i].end - sysmem.bank[i].start) >> 10);
-}
-
-/*
- * Find bank with maximal .start such that bank.start <= start
- */
-static inline struct meminfo * __init find_bank(unsigned long start)
-{
-       unsigned i;
-       struct meminfo *it = NULL;
-
-       for (i = 0; i < sysmem.nr_banks; ++i)
-               if (sysmem.bank[i].start <= start)
-                       it = sysmem.bank + i;
-               else
-                       break;
-       return it;
-}
-
-/*
- * Move all memory banks starting at 'from' to a new place at 'to',
- * adjust nr_banks accordingly.
- * Both 'from' and 'to' must be inside the sysmem.bank.
- *
- * Returns: 0 (success), -ENOMEM (not enough space in the sysmem.bank).
- */
-static int __init move_banks(struct meminfo *to, struct meminfo *from)
-{
-       unsigned n = sysmem.nr_banks - (from - sysmem.bank);
-
-       if (to > from && to - from + sysmem.nr_banks > SYSMEM_BANKS_MAX)
-               return -ENOMEM;
-       if (to != from)
-               memmove(to, from, n * sizeof(struct meminfo));
-       sysmem.nr_banks += to - from;
-       return 0;
-}
-
-/*
- * Add new bank to sysmem. Resulting sysmem is the union of bytes of the
- * original sysmem and the new bank.
- *
- * Returns: 0 (success), < 0 (error)
- */
-int __init add_sysmem_bank(unsigned long start, unsigned long end)
-{
-       unsigned i;
-       struct meminfo *it = NULL;
-       unsigned long sz;
-       unsigned long bank_sz = 0;
-
-       if (start == end ||
-           (start < end) != (PAGE_ALIGN(start) < (end & PAGE_MASK))) {
-               pr_warn("Ignoring small memory bank 0x%08lx size: %ld bytes\n",
-                       start, end - start);
-               return -EINVAL;
-       }
-
-       start = PAGE_ALIGN(start);
-       end &= PAGE_MASK;
-       sz = end - start;
-
-       it = find_bank(start);
-
-       if (it)
-               bank_sz = it->end - it->start;
-
-       if (it && bank_sz >= start - it->start) {
-               if (end - it->start > bank_sz)
-                       it->end = end;
-               else
-                       return 0;
-       } else {
-               if (!it)
-                       it = sysmem.bank;
-               else
-                       ++it;
-
-               if (it - sysmem.bank < sysmem.nr_banks &&
-                   it->start - start <= sz) {
-                       it->start = start;
-                       if (it->end - it->start < sz)
-                               it->end = end;
-                       else
-                               return 0;
-               } else {
-                       if (move_banks(it + 1, it) < 0) {
-                               pr_warn("Ignoring memory bank 0x%08lx size %ld bytes\n",
-                                       start, end - start);
-                               return -EINVAL;
-                       }
-                       it->start = start;
-                       it->end = end;
-                       return 0;
-               }
-       }
-       sz = it->end - it->start;
-       for (i = it + 1 - sysmem.bank; i < sysmem.nr_banks; ++i)
-               if (sysmem.bank[i].start - it->start <= sz) {
-                       if (sz < sysmem.bank[i].end - it->start)
-                               it->end = sysmem.bank[i].end;
-               } else {
-                       break;
-               }
-
-       move_banks(it + 1, sysmem.bank + i);
-       return 0;
-}
-
-/*
- * mem_reserve(start, end, must_exist)
- *
- * Reserve some memory from the memory pool.
- * If must_exist is set and a part of the region being reserved does not exist
- * memory map is not altered.
- *
- * Parameters:
- *  start      Start of region,
- *  end                End of region,
- *  must_exist Must exist in memory pool.
- *
- * Returns:
- *  0 (success)
- *  < 0 (error)
- */
-
-int __init mem_reserve(unsigned long start, unsigned long end, int must_exist)
-{
-       struct meminfo *it;
-       struct meminfo *rm = NULL;
-       unsigned long sz;
-       unsigned long bank_sz = 0;
-
-       start = start & PAGE_MASK;
-       end = PAGE_ALIGN(end);
-       sz = end - start;
-       if (!sz)
-               return -EINVAL;
-
-       it = find_bank(start);
-
-       if (it)
-               bank_sz = it->end - it->start;
-
-       if ((!it || end - it->start > bank_sz) && must_exist) {
-               pr_warn("mem_reserve: [0x%0lx, 0x%0lx) not in any region!\n",
-                       start, end);
-               return -EINVAL;
-       }
-
-       if (it && start - it->start <= bank_sz) {
-               if (start == it->start) {
-                       if (end - it->start < bank_sz) {
-                               it->start = end;
-                               return 0;
-                       } else {
-                               rm = it;
-                       }
-               } else {
-                       it->end = start;
-                       if (end - it->start < bank_sz)
-                               return add_sysmem_bank(end,
-                                                      it->start + bank_sz);
-                       ++it;
-               }
-       }
-
-       if (!it)
-               it = sysmem.bank;
-
-       for (; it < sysmem.bank + sysmem.nr_banks; ++it) {
-               if (it->end - start <= sz) {
-                       if (!rm)
-                               rm = it;
-               } else {
-                       if (it->start - start < sz)
-                               it->start = end;
-                       break;
-               }
-       }
-
-       if (rm)
-               move_banks(rm, it);
-
-       return 0;
-}
-
-
  /*
   * Initialize the bootmem system and give it all low memory we have available.
   */
  
  void __init bootmem_init(void)
  {
-       unsigned long pfn;
-       unsigned long bootmap_start, bootmap_size;
-       int i;
-
-       /* Reserve all memory below PLATFORM_DEFAULT_MEM_START, as memory
+       /* Reserve all memory below PHYS_OFFSET, as memory
          * accounting doesn't work for pages below that address.
          *
-        * If PLATFORM_DEFAULT_MEM_START is zero reserve page at address 0:
+        * If PHYS_OFFSET is zero reserve page at address 0:
          * successfull allocations should never return NULL.
          */
-       if (PLATFORM_DEFAULT_MEM_START)
-               mem_reserve(0, PLATFORM_DEFAULT_MEM_START, 0);
+       if (PHYS_OFFSET)
+               memblock_reserve(0, PHYS_OFFSET);
         else
-               mem_reserve(0, 1, 0);
+               memblock_reserve(0, 1);
  
-       sysmem_dump();
-       max_low_pfn = max_pfn = 0;
-       min_low_pfn = ~0;
-
-       for (i=0; i < sysmem.nr_banks; i++) {
-               pfn = PAGE_ALIGN(sysmem.bank[i].start) >> PAGE_SHIFT;
-               if (pfn < min_low_pfn)
-                       min_low_pfn = pfn;
-               pfn = PAGE_ALIGN(sysmem.bank[i].end - 1) >> PAGE_SHIFT;
-               if (pfn > max_pfn)
-                       max_pfn = pfn;
-       }
+       early_init_fdt_scan_reserved_mem();
  
-       if (min_low_pfn > max_pfn)
+       if (!memblock_phys_mem_size())
                 panic("No memory found!\n");
  
-       max_low_pfn = max_pfn < MAX_MEM_PFN >> PAGE_SHIFT ?
-               max_pfn : MAX_MEM_PFN >> PAGE_SHIFT;
+       min_low_pfn = PFN_UP(memblock_start_of_DRAM());
+       min_low_pfn = max(min_low_pfn, PFN_UP(PHYS_OFFSET));
+       max_pfn = PFN_DOWN(memblock_end_of_DRAM());
+       max_low_pfn = min(max_pfn, MAX_LOW_PFN);
  
-       /* Find an area to use for the bootmem bitmap. */
-
-       bootmap_size = bootmem_bootmap_pages(max_low_pfn - min_low_pfn);
-       bootmap_size <<= PAGE_SHIFT;
-       bootmap_start = ~0;
-
-       for (i=0; i<sysmem.nr_banks; i++)
-               if (sysmem.bank[i].end - sysmem.bank[i].start >= bootmap_size) {
-                       bootmap_start = sysmem.bank[i].start;
-                       break;
-               }
-
-       if (bootmap_start == ~0UL)
-               panic("Cannot find %ld bytes for bootmap\n", bootmap_size);
-
-       /* Reserve the bootmem bitmap area */
-
-       mem_reserve(bootmap_start, bootmap_start + bootmap_size, 1);
-       bootmap_size = init_bootmem_node(NODE_DATA(0),
-                                        bootmap_start >> PAGE_SHIFT,
-                                        min_low_pfn,
-                                        max_low_pfn);
-
-       /* Add all remaining memory pieces into the bootmem map */
-
-       for (i = 0; i < sysmem.nr_banks; i++) {
-               if (sysmem.bank[i].start >> PAGE_SHIFT < max_low_pfn) {
-                       unsigned long end = min(max_low_pfn << PAGE_SHIFT,
-                                               sysmem.bank[i].end);
-                       free_bootmem(sysmem.bank[i].start,
-                                    end - sysmem.bank[i].start);
-               }
-       }
+       memblock_set_current_limit(PFN_PHYS(max_low_pfn));
  
+       memblock_dump_all();
  }
  
  
@@ -344,7 +103,7 @@ void __init mem_init(void)
                 "    fixmap  : 0x%08lx - 0x%08lx  (%5lu kB)\n"
  #endif
  #ifdef CONFIG_MMU
-               "    vmalloc : 0x%08x - 0x%08x  (%5u MB)\n"
+               "    vmalloc : 0x%08lx - 0x%08lx  (%5lu MB)\n"
  #endif
                 "    lowmem  : 0x%08lx - 0x%08lx  (%5lu MB)\n",
  #ifdef CONFIG_HIGHMEM
@@ -395,16 +154,16 @@ static void __init parse_memmap_one(char *p)
         switch (*p) {
         case '@':
                 start_at = memparse(p + 1, &p);
-               add_sysmem_bank(start_at, start_at + mem_size);
+               memblock_add(start_at, mem_size);
                 break;
  
         case '$':
                 start_at = memparse(p + 1, &p);
-               mem_reserve(start_at, start_at + mem_size, 0);
+               memblock_reserve(start_at, mem_size);
                 break;
  
         case 0:
-               mem_reserve(mem_size, 0, 0);
+               memblock_reserve(mem_size, -mem_size);
                 break;
  
         default:
diff --git a/arch/xtensa/platforms/iss/include/platform/simcall.h b/arch/xtensa/platforms/iss/include/platform/simcall.h

index 12b15ad..27d7a52 100644 (file)
--- a/arch/xtensa/platforms/iss/include/platform/simcall.h
+++ b/arch/xtensa/platforms/iss/include/platform/simcall.h
@@ -76,6 +76,11 @@ static inline int __simc(int a, int b, int c, int d)
         return ret;
  }
  
+static inline int simc_exit(int exit_code)
+{
+       return __simc(SYS_exit, exit_code, 0, 0);
+}
+
  static inline int simc_open(const char *file, int flags, int mode)
  {
         return __simc(SYS_open, (int) file, flags, mode);
diff --git a/arch/xtensa/platforms/iss/setup.c b/arch/xtensa/platforms/iss/setup.c

index 3918205..379aedd 100644 (file)
--- a/arch/xtensa/platforms/iss/setup.c
+++ b/arch/xtensa/platforms/iss/setup.c
@@ -32,6 +32,8 @@
  #include <asm/platform.h>
  #include <asm/bootparam.h>
  
+#include <platform/simcall.h>
+
  
  void __init platform_init(bp_tag_t* bootparam)
  {
@@ -41,37 +43,19 @@ void __init platform_init(bp_tag_t* bootparam)
  void platform_halt(void)
  {
         pr_info(" ** Called platform_halt() **\n");
-       __asm__ __volatile__("movi a2, 1\nsimcall\n");
+       simc_exit(0);
  }
  
  void platform_power_off(void)
  {
         pr_info(" ** Called platform_power_off() **\n");
-       __asm__ __volatile__("movi a2, 1\nsimcall\n");
+       simc_exit(0);
  }
  void platform_restart(void)
  {
         /* Flush and reset the mmu, simulate a processor reset, and
          * jump to the reset vector. */
-
-       __asm__ __volatile__("movi      a2, 15\n\t"
-                            "wsr       a2, icountlevel\n\t"
-                            "movi      a2, 0\n\t"
-                            "wsr       a2, icount\n\t"
-#if XCHAL_NUM_IBREAK > 0
-                            "wsr       a2, ibreakenable\n\t"
-#endif
-#if XCHAL_HAVE_LOOPS
-                            "wsr       a2, lcount\n\t"
-#endif
-                            "movi      a2, 0x1f\n\t"
-                            "wsr       a2, ps\n\t"
-                            "isync\n\t"
-                            "jx        %0\n\t"
-                            :
-                            : "a" (XCHAL_RESET_VECTOR_VADDR)
-                            : "a2");
-
+       cpu_reset();
         /* control never gets here */
  }
  
@@ -98,7 +82,7 @@ void platform_heartbeat(void)
  static int
  iss_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
  {
-       __asm__ __volatile__("movi a2, -1; simcall\n");
+       simc_exit(1);
         return NOTIFY_DONE;
  }
  
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c

index f58a4e6..ede04cc 100644 (file)
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -86,6 +86,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
                 unsigned long io;
  
                 simc_lseek(dev->fd, offset, SEEK_SET);
+               READ_ONCE(*buffer);
                 if (write)
                         io = simc_write(dev->fd, buffer, nbytes);
                 else
diff --git a/arch/xtensa/platforms/xt2000/setup.c b/arch/xtensa/platforms/xt2000/setup.c

index 4904c5c..9c2f1fb 100644 (file)
--- a/arch/xtensa/platforms/xt2000/setup.c
+++ b/arch/xtensa/platforms/xt2000/setup.c
@@ -64,26 +64,7 @@ void platform_restart(void)
  {
         /* Flush and reset the mmu, simulate a processor reset, and
          * jump to the reset vector. */
-
-       __asm__ __volatile__ ("movi     a2, 15\n\t"
-                             "wsr      a2, icountlevel\n\t"
-                             "movi     a2, 0\n\t"
-                             "wsr      a2, icount\n\t"
-#if XCHAL_NUM_IBREAK > 0
-                             "wsr      a2, ibreakenable\n\t"
-#endif
-#if XCHAL_HAVE_LOOPS
-                             "wsr      a2, lcount\n\t"
-#endif
-                             "movi     a2, 0x1f\n\t"
-                             "wsr      a2, ps\n\t"
-                             "isync\n\t"
-                             "jx       %0\n\t"
-                             :
-                             : "a" (XCHAL_RESET_VECTOR_VADDR)
-                             : "a2"
-                             );
-
+       cpu_reset();
         /* control never gets here */
  }
  
diff --git a/arch/xtensa/platforms/xtfpga/setup.c b/arch/xtensa/platforms/xtfpga/setup.c

index b509d1f..779be72 100644 (file)
--- a/arch/xtensa/platforms/xtfpga/setup.c
+++ b/arch/xtensa/platforms/xtfpga/setup.c
@@ -26,6 +26,8 @@
  #include <linux/console.h>
  #include <linux/delay.h>
  #include <linux/of.h>
+#include <linux/clk-provider.h>
+#include <linux/of_address.h>
  
  #include <asm/timex.h>
  #include <asm/processor.h>
@@ -54,58 +56,63 @@ void platform_restart(void)
  {
         /* Flush and reset the mmu, simulate a processor reset, and
          * jump to the reset vector. */
+       cpu_reset();
+       /* control never gets here */
+}
  
+void __init platform_setup(char **cmdline)
+{
+}
  
-       __asm__ __volatile__ ("movi     a2, 15\n\t"
-                             "wsr      a2, icountlevel\n\t"
-                             "movi     a2, 0\n\t"
-                             "wsr      a2, icount\n\t"
-#if XCHAL_NUM_IBREAK > 0
-                             "wsr      a2, ibreakenable\n\t"
-#endif
-#if XCHAL_HAVE_LOOPS
-                             "wsr      a2, lcount\n\t"
-#endif
-                             "movi     a2, 0x1f\n\t"
-                             "wsr      a2, ps\n\t"
-                             "isync\n\t"
-                             "jx       %0\n\t"
-                             :
-                             : "a" (XCHAL_RESET_VECTOR_VADDR)
-                             : "a2"
-                             );
+/* early initialization */
  
-       /* control never gets here */
+void __init platform_init(bp_tag_t *first)
+{
  }
  
-void __init platform_setup(char **cmdline)
+/* Heartbeat. */
+
+void platform_heartbeat(void)
+{
+}
+
+#ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
+
+void __init platform_calibrate_ccount(void)
  {
+       ccount_freq = *(long *)XTFPGA_CLKFRQ_VADDR;
  }
  
+#endif
+
  #ifdef CONFIG_OF
  
-static void __init update_clock_frequency(struct device_node *node)
+static void __init xtfpga_clk_setup(struct device_node *np)
  {
-       struct property *newfreq;
+       void __iomem *base = of_iomap(np, 0);
+       struct clk *clk;
         u32 freq;
  
-       if (!of_property_read_u32(node, "clock-frequency", &freq) && freq != 0)
+       if (!base) {
+               pr_err("%s: invalid address\n", np->name);
                 return;
+       }
  
-       newfreq = kzalloc(sizeof(*newfreq) + sizeof(u32), GFP_KERNEL);
-       if (!newfreq)
-               return;
-       newfreq->value = newfreq + 1;
-       newfreq->length = sizeof(freq);
-       newfreq->name = kstrdup("clock-frequency", GFP_KERNEL);
-       if (!newfreq->name) {
-               kfree(newfreq);
+       freq = __raw_readl(base);
+       iounmap(base);
+       clk = clk_register_fixed_rate(NULL, np->name, NULL, 0, freq);
+
+       if (IS_ERR(clk)) {
+               pr_err("%s: clk registration failed\n", np->name);
                 return;
         }
  
-       *(u32 *)newfreq->value = cpu_to_be32(*(u32 *)XTFPGA_CLKFRQ_VADDR);
-       of_update_property(node, newfreq);
+       if (of_clk_add_provider(np, of_clk_src_simple_get, clk)) {
+               pr_err("%s: clk provider registration failed\n", np->name);
+               return;
+       }
  }
+CLK_OF_DECLARE(xtfpga_clk, "cdns,xtfpga-clock", xtfpga_clk_setup);
  
  #define MAC_LEN 6
  static void __init update_local_mac(struct device_node *node)
@@ -137,56 +144,15 @@ static void __init update_local_mac(struct device_node *node)
  
  static int __init machine_setup(void)
  {
-       struct device_node *clock;
         struct device_node *eth = NULL;
  
-       for_each_node_by_name(clock, "main-oscillator")
-               update_clock_frequency(clock);
-
         if ((eth = of_find_compatible_node(eth, NULL, "opencores,ethoc")))
                 update_local_mac(eth);
         return 0;
  }
  arch_initcall(machine_setup);
  
-#endif
-
-/* early initialization */
-
-void __init platform_init(bp_tag_t *first)
-{
-}
-
-/* Heartbeat. */
-
-void platform_heartbeat(void)
-{
-}
-
-#ifdef CONFIG_XTENSA_CALIBRATE_CCOUNT
-
-void __init platform_calibrate_ccount(void)
-{
-       long clk_freq = 0;
-#ifdef CONFIG_OF
-       struct device_node *cpu =
-               of_find_compatible_node(NULL, NULL, "cdns,xtensa-cpu");
-       if (cpu) {
-               u32 freq;
-               update_clock_frequency(cpu);
-               if (!of_property_read_u32(cpu, "clock-frequency", &freq))
-                       clk_freq = freq;
-       }
-#endif
-       if (!clk_freq)
-               clk_freq = *(long *)XTFPGA_CLKFRQ_VADDR;
-
-       ccount_freq = clk_freq;
-}
-
-#endif
-
-#ifndef CONFIG_OF
+#else
  
  #include <linux/serial_8250.h>
  #include <linux/if.h>
diff --git a/arch/xtensa/variants/csp/include/variant/core.h b/arch/xtensa/variants/csp/include/variant/core.h

new file mode 100644 (file)

index 0000000..ccd81f0
--- /dev/null
+++ b/arch/xtensa/variants/csp/include/variant/core.h
@@ -0,0 +1,575 @@
+/* 
+ * xtensa/config/core-isa.h -- HAL definitions that are dependent on Xtensa
+ *                             processor CORE configuration
+ *
+ *  See <xtensa/config/core.h>, which includes this file, for more details.
+ */
+
+/* Xtensa processor core configuration information.
+
+   Copyright (c) 1999-2015 Tensilica Inc.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#ifndef _XTENSA_CORE_CONFIGURATION_H
+#define _XTENSA_CORE_CONFIGURATION_H
+
+
+/****************************************************************************
+           Parameters Useful for Any Code, USER or PRIVILEGED
+ ****************************************************************************/
+
+/*
+ *  Note:  Macros of the form XCHAL_HAVE_*** have a value of 1 if the option is
+ *  configured, and a value of 0 otherwise.  These macros are always defined.
+ */
+
+
+/*----------------------------------------------------------------------
+                               ISA
+  ----------------------------------------------------------------------*/
+
+#define XCHAL_HAVE_BE                  0       /* big-endian byte ordering */
+#define XCHAL_HAVE_WINDOWED            1       /* windowed registers option */
+#define XCHAL_NUM_AREGS                        32      /* num of physical addr regs */
+#define XCHAL_NUM_AREGS_LOG2           5       /* log2(XCHAL_NUM_AREGS) */
+#define XCHAL_MAX_INSTRUCTION_SIZE     8       /* max instr bytes (3..8) */
+#define XCHAL_HAVE_DEBUG               1       /* debug option */
+#define XCHAL_HAVE_DENSITY             1       /* 16-bit instructions */
+#define XCHAL_HAVE_LOOPS               1       /* zero-overhead loops */
+#define XCHAL_LOOP_BUFFER_SIZE         0       /* zero-ov. loop instr buffer size */
+#define XCHAL_HAVE_NSA                 1       /* NSA/NSAU instructions */
+#define XCHAL_HAVE_MINMAX              1       /* MIN/MAX instructions */
+#define XCHAL_HAVE_SEXT                        1       /* SEXT instruction */
+#define XCHAL_HAVE_DEPBITS             0       /* DEPBITS instruction */
+#define XCHAL_HAVE_CLAMPS              1       /* CLAMPS instruction */
+#define XCHAL_HAVE_MUL16               1       /* MUL16S/MUL16U instructions */
+#define XCHAL_HAVE_MUL32               1       /* MULL instruction */
+#define XCHAL_HAVE_MUL32_HIGH          1       /* MULUH/MULSH instructions */
+#define XCHAL_HAVE_DIV32               1       /* QUOS/QUOU/REMS/REMU instructions */
+#define XCHAL_HAVE_L32R                        1       /* L32R instruction */
+#define XCHAL_HAVE_ABSOLUTE_LITERALS   0       /* non-PC-rel (extended) L32R */
+#define XCHAL_HAVE_CONST16             0       /* CONST16 instruction */
+#define XCHAL_HAVE_ADDX                        1       /* ADDX#/SUBX# instructions */
+#define XCHAL_HAVE_WIDE_BRANCHES       0       /* B*.W18 or B*.W15 instr's */
+#define XCHAL_HAVE_PREDICTED_BRANCHES  0       /* B[EQ/EQZ/NE/NEZ]T instr's */
+#define XCHAL_HAVE_CALL4AND12          1       /* (obsolete option) */
+#define XCHAL_HAVE_ABS                 1       /* ABS instruction */
+/*#define XCHAL_HAVE_POPC              0*/     /* POPC instruction */
+/*#define XCHAL_HAVE_CRC               0*/     /* CRC instruction */
+#define XCHAL_HAVE_RELEASE_SYNC                1       /* L32AI/S32RI instructions */
+#define XCHAL_HAVE_S32C1I              1       /* S32C1I instruction */
+#define XCHAL_HAVE_SPECULATION         0       /* speculation */
+#define XCHAL_HAVE_FULL_RESET          1       /* all regs/state reset */
+#define XCHAL_NUM_CONTEXTS             1       /* */
+#define XCHAL_NUM_MISC_REGS            4       /* num of scratch regs (0..4) */
+#define XCHAL_HAVE_TAP_MASTER          0       /* JTAG TAP control instr's */
+#define XCHAL_HAVE_PRID                        1       /* processor ID register */
+#define XCHAL_HAVE_EXTERN_REGS         1       /* WER/RER instructions */
+#define XCHAL_HAVE_MX                  0       /* MX core (Tensilica internal) */
+#define XCHAL_HAVE_MP_INTERRUPTS       0       /* interrupt distributor port */
+#define XCHAL_HAVE_MP_RUNSTALL         0       /* core RunStall control port */
+#define XCHAL_HAVE_PSO                 0       /* Power Shut-Off */
+#define XCHAL_HAVE_PSO_CDM             0       /* core/debug/mem pwr domains */
+#define XCHAL_HAVE_PSO_FULL_RETENTION  0       /* all regs preserved on PSO */
+#define XCHAL_HAVE_THREADPTR           1       /* THREADPTR register */
+#define XCHAL_HAVE_BOOLEANS            1       /* boolean registers */
+#define XCHAL_HAVE_CP                  1       /* CPENABLE reg (coprocessor) */
+#define XCHAL_CP_MAXCFG                        8       /* max allowed cp id plus one */
+#define XCHAL_HAVE_MAC16               1       /* MAC16 package */
+
+#define XCHAL_HAVE_FUSION               0      /* Fusion*/
+#define XCHAL_HAVE_FUSION_FP    0              /* Fusion FP option */
+#define XCHAL_HAVE_FUSION_LOW_POWER 0  /* Fusion Low Power option */
+#define XCHAL_HAVE_FUSION_AES   0              /* Fusion BLE/Wifi AES-128 CCM option */
+#define XCHAL_HAVE_FUSION_CONVENC       0       /* Fusion Conv Encode option */
+#define XCHAL_HAVE_FUSION_LFSR_CRC      0      /* Fusion LFSR-CRC option */
+#define XCHAL_HAVE_FUSION_BITOPS        0      /* Fusion Bit Operations Support option */
+#define XCHAL_HAVE_FUSION_AVS   0      /* Fusion AVS option */
+#define XCHAL_HAVE_FUSION_16BIT_BASEBAND        0      /* Fusion 16-bit Baseband option */
+#define XCHAL_HAVE_HIFIPRO             0       /* HiFiPro Audio Engine pkg */
+#define XCHAL_HAVE_HIFI4               0       /* HiFi4 Audio Engine pkg */
+#define XCHAL_HAVE_HIFI4_VFPU          0       /* HiFi4 Audio Engine VFPU option */
+#define XCHAL_HAVE_HIFI3               0       /* HiFi3 Audio Engine pkg */
+#define XCHAL_HAVE_HIFI3_VFPU          0       /* HiFi3 Audio Engine VFPU option */
+#define XCHAL_HAVE_HIFI2               0       /* HiFi2 Audio Engine pkg */
+#define XCHAL_HAVE_HIFI2EP             0       /* HiFi2EP */
+#define XCHAL_HAVE_HIFI_MINI           0       
+
+
+#define XCHAL_HAVE_VECTORFPU2005       0       /* vector or user floating-point pkg */
+#define XCHAL_HAVE_USER_DPFPU         0       /* user DP floating-point pkg */
+#define XCHAL_HAVE_USER_SPFPU         0       /* user DP floating-point pkg */
+#define XCHAL_HAVE_FP                 0      /* single prec floating point */
+#define XCHAL_HAVE_FP_DIV             0  /* FP with DIV instructions */
+#define XCHAL_HAVE_FP_RECIP           0        /* FP with RECIP instructions */
+#define XCHAL_HAVE_FP_SQRT            0 /* FP with SQRT instructions */
+#define XCHAL_HAVE_FP_RSQRT           0        /* FP with RSQRT instructions */
+#define XCHAL_HAVE_DFP                        0     /* double precision FP pkg */
+#define XCHAL_HAVE_DFP_DIV            0 /* DFP with DIV instructions */
+#define XCHAL_HAVE_DFP_RECIP          0       /* DFP with RECIP instructions*/
+#define XCHAL_HAVE_DFP_SQRT           0        /* DFP with SQRT instructions */
+#define XCHAL_HAVE_DFP_RSQRT          0       /* DFP with RSQRT instructions*/
+#define XCHAL_HAVE_DFP_ACCEL           0       /* double precision FP acceleration pkg */
+#define XCHAL_HAVE_DFP_accel           XCHAL_HAVE_DFP_ACCEL                            /* for backward compatibility */
+
+#define XCHAL_HAVE_DFPU_SINGLE_ONLY    0                       /* DFPU Coprocessor, single precision only */
+#define XCHAL_HAVE_DFPU_SINGLE_DOUBLE  0                       /* DFPU Coprocessor, single and double precision */
+#define XCHAL_HAVE_VECTRA1             0       /* Vectra I  pkg */
+#define XCHAL_HAVE_VECTRALX            0       /* Vectra LX pkg */
+#define XCHAL_HAVE_PDX4                        0       /* PDX4 */
+#define XCHAL_HAVE_CONNXD2             0       /* ConnX D2 pkg */
+#define XCHAL_HAVE_CONNXD2_DUALLSFLIX   0      /* ConnX D2 & Dual LoadStore Flix */
+#define XCHAL_HAVE_BBE16               0       /* ConnX BBE16 pkg */
+#define XCHAL_HAVE_BBE16_RSQRT         0       /* BBE16 & vector recip sqrt */
+#define XCHAL_HAVE_BBE16_VECDIV                0       /* BBE16 & vector divide */
+#define XCHAL_HAVE_BBE16_DESPREAD      0       /* BBE16 & despread */
+#define XCHAL_HAVE_BBENEP              0       /* ConnX BBENEP pkgs */
+#define XCHAL_HAVE_BSP3                        0       /* ConnX BSP3 pkg */
+#define XCHAL_HAVE_BSP3_TRANSPOSE      0       /* BSP3 & transpose32x32 */
+#define XCHAL_HAVE_SSP16               0       /* ConnX SSP16 pkg */
+#define XCHAL_HAVE_SSP16_VITERBI       0       /* SSP16 & viterbi */
+#define XCHAL_HAVE_TURBO16             0       /* ConnX Turbo16 pkg */
+#define XCHAL_HAVE_BBP16               0       /* ConnX BBP16 pkg */
+#define XCHAL_HAVE_FLIX3               0       /* basic 3-way FLIX option */
+#define XCHAL_HAVE_GRIVPEP              0   /*  GRIVPEP is General Release of IVPEP */
+#define XCHAL_HAVE_GRIVPEP_HISTOGRAM    0   /* Histogram option on GRIVPEP */
+
+
+/*----------------------------------------------------------------------
+                               MISC
+  ----------------------------------------------------------------------*/
+
+#define XCHAL_NUM_LOADSTORE_UNITS      1       /* load/store units */
+#define XCHAL_NUM_WRITEBUFFER_ENTRIES  8       /* size of write buffer */
+#define XCHAL_INST_FETCH_WIDTH         8       /* instr-fetch width in bytes */
+#define XCHAL_DATA_WIDTH               16      /* data width in bytes */
+#define XCHAL_DATA_PIPE_DELAY          1       /* d-side pipeline delay
+                                                  (1 = 5-stage, 2 = 7-stage) */
+#define XCHAL_CLOCK_GATING_GLOBAL      0       /* global clock gating */
+#define XCHAL_CLOCK_GATING_FUNCUNIT    0       /* funct. unit clock gating */
+/*  In T1050, applies to selected core load and store instructions (see ISA): */
+#define XCHAL_UNALIGNED_LOAD_EXCEPTION 1       /* unaligned loads cause exc. */
+#define XCHAL_UNALIGNED_STORE_EXCEPTION        1       /* unaligned stores cause exc.*/
+#define XCHAL_UNALIGNED_LOAD_HW                0       /* unaligned loads work in hw */
+#define XCHAL_UNALIGNED_STORE_HW       0       /* unaligned stores work in hw*/
+
+#define XCHAL_SW_VERSION               1100002 /* sw version of this header */
+
+#define XCHAL_CORE_ID                  "xt_lnx"        /* alphanum core name
+                                                  (CoreID) set in the Xtensa
+                                                  Processor Generator */
+
+#define XCHAL_BUILD_UNIQUE_ID          0x00057D54      /* 22-bit sw build ID */
+
+/*
+ *  These definitions describe the hardware targeted by this software.
+ */
+#define XCHAL_HW_CONFIGID0             0xC1B3FFFE      /* ConfigID hi 32 bits*/
+#define XCHAL_HW_CONFIGID1             0x1C857D54      /* ConfigID lo 32 bits*/
+#define XCHAL_HW_VERSION_NAME          "LX6.0.2"       /* full version name */
+#define XCHAL_HW_VERSION_MAJOR         2600    /* major ver# of targeted hw */
+#define XCHAL_HW_VERSION_MINOR         2       /* minor ver# of targeted hw */
+#define XCHAL_HW_VERSION               260002  /* major*100+minor */
+#define XCHAL_HW_REL_LX6               1
+#define XCHAL_HW_REL_LX6_0             1
+#define XCHAL_HW_REL_LX6_0_2           1
+#define XCHAL_HW_CONFIGID_RELIABLE     1
+/*  If software targets a *range* of hardware versions, these are the bounds: */
+#define XCHAL_HW_MIN_VERSION_MAJOR     2600    /* major v of earliest tgt hw */
+#define XCHAL_HW_MIN_VERSION_MINOR     2       /* minor v of earliest tgt hw */
+#define XCHAL_HW_MIN_VERSION           260002  /* earliest targeted hw */
+#define XCHAL_HW_MAX_VERSION_MAJOR     2600    /* major v of latest tgt hw */
+#define XCHAL_HW_MAX_VERSION_MINOR     2       /* minor v of latest tgt hw */
+#define XCHAL_HW_MAX_VERSION           260002  /* latest targeted hw */
+
+
+/*----------------------------------------------------------------------
+                               CACHE
+  ----------------------------------------------------------------------*/
+
+#define XCHAL_ICACHE_LINESIZE          64      /* I-cache line size in bytes */
+#define XCHAL_DCACHE_LINESIZE          64      /* D-cache line size in bytes */
+#define XCHAL_ICACHE_LINEWIDTH         6       /* log2(I line size in bytes) */
+#define XCHAL_DCACHE_LINEWIDTH         6       /* log2(D line size in bytes) */
+
+#define XCHAL_ICACHE_SIZE              65536   /* I-cache size in bytes or 0 */
+#define XCHAL_DCACHE_SIZE              16384   /* D-cache size in bytes or 0 */
+
+#define XCHAL_DCACHE_IS_WRITEBACK      1       /* writeback feature */
+#define XCHAL_DCACHE_IS_COHERENT       0       /* MP coherence feature */
+
+#define XCHAL_HAVE_PREFETCH            0       /* PREFCTL register */
+#define XCHAL_HAVE_PREFETCH_L1         0       /* prefetch to L1 dcache */
+#define XCHAL_PREFETCH_CASTOUT_LINES   0       /* dcache pref. castout bufsz */
+#define XCHAL_PREFETCH_ENTRIES         0       /* cache prefetch entries */
+#define XCHAL_PREFETCH_BLOCK_ENTRIES   0       /* prefetch block streams */
+#define XCHAL_HAVE_CACHE_BLOCKOPS      0       /* block prefetch for caches */
+#define XCHAL_HAVE_ICACHE_TEST         1       /* Icache test instructions */
+#define XCHAL_HAVE_DCACHE_TEST         1       /* Dcache test instructions */
+#define XCHAL_HAVE_ICACHE_DYN_WAYS     0       /* Icache dynamic way support */
+#define XCHAL_HAVE_DCACHE_DYN_WAYS     0       /* Dcache dynamic way support */
+
+
+
+
+/****************************************************************************
+    Parameters Useful for PRIVILEGED (Supervisory or Non-Virtualized) Code
+ ****************************************************************************/
+
+
+#ifndef XTENSA_HAL_NON_PRIVILEGED_ONLY
+
+/*----------------------------------------------------------------------
+                               CACHE
+  ----------------------------------------------------------------------*/
+
+#define XCHAL_HAVE_PIF                 1       /* any outbound PIF present */
+
+/*  If present, cache size in bytes == (ways * 2^(linewidth + setwidth)).  */
+
+/*  Number of cache sets in log2(lines per way):  */
+#define XCHAL_ICACHE_SETWIDTH          8
+#define XCHAL_DCACHE_SETWIDTH          6
+
+/*  Cache set associativity (number of ways):  */
+#define XCHAL_ICACHE_WAYS              4
+#define XCHAL_DCACHE_WAYS              4
+
+/*  Cache features:  */
+#define XCHAL_ICACHE_LINE_LOCKABLE     1
+#define XCHAL_DCACHE_LINE_LOCKABLE     1
+#define XCHAL_ICACHE_ECC_PARITY                0
+#define XCHAL_DCACHE_ECC_PARITY                0
+
+/*  Cache access size in bytes (affects operation of SICW instruction):  */
+#define XCHAL_ICACHE_ACCESS_SIZE       16
+#define XCHAL_DCACHE_ACCESS_SIZE       16
+
+#define XCHAL_DCACHE_BANKS             1       /* number of banks */
+
+/*  Number of encoded cache attr bits (see <xtensa/hal.h> for decoded bits):  */
+#define XCHAL_CA_BITS                  4
+
+/*  Whether MEMCTL register has anything useful  */
+#define XCHAL_USE_MEMCTL               (((XCHAL_LOOP_BUFFER_SIZE > 0)  ||      \
+                                          XCHAL_DCACHE_IS_COHERENT     ||      \
+                                          XCHAL_HAVE_ICACHE_DYN_WAYS   ||      \
+                                          XCHAL_HAVE_DCACHE_DYN_WAYS)  &&      \
+                                          (XCHAL_HW_MIN_VERSION >= XTENSA_HWVERSION_RE_2012_0))
+
+
+/*----------------------------------------------------------------------
+                       INTERNAL I/D RAM/ROMs and XLMI
+  ----------------------------------------------------------------------*/
+
+#define XCHAL_NUM_INSTROM              0       /* number of core instr. ROMs */
+#define XCHAL_NUM_INSTRAM              0       /* number of core instr. RAMs */
+#define XCHAL_NUM_DATAROM              0       /* number of core data ROMs */
+#define XCHAL_NUM_DATARAM              0       /* number of core data RAMs */
+#define XCHAL_NUM_URAM                 0       /* number of core unified RAMs*/
+#define XCHAL_NUM_XLMI                 0       /* number of core XLMI ports */
+
+#define XCHAL_HAVE_IMEM_LOADSTORE      1       /* can load/store to IROM/IRAM*/
+
+
+/*----------------------------------------------------------------------
+                       INTERRUPTS and TIMERS
+  ----------------------------------------------------------------------*/
+
+#define XCHAL_HAVE_INTERRUPTS          1       /* interrupt option */
+#define XCHAL_HAVE_HIGHPRI_INTERRUPTS  1       /* med/high-pri. interrupts */
+#define XCHAL_HAVE_NMI                 1       /* non-maskable interrupt */
+#define XCHAL_HAVE_CCOUNT              1       /* CCOUNT reg. (timer option) */
+#define XCHAL_NUM_TIMERS               3       /* number of CCOMPAREn regs */
+#define XCHAL_NUM_INTERRUPTS           22      /* number of interrupts */
+#define XCHAL_NUM_INTERRUPTS_LOG2      5       /* ceil(log2(NUM_INTERRUPTS)) */
+#define XCHAL_NUM_EXTINTERRUPTS                16      /* num of external interrupts */
+#define XCHAL_NUM_INTLEVELS            6       /* number of interrupt levels
+                                                  (not including level zero) */
+#define XCHAL_EXCM_LEVEL               3       /* level masked by PS.EXCM */
+       /* (always 1 in XEA1; levels 2 .. EXCM_LEVEL are "medium priority") */
+
+/*  Masks of interrupts at each interrupt level:  */
+#define XCHAL_INTLEVEL1_MASK           0x001F00BF
+#define XCHAL_INTLEVEL2_MASK           0x00001140
+#define XCHAL_INTLEVEL3_MASK           0x00200E00
+#define XCHAL_INTLEVEL4_MASK           0x00008000
+#define XCHAL_INTLEVEL5_MASK           0x00002000
+#define XCHAL_INTLEVEL6_MASK           0x00000000
+#define XCHAL_INTLEVEL7_MASK           0x00004000
+
+/*  Masks of interrupts at each range 1..n of interrupt levels:  */
+#define XCHAL_INTLEVEL1_ANDBELOW_MASK  0x001F00BF
+#define XCHAL_INTLEVEL2_ANDBELOW_MASK  0x001F11FF
+#define XCHAL_INTLEVEL3_ANDBELOW_MASK  0x003F1FFF
+#define XCHAL_INTLEVEL4_ANDBELOW_MASK  0x003F9FFF
+#define XCHAL_INTLEVEL5_ANDBELOW_MASK  0x003FBFFF
+#define XCHAL_INTLEVEL6_ANDBELOW_MASK  0x003FBFFF
+#define XCHAL_INTLEVEL7_ANDBELOW_MASK  0x003FFFFF
+
+/*  Level of each interrupt:  */
+#define XCHAL_INT0_LEVEL               1
+#define XCHAL_INT1_LEVEL               1
+#define XCHAL_INT2_LEVEL               1
+#define XCHAL_INT3_LEVEL               1
+#define XCHAL_INT4_LEVEL               1
+#define XCHAL_INT5_LEVEL               1
+#define XCHAL_INT6_LEVEL               2
+#define XCHAL_INT7_LEVEL               1
+#define XCHAL_INT8_LEVEL               2
+#define XCHAL_INT9_LEVEL               3
+#define XCHAL_INT10_LEVEL              3
+#define XCHAL_INT11_LEVEL              3
+#define XCHAL_INT12_LEVEL              2
+#define XCHAL_INT13_LEVEL              5
+#define XCHAL_INT14_LEVEL              7
+#define XCHAL_INT15_LEVEL              4
+#define XCHAL_INT16_LEVEL              1
+#define XCHAL_INT17_LEVEL              1
+#define XCHAL_INT18_LEVEL              1
+#define XCHAL_INT19_LEVEL              1
+#define XCHAL_INT20_LEVEL              1
+#define XCHAL_INT21_LEVEL              3
+#define XCHAL_DEBUGLEVEL               6       /* debug interrupt level */
+#define XCHAL_HAVE_DEBUG_EXTERN_INT    1       /* OCD external db interrupt */
+#define XCHAL_NMILEVEL                 7       /* NMI "level" (for use with
+                                                  EXCSAVE/EPS/EPC_n, RFI n) */
+
+/*  Type of each interrupt:  */
+#define XCHAL_INT0_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT1_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT2_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT3_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT4_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT5_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT6_TYPE        XTHAL_INTTYPE_TIMER
+#define XCHAL_INT7_TYPE        XTHAL_INTTYPE_SOFTWARE
+#define XCHAL_INT8_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT9_TYPE        XTHAL_INTTYPE_EXTERN_LEVEL
+#define XCHAL_INT10_TYPE       XTHAL_INTTYPE_TIMER
+#define XCHAL_INT11_TYPE       XTHAL_INTTYPE_SOFTWARE
+#define XCHAL_INT12_TYPE       XTHAL_INTTYPE_EXTERN_EDGE
+#define XCHAL_INT13_TYPE       XTHAL_INTTYPE_TIMER
+#define XCHAL_INT14_TYPE       XTHAL_INTTYPE_NMI
+#define XCHAL_INT15_TYPE       XTHAL_INTTYPE_PROFILING
+#define XCHAL_INT16_TYPE       XTHAL_INTTYPE_EXTERN_EDGE
+#define XCHAL_INT17_TYPE       XTHAL_INTTYPE_EXTERN_EDGE
+#define XCHAL_INT18_TYPE       XTHAL_INTTYPE_EXTERN_EDGE
+#define XCHAL_INT19_TYPE       XTHAL_INTTYPE_EXTERN_EDGE
+#define XCHAL_INT20_TYPE       XTHAL_INTTYPE_EXTERN_EDGE
+#define XCHAL_INT21_TYPE       XTHAL_INTTYPE_EXTERN_EDGE
+
+/*  Masks of interrupts for each type of interrupt:  */
+#define XCHAL_INTTYPE_MASK_UNCONFIGURED        0xFFC00000
+#define XCHAL_INTTYPE_MASK_SOFTWARE    0x00000880
+#define XCHAL_INTTYPE_MASK_EXTERN_EDGE 0x003F1000
+#define XCHAL_INTTYPE_MASK_EXTERN_LEVEL        0x0000033F
+#define XCHAL_INTTYPE_MASK_TIMER       0x00002440
+#define XCHAL_INTTYPE_MASK_NMI         0x00004000
+#define XCHAL_INTTYPE_MASK_WRITE_ERROR 0x00000000
+#define XCHAL_INTTYPE_MASK_PROFILING   0x00008000
+
+/*  Interrupt numbers assigned to specific interrupt sources:  */
+#define XCHAL_TIMER0_INTERRUPT         6       /* CCOMPARE0 */
+#define XCHAL_TIMER1_INTERRUPT         10      /* CCOMPARE1 */
+#define XCHAL_TIMER2_INTERRUPT         13      /* CCOMPARE2 */
+#define XCHAL_TIMER3_INTERRUPT         XTHAL_TIMER_UNCONFIGURED
+#define XCHAL_NMI_INTERRUPT            14      /* non-maskable interrupt */
+#define XCHAL_PROFILING_INTERRUPT      15      /* profiling interrupt */
+
+/*  Interrupt numbers for levels at which only one interrupt is configured:  */
+#define XCHAL_INTLEVEL4_NUM            15
+#define XCHAL_INTLEVEL5_NUM            13
+#define XCHAL_INTLEVEL7_NUM            14
+/*  (There are many interrupts each at level(s) 1, 2, 3.)  */
+
+
+/*
+ *  External interrupt mapping.
+ *  These macros describe how Xtensa processor interrupt numbers
+ *  (as numbered internally, eg. in INTERRUPT and INTENABLE registers)
+ *  map to external BInterrupt<n> pins, for those interrupts
+ *  configured as external (level-triggered, edge-triggered, or NMI).
+ *  See the Xtensa processor databook for more details.
+ */
+
+/*  Core interrupt numbers mapped to each EXTERNAL BInterrupt pin number:  */
+#define XCHAL_EXTINT0_NUM              0       /* (intlevel 1) */
+#define XCHAL_EXTINT1_NUM              1       /* (intlevel 1) */
+#define XCHAL_EXTINT2_NUM              2       /* (intlevel 1) */
+#define XCHAL_EXTINT3_NUM              3       /* (intlevel 1) */
+#define XCHAL_EXTINT4_NUM              4       /* (intlevel 1) */
+#define XCHAL_EXTINT5_NUM              5       /* (intlevel 1) */
+#define XCHAL_EXTINT6_NUM              8       /* (intlevel 2) */
+#define XCHAL_EXTINT7_NUM              9       /* (intlevel 3) */
+#define XCHAL_EXTINT8_NUM              12      /* (intlevel 2) */
+#define XCHAL_EXTINT9_NUM              14      /* (intlevel 7) */
+#define XCHAL_EXTINT10_NUM             16      /* (intlevel 1) */
+#define XCHAL_EXTINT11_NUM             17      /* (intlevel 1) */
+#define XCHAL_EXTINT12_NUM             18      /* (intlevel 1) */
+#define XCHAL_EXTINT13_NUM             19      /* (intlevel 1) */
+#define XCHAL_EXTINT14_NUM             20      /* (intlevel 1) */
+#define XCHAL_EXTINT15_NUM             21      /* (intlevel 3) */
+/*  EXTERNAL BInterrupt pin numbers mapped to each core interrupt number:  */
+#define XCHAL_INT0_EXTNUM              0       /* (intlevel 1) */
+#define XCHAL_INT1_EXTNUM              1       /* (intlevel 1) */
+#define XCHAL_INT2_EXTNUM              2       /* (intlevel 1) */
+#define XCHAL_INT3_EXTNUM              3       /* (intlevel 1) */
+#define XCHAL_INT4_EXTNUM              4       /* (intlevel 1) */
+#define XCHAL_INT5_EXTNUM              5       /* (intlevel 1) */
+#define XCHAL_INT8_EXTNUM              6       /* (intlevel 2) */
+#define XCHAL_INT9_EXTNUM              7       /* (intlevel 3) */
+#define XCHAL_INT12_EXTNUM             8       /* (intlevel 2) */
+#define XCHAL_INT14_EXTNUM             9       /* (intlevel 7) */
+#define XCHAL_INT16_EXTNUM             10      /* (intlevel 1) */
+#define XCHAL_INT17_EXTNUM             11      /* (intlevel 1) */
+#define XCHAL_INT18_EXTNUM             12      /* (intlevel 1) */
+#define XCHAL_INT19_EXTNUM             13      /* (intlevel 1) */
+#define XCHAL_INT20_EXTNUM             14      /* (intlevel 1) */
+#define XCHAL_INT21_EXTNUM             15      /* (intlevel 3) */
+
+
+/*----------------------------------------------------------------------
+                       EXCEPTIONS and VECTORS
+  ----------------------------------------------------------------------*/
+
+#define XCHAL_XEA_VERSION              2       /* Xtensa Exception Architecture
+                                                  number: 1 == XEA1 (old)
+                                                          2 == XEA2 (new)
+                                                          0 == XEAX (extern) or TX */
+#define XCHAL_HAVE_XEA1                        0       /* Exception Architecture 1 */
+#define XCHAL_HAVE_XEA2                        1       /* Exception Architecture 2 */
+#define XCHAL_HAVE_XEAX                        0       /* External Exception Arch. */
+#define XCHAL_HAVE_EXCEPTIONS          1       /* exception option */
+#define XCHAL_HAVE_HALT                        0       /* halt architecture option */
+#define XCHAL_HAVE_BOOTLOADER          0       /* boot loader (for TX) */
+#define XCHAL_HAVE_MEM_ECC_PARITY      0       /* local memory ECC/parity */
+#define XCHAL_HAVE_VECTOR_SELECT       1       /* relocatable vectors */
+#define XCHAL_HAVE_VECBASE             1       /* relocatable vectors */
+#define XCHAL_VECBASE_RESET_VADDR      0x00002000  /* VECBASE reset value */
+#define XCHAL_VECBASE_RESET_PADDR      0x00002000
+#define XCHAL_RESET_VECBASE_OVERLAP    0
+
+#define XCHAL_RESET_VECTOR0_VADDR      0xFE000000
+#define XCHAL_RESET_VECTOR0_PADDR      0xFE000000
+#define XCHAL_RESET_VECTOR1_VADDR      0x00001000
+#define XCHAL_RESET_VECTOR1_PADDR      0x00001000
+#define XCHAL_RESET_VECTOR_VADDR       0xFE000000
+#define XCHAL_RESET_VECTOR_PADDR       0xFE000000
+#define XCHAL_USER_VECOFS              0x00000340
+#define XCHAL_USER_VECTOR_VADDR                0x00002340
+#define XCHAL_USER_VECTOR_PADDR                0x00002340
+#define XCHAL_KERNEL_VECOFS            0x00000300
+#define XCHAL_KERNEL_VECTOR_VADDR      0x00002300
+#define XCHAL_KERNEL_VECTOR_PADDR      0x00002300
+#define XCHAL_DOUBLEEXC_VECOFS         0x000003C0
+#define XCHAL_DOUBLEEXC_VECTOR_VADDR   0x000023C0
+#define XCHAL_DOUBLEEXC_VECTOR_PADDR   0x000023C0
+#define XCHAL_WINDOW_OF4_VECOFS                0x00000000
+#define XCHAL_WINDOW_UF4_VECOFS                0x00000040
+#define XCHAL_WINDOW_OF8_VECOFS                0x00000080
+#define XCHAL_WINDOW_UF8_VECOFS                0x000000C0
+#define XCHAL_WINDOW_OF12_VECOFS       0x00000100
+#define XCHAL_WINDOW_UF12_VECOFS       0x00000140
+#define XCHAL_WINDOW_VECTORS_VADDR     0x00002000
+#define XCHAL_WINDOW_VECTORS_PADDR     0x00002000
+#define XCHAL_INTLEVEL2_VECOFS         0x00000180
+#define XCHAL_INTLEVEL2_VECTOR_VADDR   0x00002180
+#define XCHAL_INTLEVEL2_VECTOR_PADDR   0x00002180
+#define XCHAL_INTLEVEL3_VECOFS         0x000001C0
+#define XCHAL_INTLEVEL3_VECTOR_VADDR   0x000021C0
+#define XCHAL_INTLEVEL3_VECTOR_PADDR   0x000021C0
+#define XCHAL_INTLEVEL4_VECOFS         0x00000200
+#define XCHAL_INTLEVEL4_VECTOR_VADDR   0x00002200
+#define XCHAL_INTLEVEL4_VECTOR_PADDR   0x00002200
+#define XCHAL_INTLEVEL5_VECOFS         0x00000240
+#define XCHAL_INTLEVEL5_VECTOR_VADDR   0x00002240
+#define XCHAL_INTLEVEL5_VECTOR_PADDR   0x00002240
+#define XCHAL_INTLEVEL6_VECOFS         0x00000280
+#define XCHAL_INTLEVEL6_VECTOR_VADDR   0x00002280
+#define XCHAL_INTLEVEL6_VECTOR_PADDR   0x00002280
+#define XCHAL_DEBUG_VECOFS             XCHAL_INTLEVEL6_VECOFS
+#define XCHAL_DEBUG_VECTOR_VADDR       XCHAL_INTLEVEL6_VECTOR_VADDR
+#define XCHAL_DEBUG_VECTOR_PADDR       XCHAL_INTLEVEL6_VECTOR_PADDR
+#define XCHAL_NMI_VECOFS               0x000002C0
+#define XCHAL_NMI_VECTOR_VADDR         0x000022C0
+#define XCHAL_NMI_VECTOR_PADDR         0x000022C0
+#define XCHAL_INTLEVEL7_VECOFS         XCHAL_NMI_VECOFS
+#define XCHAL_INTLEVEL7_VECTOR_VADDR   XCHAL_NMI_VECTOR_VADDR
+#define XCHAL_INTLEVEL7_VECTOR_PADDR   XCHAL_NMI_VECTOR_PADDR
+
+
+/*----------------------------------------------------------------------
+                               DEBUG MODULE
+  ----------------------------------------------------------------------*/
+
+/*  Misc  */
+#define XCHAL_HAVE_DEBUG_ERI           1       /* ERI to debug module */
+#define XCHAL_HAVE_DEBUG_APB           1       /* APB to debug module */
+#define XCHAL_HAVE_DEBUG_JTAG          1       /* JTAG to debug module */
+
+/*  On-Chip Debug (OCD)  */
+#define XCHAL_HAVE_OCD                 1       /* OnChipDebug option */
+#define XCHAL_NUM_IBREAK               2       /* number of IBREAKn regs */
+#define XCHAL_NUM_DBREAK               2       /* number of DBREAKn regs */
+#define XCHAL_HAVE_OCD_DIR_ARRAY       0       /* faster OCD option (to LX4) */
+#define XCHAL_HAVE_OCD_LS32DDR         1       /* L32DDR/S32DDR (faster OCD) */
+
+/*  TRAX (in core)  */
+#define XCHAL_HAVE_TRAX                        1       /* TRAX in debug module */
+#define XCHAL_TRAX_MEM_SIZE            262144  /* TRAX memory size in bytes */
+#define XCHAL_TRAX_MEM_SHAREABLE       1       /* start/end regs; ready sig. */
+#define XCHAL_TRAX_ATB_WIDTH           0       /* ATB width (bits), 0=no ATB */
+#define XCHAL_TRAX_TIME_WIDTH          0       /* timestamp bitwidth, 0=none */
+
+/*  Perf counters  */
+#define XCHAL_NUM_PERF_COUNTERS                8       /* performance counters */
+
+
+/*----------------------------------------------------------------------
+                               MMU
+  ----------------------------------------------------------------------*/
+
+/*  See core-matmap.h header file for more details.  */
+
+#define XCHAL_HAVE_TLBS                        1       /* inverse of HAVE_CACHEATTR */
+#define XCHAL_HAVE_SPANNING_WAY                1       /* one way maps I+D 4GB vaddr */
+#define XCHAL_SPANNING_WAY             6       /* TLB spanning way number */
+#define XCHAL_HAVE_IDENTITY_MAP                0       /* vaddr == paddr always */
+#define XCHAL_HAVE_CACHEATTR           0       /* CACHEATTR register present */
+#define XCHAL_HAVE_MIMIC_CACHEATTR     0       /* region protection */
+#define XCHAL_HAVE_XLT_CACHEATTR       0       /* region prot. w/translation */
+#define XCHAL_HAVE_PTP_MMU             1       /* full MMU (with page table
+                                                  [autorefill] and protection)
+                                                  usable for an MMU-based OS */
+/*  If none of the above last 4 are set, it's a custom TLB configuration.  */
+#define XCHAL_ITLB_ARF_ENTRIES_LOG2    2       /* log2(autorefill way size) */
+#define XCHAL_DTLB_ARF_ENTRIES_LOG2    2       /* log2(autorefill way size) */
+
+#define XCHAL_MMU_ASID_BITS            8       /* number of bits in ASIDs */
+#define XCHAL_MMU_RINGS                        4       /* number of rings (1..4) */
+#define XCHAL_MMU_RING_BITS            2       /* num of bits in RING field */
+
+#endif /* !XTENSA_HAL_NON_PRIVILEGED_ONLY */
+
+
+#endif /* _XTENSA_CORE_CONFIGURATION_H */
+
diff --git a/arch/xtensa/variants/csp/include/variant/tie-asm.h b/arch/xtensa/variants/csp/include/variant/tie-asm.h

new file mode 100644 (file)

index 0000000..ba773c4
--- /dev/null
+++ b/arch/xtensa/variants/csp/include/variant/tie-asm.h
@@ -0,0 +1,194 @@
+/* 
+ * tie-asm.h -- compile-time HAL assembler definitions dependent on CORE & TIE
+ *
+ *  NOTE:  This header file is not meant to be included directly.
+ */
+
+/* This header file contains assembly-language definitions (assembly
+   macros, etc.) for this specific Xtensa processor's TIE extensions
+   and options.  It is customized to this Xtensa processor configuration.
+
+   Copyright (c) 1999-2015 Cadence Design Systems Inc.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#ifndef _XTENSA_CORE_TIE_ASM_H
+#define _XTENSA_CORE_TIE_ASM_H
+
+/*  Selection parameter values for save-area save/restore macros:  */
+/*  Option vs. TIE:  */
+#define XTHAL_SAS_TIE  0x0001  /* custom extension or coprocessor */
+#define XTHAL_SAS_OPT  0x0002  /* optional (and not a coprocessor) */
+#define XTHAL_SAS_ANYOT        0x0003  /* both of the above */
+/*  Whether used automatically by compiler:  */
+#define XTHAL_SAS_NOCC 0x0004  /* not used by compiler w/o special opts/code */
+#define XTHAL_SAS_CC   0x0008  /* used by compiler without special opts/code */
+#define XTHAL_SAS_ANYCC        0x000C  /* both of the above */
+/*  ABI handling across function calls:  */
+#define XTHAL_SAS_CALR 0x0010  /* caller-saved */
+#define XTHAL_SAS_CALE 0x0020  /* callee-saved */
+#define XTHAL_SAS_GLOB 0x0040  /* global across function calls (in thread) */
+#define XTHAL_SAS_ANYABI       0x0070  /* all of the above three */
+/*  Misc  */
+#define XTHAL_SAS_ALL  0xFFFF  /* include all default NCP contents */
+#define XTHAL_SAS3(optie,ccuse,abi)    ( ((optie) & XTHAL_SAS_ANYOT)  \
+                                       | ((ccuse) & XTHAL_SAS_ANYCC)  \
+                                       | ((abi)   & XTHAL_SAS_ANYABI) )
+
+
+    /*
+      *  Macro to store all non-coprocessor (extra) custom TIE and optional state
+      *  (not including zero-overhead loop registers).
+      *  Required parameters:
+      *      ptr         Save area pointer address register (clobbered)
+      *                  (register must contain a 4 byte aligned address).
+      *      at1..at4    Four temporary address registers (first XCHAL_NCP_NUM_ATMPS
+      *                  registers are clobbered, the remaining are unused).
+      *  Optional parameters:
+      *      continue    If macro invoked as part of a larger store sequence, set to 1
+      *                  if this is not the first in the sequence.  Defaults to 0.
+      *      ofs         Offset from start of larger sequence (from value of first ptr
+      *                  in sequence) at which to store.  Defaults to next available space
+      *                  (or 0 if <continue> is 0).
+      *      select      Select what category(ies) of registers to store, as a bitmask
+      *                  (see XTHAL_SAS_xxx constants).  Defaults to all registers.
+      *      alloc       Select what category(ies) of registers to allocate; if any
+      *                  category is selected here that is not in <select>, space for
+      *                  the corresponding registers is skipped without doing any store.
+      */
+    .macro xchal_ncp_store  ptr at1 at2 at3 at4  continue=0 ofs=-1 select=XTHAL_SAS_ALL alloc=0
+       xchal_sa_start  \continue, \ofs
+       // Optional global registers used by default by the compiler:
+       .ifeq (XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_GLOB) & ~(\select)
+       xchal_sa_align  \ptr, 0, 1020, 4, 4
+       rur.THREADPTR   \at1            // threadptr option
+       s32i    \at1, \ptr, .Lxchal_ofs_+0
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 4
+       .elseif ((XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_GLOB) & ~(\alloc)) == 0
+       xchal_sa_align  \ptr, 0, 1020, 4, 4
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 4
+       .endif
+       // Optional caller-saved registers used by default by the compiler:
+       .ifeq (XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_CALR) & ~(\select)
+       xchal_sa_align  \ptr, 0, 1016, 4, 4
+       rsr.ACCLO       \at1            // MAC16 option
+       s32i    \at1, \ptr, .Lxchal_ofs_+0
+       rsr.ACCHI       \at1            // MAC16 option
+       s32i    \at1, \ptr, .Lxchal_ofs_+4
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 8
+       .elseif ((XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_CALR) & ~(\alloc)) == 0
+       xchal_sa_align  \ptr, 0, 1016, 4, 4
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 8
+       .endif
+       // Optional caller-saved registers not used by default by the compiler:
+       .ifeq (XTHAL_SAS_OPT | XTHAL_SAS_NOCC | XTHAL_SAS_CALR) & ~(\select)
+       xchal_sa_align  \ptr, 0, 1000, 4, 4
+       rsr.BR  \at1            // boolean option
+       s32i    \at1, \ptr, .Lxchal_ofs_+0
+       rsr.SCOMPARE1   \at1            // conditional store option
+       s32i    \at1, \ptr, .Lxchal_ofs_+4
+       rsr.M0  \at1            // MAC16 option
+       s32i    \at1, \ptr, .Lxchal_ofs_+8
+       rsr.M1  \at1            // MAC16 option
+       s32i    \at1, \ptr, .Lxchal_ofs_+12
+       rsr.M2  \at1            // MAC16 option
+       s32i    \at1, \ptr, .Lxchal_ofs_+16
+       rsr.M3  \at1            // MAC16 option
+       s32i    \at1, \ptr, .Lxchal_ofs_+20
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 24
+       .elseif ((XTHAL_SAS_OPT | XTHAL_SAS_NOCC | XTHAL_SAS_CALR) & ~(\alloc)) == 0
+       xchal_sa_align  \ptr, 0, 1000, 4, 4
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 24
+       .endif
+    .endm      // xchal_ncp_store
+
+    /*
+      *  Macro to load all non-coprocessor (extra) custom TIE and optional state
+      *  (not including zero-overhead loop registers).
+      *  Required parameters:
+      *      ptr         Save area pointer address register (clobbered)
+      *                  (register must contain a 4 byte aligned address).
+      *      at1..at4    Four temporary address registers (first XCHAL_NCP_NUM_ATMPS
+      *                  registers are clobbered, the remaining are unused).
+      *  Optional parameters:
+      *      continue    If macro invoked as part of a larger load sequence, set to 1
+      *                  if this is not the first in the sequence.  Defaults to 0.
+      *      ofs         Offset from start of larger sequence (from value of first ptr
+      *                  in sequence) at which to load.  Defaults to next available space
+      *                  (or 0 if <continue> is 0).
+      *      select      Select what category(ies) of registers to load, as a bitmask
+      *                  (see XTHAL_SAS_xxx constants).  Defaults to all registers.
+      *      alloc       Select what category(ies) of registers to allocate; if any
+      *                  category is selected here that is not in <select>, space for
+      *                  the corresponding registers is skipped without doing any load.
+      */
+    .macro xchal_ncp_load  ptr at1 at2 at3 at4  continue=0 ofs=-1 select=XTHAL_SAS_ALL alloc=0
+       xchal_sa_start  \continue, \ofs
+       // Optional global registers used by default by the compiler:
+       .ifeq (XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_GLOB) & ~(\select)
+       xchal_sa_align  \ptr, 0, 1020, 4, 4
+       l32i    \at1, \ptr, .Lxchal_ofs_+0
+       wur.THREADPTR   \at1            // threadptr option
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 4
+       .elseif ((XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_GLOB) & ~(\alloc)) == 0
+       xchal_sa_align  \ptr, 0, 1020, 4, 4
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 4
+       .endif
+       // Optional caller-saved registers used by default by the compiler:
+       .ifeq (XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_CALR) & ~(\select)
+       xchal_sa_align  \ptr, 0, 1016, 4, 4
+       l32i    \at1, \ptr, .Lxchal_ofs_+0
+       wsr.ACCLO       \at1            // MAC16 option
+       l32i    \at1, \ptr, .Lxchal_ofs_+4
+       wsr.ACCHI       \at1            // MAC16 option
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 8
+       .elseif ((XTHAL_SAS_OPT | XTHAL_SAS_CC | XTHAL_SAS_CALR) & ~(\alloc)) == 0
+       xchal_sa_align  \ptr, 0, 1016, 4, 4
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 8
+       .endif
+       // Optional caller-saved registers not used by default by the compiler:
+       .ifeq (XTHAL_SAS_OPT | XTHAL_SAS_NOCC | XTHAL_SAS_CALR) & ~(\select)
+       xchal_sa_align  \ptr, 0, 1000, 4, 4
+       l32i    \at1, \ptr, .Lxchal_ofs_+0
+       wsr.BR  \at1            // boolean option
+       l32i    \at1, \ptr, .Lxchal_ofs_+4
+       wsr.SCOMPARE1   \at1            // conditional store option
+       l32i    \at1, \ptr, .Lxchal_ofs_+8
+       wsr.M0  \at1            // MAC16 option
+       l32i    \at1, \ptr, .Lxchal_ofs_+12
+       wsr.M1  \at1            // MAC16 option
+       l32i    \at1, \ptr, .Lxchal_ofs_+16
+       wsr.M2  \at1            // MAC16 option
+       l32i    \at1, \ptr, .Lxchal_ofs_+20
+       wsr.M3  \at1            // MAC16 option
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 24
+       .elseif ((XTHAL_SAS_OPT | XTHAL_SAS_NOCC | XTHAL_SAS_CALR) & ~(\alloc)) == 0
+       xchal_sa_align  \ptr, 0, 1000, 4, 4
+       .set    .Lxchal_ofs_, .Lxchal_ofs_ + 24
+       .endif
+    .endm      // xchal_ncp_load
+
+
+#define XCHAL_NCP_NUM_ATMPS    1
+
+#define XCHAL_SA_NUM_ATMPS     1
+
+#endif /*_XTENSA_CORE_TIE_ASM_H*/
+
diff --git a/arch/xtensa/variants/csp/include/variant/tie.h b/arch/xtensa/variants/csp/include/variant/tie.h

new file mode 100644 (file)

index 0000000..3ce391c
--- /dev/null
+++ b/arch/xtensa/variants/csp/include/variant/tie.h
@@ -0,0 +1,161 @@
+/* 
+ * tie.h -- compile-time HAL definitions dependent on CORE & TIE configuration
+ *
+ *  NOTE:  This header file is not meant to be included directly.
+ */
+
+/* This header file describes this specific Xtensa processor's TIE extensions
+   that extend basic Xtensa core functionality.  It is customized to this
+   Xtensa processor configuration.
+
+   Copyright (c) 1999-2015 Cadence Design Systems Inc.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#ifndef _XTENSA_CORE_TIE_H
+#define _XTENSA_CORE_TIE_H
+
+#define XCHAL_CP_NUM                   1       /* number of coprocessors */
+#define XCHAL_CP_MAX                   8       /* max CP ID + 1 (0 if none) */
+#define XCHAL_CP_MASK                  0x80    /* bitmask of all CPs by ID */
+#define XCHAL_CP_PORT_MASK             0x80    /* bitmask of only port CPs */
+
+/*  Basic parameters of each coprocessor:  */
+#define XCHAL_CP7_NAME                 "XTIOP"
+#define XCHAL_CP7_IDENT                        XTIOP
+#define XCHAL_CP7_SA_SIZE              0       /* size of state save area */
+#define XCHAL_CP7_SA_ALIGN             1       /* min alignment of save area */
+#define XCHAL_CP_ID_XTIOP              7       /* coprocessor ID (0..7) */
+
+/*  Filler info for unassigned coprocessors, to simplify arrays etc:  */
+#define XCHAL_CP0_SA_SIZE              0
+#define XCHAL_CP0_SA_ALIGN             1
+#define XCHAL_CP1_SA_SIZE              0
+#define XCHAL_CP1_SA_ALIGN             1
+#define XCHAL_CP2_SA_SIZE              0
+#define XCHAL_CP2_SA_ALIGN             1
+#define XCHAL_CP3_SA_SIZE              0
+#define XCHAL_CP3_SA_ALIGN             1
+#define XCHAL_CP4_SA_SIZE              0
+#define XCHAL_CP4_SA_ALIGN             1
+#define XCHAL_CP5_SA_SIZE              0
+#define XCHAL_CP5_SA_ALIGN             1
+#define XCHAL_CP6_SA_SIZE              0
+#define XCHAL_CP6_SA_ALIGN             1
+
+/*  Save area for non-coprocessor optional and custom (TIE) state:  */
+#define XCHAL_NCP_SA_SIZE              36
+#define XCHAL_NCP_SA_ALIGN             4
+
+/*  Total save area for optional and custom state (NCP + CPn):  */
+#define XCHAL_TOTAL_SA_SIZE            48      /* with 16-byte align padding */
+#define XCHAL_TOTAL_SA_ALIGN           4       /* actual minimum alignment */
+
+/*
+ * Detailed contents of save areas.
+ * NOTE:  caller must define the XCHAL_SA_REG macro (not defined here)
+ * before expanding the XCHAL_xxx_SA_LIST() macros.
+ *
+ * XCHAL_SA_REG(s,ccused,abikind,kind,opt,name,galign,align,asize,
+ *             dbnum,base,regnum,bitsz,gapsz,reset,x...)
+ *
+ *     s = passed from XCHAL_*_LIST(s), eg. to select how to expand
+ *     ccused = set if used by compiler without special options or code
+ *     abikind = 0 (caller-saved), 1 (callee-saved), or 2 (thread-global)
+ *     kind = 0 (special reg), 1 (TIE user reg), or 2 (TIE regfile reg)
+ *     opt = 0 (custom TIE extension or coprocessor), or 1 (optional reg)
+ *     name = lowercase reg name (no quotes)
+ *     galign = group byte alignment (power of 2) (galign >= align)
+ *     align = register byte alignment (power of 2)
+ *     asize = allocated size in bytes (asize*8 == bitsz + gapsz + padsz)
+ *       (not including any pad bytes required to galign this or next reg)
+ *     dbnum = unique target number f/debug (see <xtensa-libdb-macros.h>)
+ *     base = reg shortname w/o index (or sr=special, ur=TIE user reg)
+ *     regnum = reg index in regfile, or special/TIE-user reg number
+ *     bitsz = number of significant bits (regfile width, or ur/sr mask bits)
+ *     gapsz = intervening bits, if bitsz bits not stored contiguously
+ *     (padsz = pad bits at end [TIE regfile] or at msbits [ur,sr] of asize)
+ *     reset = register reset value (or 0 if undefined at reset)
+ *     x = reserved for future use (0 until then)
+ *
+ *  To filter out certain registers, e.g. to expand only the non-global
+ *  registers used by the compiler, you can do something like this:
+ *
+ *  #define XCHAL_SA_REG(s,ccused,p...)        SELCC##ccused(p)
+ *  #define SELCC0(p...)
+ *  #define SELCC1(abikind,p...)       SELAK##abikind(p)
+ *  #define SELAK0(p...)               REG(p)
+ *  #define SELAK1(p...)               REG(p)
+ *  #define SELAK2(p...)
+ *  #define REG(kind,tie,name,galn,aln,asz,csz,dbnum,base,rnum,bsz,rst,x...) \
+ *             ...what you want to expand...
+ */
+
+#define XCHAL_NCP_SA_NUM       9
+#define XCHAL_NCP_SA_LIST(s)   \
+ XCHAL_SA_REG(s,1,2,1,1,      threadptr, 4, 4, 4,0x03E7,  ur,231, 32,0,0,0) \
+ XCHAL_SA_REG(s,1,0,0,1,          acclo, 4, 4, 4,0x0210,  sr,16 , 32,0,0,0) \
+ XCHAL_SA_REG(s,1,0,0,1,          acchi, 4, 4, 4,0x0211,  sr,17 ,  8,0,0,0) \
+ XCHAL_SA_REG(s,0,0,0,1,             br, 4, 4, 4,0x0204,  sr,4  , 16,0,0,0) \
+ XCHAL_SA_REG(s,0,0,0,1,      scompare1, 4, 4, 4,0x020C,  sr,12 , 32,0,0,0) \
+ XCHAL_SA_REG(s,0,0,0,1,             m0, 4, 4, 4,0x0220,  sr,32 , 32,0,0,0) \
+ XCHAL_SA_REG(s,0,0,0,1,             m1, 4, 4, 4,0x0221,  sr,33 , 32,0,0,0) \
+ XCHAL_SA_REG(s,0,0,0,1,             m2, 4, 4, 4,0x0222,  sr,34 , 32,0,0,0) \
+ XCHAL_SA_REG(s,0,0,0,1,             m3, 4, 4, 4,0x0223,  sr,35 , 32,0,0,0)
+
+#define XCHAL_CP0_SA_NUM       0
+#define XCHAL_CP0_SA_LIST(s)   /* empty */
+
+#define XCHAL_CP1_SA_NUM       0
+#define XCHAL_CP1_SA_LIST(s)   /* empty */
+
+#define XCHAL_CP2_SA_NUM       0
+#define XCHAL_CP2_SA_LIST(s)   /* empty */
+
+#define XCHAL_CP3_SA_NUM       0
+#define XCHAL_CP3_SA_LIST(s)   /* empty */
+
+#define XCHAL_CP4_SA_NUM       0
+#define XCHAL_CP4_SA_LIST(s)   /* empty */
+
+#define XCHAL_CP5_SA_NUM       0
+#define XCHAL_CP5_SA_LIST(s)   /* empty */
+
+#define XCHAL_CP6_SA_NUM       0
+#define XCHAL_CP6_SA_LIST(s)   /* empty */
+
+#define XCHAL_CP7_SA_NUM       0
+#define XCHAL_CP7_SA_LIST(s)   /* empty */
+
+/* Byte length of instruction from its first nibble (op0 field), per FLIX.  */
+#define XCHAL_OP0_FORMAT_LENGTHS       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3
+/* Byte length of instruction from its first byte, per FLIX.  */
+#define XCHAL_BYTE0_FORMAT_LENGTHS     \
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3,\
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3,\
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3,\
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3,\
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3,\
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3,\
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3,\
+       3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3, 3,3,3,3,3,3,3,3,2,2,2,2,2,2,3,3
+
+#endif /*_XTENSA_CORE_TIE_H*/
+
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c

index f096ab3..ec4f507 100644 (file)
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -938,7 +938,7 @@ static int erst_clearer(enum pstore_type_id type, u64 id, int count,
  static struct pstore_info erst_info = {
         .owner          = THIS_MODULE,
         .name           = "erst",
-       .flags          = PSTORE_FLAGS_FRAGILE,
+       .flags          = PSTORE_FLAGS_DMESG,
         .open           = erst_open_pstore,
         .close          = erst_close_pstore,
         .read           = erst_reader,
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c

index a5b5c87..a56fa2a 100644 (file)
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -19,6 +19,7 @@
  #include <linux/amba/bus.h>
  #include <linux/sizes.h>
  #include <linux/limits.h>
+#include <linux/clk/clk-conf.h>
  
  #include <asm/irq.h>
  
@@ -237,6 +238,10 @@ static int amba_probe(struct device *dev)
         int ret;
  
         do {
+               ret = of_clk_set_defaults(dev->of_node, false);
+               if (ret < 0)
+                       break;
+
                 ret = dev_pm_domain_attach(dev, true);
                 if (ret == -EPROBE_DEFER)
                         break;
diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c

index b5befc2..2bac9b6 100644 (file)
--- a/drivers/cpufreq/sa1110-cpufreq.c
+++ b/drivers/cpufreq/sa1110-cpufreq.c
@@ -159,7 +159,7 @@ sdram_calculate_timing(struct sdram_info *sd, u_int cpu_khz,
          * half speed or use delayed read latching (errata 13).
          */
         if ((ns_to_cycles(sdram->tck, sd_khz) > 1) ||
-           (CPU_REVISION < CPU_SA1110_B2 && sd_khz < 62000))
+           (read_cpuid_revision() < ARM_CPU_REV_SA1110_B2 && sd_khz < 62000))
                 sd_khz /= 2;
  
         sd->mdcnfg = MDCNFG & 0x007f007f;
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c

index 1c33d74..f402ba2 100644 (file)
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -380,7 +380,7 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
  static struct pstore_info efi_pstore_info = {
         .owner          = THIS_MODULE,
         .name           = "efi",
-       .flags          = PSTORE_FLAGS_FRAGILE,
+       .flags          = PSTORE_FLAGS_DMESG,
         .open           = efi_pstore_open,
         .close          = efi_pstore_close,
         .read           = efi_pstore_read,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c

index 4025291..58fa8cc 100644 (file)
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -137,6 +137,7 @@ struct iommu_dev_data {
         bool pri_tlp;                     /* PASID TLB required for
                                              PPR completions */
         u32 errata;                       /* Bitmap for errata to apply */
+       bool use_vapic;                   /* Enable device to use vapic mode */
  };
  
  /*
@@ -707,14 +708,74 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu)
         }
  }
  
+#ifdef CONFIG_IRQ_REMAP
+static int (*iommu_ga_log_notifier)(u32);
+
+int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+       iommu_ga_log_notifier = notifier;
+
+       return 0;
+}
+EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
+
+static void iommu_poll_ga_log(struct amd_iommu *iommu)
+{
+       u32 head, tail, cnt = 0;
+
+       if (iommu->ga_log == NULL)
+               return;
+
+       head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+       tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+       while (head != tail) {
+               volatile u64 *raw;
+               u64 log_entry;
+
+               raw = (u64 *)(iommu->ga_log + head);
+               cnt++;
+
+               /* Avoid memcpy function-call overhead */
+               log_entry = *raw;
+
+               /* Update head pointer of hardware ring-buffer */
+               head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
+               writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+
+               /* Handle GA entry */
+               switch (GA_REQ_TYPE(log_entry)) {
+               case GA_GUEST_NR:
+                       if (!iommu_ga_log_notifier)
+                               break;
+
+                       pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n",
+                                __func__, GA_DEVID(log_entry),
+                                GA_TAG(log_entry));
+
+                       if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
+                               pr_err("AMD-Vi: GA log notifier failed.\n");
+                       break;
+               default:
+                       break;
+               }
+       }
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+#define AMD_IOMMU_INT_MASK     \
+       (MMIO_STATUS_EVT_INT_MASK | \
+        MMIO_STATUS_PPR_INT_MASK | \
+        MMIO_STATUS_GALOG_INT_MASK)
+
  irqreturn_t amd_iommu_int_thread(int irq, void *data)
  {
         struct amd_iommu *iommu = (struct amd_iommu *) data;
         u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
  
-       while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) {
-               /* Enable EVT and PPR interrupts again */
-               writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK),
+       while (status & AMD_IOMMU_INT_MASK) {
+               /* Enable EVT and PPR and GA interrupts again */
+               writel(AMD_IOMMU_INT_MASK,
                         iommu->mmio_base + MMIO_STATUS_OFFSET);
  
                 if (status & MMIO_STATUS_EVT_INT_MASK) {
@@ -727,6 +788,13 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data)
                         iommu_poll_ppr_log(iommu);
                 }
  
+#ifdef CONFIG_IRQ_REMAP
+               if (status & MMIO_STATUS_GALOG_INT_MASK) {
+                       pr_devel("AMD-Vi: Processing IOMMU GA Log\n");
+                       iommu_poll_ga_log(iommu);
+               }
+#endif
+
                 /*
                  * Hardware bug: ERBT1312
                  * When re-enabling interrupt (by writing 1
@@ -2967,6 +3035,12 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
         if (!iommu)
                 return;
  
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+           (dom->type == IOMMU_DOMAIN_UNMANAGED))
+               dev_data->use_vapic = 0;
+#endif
+
         iommu_completion_wait(iommu);
  }
  
@@ -2992,6 +3066,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
  
         ret = attach_device(dev, domain);
  
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+               if (dom->type == IOMMU_DOMAIN_UNMANAGED)
+                       dev_data->use_vapic = 1;
+               else
+                       dev_data->use_vapic = 0;
+       }
+#endif
+
         iommu_completion_wait(iommu);
  
         return ret;
@@ -3530,34 +3613,6 @@ EXPORT_SYMBOL(amd_iommu_device_info);
   *
   *****************************************************************************/
  
-union irte {
-       u32 val;
-       struct {
-               u32 valid       : 1,
-                   no_fault    : 1,
-                   int_type    : 3,
-                   rq_eoi      : 1,
-                   dm          : 1,
-                   rsvd_1      : 1,
-                   destination : 8,
-                   vector      : 8,
-                   rsvd_2      : 8;
-       } fields;
-};
-
-struct irq_2_irte {
-       u16 devid; /* Device ID for IRTE table */
-       u16 index; /* Index into IRTE table*/
-};
-
-struct amd_ir_data {
-       struct irq_2_irte                       irq_2_irte;
-       union irte                              irte_entry;
-       union {
-               struct msi_msg                  msi_entry;
-       };
-};
-
  static struct irq_chip amd_ir_chip;
  
  #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
@@ -3579,8 +3634,6 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
         amd_iommu_dev_table[devid].data[2] = dte;
  }
  
-#define IRTE_ALLOCATED (~1U)
-
  static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
  {
         struct irq_remap_table *table = NULL;
@@ -3626,13 +3679,18 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
                 goto out;
         }
  
-       memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32));
+       if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+               memset(table->table, 0,
+                      MAX_IRQS_PER_TABLE * sizeof(u32));
+       else
+               memset(table->table, 0,
+                      (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
  
         if (ioapic) {
                 int i;
  
                 for (i = 0; i < 32; ++i)
-                       table->table[i] = IRTE_ALLOCATED;
+                       iommu->irte_ops->set_allocated(table, i);
         }
  
         irq_lookup_table[devid] = table;
@@ -3658,6 +3716,10 @@ static int alloc_irq_index(u16 devid, int count)
         struct irq_remap_table *table;
         unsigned long flags;
         int index, c;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+       if (!iommu)
+               return -ENODEV;
  
         table = get_irq_table(devid, false);
         if (!table)
@@ -3669,14 +3731,14 @@ static int alloc_irq_index(u16 devid, int count)
         for (c = 0, index = table->min_index;
              index < MAX_IRQS_PER_TABLE;
              ++index) {
-               if (table->table[index] == 0)
+               if (!iommu->irte_ops->is_allocated(table, index))
                         c += 1;
                 else
                         c = 0;
  
                 if (c == count) {
                         for (; c != 0; --c)
-                               table->table[index - c + 1] = IRTE_ALLOCATED;
+                               iommu->irte_ops->set_allocated(table, index - c + 1);
  
                         index -= count - 1;
                         goto out;
@@ -3691,7 +3753,42 @@ out:
         return index;
  }
  
-static int modify_irte(u16 devid, int index, union irte irte)
+static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
+                         struct amd_ir_data *data)
+{
+       struct irq_remap_table *table;
+       struct amd_iommu *iommu;
+       unsigned long flags;
+       struct irte_ga *entry;
+
+       iommu = amd_iommu_rlookup_table[devid];
+       if (iommu == NULL)
+               return -EINVAL;
+
+       table = get_irq_table(devid, false);
+       if (!table)
+               return -ENOMEM;
+
+       spin_lock_irqsave(&table->lock, flags);
+
+       entry = (struct irte_ga *)table->table;
+       entry = &entry[index];
+       entry->lo.fields_remap.valid = 0;
+       entry->hi.val = irte->hi.val;
+       entry->lo.val = irte->lo.val;
+       entry->lo.fields_remap.valid = 1;
+       if (data)
+               data->ref = entry;
+
+       spin_unlock_irqrestore(&table->lock, flags);
+
+       iommu_flush_irt(iommu, devid);
+       iommu_completion_wait(iommu);
+
+       return 0;
+}
+
+static int modify_irte(u16 devid, int index, union irte *irte)
  {
         struct irq_remap_table *table;
         struct amd_iommu *iommu;
@@ -3706,7 +3803,7 @@ static int modify_irte(u16 devid, int index, union irte irte)
                 return -ENOMEM;
  
         spin_lock_irqsave(&table->lock, flags);
-       table->table[index] = irte.val;
+       table->table[index] = irte->val;
         spin_unlock_irqrestore(&table->lock, flags);
  
         iommu_flush_irt(iommu, devid);
@@ -3730,13 +3827,146 @@ static void free_irte(u16 devid, int index)
                 return;
  
         spin_lock_irqsave(&table->lock, flags);
-       table->table[index] = 0;
+       iommu->irte_ops->clear_allocated(table, index);
         spin_unlock_irqrestore(&table->lock, flags);
  
         iommu_flush_irt(iommu, devid);
         iommu_completion_wait(iommu);
  }
  
+static void irte_prepare(void *entry,
+                        u32 delivery_mode, u32 dest_mode,
+                        u8 vector, u32 dest_apicid, int devid)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->val                = 0;
+       irte->fields.vector      = vector;
+       irte->fields.int_type    = delivery_mode;
+       irte->fields.destination = dest_apicid;
+       irte->fields.dm          = dest_mode;
+       irte->fields.valid       = 1;
+}
+
+static void irte_ga_prepare(void *entry,
+                           u32 delivery_mode, u32 dest_mode,
+                           u8 vector, u32 dest_apicid, int devid)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       struct iommu_dev_data *dev_data = search_dev_data(devid);
+
+       irte->lo.val                      = 0;
+       irte->hi.val                      = 0;
+       irte->lo.fields_remap.guest_mode  = dev_data ? dev_data->use_vapic : 0;
+       irte->lo.fields_remap.int_type    = delivery_mode;
+       irte->lo.fields_remap.dm          = dest_mode;
+       irte->hi.fields.vector            = vector;
+       irte->lo.fields_remap.destination = dest_apicid;
+       irte->lo.fields_remap.valid       = 1;
+}
+
+static void irte_activate(void *entry, u16 devid, u16 index)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->fields.valid = 1;
+       modify_irte(devid, index, irte);
+}
+
+static void irte_ga_activate(void *entry, u16 devid, u16 index)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+
+       irte->lo.fields_remap.valid = 1;
+       modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_deactivate(void *entry, u16 devid, u16 index)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->fields.valid = 0;
+       modify_irte(devid, index, irte);
+}
+
+static void irte_ga_deactivate(void *entry, u16 devid, u16 index)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+
+       irte->lo.fields_remap.valid = 0;
+       modify_irte_ga(devid, index, irte, NULL);
+}
+
+static void irte_set_affinity(void *entry, u16 devid, u16 index,
+                             u8 vector, u32 dest_apicid)
+{
+       union irte *irte = (union irte *) entry;
+
+       irte->fields.vector = vector;
+       irte->fields.destination = dest_apicid;
+       modify_irte(devid, index, irte);
+}
+
+static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
+                                u8 vector, u32 dest_apicid)
+{
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       struct iommu_dev_data *dev_data = search_dev_data(devid);
+
+       if (!dev_data || !dev_data->use_vapic) {
+               irte->hi.fields.vector = vector;
+               irte->lo.fields_remap.destination = dest_apicid;
+               irte->lo.fields_remap.guest_mode = 0;
+               modify_irte_ga(devid, index, irte, NULL);
+       }
+}
+
+#define IRTE_ALLOCATED (~1U)
+static void irte_set_allocated(struct irq_remap_table *table, int index)
+{
+       table->table[index] = IRTE_ALLOCATED;
+}
+
+static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
+{
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+
+       memset(&irte->lo.val, 0, sizeof(u64));
+       memset(&irte->hi.val, 0, sizeof(u64));
+       irte->hi.fields.vector = 0xff;
+}
+
+static bool irte_is_allocated(struct irq_remap_table *table, int index)
+{
+       union irte *ptr = (union irte *)table->table;
+       union irte *irte = &ptr[index];
+
+       return irte->val != 0;
+}
+
+static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
+{
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+
+       return irte->hi.fields.vector != 0;
+}
+
+static void irte_clear_allocated(struct irq_remap_table *table, int index)
+{
+       table->table[index] = 0;
+}
+
+static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
+{
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+
+       memset(&irte->lo.val, 0, sizeof(u64));
+       memset(&irte->hi.val, 0, sizeof(u64));
+}
+
  static int get_devid(struct irq_alloc_info *info)
  {
         int devid = -1;
@@ -3821,19 +4051,17 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
  {
         struct irq_2_irte *irte_info = &data->irq_2_irte;
         struct msi_msg *msg = &data->msi_entry;
-       union irte *irte = &data->irte_entry;
         struct IO_APIC_route_entry *entry;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+       if (!iommu)
+               return;
  
         data->irq_2_irte.devid = devid;
         data->irq_2_irte.index = index + sub_handle;
-
-       /* Setup IRTE for IOMMU */
-       irte->val = 0;
-       irte->fields.vector      = irq_cfg->vector;
-       irte->fields.int_type    = apic->irq_delivery_mode;
-       irte->fields.destination = irq_cfg->dest_apicid;
-       irte->fields.dm          = apic->irq_dest_mode;
-       irte->fields.valid       = 1;
+       iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
+                                apic->irq_dest_mode, irq_cfg->vector,
+                                irq_cfg->dest_apicid, devid);
  
         switch (info->type) {
         case X86_IRQ_ALLOC_TYPE_IOAPIC:
@@ -3864,12 +4092,32 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
         }
  }
  
+struct amd_irte_ops irte_32_ops = {
+       .prepare = irte_prepare,
+       .activate = irte_activate,
+       .deactivate = irte_deactivate,
+       .set_affinity = irte_set_affinity,
+       .set_allocated = irte_set_allocated,
+       .is_allocated = irte_is_allocated,
+       .clear_allocated = irte_clear_allocated,
+};
+
+struct amd_irte_ops irte_128_ops = {
+       .prepare = irte_ga_prepare,
+       .activate = irte_ga_activate,
+       .deactivate = irte_ga_deactivate,
+       .set_affinity = irte_ga_set_affinity,
+       .set_allocated = irte_ga_set_allocated,
+       .is_allocated = irte_ga_is_allocated,
+       .clear_allocated = irte_ga_clear_allocated,
+};
+
  static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
                                unsigned int nr_irqs, void *arg)
  {
         struct irq_alloc_info *info = arg;
         struct irq_data *irq_data;
-       struct amd_ir_data *data;
+       struct amd_ir_data *data = NULL;
         struct irq_cfg *cfg;
         int i, ret, devid;
         int index = -1;
@@ -3921,6 +4169,16 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
                 if (!data)
                         goto out_free_data;
  
+               if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+                       data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
+               else
+                       data->entry = kzalloc(sizeof(struct irte_ga),
+                                                    GFP_KERNEL);
+               if (!data->entry) {
+                       kfree(data);
+                       goto out_free_data;
+               }
+
                 irq_data->hwirq = (devid << 16) + i;
                 irq_data->chip_data = data;
                 irq_data->chip = &amd_ir_chip;
@@ -3957,6 +4215,7 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
                         data = irq_data->chip_data;
                         irte_info = &data->irq_2_irte;
                         free_irte(irte_info->devid, irte_info->index);
+                       kfree(data->entry);
                         kfree(data);
                 }
         }
@@ -3968,8 +4227,11 @@ static void irq_remapping_activate(struct irq_domain *domain,
  {
         struct amd_ir_data *data = irq_data->chip_data;
         struct irq_2_irte *irte_info = &data->irq_2_irte;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
  
-       modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+       if (iommu)
+               iommu->irte_ops->activate(data->entry, irte_info->devid,
+                                         irte_info->index);
  }
  
  static void irq_remapping_deactivate(struct irq_domain *domain,
@@ -3977,10 +4239,11 @@ static void irq_remapping_deactivate(struct irq_domain *domain,
  {
         struct amd_ir_data *data = irq_data->chip_data;
         struct irq_2_irte *irte_info = &data->irq_2_irte;
-       union irte entry;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
  
-       entry.val = 0;
-       modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+       if (iommu)
+               iommu->irte_ops->deactivate(data->entry, irte_info->devid,
+                                           irte_info->index);
  }
  
  static struct irq_domain_ops amd_ir_domain_ops = {
@@ -3990,6 +4253,70 @@ static struct irq_domain_ops amd_ir_domain_ops = {
         .deactivate = irq_remapping_deactivate,
  };
  
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+       struct amd_iommu *iommu;
+       struct amd_iommu_pi_data *pi_data = vcpu_info;
+       struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+       struct amd_ir_data *ir_data = data->chip_data;
+       struct irte_ga *irte = (struct irte_ga *) ir_data->entry;
+       struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+       struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid);
+
+       /* Note:
+        * This device has never been set up for guest mode.
+        * we should not modify the IRTE
+        */
+       if (!dev_data || !dev_data->use_vapic)
+               return 0;
+
+       pi_data->ir_data = ir_data;
+
+       /* Note:
+        * SVM tries to set up for VAPIC mode, but we are in
+        * legacy mode. So, we force legacy mode instead.
+        */
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+               pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n",
+                        __func__);
+               pi_data->is_guest_mode = false;
+       }
+
+       iommu = amd_iommu_rlookup_table[irte_info->devid];
+       if (iommu == NULL)
+               return -EINVAL;
+
+       pi_data->prev_ga_tag = ir_data->cached_ga_tag;
+       if (pi_data->is_guest_mode) {
+               /* Setting */
+               irte->hi.fields.ga_root_ptr = (pi_data->base >> 12);
+               irte->hi.fields.vector = vcpu_pi_info->vector;
+               irte->lo.fields_vapic.guest_mode = 1;
+               irte->lo.fields_vapic.ga_tag = pi_data->ga_tag;
+
+               ir_data->cached_ga_tag = pi_data->ga_tag;
+       } else {
+               /* Un-Setting */
+               struct irq_cfg *cfg = irqd_cfg(data);
+
+               irte->hi.val = 0;
+               irte->lo.val = 0;
+               irte->hi.fields.vector = cfg->vector;
+               irte->lo.fields_remap.guest_mode = 0;
+               irte->lo.fields_remap.destination = cfg->dest_apicid;
+               irte->lo.fields_remap.int_type = apic->irq_delivery_mode;
+               irte->lo.fields_remap.dm = apic->irq_dest_mode;
+
+               /*
+                * This communicates the ga_tag back to the caller
+                * so that it can do all the necessary clean up.
+                */
+               ir_data->cached_ga_tag = 0;
+       }
+
+       return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data);
+}
+
  static int amd_ir_set_affinity(struct irq_data *data,
                                const struct cpumask *mask, bool force)
  {
@@ -3997,8 +4324,12 @@ static int amd_ir_set_affinity(struct irq_data *data,
         struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
         struct irq_cfg *cfg = irqd_cfg(data);
         struct irq_data *parent = data->parent_data;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
         int ret;
  
+       if (!iommu)
+               return -ENODEV;
+
         ret = parent->chip->irq_set_affinity(parent, mask, force);
         if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
                 return ret;
@@ -4007,9 +4338,8 @@ static int amd_ir_set_affinity(struct irq_data *data,
          * Atomically updates the IRTE with the new destination, vector
          * and flushes the interrupt entry cache.
          */
-       ir_data->irte_entry.fields.vector = cfg->vector;
-       ir_data->irte_entry.fields.destination = cfg->dest_apicid;
-       modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry);
+       iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
+                           irte_info->index, cfg->vector, cfg->dest_apicid);
  
         /*
          * After this point, all the interrupts will start arriving
@@ -4031,6 +4361,7 @@ static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
  static struct irq_chip amd_ir_chip = {
         .irq_ack = ir_ack_apic_edge,
         .irq_set_affinity = amd_ir_set_affinity,
+       .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
         .irq_compose_msi_msg = ir_compose_msi_msg,
  };
  
@@ -4045,4 +4376,43 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
  
         return 0;
  }
+
+int amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+       unsigned long flags;
+       struct amd_iommu *iommu;
+       struct irq_remap_table *irt;
+       struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+       int devid = ir_data->irq_2_irte.devid;
+       struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+       struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
+
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+           !ref || !entry || !entry->lo.fields_vapic.guest_mode)
+               return 0;
+
+       iommu = amd_iommu_rlookup_table[devid];
+       if (!iommu)
+               return -ENODEV;
+
+       irt = get_irq_table(devid, false);
+       if (!irt)
+               return -ENODEV;
+
+       spin_lock_irqsave(&irt->lock, flags);
+
+       if (ref->lo.fields_vapic.guest_mode) {
+               if (cpu >= 0)
+                       ref->lo.fields_vapic.destination = cpu;
+               ref->lo.fields_vapic.is_run = is_run;
+               barrier();
+       }
+
+       spin_unlock_irqrestore(&irt->lock, flags);
+
+       iommu_flush_irt(iommu, devid);
+       iommu_completion_wait(iommu);
+       return 0;
+}
+EXPORT_SYMBOL(amd_iommu_update_ga);
  #endif
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c

index 59741ea..cd17136 100644 (file)
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -84,6 +84,7 @@
  #define ACPI_DEVFLAG_LINT1              0x80
  #define ACPI_DEVFLAG_ATSDIS             0x10000000
  
+#define LOOP_TIMEOUT   100000
  /*
   * ACPI table definitions
   *
@@ -145,6 +146,8 @@ struct ivmd_header {
  bool amd_iommu_dump;
  bool amd_iommu_irq_remap __read_mostly;
  
+int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+
  static bool amd_iommu_detected;
  static bool __initdata amd_iommu_disabled;
  static int amd_iommu_target_ivhd_type;
@@ -386,6 +389,10 @@ static void iommu_disable(struct amd_iommu *iommu)
         iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
         iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
  
+       /* Disable IOMMU GA_LOG */
+       iommu_feature_disable(iommu, CONTROL_GALOG_EN);
+       iommu_feature_disable(iommu, CONTROL_GAINT_EN);
+
         /* Disable IOMMU hardware itself */
         iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
  }
@@ -671,6 +678,99 @@ static void __init free_ppr_log(struct amd_iommu *iommu)
         free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE));
  }
  
+static void free_ga_log(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+       if (iommu->ga_log)
+               free_pages((unsigned long)iommu->ga_log,
+                           get_order(GA_LOG_SIZE));
+       if (iommu->ga_log_tail)
+               free_pages((unsigned long)iommu->ga_log_tail,
+                           get_order(8));
+#endif
+}
+
+static int iommu_ga_log_enable(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+       u32 status, i;
+
+       if (!iommu->ga_log)
+               return -EINVAL;
+
+       status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+       /* Check if already running */
+       if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+               return 0;
+
+       iommu_feature_enable(iommu, CONTROL_GAINT_EN);
+       iommu_feature_enable(iommu, CONTROL_GALOG_EN);
+
+       for (i = 0; i < LOOP_TIMEOUT; ++i) {
+               status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+               if (status & (MMIO_STATUS_GALOG_RUN_MASK))
+                       break;
+       }
+
+       if (i >= LOOP_TIMEOUT)
+               return -EINVAL;
+#endif /* CONFIG_IRQ_REMAP */
+       return 0;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static int iommu_init_ga_log(struct amd_iommu *iommu)
+{
+       u64 entry;
+
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+               return 0;
+
+       iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                       get_order(GA_LOG_SIZE));
+       if (!iommu->ga_log)
+               goto err_out;
+
+       iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                       get_order(8));
+       if (!iommu->ga_log_tail)
+               goto err_out;
+
+       entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+       memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
+                   &entry, sizeof(entry));
+       entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
+       memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
+                   &entry, sizeof(entry));
+       writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+       writel(0x00, iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+
+       return 0;
+err_out:
+       free_ga_log(iommu);
+       return -EINVAL;
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+static int iommu_init_ga(struct amd_iommu *iommu)
+{
+       int ret = 0;
+
+#ifdef CONFIG_IRQ_REMAP
+       /* Note: We have already checked GASup from IVRS table.
+        *       Now, we need to make sure that GAMSup is set.
+        */
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+           !iommu_feature(iommu, FEATURE_GAM_VAPIC))
+               amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+
+       ret = iommu_init_ga_log(iommu);
+#endif /* CONFIG_IRQ_REMAP */
+
+       return ret;
+}
+
  static void iommu_enable_gt(struct amd_iommu *iommu)
  {
         if (!iommu_feature(iommu, FEATURE_GT))
@@ -1144,6 +1244,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu)
         free_command_buffer(iommu);
         free_event_buffer(iommu);
         free_ppr_log(iommu);
+       free_ga_log(iommu);
         iommu_unmap_mmio_space(iommu);
  }
  
@@ -1258,6 +1359,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
                         iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
                 else
                         iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+               if (((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0))
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
                 break;
         case 0x11:
         case 0x40:
@@ -1265,6 +1368,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
                         iommu->mmio_phys_end = MMIO_REG_END_OFFSET;
                 else
                         iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
+               if (((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0))
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
                 break;
         default:
                 return -EINVAL;
@@ -1432,6 +1537,7 @@ static int iommu_init_pci(struct amd_iommu *iommu)
  {
         int cap_ptr = iommu->cap_ptr;
         u32 range, misc, low, high;
+       int ret;
  
         iommu->dev = pci_get_bus_and_slot(PCI_BUS_NUM(iommu->devid),
                                           iommu->devid & 0xff);
@@ -1488,6 +1594,10 @@ static int iommu_init_pci(struct amd_iommu *iommu)
         if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu))
                 return -ENOMEM;
  
+       ret = iommu_init_ga(iommu);
+       if (ret)
+               return ret;
+
         if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
                 amd_iommu_np_cache = true;
  
@@ -1545,16 +1655,24 @@ static void print_iommu_info(void)
                         dev_name(&iommu->dev->dev), iommu->cap_ptr);
  
                 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
-                       pr_info("AMD-Vi:  Extended features: ");
+                       pr_info("AMD-Vi: Extended features (%#llx):\n",
+                               iommu->features);
                         for (i = 0; i < ARRAY_SIZE(feat_str); ++i) {
                                 if (iommu_feature(iommu, (1ULL << i)))
                                         pr_cont(" %s", feat_str[i]);
                         }
+
+                       if (iommu->features & FEATURE_GAM_VAPIC)
+                               pr_cont(" GA_vAPIC");
+
                         pr_cont("\n");
                 }
         }
-       if (irq_remapping_enabled)
+       if (irq_remapping_enabled) {
                 pr_info("AMD-Vi: Interrupt remapping enabled\n");
+               if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+                       pr_info("AMD-Vi: virtual APIC enabled\n");
+       }
  }
  
  static int __init amd_iommu_init_pci(void)
@@ -1645,6 +1763,8 @@ enable_faults:
         if (iommu->ppr_log != NULL)
                 iommu_feature_enable(iommu, CONTROL_PPFINT_EN);
  
+       iommu_ga_log_enable(iommu);
+
         return 0;
  }
  
@@ -1862,6 +1982,24 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
                                iommu->stored_addr_lo | 1);
  }
  
+static void iommu_enable_ga(struct amd_iommu *iommu)
+{
+#ifdef CONFIG_IRQ_REMAP
+       switch (amd_iommu_guest_ir) {
+       case AMD_IOMMU_GUEST_IR_VAPIC:
+               iommu_feature_enable(iommu, CONTROL_GAM_EN);
+               /* Fall through */
+       case AMD_IOMMU_GUEST_IR_LEGACY_GA:
+               iommu_feature_enable(iommu, CONTROL_GA_EN);
+               iommu->irte_ops = &irte_128_ops;
+               break;
+       default:
+               iommu->irte_ops = &irte_32_ops;
+               break;
+       }
+#endif
+}
+
  /*
   * This function finally enables all IOMMUs found in the system after
   * they have been initialized
@@ -1877,9 +2015,15 @@ static void early_enable_iommus(void)
                 iommu_enable_command_buffer(iommu);
                 iommu_enable_event_buffer(iommu);
                 iommu_set_exclusion_range(iommu);
+               iommu_enable_ga(iommu);
                 iommu_enable(iommu);
                 iommu_flush_all_caches(iommu);
         }
+
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+               amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
+#endif
  }
  
  static void enable_iommus_v2(void)
@@ -1905,6 +2049,11 @@ static void disable_iommus(void)
  
         for_each_iommu(iommu)
                 iommu_disable(iommu);
+
+#ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
+               amd_iommu_irq_ops.capability &= ~(1 << IRQ_POSTING_CAP);
+#endif
  }
  
  /*
@@ -2059,7 +2208,7 @@ static int __init early_amd_iommu_init(void)
         struct acpi_table_header *ivrs_base;
         acpi_size ivrs_size;
         acpi_status status;
-       int i, ret = 0;
+       int i, remap_cache_sz, ret = 0;
  
         if (!amd_iommu_detected)
                 return -ENODEV;
@@ -2157,10 +2306,14 @@ static int __init early_amd_iommu_init(void)
                  * remapping tables.
                  */
                 ret = -ENOMEM;
+               if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+                       remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32);
+               else
+                       remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2);
                 amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache",
-                               MAX_IRQS_PER_TABLE * sizeof(u32),
-                               IRQ_TABLE_ALIGNMENT,
-                               0, NULL);
+                                                       remap_cache_sz,
+                                                       IRQ_TABLE_ALIGNMENT,
+                                                       0, NULL);
                 if (!amd_iommu_irq_cache)
                         goto out;
  
@@ -2413,6 +2566,21 @@ static int __init parse_amd_iommu_dump(char *str)
         return 1;
  }
  
+static int __init parse_amd_iommu_intr(char *str)
+{
+       for (; *str; ++str) {
+               if (strncmp(str, "legacy", 6) == 0) {
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
+                       break;
+               }
+               if (strncmp(str, "vapic", 5) == 0) {
+                       amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC;
+                       break;
+               }
+       }
+       return 1;
+}
+
  static int __init parse_amd_iommu_options(char *str)
  {
         for (; *str; ++str) {
@@ -2521,6 +2689,7 @@ static int __init parse_ivrs_acpihid(char *str)
  
  __setup("amd_iommu_dump",      parse_amd_iommu_dump);
  __setup("amd_iommu=",          parse_amd_iommu_options);
+__setup("amd_iommu_intr=",     parse_amd_iommu_intr);
  __setup("ivrs_ioapic",         parse_ivrs_ioapic);
  __setup("ivrs_hpet",           parse_ivrs_hpet);
  __setup("ivrs_acpihid",                parse_ivrs_acpihid);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h

index 0bd9eb3..faa3b48 100644 (file)
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -38,6 +38,7 @@ extern int amd_iommu_enable(void);
  extern void amd_iommu_disable(void);
  extern int amd_iommu_reenable(int);
  extern int amd_iommu_enable_faulting(void);
+extern int amd_iommu_guest_ir;
  
  /* IOMMUv2 specific functions */
  struct iommu_domain;
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h

index 9652848..0d91785 100644 (file)
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -22,6 +22,7 @@
  
  #include <linux/types.h>
  #include <linux/mutex.h>
+#include <linux/msi.h>
  #include <linux/list.h>
  #include <linux/spinlock.h>
  #include <linux/pci.h>
@@ -69,6 +70,8 @@
  #define MMIO_EXCL_LIMIT_OFFSET  0x0028
  #define MMIO_EXT_FEATURES      0x0030
  #define MMIO_PPR_LOG_OFFSET    0x0038
+#define MMIO_GA_LOG_BASE_OFFSET        0x00e0
+#define MMIO_GA_LOG_TAIL_OFFSET        0x00e8
  #define MMIO_CMD_HEAD_OFFSET   0x2000
  #define MMIO_CMD_TAIL_OFFSET   0x2008
  #define MMIO_EVT_HEAD_OFFSET   0x2010
@@ -76,6 +79,8 @@
  #define MMIO_STATUS_OFFSET     0x2020
  #define MMIO_PPR_HEAD_OFFSET   0x2030
  #define MMIO_PPR_TAIL_OFFSET   0x2038
+#define MMIO_GA_HEAD_OFFSET    0x2040
+#define MMIO_GA_TAIL_OFFSET    0x2048
  #define MMIO_CNTR_CONF_OFFSET  0x4000
  #define MMIO_CNTR_REG_OFFSET   0x40000
  #define MMIO_REG_END_OFFSET    0x80000
@@ -92,6 +97,7 @@
  #define FEATURE_GA             (1ULL<<7)
  #define FEATURE_HE             (1ULL<<8)
  #define FEATURE_PC             (1ULL<<9)
+#define FEATURE_GAM_VAPIC      (1ULL<<21)
  
  #define FEATURE_PASID_SHIFT    32
  #define FEATURE_PASID_MASK     (0x1fULL << FEATURE_PASID_SHIFT)
@@ -110,6 +116,9 @@
  #define MMIO_STATUS_EVT_INT_MASK       (1 << 1)
  #define MMIO_STATUS_COM_WAIT_INT_MASK  (1 << 2)
  #define MMIO_STATUS_PPR_INT_MASK       (1 << 6)
+#define MMIO_STATUS_GALOG_RUN_MASK     (1 << 8)
+#define MMIO_STATUS_GALOG_OVERFLOW_MASK        (1 << 9)
+#define MMIO_STATUS_GALOG_INT_MASK     (1 << 10)
  
  /* event logging constants */
  #define EVENT_ENTRY_SIZE       0x10
@@ -146,6 +155,10 @@
  #define CONTROL_PPFINT_EN       0x0eULL
  #define CONTROL_PPR_EN          0x0fULL
  #define CONTROL_GT_EN           0x10ULL
+#define CONTROL_GA_EN           0x11ULL
+#define CONTROL_GAM_EN          0x19ULL
+#define CONTROL_GALOG_EN        0x1CULL
+#define CONTROL_GAINT_EN        0x1DULL
  
  #define CTRL_INV_TO_MASK       (7 << CONTROL_INV_TIMEOUT)
  #define CTRL_INV_TO_NONE       0
@@ -224,6 +237,19 @@
  
  #define PPR_REQ_FAULT          0x01
  
+/* Constants for GA Log handling */
+#define GA_LOG_ENTRIES         512
+#define GA_LOG_SIZE_SHIFT      56
+#define GA_LOG_SIZE_512                (0x8ULL << GA_LOG_SIZE_SHIFT)
+#define GA_ENTRY_SIZE          8
+#define GA_LOG_SIZE            (GA_ENTRY_SIZE * GA_LOG_ENTRIES)
+
+#define GA_TAG(x)              (u32)(x & 0xffffffffULL)
+#define GA_DEVID(x)            (u16)(((x) >> 32) & 0xffffULL)
+#define GA_REQ_TYPE(x)         (((x) >> 60) & 0xfULL)
+
+#define GA_GUEST_NR            0x1
+
  #define PAGE_MODE_NONE    0x00
  #define PAGE_MODE_1_LEVEL 0x01
  #define PAGE_MODE_2_LEVEL 0x02
@@ -329,6 +355,12 @@
  #define IOMMU_CAP_NPCACHE 26
  #define IOMMU_CAP_EFR     27
  
+/* IOMMU Feature Reporting Field (for IVHD type 10h */
+#define IOMMU_FEAT_GASUP_SHIFT 6
+
+/* IOMMU Extended Feature Register (EFR) */
+#define IOMMU_EFR_GASUP_SHIFT  7
+
  #define MAX_DOMAIN_ID 65536
  
  /* Protection domain flags */
@@ -400,6 +432,7 @@ struct amd_iommu_fault {
  
  struct iommu_domain;
  struct irq_domain;
+struct amd_irte_ops;
  
  /*
   * This structure contains generic data for  IOMMU protection domains
@@ -490,6 +523,12 @@ struct amd_iommu {
         /* Base of the PPR log, if present */
         u8 *ppr_log;
  
+       /* Base of the GA log, if present */
+       u8 *ga_log;
+
+       /* Tail of the GA log, if present */
+       u8 *ga_log_tail;
+
         /* true if interrupts for this IOMMU are already enabled */
         bool int_enabled;
  
@@ -523,6 +562,8 @@ struct amd_iommu {
  #ifdef CONFIG_IRQ_REMAP
         struct irq_domain *ir_domain;
         struct irq_domain *msi_domain;
+
+       struct amd_irte_ops *irte_ops;
  #endif
  
         volatile u64 __aligned(8) cmd_sem;
@@ -683,4 +724,112 @@ static inline int get_hpet_devid(int id)
         return -EINVAL;
  }
  
+enum amd_iommu_intr_mode_type {
+       AMD_IOMMU_GUEST_IR_LEGACY,
+
+       /* This mode is not visible to users. It is used when
+        * we cannot fully enable vAPIC and fallback to only support
+        * legacy interrupt remapping via 128-bit IRTE.
+        */
+       AMD_IOMMU_GUEST_IR_LEGACY_GA,
+       AMD_IOMMU_GUEST_IR_VAPIC,
+};
+
+#define AMD_IOMMU_GUEST_IR_GA(x)       (x == AMD_IOMMU_GUEST_IR_VAPIC || \
+                                        x == AMD_IOMMU_GUEST_IR_LEGACY_GA)
+
+#define AMD_IOMMU_GUEST_IR_VAPIC(x)    (x == AMD_IOMMU_GUEST_IR_VAPIC)
+
+union irte {
+       u32 val;
+       struct {
+               u32 valid       : 1,
+                   no_fault    : 1,
+                   int_type    : 3,
+                   rq_eoi      : 1,
+                   dm          : 1,
+                   rsvd_1      : 1,
+                   destination : 8,
+                   vector      : 8,
+                   rsvd_2      : 8;
+       } fields;
+};
+
+union irte_ga_lo {
+       u64 val;
+
+       /* For int remapping */
+       struct {
+               u64 valid       : 1,
+                   no_fault    : 1,
+                   /* ------ */
+                   int_type    : 3,
+                   rq_eoi      : 1,
+                   dm          : 1,
+                   /* ------ */
+                   guest_mode  : 1,
+                   destination : 8,
+                   rsvd        : 48;
+       } fields_remap;
+
+       /* For guest vAPIC */
+       struct {
+               u64 valid       : 1,
+                   no_fault    : 1,
+                   /* ------ */
+                   ga_log_intr : 1,
+                   rsvd1       : 3,
+                   is_run      : 1,
+                   /* ------ */
+                   guest_mode  : 1,
+                   destination : 8,
+                   rsvd2       : 16,
+                   ga_tag      : 32;
+       } fields_vapic;
+};
+
+union irte_ga_hi {
+       u64 val;
+       struct {
+               u64 vector      : 8,
+                   rsvd_1      : 4,
+                   ga_root_ptr : 40,
+                   rsvd_2      : 12;
+       } fields;
+};
+
+struct irte_ga {
+       union irte_ga_lo lo;
+       union irte_ga_hi hi;
+};
+
+struct irq_2_irte {
+       u16 devid; /* Device ID for IRTE table */
+       u16 index; /* Index into IRTE table*/
+};
+
+struct amd_ir_data {
+       u32 cached_ga_tag;
+       struct irq_2_irte irq_2_irte;
+       struct msi_msg msi_entry;
+       void *entry;    /* Pointer to union irte or struct irte_ga */
+       void *ref;      /* Pointer to the actual irte */
+};
+
+struct amd_irte_ops {
+       void (*prepare)(void *, u32, u32, u8, u32, int);
+       void (*activate)(void *, u16, u16);
+       void (*deactivate)(void *, u16, u16);
+       void (*set_affinity)(void *, u16, u16, u8, u32);
+       void *(*get)(struct irq_remap_table *, int);
+       void (*set_allocated)(struct irq_remap_table *, int);
+       bool (*is_allocated)(struct irq_remap_table *, int);
+       void (*clear_allocated)(struct irq_remap_table *, int);
+};
+
+#ifdef CONFIG_IRQ_REMAP
+extern struct amd_irte_ops irte_32_ops;
+extern struct amd_irte_ops irte_128_ops;
+#endif
+
  #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c

index 6ea0e5f..856379c 100644 (file)
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1046,7 +1046,7 @@ static void bgmac_enable(struct bgmac *bgmac)
  
         mode = (bgmac_read(bgmac, BGMAC_DEV_STATUS) & BGMAC_DS_MM_MASK) >>
                 BGMAC_DS_MM_SHIFT;
-       if (bgmac->feature_flags & BGMAC_FEAT_CLKCTLST || mode != 0)
+       if (!(bgmac->feature_flags & BGMAC_FEAT_CLKCTLST) || mode != 0)
                 bgmac_set(bgmac, BCMA_CLKCTLST, BCMA_CLKCTLST_FORCEHT);
         if (bgmac->feature_flags & BGMAC_FEAT_CLKCTLST && mode == 2)
                 bgmac_cco_ctl_maskset(bgmac, 1, ~0,
diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c

index 63144bb..b32444a 100644 (file)
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -3117,6 +3117,7 @@ static int macb_remove(struct platform_device *pdev)
                 if (dev->phydev)
                         phy_disconnect(dev->phydev);
                 mdiobus_unregister(bp->mii_bus);
+               dev->phydev = NULL;
                 mdiobus_free(bp->mii_bus);
  
                 /* Shutdown the PHY if there is a GPIO reset */
diff --git a/drivers/net/ethernet/freescale/fman/Makefile b/drivers/net/ethernet/freescale/fman/Makefile

index 51fd2e6..6049177 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -1,7 +1,9 @@
  subdir-ccflags-y +=  -I$(srctree)/drivers/net/ethernet/freescale/fman
  
-obj-y          += fsl_fman.o fsl_fman_mac.o fsl_mac.o
+obj-$(CONFIG_FSL_FMAN) += fsl_fman.o
+obj-$(CONFIG_FSL_FMAN) += fsl_fman_port.o
+obj-$(CONFIG_FSL_FMAN) += fsl_mac.o
  
-fsl_fman-objs  := fman_muram.o fman.o fman_sp.o fman_port.o
-fsl_fman_mac-objs := fman_dtsec.o fman_memac.o fman_tgec.o
-fsl_mac-objs += mac.o
+fsl_fman-objs  := fman_muram.o fman.o fman_sp.o
+fsl_fman_port-objs := fman_port.o
+fsl_mac-objs:= mac.o fman_dtsec.o fman_memac.o fman_tgec.o
diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c

index 1de2e1e..dafd9e1 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -618,7 +618,7 @@ struct fman {
         unsigned long cam_offset;
         size_t cam_size;
         /* Fifo in MURAM */
-       int fifo_offset;
+       unsigned long fifo_offset;
         size_t fifo_size;
  
         u32 liodn_base[64];
@@ -2036,7 +2036,7 @@ static int fman_init(struct fman *fman)
         /* allocate MURAM for FIFO according to total size */
         fman->fifo_offset = fman_muram_alloc(fman->muram,
                                              fman->state->total_fifo_size);
-       if (IS_ERR_VALUE(fman->cam_offset)) {
+       if (IS_ERR_VALUE(fman->fifo_offset)) {
                 free_init_resources(fman);
                 dev_err(fman->dev, "%s: MURAM alloc for BMI FIFO failed\n",
                         __func__);
@@ -2115,6 +2115,7 @@ void fman_register_intr(struct fman *fman, enum fman_event_modules module,
         fman->intr_mng[event].isr_cb = isr_cb;
         fman->intr_mng[event].src_handle = src_arg;
  }
+EXPORT_SYMBOL(fman_register_intr);
  
  /**
   * fman_unregister_intr
@@ -2138,6 +2139,7 @@ void fman_unregister_intr(struct fman *fman, enum fman_event_modules module,
         fman->intr_mng[event].isr_cb = NULL;
         fman->intr_mng[event].src_handle = NULL;
  }
+EXPORT_SYMBOL(fman_unregister_intr);
  
  /**
   * fman_set_port_params
@@ -2241,6 +2243,7 @@ return_err:
         spin_unlock_irqrestore(&fman->spinlock, flags);
         return err;
  }
+EXPORT_SYMBOL(fman_set_port_params);
  
  /**
   * fman_reset_mac
@@ -2310,6 +2313,7 @@ int fman_reset_mac(struct fman *fman, u8 mac_id)
  
         return 0;
  }
+EXPORT_SYMBOL(fman_reset_mac);
  
  /**
   * fman_set_mac_max_frame
@@ -2327,8 +2331,7 @@ int fman_set_mac_max_frame(struct fman *fman, u8 mac_id, u16 mfl)
          * or equal to the port's max
          */
         if ((!fman->state->port_mfl[mac_id]) ||
-           (fman->state->port_mfl[mac_id] &&
-           (mfl <= fman->state->port_mfl[mac_id]))) {
+           (mfl <= fman->state->port_mfl[mac_id])) {
                 fman->state->mac_mfl[mac_id] = mfl;
         } else {
                 dev_warn(fman->dev, "%s: MAC max_frame_length is larger than Port max_frame_length\n",
@@ -2337,6 +2340,7 @@ int fman_set_mac_max_frame(struct fman *fman, u8 mac_id, u16 mfl)
         }
         return 0;
  }
+EXPORT_SYMBOL(fman_set_mac_max_frame);
  
  /**
   * fman_get_clock_freq
@@ -2363,6 +2367,7 @@ u32 fman_get_bmi_max_fifo_size(struct fman *fman)
  {
         return fman->state->bmi_max_fifo_size;
  }
+EXPORT_SYMBOL(fman_get_bmi_max_fifo_size);
  
  /**
   * fman_get_revision
@@ -2384,6 +2389,7 @@ void fman_get_revision(struct fman *fman, struct fman_rev_info *rev_info)
                                 FPM_REV1_MAJOR_SHIFT);
         rev_info->minor = tmp & FPM_REV1_MINOR_MASK;
  }
+EXPORT_SYMBOL(fman_get_revision);
  
  /**
   * fman_get_qman_channel_id
@@ -2419,6 +2425,7 @@ u32 fman_get_qman_channel_id(struct fman *fman, u32 port_id)
  
         return fman->state->qman_channel_base + i;
  }
+EXPORT_SYMBOL(fman_get_qman_channel_id);
  
  /**
   * fman_get_mem_region
@@ -2432,6 +2439,7 @@ struct resource *fman_get_mem_region(struct fman *fman)
  {
         return fman->state->res;
  }
+EXPORT_SYMBOL(fman_get_mem_region);
  
  /* Bootargs defines */
  /* Extra headroom for RX buffers - Default, min and max */
@@ -2453,7 +2461,7 @@ struct resource *fman_get_mem_region(struct fman *fman)
   * particular forwarding scenarios that add extra headers to the
   * forwarded frame.
   */
-int fsl_fm_rx_extra_headroom = FSL_FM_RX_EXTRA_HEADROOM;
+static int fsl_fm_rx_extra_headroom = FSL_FM_RX_EXTRA_HEADROOM;
  module_param(fsl_fm_rx_extra_headroom, int, 0);
  MODULE_PARM_DESC(fsl_fm_rx_extra_headroom, "Extra headroom for Rx buffers");
  
@@ -2466,7 +2474,7 @@ MODULE_PARM_DESC(fsl_fm_rx_extra_headroom, "Extra headroom for Rx buffers");
   * Could be overridden once, at boot-time, via the
   * fm_set_max_frm() callback.
   */
-int fsl_fm_max_frm = FSL_FM_MAX_FRAME_SIZE;
+static int fsl_fm_max_frm = FSL_FM_MAX_FRAME_SIZE;
  module_param(fsl_fm_max_frm, int, 0);
  MODULE_PARM_DESC(fsl_fm_max_frm, "Maximum frame size, across all interfaces");
  
@@ -2538,6 +2546,7 @@ struct fman *fman_bind(struct device *fm_dev)
  {
         return (struct fman *)(dev_get_drvdata(get_device(fm_dev)));
  }
+EXPORT_SYMBOL(fman_bind);
  
  static irqreturn_t fman_err_irq(int irq, void *handle)
  {
@@ -2727,8 +2736,8 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
         struct fman *fman;
         struct device_node *fm_node, *muram_node;
         struct resource *res;
-       const u32 *u32_prop;
-       int lenp, err, irq;
+       u32 val, range[2];
+       int err, irq;
         struct clk *clk;
         u32 clk_rate;
         phys_addr_t phys_base_addr;
@@ -2740,16 +2749,13 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
  
         fm_node = of_node_get(of_dev->dev.of_node);
  
-       u32_prop = (const u32 *)of_get_property(fm_node, "cell-index", &lenp);
-       if (!u32_prop) {
-               dev_err(&of_dev->dev, "%s: of_get_property(%s, cell-index) failed\n",
+       err = of_property_read_u32(fm_node, "cell-index", &val);
+       if (err) {
+               dev_err(&of_dev->dev, "%s: failed to read cell-index for %s\n",
                         __func__, fm_node->full_name);
                 goto fman_node_put;
         }
-       if (WARN_ON(lenp != sizeof(u32)))
-               goto fman_node_put;
-
-       fman->dts_params.id = (u8)fdt32_to_cpu(u32_prop[0]);
+       fman->dts_params.id = (u8)val;
  
         /* Get the FM interrupt */
         res = platform_get_resource(of_dev, IORESOURCE_IRQ, 0);
@@ -2796,18 +2802,15 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
         /* Rounding to MHz */
         fman->dts_params.clk_freq = DIV_ROUND_UP(clk_rate, 1000000);
  
-       u32_prop = (const u32 *)of_get_property(fm_node,
-                                               "fsl,qman-channel-range",
-                                               &lenp);
-       if (!u32_prop) {
-               dev_err(&of_dev->dev, "%s: of_get_property(%s, fsl,qman-channel-range) failed\n",
+       err = of_property_read_u32_array(fm_node, "fsl,qman-channel-range",
+                                        &range[0], 2);
+       if (err) {
+               dev_err(&of_dev->dev, "%s: failed to read fsl,qman-channel-range for %s\n",
                         __func__, fm_node->full_name);
                 goto fman_node_put;
         }
-       if (WARN_ON(lenp != sizeof(u32) * 2))
-               goto fman_node_put;
-       fman->dts_params.qman_channel_base = fdt32_to_cpu(u32_prop[0]);
-       fman->dts_params.num_of_qman_channels = fdt32_to_cpu(u32_prop[1]);
+       fman->dts_params.qman_channel_base = range[0];
+       fman->dts_params.num_of_qman_channels = range[1];
  
         /* Get the MURAM base address and size */
         muram_node = of_find_matching_node(fm_node, fman_muram_match);
@@ -2858,7 +2861,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
  
         fman->dts_params.base_addr =
                 devm_ioremap(&of_dev->dev, phys_base_addr, mem_size);
-       if (fman->dts_params.base_addr == 0) {
+       if (!fman->dts_params.base_addr) {
                 dev_err(&of_dev->dev, "%s: devm_ioremap() failed\n", __func__);
                 goto fman_free;
         }
@@ -2930,7 +2933,7 @@ static const struct of_device_id fman_match[] = {
         {}
  };
  
-MODULE_DEVICE_TABLE(of, fm_match);
+MODULE_DEVICE_TABLE(of, fman_match);
  
  static struct platform_driver fman_driver = {
         .driver = {
@@ -2940,4 +2943,25 @@ static struct platform_driver fman_driver = {
         .probe = fman_probe,
  };
  
-builtin_platform_driver(fman_driver);
+static int __init fman_load(void)
+{
+       int err;
+
+       pr_debug("FSL DPAA FMan driver\n");
+
+       err = platform_driver_register(&fman_driver);
+       if (err < 0)
+               pr_err("Error, platform_driver_register() = %d\n", err);
+
+       return err;
+}
+module_init(fman_load);
+
+static void __exit fman_unload(void)
+{
+       platform_driver_unregister(&fman_driver);
+}
+module_exit(fman_unload);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Freescale DPAA Frame Manager driver");
diff --git a/drivers/net/ethernet/freescale/fman/fman_mac.h b/drivers/net/ethernet/freescale/fman/fman_mac.h

index ddf0260..dd6d052 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman_mac.h
+++ b/drivers/net/ethernet/freescale/fman/fman_mac.h
@@ -191,10 +191,6 @@ struct fman_mac_params {
         u16 max_speed;
         /* A handle to the FM object this port related to */
         void *fm;
-       /* MDIO exceptions interrupt source - not valid for all
-        * MACs; MUST be set to 0 for MACs that don't have
-        * mdio-irq, or for polling
-        */
         void *dev_id; /* device cookie used by the exception cbs */
         fman_mac_exception_cb *event_cb;    /* MDIO Events Callback Routine */
         fman_mac_exception_cb *exception_cb;/* Exception Callback Routine */
diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c b/drivers/net/ethernet/freescale/fman/fman_memac.c

index 45e98fd..53ef51e 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman_memac.c
+++ b/drivers/net/ethernet/freescale/fman/fman_memac.c
@@ -507,6 +507,9 @@ static void setup_sgmii_internal_phy(struct fman_mac *memac,
  {
         u16 tmp_reg16;
  
+       if (WARN_ON(!memac->pcsphy))
+               return;
+
         /* SGMII mode */
         tmp_reg16 = IF_MODE_SGMII_EN;
         if (!fixed_link)
@@ -1151,7 +1154,8 @@ struct fman_mac *memac_config(struct fman_mac_params *params)
         /* Save FMan revision */
         fman_get_revision(memac->fm, &memac->fm_rev_info);
  
-       if (memac->phy_if == PHY_INTERFACE_MODE_SGMII) {
+       if (memac->phy_if == PHY_INTERFACE_MODE_SGMII ||
+           memac->phy_if == PHY_INTERFACE_MODE_QSGMII) {
                 if (!params->internal_phy_node) {
                         pr_err("PCS PHY node is not available\n");
                         memac_free(memac);
diff --git a/drivers/net/ethernet/freescale/fman/fman_muram.c b/drivers/net/ethernet/freescale/fman/fman_muram.c

index 47394c4..5ec94d2 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman_muram.c
+++ b/drivers/net/ethernet/freescale/fman/fman_muram.c
@@ -150,7 +150,8 @@ unsigned long fman_muram_alloc(struct muram_info *muram, size_t size)
   *
   * Free an allocated memory from FM-MURAM partition.
   */
-void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size)
+void fman_muram_free_mem(struct muram_info *muram, unsigned long offset,
+                        size_t size)
  {
         unsigned long addr = fman_muram_offset_to_vbase(muram, offset);
  
diff --git a/drivers/net/ethernet/freescale/fman/fman_muram.h b/drivers/net/ethernet/freescale/fman/fman_muram.h

index 889649a..453bf84 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman_muram.h
+++ b/drivers/net/ethernet/freescale/fman/fman_muram.h
@@ -46,6 +46,7 @@ unsigned long fman_muram_offset_to_vbase(struct muram_info *muram,
  
  unsigned long fman_muram_alloc(struct muram_info *muram, size_t size);
  
-void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size);
+void fman_muram_free_mem(struct muram_info *muram, unsigned long offset,
+                        size_t size);
  
  #endif /* __FM_MURAM_EXT */
diff --git a/drivers/net/ethernet/freescale/fman/fman_port.c b/drivers/net/ethernet/freescale/fman/fman_port.c

index 70c198d..9f3bb50 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman_port.c
+++ b/drivers/net/ethernet/freescale/fman/fman_port.c
@@ -1477,7 +1477,8 @@ EXPORT_SYMBOL(fman_port_cfg_buf_prefix_content);
   */
  int fman_port_disable(struct fman_port *port)
  {
-       u32 __iomem *bmi_cfg_reg, *bmi_status_reg, tmp;
+       u32 __iomem *bmi_cfg_reg, *bmi_status_reg;
+       u32 tmp;
         bool rx_port, failure = false;
         int count;
  
@@ -1553,7 +1554,8 @@ EXPORT_SYMBOL(fman_port_disable);
   */
  int fman_port_enable(struct fman_port *port)
  {
-       u32 __iomem *bmi_cfg_reg, tmp;
+       u32 __iomem *bmi_cfg_reg;
+       u32 tmp;
         bool rx_port;
  
         if (!is_init_done(port->cfg))
@@ -1623,7 +1625,7 @@ static int fman_port_probe(struct platform_device *of_dev)
         struct device_node *fm_node, *port_node;
         struct resource res;
         struct resource *dev_res;
-       const u32 *u32_prop;
+       u32 val;
         int err = 0, lenp;
         enum fman_port_type port_type;
         u16 port_speed;
@@ -1652,28 +1654,20 @@ static int fman_port_probe(struct platform_device *of_dev)
                 goto return_err;
         }
  
-       u32_prop = (const u32 *)of_get_property(port_node, "cell-index", &lenp);
-       if (!u32_prop) {
-               dev_err(port->dev, "%s: of_get_property(%s, cell-index) failed\n",
+       err = of_property_read_u32(port_node, "cell-index", &val);
+       if (err) {
+               dev_err(port->dev, "%s: reading cell-index for %s failed\n",
                         __func__, port_node->full_name);
                 err = -EINVAL;
                 goto return_err;
         }
-       if (WARN_ON(lenp != sizeof(u32))) {
-               err = -EINVAL;
-               goto return_err;
-       }
-       port_id = (u8)fdt32_to_cpu(u32_prop[0]);
-
+       port_id = (u8)val;
         port->dts_params.id = port_id;
  
         if (of_device_is_compatible(port_node, "fsl,fman-v3-port-tx")) {
                 port_type = FMAN_PORT_TYPE_TX;
                 port_speed = 1000;
-               u32_prop = (const u32 *)of_get_property(port_node,
-                                                       "fsl,fman-10g-port",
-                                                       &lenp);
-               if (u32_prop)
+               if (of_find_property(port_node, "fsl,fman-10g-port", &lenp))
                         port_speed = 10000;
  
         } else if (of_device_is_compatible(port_node, "fsl,fman-v2-port-tx")) {
@@ -1686,9 +1680,7 @@ static int fman_port_probe(struct platform_device *of_dev)
         } else if (of_device_is_compatible(port_node, "fsl,fman-v3-port-rx")) {
                 port_type = FMAN_PORT_TYPE_RX;
                 port_speed = 1000;
-               u32_prop = (const u32 *)of_get_property(port_node,
-                                                 "fsl,fman-10g-port", &lenp);
-               if (u32_prop)
+               if (of_find_property(port_node, "fsl,fman-10g-port", &lenp))
                         port_speed = 10000;
  
         } else if (of_device_is_compatible(port_node, "fsl,fman-v2-port-rx")) {
@@ -1743,7 +1735,7 @@ static int fman_port_probe(struct platform_device *of_dev)
  
         port->dts_params.base_addr = devm_ioremap(port->dev, res.start,
                                                   resource_size(&res));
-       if (port->dts_params.base_addr == 0)
+       if (!port->dts_params.base_addr)
                 dev_err(port->dev, "%s: devm_ioremap() failed\n", __func__);
  
         dev_set_drvdata(&of_dev->dev, port);
@@ -1775,4 +1767,25 @@ static struct platform_driver fman_port_driver = {
         .probe = fman_port_probe,
  };
  
-builtin_platform_driver(fman_port_driver);
+static int __init fman_port_load(void)
+{
+       int err;
+
+       pr_debug("FSL DPAA FMan driver\n");
+
+       err = platform_driver_register(&fman_port_driver);
+       if (err < 0)
+               pr_err("Error, platform_driver_register() = %d\n", err);
+
+       return err;
+}
+module_init(fman_port_load);
+
+static void __exit fman_port_unload(void)
+{
+       platform_driver_unregister(&fman_port_driver);
+}
+module_exit(fman_port_unload);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Freescale DPAA Frame Manager Port driver");
diff --git a/drivers/net/ethernet/freescale/fman/fman_sp.c b/drivers/net/ethernet/freescale/fman/fman_sp.c

index f9e7aa3..248f5bc 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman_sp.c
+++ b/drivers/net/ethernet/freescale/fman/fman_sp.c
@@ -80,6 +80,7 @@ void fman_sp_set_buf_pools_in_asc_order_of_buf_sizes(struct fman_ext_pools
                 }
         }
  }
+EXPORT_SYMBOL(fman_sp_set_buf_pools_in_asc_order_of_buf_sizes);
  
  int fman_sp_build_buffer_struct(struct fman_sp_int_context_data_copy *
                                 int_context_data_copy,
@@ -164,3 +165,5 @@ int fman_sp_build_buffer_struct(struct fman_sp_int_context_data_copy *
  
         return 0;
  }
+EXPORT_SYMBOL(fman_sp_build_buffer_struct);
+
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c

index e33d9d2..8fe6b3e 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -469,9 +469,9 @@ static void adjust_link_memac(struct net_device *net_dev)
  /* Initializes driver's PHY state, and attaches to the PHY.
   * Returns 0 on success.
   */
-static int init_phy(struct net_device *net_dev,
-                   struct mac_device *mac_dev,
-                   void (*adj_lnk)(struct net_device *))
+static struct phy_device *init_phy(struct net_device *net_dev,
+                                  struct mac_device *mac_dev,
+                                  void (*adj_lnk)(struct net_device *))
  {
         struct phy_device       *phy_dev;
         struct mac_priv_s       *priv = mac_dev->priv;
@@ -480,7 +480,7 @@ static int init_phy(struct net_device *net_dev,
                                  priv->phy_if);
         if (!phy_dev) {
                 netdev_err(net_dev, "Could not connect to PHY\n");
-               return -ENODEV;
+               return NULL;
         }
  
         /* Remove any features not supported by the controller */
@@ -493,23 +493,23 @@ static int init_phy(struct net_device *net_dev,
  
         mac_dev->phy_dev = phy_dev;
  
-       return 0;
+       return phy_dev;
  }
  
-static int dtsec_init_phy(struct net_device *net_dev,
-                         struct mac_device *mac_dev)
+static struct phy_device *dtsec_init_phy(struct net_device *net_dev,
+                                        struct mac_device *mac_dev)
  {
         return init_phy(net_dev, mac_dev, &adjust_link_dtsec);
  }
  
-static int tgec_init_phy(struct net_device *net_dev,
-                        struct mac_device *mac_dev)
+static struct phy_device *tgec_init_phy(struct net_device *net_dev,
+                                       struct mac_device *mac_dev)
  {
         return init_phy(net_dev, mac_dev, adjust_link_void);
  }
  
-static int memac_init_phy(struct net_device *net_dev,
-                         struct mac_device *mac_dev)
+static struct phy_device *memac_init_phy(struct net_device *net_dev,
+                                        struct mac_device *mac_dev)
  {
         return init_phy(net_dev, mac_dev, &adjust_link_memac);
  }
@@ -583,31 +583,6 @@ static void setup_memac(struct mac_device *mac_dev)
  
  static DEFINE_MUTEX(eth_lock);
  
-static const char phy_str[][11] = {
-       [PHY_INTERFACE_MODE_MII]                = "mii",
-       [PHY_INTERFACE_MODE_GMII]               = "gmii",
-       [PHY_INTERFACE_MODE_SGMII]              = "sgmii",
-       [PHY_INTERFACE_MODE_TBI]                = "tbi",
-       [PHY_INTERFACE_MODE_RMII]               = "rmii",
-       [PHY_INTERFACE_MODE_RGMII]              = "rgmii",
-       [PHY_INTERFACE_MODE_RGMII_ID]           = "rgmii-id",
-       [PHY_INTERFACE_MODE_RGMII_RXID] = "rgmii-rxid",
-       [PHY_INTERFACE_MODE_RGMII_TXID] = "rgmii-txid",
-       [PHY_INTERFACE_MODE_RTBI]               = "rtbi",
-       [PHY_INTERFACE_MODE_XGMII]              = "xgmii"
-};
-
-static phy_interface_t __pure __attribute__((nonnull)) str2phy(const char *str)
-{
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(phy_str); i++)
-               if (strcmp(str, phy_str[i]) == 0)
-                       return (phy_interface_t)i;
-
-       return PHY_INTERFACE_MODE_MII;
-}
-
  static const u16 phy2speed[] = {
         [PHY_INTERFACE_MODE_MII]                = SPEED_100,
         [PHY_INTERFACE_MODE_GMII]               = SPEED_1000,
@@ -678,7 +653,7 @@ MODULE_DEVICE_TABLE(of, mac_match);
  
  static int mac_probe(struct platform_device *_of_dev)
  {
-       int                      err, i, lenp, nph;
+       int                      err, i, nph;
         struct device           *dev;
         struct device_node      *mac_node, *dev_node;
         struct mac_device       *mac_dev;
@@ -686,9 +661,9 @@ static int mac_probe(struct platform_device *_of_dev)
         struct resource          res;
         struct mac_priv_s       *priv;
         const u8                *mac_addr;
-       const char              *char_prop;
-       const u32               *u32_prop;
+       u32                      val;
         u8                      fman_id;
+       int                     phy_if;
  
         dev = &_of_dev->dev;
         mac_node = dev->of_node;
@@ -749,16 +724,15 @@ static int mac_probe(struct platform_device *_of_dev)
         }
  
         /* Get the FMan cell-index */
-       u32_prop = of_get_property(dev_node, "cell-index", &lenp);
-       if (!u32_prop) {
-               dev_err(dev, "of_get_property(%s, cell-index) failed\n",
+       err = of_property_read_u32(dev_node, "cell-index", &val);
+       if (err) {
+               dev_err(dev, "failed to read cell-index for %s\n",
                         dev_node->full_name);
                 err = -EINVAL;
                 goto _return_of_node_put;
         }
-       WARN_ON(lenp != sizeof(u32));
         /* cell-index 0 => FMan id 1 */
-       fman_id = (u8)(fdt32_to_cpu(u32_prop[0]) + 1);
+       fman_id = (u8)(val + 1);
  
         priv->fman = fman_bind(&of_dev->dev);
         if (!priv->fman) {
@@ -805,15 +779,14 @@ static int mac_probe(struct platform_device *_of_dev)
         }
  
         /* Get the cell-index */
-       u32_prop = of_get_property(mac_node, "cell-index", &lenp);
-       if (!u32_prop) {
-               dev_err(dev, "of_get_property(%s, cell-index) failed\n",
+       err = of_property_read_u32(mac_node, "cell-index", &val);
+       if (err) {
+               dev_err(dev, "failed to read cell-index for %s\n",
                         mac_node->full_name);
                 err = -EINVAL;
                 goto _return_dev_set_drvdata;
         }
-       WARN_ON(lenp != sizeof(u32));
-       priv->cell_index = (u8)fdt32_to_cpu(u32_prop[0]);
+       priv->cell_index = (u8)val;
  
         /* Get the MAC address */
         mac_addr = of_get_mac_address(mac_node);
@@ -870,16 +843,14 @@ static int mac_probe(struct platform_device *_of_dev)
         }
  
         /* Get the PHY connection type */
-       char_prop = (const char *)of_get_property(mac_node,
-                                                 "phy-connection-type", NULL);
-       if (!char_prop) {
+       phy_if = of_get_phy_mode(mac_node);
+       if (phy_if < 0) {
                 dev_warn(dev,
-                        "of_get_property(%s, phy-connection-type) failed. Defaulting to MII\n",
+                        "of_get_phy_mode() for %s failed. Defaulting to SGMII\n",
                          mac_node->full_name);
-               priv->phy_if = PHY_INTERFACE_MODE_MII;
-       } else {
-               priv->phy_if = str2phy(char_prop);
+               phy_if = PHY_INTERFACE_MODE_SGMII;
         }
+       priv->phy_if = phy_if;
  
         priv->speed             = phy2speed[priv->phy_if];
         priv->max_speed         = priv->speed;
diff --git a/drivers/net/ethernet/freescale/fman/mac.h b/drivers/net/ethernet/freescale/fman/mac.h

index 0211cc9..d7313f0 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/mac.h
+++ b/drivers/net/ethernet/freescale/fman/mac.h
@@ -58,7 +58,8 @@ struct mac_device {
         bool tx_pause_active;
         bool promisc;
  
-       int (*init_phy)(struct net_device *net_dev, struct mac_device *mac_dev);
+       struct phy_device *(*init_phy)(struct net_device *net_dev,
+                                      struct mac_device *mac_dev);
         int (*init)(struct mac_device *mac_dev);
         int (*start)(struct mac_device *mac_dev);
         int (*stop)(struct mac_device *mac_dev);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c

index a834774..751c126 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
@@ -126,7 +126,7 @@ void hns_mac_adjust_link(struct hns_mac_cb *mac_cb, int speed, int duplex)
                         (enum mac_speed)speed, duplex);
                 if (ret) {
                         dev_err(mac_cb->dev,
-                               "adjust_link failed,%s mac%d ret = %#x!\n",
+                               "adjust_link failed, %s mac%d ret = %#x!\n",
                                 mac_cb->dsaf_dev->ae_dev.name,
                                 mac_cb->mac_id, ret);
                         return;
@@ -149,7 +149,7 @@ static int hns_mac_get_inner_port_num(struct hns_mac_cb *mac_cb,
         if (mac_cb->dsaf_dev->dsaf_mode <= DSAF_MODE_ENABLE) {
                 if (mac_cb->mac_id != DSAF_MAX_PORT_NUM) {
                         dev_err(mac_cb->dev,
-                               "input invalid,%s mac%d vmid%d !\n",
+                               "input invalid, %s mac%d vmid%d !\n",
                                 mac_cb->dsaf_dev->ae_dev.name,
                                 mac_cb->mac_id, vmid);
                         return -EINVAL;
@@ -157,19 +157,19 @@ static int hns_mac_get_inner_port_num(struct hns_mac_cb *mac_cb,
         } else if (mac_cb->dsaf_dev->dsaf_mode < DSAF_MODE_MAX) {
                 if (mac_cb->mac_id >= DSAF_MAX_PORT_NUM) {
                         dev_err(mac_cb->dev,
-                               "input invalid,%s mac%d vmid%d!\n",
+                               "input invalid, %s mac%d vmid%d!\n",
                                 mac_cb->dsaf_dev->ae_dev.name,
                                 mac_cb->mac_id, vmid);
                         return -EINVAL;
                 }
         } else {
-               dev_err(mac_cb->dev, "dsaf mode invalid,%s mac%d!\n",
+               dev_err(mac_cb->dev, "dsaf mode invalid, %s mac%d!\n",
                         mac_cb->dsaf_dev->ae_dev.name, mac_cb->mac_id);
                 return -EINVAL;
         }
  
         if (vmid >= mac_cb->dsaf_dev->rcb_common[0]->max_vfn) {
-               dev_err(mac_cb->dev, "input invalid,%s mac%d vmid%d !\n",
+               dev_err(mac_cb->dev, "input invalid, %s mac%d vmid%d !\n",
                         mac_cb->dsaf_dev->ae_dev.name, mac_cb->mac_id, vmid);
                 return -EINVAL;
         }
@@ -196,7 +196,7 @@ static int hns_mac_get_inner_port_num(struct hns_mac_cb *mac_cb,
                 tmp_port = vmid;
                 break;
         default:
-               dev_err(mac_cb->dev, "dsaf mode invalid,%s mac%d!\n",
+               dev_err(mac_cb->dev, "dsaf mode invalid, %s mac%d!\n",
                         mac_cb->dsaf_dev->ae_dev.name, mac_cb->mac_id);
                 return -EINVAL;
         }
@@ -275,7 +275,7 @@ int hns_mac_set_multi(struct hns_mac_cb *mac_cb,
                         ret = hns_dsaf_add_mac_mc_port(dsaf_dev, &mac_entry);
                 if (ret) {
                         dev_err(dsaf_dev->dev,
-                               "set mac mc port failed,%s mac%d ret = %#x!\n",
+                               "set mac mc port failed, %s mac%d ret = %#x!\n",
                                 mac_cb->dsaf_dev->ae_dev.name,
                                 mac_cb->mac_id, ret);
                         return ret;
@@ -305,7 +305,7 @@ int hns_mac_del_mac(struct hns_mac_cb *mac_cb, u32 vfn, char *mac)
                 old_mac = &mac_cb->addr_entry_idx[vfn];
         } else {
                 dev_err(mac_cb->dev,
-                       "vf queue is too large,%s mac%d queue = %#x!\n",
+                       "vf queue is too large, %s mac%d queue = %#x!\n",
                         mac_cb->dsaf_dev->ae_dev.name, mac_cb->mac_id, vfn);
                 return -EINVAL;
         }
@@ -547,7 +547,7 @@ int hns_mac_set_autoneg(struct hns_mac_cb *mac_cb, u8 enable)
         struct mac_driver *mac_ctrl_drv = hns_mac_get_drv(mac_cb);
  
         if (mac_cb->phy_if == PHY_INTERFACE_MODE_XGMII && enable) {
-               dev_err(mac_cb->dev, "enable autoneg is not allowed!");
+               dev_err(mac_cb->dev, "enabling autoneg is not allowed!\n");
                 return -ENOTSUPP;
         }
  
@@ -571,7 +571,7 @@ int hns_mac_set_pauseparam(struct hns_mac_cb *mac_cb, u32 rx_en, u32 tx_en)
  
         if (mac_cb->mac_type == HNAE_PORT_DEBUG) {
                 if (is_ver1 && (tx_en || rx_en)) {
-                       dev_err(mac_cb->dev, "macv1 cann't enable tx/rx_pause!");
+                       dev_err(mac_cb->dev, "macv1 can't enable tx/rx_pause!\n");
                         return -EINVAL;
                 }
         }
@@ -941,7 +941,7 @@ int hns_mac_get_cfg(struct dsaf_device *dsaf_dev, struct hns_mac_cb *mac_cb)
         ret = hns_mac_get_mode(mac_cb->phy_if);
         if (ret < 0) {
                 dev_err(dsaf_dev->dev,
-                       "hns_mac_get_mode failed,mac%d ret = %#x!\n",
+                       "hns_mac_get_mode failed, mac%d ret = %#x!\n",
                         mac_cb->mac_id, ret);
                 return ret;
         }
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c

index ad4ab97..4a62ffd 100644 (file)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2323,6 +2323,41 @@ free_netdev:
         return err;
  }
  
+static int mtk_get_chip_id(struct mtk_eth *eth, u32 *chip_id)
+{
+       u32 val[2], id[4];
+
+       regmap_read(eth->ethsys, ETHSYS_CHIPID0_3, &val[0]);
+       regmap_read(eth->ethsys, ETHSYS_CHIPID4_7, &val[1]);
+
+       id[3] = ((val[0] >> 16) & 0xff) - '0';
+       id[2] = ((val[0] >> 24) & 0xff) - '0';
+       id[1] = (val[1] & 0xff) - '0';
+       id[0] = ((val[1] >> 8) & 0xff) - '0';
+
+       *chip_id = (id[3] * 1000) + (id[2] * 100) +
+                  (id[1] * 10) + id[0];
+
+       if (!(*chip_id)) {
+               dev_err(eth->dev, "failed to get chip id\n");
+               return -ENODEV;
+       }
+
+       dev_info(eth->dev, "chip id = %d\n", *chip_id);
+
+       return 0;
+}
+
+static bool mtk_is_hwlro_supported(struct mtk_eth *eth)
+{
+       switch (eth->chip_id) {
+       case MT7623_ETH:
+               return true;
+       }
+
+       return false;
+}
+
  static int mtk_probe(struct platform_device *pdev)
  {
         struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -2362,8 +2397,6 @@ static int mtk_probe(struct platform_device *pdev)
                 return PTR_ERR(eth->pctl);
         }
  
-       eth->hwlro = of_property_read_bool(pdev->dev.of_node, "mediatek,hwlro");
-
         for (i = 0; i < 3; i++) {
                 eth->irq[i] = platform_get_irq(pdev, i);
                 if (eth->irq[i] < 0) {
@@ -2388,6 +2421,12 @@ static int mtk_probe(struct platform_device *pdev)
         if (err)
                 return err;
  
+       err = mtk_get_chip_id(eth, &eth->chip_id);
+       if (err)
+               return err;
+
+       eth->hwlro = mtk_is_hwlro_supported(eth);
+
         for_each_child_of_node(pdev->dev.of_node, mac_np) {
                 if (!of_device_is_compatible(mac_np,
                                              "mediatek,eth-mac"))
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h

index 3003195..99b1c8e 100644 (file)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -342,6 +342,11 @@
  #define GPIO_BIAS_CTRL         0xed0
  #define GPIO_DRV_SEL10         0xf00
  
+/* ethernet subsystem chip id register */
+#define ETHSYS_CHIPID0_3       0x0
+#define ETHSYS_CHIPID4_7       0x4
+#define MT7623_ETH             7623
+
  /* ethernet subsystem config register */
  #define ETHSYS_SYSCFG0         0x14
  #define SYSCFG0_GE_MASK                0x3
@@ -534,6 +539,7 @@ struct mtk_eth {
         unsigned long                   sysclk;
         struct regmap                   *ethsys;
         struct regmap                   *pctl;
+       u32                             chip_id;
         bool                            hwlro;
         atomic_t                        dma_refcnt;
         struct mtk_tx_ring              tx_ring;
diff --git a/drivers/net/ethernet/qualcomm/Kconfig b/drivers/net/ethernet/qualcomm/Kconfig

index 9ba568d..d7720bf 100644 (file)
--- a/drivers/net/ethernet/qualcomm/Kconfig
+++ b/drivers/net/ethernet/qualcomm/Kconfig
@@ -26,6 +26,7 @@ config QCA7000
  
  config QCOM_EMAC
         tristate "Qualcomm Technologies, Inc. EMAC Gigabit Ethernet support"
+       depends on HAS_DMA && HAS_IOMEM
         select CRC32
         select PHYLIB
         ---help---
diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c b/drivers/net/ethernet/ti/cpsw-phy-sel.c

index c3e85ac..054a8dd 100644 (file)
--- a/drivers/net/ethernet/ti/cpsw-phy-sel.c
+++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c
@@ -30,6 +30,8 @@
  
  #define AM33XX_GMII_SEL_RMII2_IO_CLK_EN        BIT(7)
  #define AM33XX_GMII_SEL_RMII1_IO_CLK_EN        BIT(6)
+#define AM33XX_GMII_SEL_RGMII2_IDMODE  BIT(5)
+#define AM33XX_GMII_SEL_RGMII1_IDMODE  BIT(4)
  
  #define GMII_SEL_MODE_MASK             0x3
  
@@ -48,6 +50,7 @@ static void cpsw_gmii_sel_am3352(struct cpsw_phy_sel_priv *priv,
         u32 reg;
         u32 mask;
         u32 mode = 0;
+       bool rgmii_id = false;
  
         reg = readl(priv->gmii_sel);
  
@@ -57,10 +60,14 @@ static void cpsw_gmii_sel_am3352(struct cpsw_phy_sel_priv *priv,
                 break;
  
         case PHY_INTERFACE_MODE_RGMII:
+               mode = AM33XX_GMII_SEL_MODE_RGMII;
+               break;
+
         case PHY_INTERFACE_MODE_RGMII_ID:
         case PHY_INTERFACE_MODE_RGMII_RXID:
         case PHY_INTERFACE_MODE_RGMII_TXID:
                 mode = AM33XX_GMII_SEL_MODE_RGMII;
+               rgmii_id = true;
                 break;
  
         default:
@@ -83,6 +90,13 @@ static void cpsw_gmii_sel_am3352(struct cpsw_phy_sel_priv *priv,
                         mode |= AM33XX_GMII_SEL_RMII2_IO_CLK_EN;
         }
  
+       if (rgmii_id) {
+               if (slave == 0)
+                       mode |= AM33XX_GMII_SEL_RGMII1_IDMODE;
+               else
+                       mode |= AM33XX_GMII_SEL_RGMII2_IDMODE;
+       }
+
         reg &= ~mask;
         reg |= mode;
  
diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index bc258d7..272f2b1 100644 (file)
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -1769,7 +1769,7 @@ static int ps3_gelic_driver_probe(struct ps3_system_bus_device *dev)
         gelic_ether_setup_netdev_ops(netdev, &card->napi);
         result = gelic_net_setup_netdev(netdev, card);
         if (result) {
-               dev_dbg(&dev->core, "%s: setup_netdev failed %d",
+               dev_dbg(&dev->core, "%s: setup_netdev failed %d\n",
                         __func__, result);
                 goto fail_setup_netdev;
         }
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c

index 69e2a83..35f9f97 100644 (file)
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -818,7 +818,7 @@ static irqreturn_t axienet_tx_irq(int irq, void *_ndev)
                 goto out;
         }
         if (!(status & XAXIDMA_IRQ_ALL_MASK))
-               dev_err(&ndev->dev, "No interrupts asserted in Tx path");
+               dev_err(&ndev->dev, "No interrupts asserted in Tx path\n");
         if (status & XAXIDMA_IRQ_ERROR_MASK) {
                 dev_err(&ndev->dev, "DMA Tx error 0x%x\n", status);
                 dev_err(&ndev->dev, "Current BD is at: 0x%x\n",
@@ -867,7 +867,7 @@ static irqreturn_t axienet_rx_irq(int irq, void *_ndev)
                 goto out;
         }
         if (!(status & XAXIDMA_IRQ_ALL_MASK))
-               dev_err(&ndev->dev, "No interrupts asserted in Rx path");
+               dev_err(&ndev->dev, "No interrupts asserted in Rx path\n");
         if (status & XAXIDMA_IRQ_ERROR_MASK) {
                 dev_err(&ndev->dev, "DMA Rx error 0x%x\n", status);
                 dev_err(&ndev->dev, "Current BD is at: 0x%x\n",
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig

index 5078a0d..2651c8d 100644 (file)
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -142,6 +142,7 @@ config MDIO_THUNDER
  
  config MDIO_XGENE
         tristate "APM X-Gene SoC MDIO bus controller"
+       depends on ARCH_XGENE || COMPILE_TEST
         help
           This module provides a driver for the MDIO busses found in the
           APM X-Gene SoC's.
@@ -320,13 +321,6 @@ config XILINX_GMII2RGMII
           the Reduced Gigabit Media Independent Interface(RGMII) between
           Ethernet physical media devices and the Gigabit Ethernet controller.
  
-config MDIO_XGENE
-       tristate "APM X-Gene SoC MDIO bus controller"
-       depends on ARCH_XGENE || COMPILE_TEST
-       help
-         This module provides a driver for the MDIO busses found in the
-         APM X-Gene SoC's.
-
  endif # PHYLIB
  
  config MICREL_KS8995MA
diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c

index a17573e..77a6671 100644 (file)
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -13,6 +13,7 @@
  #include <linux/phy.h>
  #include <linux/of.h>
  #include <dt-bindings/net/mscc-phy-vsc8531.h>
+#include <linux/netdevice.h>
  
  enum rgmii_rx_clock_delay {
         RGMII_RX_CLK_DELAY_0_2_NS = 0,
@@ -37,6 +38,7 @@ enum rgmii_rx_clock_delay {
  
  #define MII_VSC85XX_INT_MASK             25
  #define MII_VSC85XX_INT_MASK_MASK        0xa000
+#define MII_VSC85XX_INT_MASK_WOL         0x0040
  #define MII_VSC85XX_INT_STATUS           26
  
  #define MSCC_PHY_WOL_MAC_CONTROL          27
@@ -52,6 +54,17 @@ enum rgmii_rx_clock_delay {
  #define RGMII_RX_CLK_DELAY_MASK                  0x0070
  #define RGMII_RX_CLK_DELAY_POS           4
  
+#define MSCC_PHY_WOL_LOWER_MAC_ADDR      21
+#define MSCC_PHY_WOL_MID_MAC_ADDR        22
+#define MSCC_PHY_WOL_UPPER_MAC_ADDR      23
+#define MSCC_PHY_WOL_LOWER_PASSWD        24
+#define MSCC_PHY_WOL_MID_PASSWD                  25
+#define MSCC_PHY_WOL_UPPER_PASSWD        26
+
+#define MSCC_PHY_WOL_MAC_CONTROL         27
+#define SECURE_ON_ENABLE                 0x8000
+#define SECURE_ON_PASSWD_LEN_4           0x4000
+
  /* Microsemi PHY ID's */
  #define PHY_ID_VSC8531                   0x00070570
  #define PHY_ID_VSC8541                   0x00070770
@@ -81,6 +94,117 @@ static int vsc85xx_phy_page_set(struct phy_device *phydev, u8 page)
         return rc;
  }
  
+static int vsc85xx_wol_set(struct phy_device *phydev,
+                          struct ethtool_wolinfo *wol)
+{
+       int rc;
+       u16 reg_val;
+       u8  i;
+       u16 pwd[3] = {0, 0, 0};
+       struct ethtool_wolinfo *wol_conf = wol;
+       u8 *mac_addr = phydev->attached_dev->dev_addr;
+
+       mutex_lock(&phydev->lock);
+       rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2);
+       if (rc != 0)
+               goto out_unlock;
+
+       if (wol->wolopts & WAKE_MAGIC) {
+               /* Store the device address for the magic packet */
+               for (i = 0; i < ARRAY_SIZE(pwd); i++)
+                       pwd[i] = mac_addr[5 - (i * 2 + 1)] << 8 |
+                                mac_addr[5 - i * 2];
+               phy_write(phydev, MSCC_PHY_WOL_LOWER_MAC_ADDR, pwd[0]);
+               phy_write(phydev, MSCC_PHY_WOL_MID_MAC_ADDR, pwd[1]);
+               phy_write(phydev, MSCC_PHY_WOL_UPPER_MAC_ADDR, pwd[2]);
+       } else {
+               phy_write(phydev, MSCC_PHY_WOL_LOWER_MAC_ADDR, 0);
+               phy_write(phydev, MSCC_PHY_WOL_MID_MAC_ADDR, 0);
+               phy_write(phydev, MSCC_PHY_WOL_UPPER_MAC_ADDR, 0);
+       }
+
+       if (wol_conf->wolopts & WAKE_MAGICSECURE) {
+               for (i = 0; i < ARRAY_SIZE(pwd); i++)
+                       pwd[i] = wol_conf->sopass[5 - (i * 2 + 1)] << 8 |
+                                wol_conf->sopass[5 - i * 2];
+               phy_write(phydev, MSCC_PHY_WOL_LOWER_PASSWD, pwd[0]);
+               phy_write(phydev, MSCC_PHY_WOL_MID_PASSWD, pwd[1]);
+               phy_write(phydev, MSCC_PHY_WOL_UPPER_PASSWD, pwd[2]);
+       } else {
+               phy_write(phydev, MSCC_PHY_WOL_LOWER_PASSWD, 0);
+               phy_write(phydev, MSCC_PHY_WOL_MID_PASSWD, 0);
+               phy_write(phydev, MSCC_PHY_WOL_UPPER_PASSWD, 0);
+       }
+
+       reg_val = phy_read(phydev, MSCC_PHY_WOL_MAC_CONTROL);
+       if (wol_conf->wolopts & WAKE_MAGICSECURE)
+               reg_val |= SECURE_ON_ENABLE;
+       else
+               reg_val &= ~SECURE_ON_ENABLE;
+       phy_write(phydev, MSCC_PHY_WOL_MAC_CONTROL, reg_val);
+
+       rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
+       if (rc != 0)
+               goto out_unlock;
+
+       if (wol->wolopts & WAKE_MAGIC) {
+               /* Enable the WOL interrupt */
+               reg_val = phy_read(phydev, MII_VSC85XX_INT_MASK);
+               reg_val |= MII_VSC85XX_INT_MASK_WOL;
+               rc = phy_write(phydev, MII_VSC85XX_INT_MASK, reg_val);
+               if (rc != 0)
+                       goto out_unlock;
+       } else {
+               /* Disable the WOL interrupt */
+               reg_val = phy_read(phydev, MII_VSC85XX_INT_MASK);
+               reg_val &= (~MII_VSC85XX_INT_MASK_WOL);
+               rc = phy_write(phydev, MII_VSC85XX_INT_MASK, reg_val);
+               if (rc != 0)
+                       goto out_unlock;
+       }
+       /* Clear WOL iterrupt status */
+       reg_val = phy_read(phydev, MII_VSC85XX_INT_STATUS);
+
+out_unlock:
+       mutex_unlock(&phydev->lock);
+
+       return rc;
+}
+
+static void vsc85xx_wol_get(struct phy_device *phydev,
+                           struct ethtool_wolinfo *wol)
+{
+       int rc;
+       u16 reg_val;
+       u8  i;
+       u16 pwd[3] = {0, 0, 0};
+       struct ethtool_wolinfo *wol_conf = wol;
+
+       mutex_lock(&phydev->lock);
+       rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2);
+       if (rc != 0)
+               goto out_unlock;
+
+       reg_val = phy_read(phydev, MSCC_PHY_WOL_MAC_CONTROL);
+       if (reg_val & SECURE_ON_ENABLE)
+               wol_conf->wolopts |= WAKE_MAGICSECURE;
+       if (wol_conf->wolopts & WAKE_MAGICSECURE) {
+               pwd[0] = phy_read(phydev, MSCC_PHY_WOL_LOWER_PASSWD);
+               pwd[1] = phy_read(phydev, MSCC_PHY_WOL_MID_PASSWD);
+               pwd[2] = phy_read(phydev, MSCC_PHY_WOL_UPPER_PASSWD);
+               for (i = 0; i < ARRAY_SIZE(pwd); i++) {
+                       wol_conf->sopass[5 - i * 2] = pwd[i] & 0x00ff;
+                       wol_conf->sopass[5 - (i * 2 + 1)] = (pwd[i] & 0xff00)
+                                                           >> 8;
+               }
+       }
+
+       rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
+
+out_unlock:
+       mutex_unlock(&phydev->lock);
+}
+
  static u8 edge_rate_magic_get(u16 vddmac,
                               int slowdown)
  {
@@ -301,6 +425,8 @@ static struct phy_driver vsc85xx_driver[] = {
         .suspend        = &genphy_suspend,
         .resume         = &genphy_resume,
         .probe          = &vsc85xx_probe,
+       .set_wol        = &vsc85xx_wol_set,
+       .get_wol        = &vsc85xx_wol_get,
  },
  {
         .phy_id         = PHY_ID_VSC8541,
@@ -318,6 +444,8 @@ static struct phy_driver vsc85xx_driver[] = {
         .suspend        = &genphy_suspend,
         .resume         = &genphy_resume,
         .probe          = &vsc85xx_probe,
+       .set_wol        = &vsc85xx_wol_set,
+       .get_wol        = &vsc85xx_wol_get,
  }
  
  };
diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c

index 5fbf83d..6564753 100644 (file)
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -295,11 +295,11 @@ free_ucc_pram:
         qe_muram_free(priv->ucc_pram_offset);
  free_tx_bd:
         dma_free_coherent(priv->dev,
-                         TX_BD_RING_LEN * sizeof(struct qe_bd),
+                         TX_BD_RING_LEN * sizeof(struct qe_bd *),
                           priv->tx_bd_base, priv->dma_tx_bd);
  free_rx_bd:
         dma_free_coherent(priv->dev,
-                         RX_BD_RING_LEN * sizeof(struct qe_bd),
+                         RX_BD_RING_LEN * sizeof(struct qe_bd *),
                           priv->rx_bd_base, priv->dma_rx_bd);
  free_uccf:
         ucc_fast_free(priv->uccf);
@@ -688,7 +688,7 @@ static void uhdlc_memclean(struct ucc_hdlc_private *priv)
  
         if (priv->rx_bd_base) {
                 dma_free_coherent(priv->dev,
-                                 RX_BD_RING_LEN * sizeof(struct qe_bd),
+                                 RX_BD_RING_LEN * sizeof(struct qe_bd *),
                                   priv->rx_bd_base, priv->dma_rx_bd);
  
                 priv->rx_bd_base = NULL;
@@ -697,7 +697,7 @@ static void uhdlc_memclean(struct ucc_hdlc_private *priv)
  
         if (priv->tx_bd_base) {
                 dma_free_coherent(priv->dev,
-                                 TX_BD_RING_LEN * sizeof(struct qe_bd),
+                                 TX_BD_RING_LEN * sizeof(struct qe_bd *),
                                   priv->tx_bd_base, priv->dma_tx_bd);
  
                 priv->tx_bd_base = NULL;
diff --git a/drivers/net/xen-netback/Makefile b/drivers/net/xen-netback/Makefile

index 11e02be..d49798a 100644 (file)
--- a/drivers/net/xen-netback/Makefile
+++ b/drivers/net/xen-netback/Makefile
@@ -1,3 +1,3 @@
  obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
  
-xen-netback-y := netback.o xenbus.o interface.o hash.o
+xen-netback-y := netback.o xenbus.o interface.o hash.o rx.o
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h

index b38fb2c..cf68149 100644 (file)
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -91,13 +91,6 @@ struct xenvif_rx_meta {
   */
  #define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1)
  
-/* It's possible for an skb to have a maximal number of frags
- * but still be less than MAX_BUFFER_OFFSET in size. Thus the
- * worst-case number of copy operations is MAX_XEN_SKB_FRAGS per
- * ring slot.
- */
-#define MAX_GRANT_COPY_OPS (MAX_XEN_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)
-
  #define NETBACK_INVALID_HANDLE -1
  
  /* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
@@ -133,6 +126,15 @@ struct xenvif_stats {
         unsigned long tx_frag_overflow;
  };
  
+#define COPY_BATCH_SIZE 64
+
+struct xenvif_copy_state {
+       struct gnttab_copy op[COPY_BATCH_SIZE];
+       RING_IDX idx[COPY_BATCH_SIZE];
+       unsigned int num;
+       struct sk_buff_head *completed;
+};
+
  struct xenvif_queue { /* Per-queue data for xenvif */
         unsigned int id; /* Queue ID, 0-based */
         char name[QUEUE_NAME_SIZE]; /* DEVNAME-qN */
@@ -189,12 +191,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */
         unsigned long last_rx_time;
         bool stalled;
  
-       struct gnttab_copy grant_copy_op[MAX_GRANT_COPY_OPS];
-
-       /* We create one meta structure per ring request we consume, so
-        * the maximum number is the same as the ring size.
-        */
-       struct xenvif_rx_meta meta[XEN_NETIF_RX_RING_SIZE];
+       struct xenvif_copy_state rx_copy;
  
         /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
         unsigned long   credit_bytes;
@@ -260,7 +257,6 @@ struct xenvif {
  
         /* Frontend feature information. */
         int gso_mask;
-       int gso_prefix_mask;
  
         u8 can_sg:1;
         u8 ip_csum:1;
@@ -359,6 +355,7 @@ int xenvif_dealloc_kthread(void *data);
  
  irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data);
  
+void xenvif_rx_action(struct xenvif_queue *queue);
  void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
  
  void xenvif_carrier_on(struct xenvif *vif);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c

index fb50c6d..74dc2bf 100644 (file)
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -149,17 +149,8 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb,
         struct xenvif *vif = netdev_priv(dev);
         unsigned int size = vif->hash.size;
  
-       if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE) {
-               u16 index = fallback(dev, skb) % dev->real_num_tx_queues;
-
-               /* Make sure there is no hash information in the socket
-                * buffer otherwise it would be incorrectly forwarded
-                * to the frontend.
-                */
-               skb_clear_hash(skb);
-
-               return index;
-       }
+       if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
+               return fallback(dev, skb) % dev->real_num_tx_queues;
  
         xenvif_set_skb_hash(vif, skb);
  
@@ -208,6 +199,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
         cb = XENVIF_RX_CB(skb);
         cb->expires = jiffies + vif->drain_timeout;
  
+       /* If there is no hash algorithm configured then make sure there
+        * is no hash information in the socket buffer otherwise it
+        * would be incorrectly forwarded to the frontend.
+        */
+       if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
+               skb_clear_hash(skb);
+
         xenvif_rx_queue_tail(queue, skb);
         xenvif_kick_thread(queue);
  
@@ -319,9 +317,9 @@ static netdev_features_t xenvif_fix_features(struct net_device *dev,
  
         if (!vif->can_sg)
                 features &= ~NETIF_F_SG;
-       if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV4))
+       if (~(vif->gso_mask) & GSO_BIT(TCPV4))
                 features &= ~NETIF_F_TSO;
-       if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV6))
+       if (~(vif->gso_mask) & GSO_BIT(TCPV6))
                 features &= ~NETIF_F_TSO6;
         if (!vif->ip_csum)
                 features &= ~NETIF_F_IP_CSUM;
@@ -467,7 +465,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
         dev->netdev_ops = &xenvif_netdev_ops;
         dev->hw_features = NETIF_F_SG |
                 NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-               NETIF_F_TSO | NETIF_F_TSO6;
+               NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_FRAGLIST;
         dev->features = dev->hw_features | NETIF_F_RXCSUM;
         dev->ethtool_ops = &xenvif_ethtool_ops;
  
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c

index 3d0c989..47b4810 100644 (file)
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -106,13 +106,6 @@ static void push_tx_responses(struct xenvif_queue *queue);
  
  static inline int tx_work_todo(struct xenvif_queue *queue);
  
-static struct xen_netif_rx_response *make_rx_response(struct xenvif_queue *queue,
-                                            u16      id,
-                                            s8       st,
-                                            u16      offset,
-                                            u16      size,
-                                            u16      flags);
-
  static inline unsigned long idx_to_pfn(struct xenvif_queue *queue,
                                        u16 idx)
  {
@@ -155,571 +148,11 @@ static inline pending_ring_idx_t pending_index(unsigned i)
         return i & (MAX_PENDING_REQS-1);
  }
  
-static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
-{
-       RING_IDX prod, cons;
-       struct sk_buff *skb;
-       int needed;
-
-       skb = skb_peek(&queue->rx_queue);
-       if (!skb)
-               return false;
-
-       needed = DIV_ROUND_UP(skb->len, XEN_PAGE_SIZE);
-       if (skb_is_gso(skb))
-               needed++;
-       if (skb->sw_hash)
-               needed++;
-
-       do {
-               prod = queue->rx.sring->req_prod;
-               cons = queue->rx.req_cons;
-
-               if (prod - cons >= needed)
-                       return true;
-
-               queue->rx.sring->req_event = prod + 1;
-
-               /* Make sure event is visible before we check prod
-                * again.
-                */
-               mb();
-       } while (queue->rx.sring->req_prod != prod);
-
-       return false;
-}
-
-void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&queue->rx_queue.lock, flags);
-
-       __skb_queue_tail(&queue->rx_queue, skb);
-
-       queue->rx_queue_len += skb->len;
-       if (queue->rx_queue_len > queue->rx_queue_max)
-               netif_tx_stop_queue(netdev_get_tx_queue(queue->vif->dev, queue->id));
-
-       spin_unlock_irqrestore(&queue->rx_queue.lock, flags);
-}
-
-static struct sk_buff *xenvif_rx_dequeue(struct xenvif_queue *queue)
-{
-       struct sk_buff *skb;
-
-       spin_lock_irq(&queue->rx_queue.lock);
-
-       skb = __skb_dequeue(&queue->rx_queue);
-       if (skb)
-               queue->rx_queue_len -= skb->len;
-
-       spin_unlock_irq(&queue->rx_queue.lock);
-
-       return skb;
-}
-
-static void xenvif_rx_queue_maybe_wake(struct xenvif_queue *queue)
-{
-       spin_lock_irq(&queue->rx_queue.lock);
-
-       if (queue->rx_queue_len < queue->rx_queue_max)
-               netif_tx_wake_queue(netdev_get_tx_queue(queue->vif->dev, queue->id));
-
-       spin_unlock_irq(&queue->rx_queue.lock);
-}
-
-
-static void xenvif_rx_queue_purge(struct xenvif_queue *queue)
-{
-       struct sk_buff *skb;
-       while ((skb = xenvif_rx_dequeue(queue)) != NULL)
-               kfree_skb(skb);
-}
-
-static void xenvif_rx_queue_drop_expired(struct xenvif_queue *queue)
-{
-       struct sk_buff *skb;
-
-       for(;;) {
-               skb = skb_peek(&queue->rx_queue);
-               if (!skb)
-                       break;
-               if (time_before(jiffies, XENVIF_RX_CB(skb)->expires))
-                       break;
-               xenvif_rx_dequeue(queue);
-               kfree_skb(skb);
-       }
-}
-
-struct netrx_pending_operations {
-       unsigned copy_prod, copy_cons;
-       unsigned meta_prod, meta_cons;
-       struct gnttab_copy *copy;
-       struct xenvif_rx_meta *meta;
-       int copy_off;
-       grant_ref_t copy_gref;
-};
-
-static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif_queue *queue,
-                                                struct netrx_pending_operations *npo)
-{
-       struct xenvif_rx_meta *meta;
-       struct xen_netif_rx_request req;
-
-       RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
-
-       meta = npo->meta + npo->meta_prod++;
-       meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
-       meta->gso_size = 0;
-       meta->size = 0;
-       meta->id = req.id;
-
-       npo->copy_off = 0;
-       npo->copy_gref = req.gref;
-
-       return meta;
-}
-
-struct gop_frag_copy {
-       struct xenvif_queue *queue;
-       struct netrx_pending_operations *npo;
-       struct xenvif_rx_meta *meta;
-       int head;
-       int gso_type;
-       int protocol;
-       int hash_present;
-
-       struct page *page;
-};
-
-static void xenvif_setup_copy_gop(unsigned long gfn,
-                                 unsigned int offset,
-                                 unsigned int *len,
-                                 struct gop_frag_copy *info)
-{
-       struct gnttab_copy *copy_gop;
-       struct xen_page_foreign *foreign;
-       /* Convenient aliases */
-       struct xenvif_queue *queue = info->queue;
-       struct netrx_pending_operations *npo = info->npo;
-       struct page *page = info->page;
-
-       BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
-
-       if (npo->copy_off == MAX_BUFFER_OFFSET)
-               info->meta = get_next_rx_buffer(queue, npo);
-
-       if (npo->copy_off + *len > MAX_BUFFER_OFFSET)
-               *len = MAX_BUFFER_OFFSET - npo->copy_off;
-
-       copy_gop = npo->copy + npo->copy_prod++;
-       copy_gop->flags = GNTCOPY_dest_gref;
-       copy_gop->len = *len;
-
-       foreign = xen_page_foreign(page);
-       if (foreign) {
-               copy_gop->source.domid = foreign->domid;
-               copy_gop->source.u.ref = foreign->gref;
-               copy_gop->flags |= GNTCOPY_source_gref;
-       } else {
-               copy_gop->source.domid = DOMID_SELF;
-               copy_gop->source.u.gmfn = gfn;
-       }
-       copy_gop->source.offset = offset;
-
-       copy_gop->dest.domid = queue->vif->domid;
-       copy_gop->dest.offset = npo->copy_off;
-       copy_gop->dest.u.ref = npo->copy_gref;
-
-       npo->copy_off += *len;
-       info->meta->size += *len;
-
-       if (!info->head)
-               return;
-
-       /* Leave a gap for the GSO descriptor. */
-       if ((1 << info->gso_type) & queue->vif->gso_mask)
-               queue->rx.req_cons++;
-
-       /* Leave a gap for the hash extra segment. */
-       if (info->hash_present)
-               queue->rx.req_cons++;
-
-       info->head = 0; /* There must be something in this buffer now */
-}
-
-static void xenvif_gop_frag_copy_grant(unsigned long gfn,
-                                      unsigned offset,
-                                      unsigned int len,
-                                      void *data)
-{
-       unsigned int bytes;
-
-       while (len) {
-               bytes = len;
-               xenvif_setup_copy_gop(gfn, offset, &bytes, data);
-               offset += bytes;
-               len -= bytes;
-       }
-}
-
-/*
- * Set up the grant operations for this fragment. If it's a flipping
- * interface, we also set up the unmap request from here.
- */
-static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb,
-                                struct netrx_pending_operations *npo,
-                                struct page *page, unsigned long size,
-                                unsigned long offset, int *head)
-{
-       struct gop_frag_copy info = {
-               .queue = queue,
-               .npo = npo,
-               .head = *head,
-               .gso_type = XEN_NETIF_GSO_TYPE_NONE,
-               /* xenvif_set_skb_hash() will have either set a s/w
-                * hash or cleared the hash depending on
-                * whether the the frontend wants a hash for this skb.
-                */
-               .hash_present = skb->sw_hash,
-       };
-       unsigned long bytes;
-
-       if (skb_is_gso(skb)) {
-               if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
-                       info.gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
-               else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
-                       info.gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
-       }
-
-       /* Data must not cross a page boundary. */
-       BUG_ON(size + offset > PAGE_SIZE<<compound_order(page));
-
-       info.meta = npo->meta + npo->meta_prod - 1;
-
-       /* Skip unused frames from start of page */
-       page += offset >> PAGE_SHIFT;
-       offset &= ~PAGE_MASK;
-
-       while (size > 0) {
-               BUG_ON(offset >= PAGE_SIZE);
-
-               bytes = PAGE_SIZE - offset;
-               if (bytes > size)
-                       bytes = size;
-
-               info.page = page;
-               gnttab_foreach_grant_in_range(page, offset, bytes,
-                                             xenvif_gop_frag_copy_grant,
-                                             &info);
-               size -= bytes;
-               offset = 0;
-
-               /* Next page */
-               if (size) {
-                       BUG_ON(!PageCompound(page));
-                       page++;
-               }
-       }
-
-       *head = info.head;
-}
-
-/*
- * Prepare an SKB to be transmitted to the frontend.
- *
- * This function is responsible for allocating grant operations, meta
- * structures, etc.
- *
- * It returns the number of meta structures consumed. The number of
- * ring slots used is always equal to the number of meta slots used
- * plus the number of GSO descriptors used. Currently, we use either
- * zero GSO descriptors (for non-GSO packets) or one descriptor (for
- * frontend-side LRO).
- */
-static int xenvif_gop_skb(struct sk_buff *skb,
-                         struct netrx_pending_operations *npo,
-                         struct xenvif_queue *queue)
-{
-       struct xenvif *vif = netdev_priv(skb->dev);
-       int nr_frags = skb_shinfo(skb)->nr_frags;
-       int i;
-       struct xen_netif_rx_request req;
-       struct xenvif_rx_meta *meta;
-       unsigned char *data;
-       int head = 1;
-       int old_meta_prod;
-       int gso_type;
-
-       old_meta_prod = npo->meta_prod;
-
-       gso_type = XEN_NETIF_GSO_TYPE_NONE;
-       if (skb_is_gso(skb)) {
-               if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
-                       gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
-               else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
-                       gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
-       }
-
-       /* Set up a GSO prefix descriptor, if necessary */
-       if ((1 << gso_type) & vif->gso_prefix_mask) {
-               RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
-               meta = npo->meta + npo->meta_prod++;
-               meta->gso_type = gso_type;
-               meta->gso_size = skb_shinfo(skb)->gso_size;
-               meta->size = 0;
-               meta->id = req.id;
-       }
-
-       RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
-       meta = npo->meta + npo->meta_prod++;
-
-       if ((1 << gso_type) & vif->gso_mask) {
-               meta->gso_type = gso_type;
-               meta->gso_size = skb_shinfo(skb)->gso_size;
-       } else {
-               meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
-               meta->gso_size = 0;
-       }
-
-       meta->size = 0;
-       meta->id = req.id;
-       npo->copy_off = 0;
-       npo->copy_gref = req.gref;
-
-       data = skb->data;
-       while (data < skb_tail_pointer(skb)) {
-               unsigned int offset = offset_in_page(data);
-               unsigned int len = PAGE_SIZE - offset;
-
-               if (data + len > skb_tail_pointer(skb))
-                       len = skb_tail_pointer(skb) - data;
-
-               xenvif_gop_frag_copy(queue, skb, npo,
-                                    virt_to_page(data), len, offset, &head);
-               data += len;
-       }
-
-       for (i = 0; i < nr_frags; i++) {
-               xenvif_gop_frag_copy(queue, skb, npo,
-                                    skb_frag_page(&skb_shinfo(skb)->frags[i]),
-                                    skb_frag_size(&skb_shinfo(skb)->frags[i]),
-                                    skb_shinfo(skb)->frags[i].page_offset,
-                                    &head);
-       }
-
-       return npo->meta_prod - old_meta_prod;
-}
-
-/*
- * This is a twin to xenvif_gop_skb.  Assume that xenvif_gop_skb was
- * used to set up the operations on the top of
- * netrx_pending_operations, which have since been done.  Check that
- * they didn't give any errors and advance over them.
- */
-static int xenvif_check_gop(struct xenvif *vif, int nr_meta_slots,
-                           struct netrx_pending_operations *npo)
-{
-       struct gnttab_copy     *copy_op;
-       int status = XEN_NETIF_RSP_OKAY;
-       int i;
-
-       for (i = 0; i < nr_meta_slots; i++) {
-               copy_op = npo->copy + npo->copy_cons++;
-               if (copy_op->status != GNTST_okay) {
-                       netdev_dbg(vif->dev,
-                                  "Bad status %d from copy to DOM%d.\n",
-                                  copy_op->status, vif->domid);
-                       status = XEN_NETIF_RSP_ERROR;
-               }
-       }
-
-       return status;
-}
-
-static void xenvif_add_frag_responses(struct xenvif_queue *queue, int status,
-                                     struct xenvif_rx_meta *meta,
-                                     int nr_meta_slots)
-{
-       int i;
-       unsigned long offset;
-
-       /* No fragments used */
-       if (nr_meta_slots <= 1)
-               return;
-
-       nr_meta_slots--;
-
-       for (i = 0; i < nr_meta_slots; i++) {
-               int flags;
-               if (i == nr_meta_slots - 1)
-                       flags = 0;
-               else
-                       flags = XEN_NETRXF_more_data;
-
-               offset = 0;
-               make_rx_response(queue, meta[i].id, status, offset,
-                                meta[i].size, flags);
-       }
-}
-
  void xenvif_kick_thread(struct xenvif_queue *queue)
  {
         wake_up(&queue->wq);
  }
  
-static void xenvif_rx_action(struct xenvif_queue *queue)
-{
-       struct xenvif *vif = queue->vif;
-       s8 status;
-       u16 flags;
-       struct xen_netif_rx_response *resp;
-       struct sk_buff_head rxq;
-       struct sk_buff *skb;
-       LIST_HEAD(notify);
-       int ret;
-       unsigned long offset;
-       bool need_to_notify = false;
-
-       struct netrx_pending_operations npo = {
-               .copy  = queue->grant_copy_op,
-               .meta  = queue->meta,
-       };
-
-       skb_queue_head_init(&rxq);
-
-       while (xenvif_rx_ring_slots_available(queue)
-              && (skb = xenvif_rx_dequeue(queue)) != NULL) {
-               queue->last_rx_time = jiffies;
-
-               XENVIF_RX_CB(skb)->meta_slots_used = xenvif_gop_skb(skb, &npo, queue);
-
-               __skb_queue_tail(&rxq, skb);
-       }
-
-       BUG_ON(npo.meta_prod > ARRAY_SIZE(queue->meta));
-
-       if (!npo.copy_prod)
-               goto done;
-
-       BUG_ON(npo.copy_prod > MAX_GRANT_COPY_OPS);
-       gnttab_batch_copy(queue->grant_copy_op, npo.copy_prod);
-
-       while ((skb = __skb_dequeue(&rxq)) != NULL) {
-               struct xen_netif_extra_info *extra = NULL;
-
-               if ((1 << queue->meta[npo.meta_cons].gso_type) &
-                   vif->gso_prefix_mask) {
-                       resp = RING_GET_RESPONSE(&queue->rx,
-                                                queue->rx.rsp_prod_pvt++);
-
-                       resp->flags = XEN_NETRXF_gso_prefix | XEN_NETRXF_more_data;
-
-                       resp->offset = queue->meta[npo.meta_cons].gso_size;
-                       resp->id = queue->meta[npo.meta_cons].id;
-                       resp->status = XENVIF_RX_CB(skb)->meta_slots_used;
-
-                       npo.meta_cons++;
-                       XENVIF_RX_CB(skb)->meta_slots_used--;
-               }
-
-
-               queue->stats.tx_bytes += skb->len;
-               queue->stats.tx_packets++;
-
-               status = xenvif_check_gop(vif,
-                                         XENVIF_RX_CB(skb)->meta_slots_used,
-                                         &npo);
-
-               if (XENVIF_RX_CB(skb)->meta_slots_used == 1)
-                       flags = 0;
-               else
-                       flags = XEN_NETRXF_more_data;
-
-               if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
-                       flags |= XEN_NETRXF_csum_blank | XEN_NETRXF_data_validated;
-               else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
-                       /* remote but checksummed. */
-                       flags |= XEN_NETRXF_data_validated;
-
-               offset = 0;
-               resp = make_rx_response(queue, queue->meta[npo.meta_cons].id,
-                                       status, offset,
-                                       queue->meta[npo.meta_cons].size,
-                                       flags);
-
-               if ((1 << queue->meta[npo.meta_cons].gso_type) &
-                   vif->gso_mask) {
-                       extra = (struct xen_netif_extra_info *)
-                               RING_GET_RESPONSE(&queue->rx,
-                                                 queue->rx.rsp_prod_pvt++);
-
-                       resp->flags |= XEN_NETRXF_extra_info;
-
-                       extra->u.gso.type = queue->meta[npo.meta_cons].gso_type;
-                       extra->u.gso.size = queue->meta[npo.meta_cons].gso_size;
-                       extra->u.gso.pad = 0;
-                       extra->u.gso.features = 0;
-
-                       extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
-                       extra->flags = 0;
-               }
-
-               if (skb->sw_hash) {
-                       /* Since the skb got here via xenvif_select_queue()
-                        * we know that the hash has been re-calculated
-                        * according to a configuration set by the frontend
-                        * and therefore we know that it is legitimate to
-                        * pass it to the frontend.
-                        */
-                       if (resp->flags & XEN_NETRXF_extra_info)
-                               extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
-                       else
-                               resp->flags |= XEN_NETRXF_extra_info;
-
-                       extra = (struct xen_netif_extra_info *)
-                               RING_GET_RESPONSE(&queue->rx,
-                                                 queue->rx.rsp_prod_pvt++);
-
-                       extra->u.hash.algorithm =
-                               XEN_NETIF_CTRL_HASH_ALGORITHM_TOEPLITZ;
-
-                       if (skb->l4_hash)
-                               extra->u.hash.type =
-                                       skb->protocol == htons(ETH_P_IP) ?
-                                       _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP :
-                                       _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP;
-                       else
-                               extra->u.hash.type =
-                                       skb->protocol == htons(ETH_P_IP) ?
-                                       _XEN_NETIF_CTRL_HASH_TYPE_IPV4 :
-                                       _XEN_NETIF_CTRL_HASH_TYPE_IPV6;
-
-                       *(uint32_t *)extra->u.hash.value =
-                               skb_get_hash_raw(skb);
-
-                       extra->type = XEN_NETIF_EXTRA_TYPE_HASH;
-                       extra->flags = 0;
-               }
-
-               xenvif_add_frag_responses(queue, status,
-                                         queue->meta + npo.meta_cons + 1,
-                                         XENVIF_RX_CB(skb)->meta_slots_used);
-
-               RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue->rx, ret);
-
-               need_to_notify |= !!ret;
-
-               npo.meta_cons += XENVIF_RX_CB(skb)->meta_slots_used;
-               dev_kfree_skb(skb);
-       }
-
-done:
-       if (need_to_notify)
-               notify_remote_via_irq(queue->rx_irq);
-}
-
  void xenvif_napi_schedule_or_enable_events(struct xenvif_queue *queue)
  {
         int more_to_do;
@@ -1951,29 +1384,6 @@ static void push_tx_responses(struct xenvif_queue *queue)
                 notify_remote_via_irq(queue->tx_irq);
  }
  
-static struct xen_netif_rx_response *make_rx_response(struct xenvif_queue *queue,
-                                            u16      id,
-                                            s8       st,
-                                            u16      offset,
-                                            u16      size,
-                                            u16      flags)
-{
-       RING_IDX i = queue->rx.rsp_prod_pvt;
-       struct xen_netif_rx_response *resp;
-
-       resp = RING_GET_RESPONSE(&queue->rx, i);
-       resp->offset     = offset;
-       resp->flags      = flags;
-       resp->id         = id;
-       resp->status     = (s16)size;
-       if (st < 0)
-               resp->status = (s16)st;
-
-       queue->rx.rsp_prod_pvt = ++i;
-
-       return resp;
-}
-
  void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx)
  {
         int ret;
@@ -2055,170 +1465,6 @@ err:
         return err;
  }
  
-static void xenvif_queue_carrier_off(struct xenvif_queue *queue)
-{
-       struct xenvif *vif = queue->vif;
-
-       queue->stalled = true;
-
-       /* At least one queue has stalled? Disable the carrier. */
-       spin_lock(&vif->lock);
-       if (vif->stalled_queues++ == 0) {
-               netdev_info(vif->dev, "Guest Rx stalled");
-               netif_carrier_off(vif->dev);
-       }
-       spin_unlock(&vif->lock);
-}
-
-static void xenvif_queue_carrier_on(struct xenvif_queue *queue)
-{
-       struct xenvif *vif = queue->vif;
-
-       queue->last_rx_time = jiffies; /* Reset Rx stall detection. */
-       queue->stalled = false;
-
-       /* All queues are ready? Enable the carrier. */
-       spin_lock(&vif->lock);
-       if (--vif->stalled_queues == 0) {
-               netdev_info(vif->dev, "Guest Rx ready");
-               netif_carrier_on(vif->dev);
-       }
-       spin_unlock(&vif->lock);
-}
-
-static bool xenvif_rx_queue_stalled(struct xenvif_queue *queue)
-{
-       RING_IDX prod, cons;
-
-       prod = queue->rx.sring->req_prod;
-       cons = queue->rx.req_cons;
-
-       return !queue->stalled && prod - cons < 1
-               && time_after(jiffies,
-                             queue->last_rx_time + queue->vif->stall_timeout);
-}
-
-static bool xenvif_rx_queue_ready(struct xenvif_queue *queue)
-{
-       RING_IDX prod, cons;
-
-       prod = queue->rx.sring->req_prod;
-       cons = queue->rx.req_cons;
-
-       return queue->stalled && prod - cons >= 1;
-}
-
-static bool xenvif_have_rx_work(struct xenvif_queue *queue)
-{
-       return xenvif_rx_ring_slots_available(queue)
-               || (queue->vif->stall_timeout &&
-                   (xenvif_rx_queue_stalled(queue)
-                    || xenvif_rx_queue_ready(queue)))
-               || kthread_should_stop()
-               || queue->vif->disabled;
-}
-
-static long xenvif_rx_queue_timeout(struct xenvif_queue *queue)
-{
-       struct sk_buff *skb;
-       long timeout;
-
-       skb = skb_peek(&queue->rx_queue);
-       if (!skb)
-               return MAX_SCHEDULE_TIMEOUT;
-
-       timeout = XENVIF_RX_CB(skb)->expires - jiffies;
-       return timeout < 0 ? 0 : timeout;
-}
-
-/* Wait until the guest Rx thread has work.
- *
- * The timeout needs to be adjusted based on the current head of the
- * queue (and not just the head at the beginning).  In particular, if
- * the queue is initially empty an infinite timeout is used and this
- * needs to be reduced when a skb is queued.
- *
- * This cannot be done with wait_event_timeout() because it only
- * calculates the timeout once.
- */
-static void xenvif_wait_for_rx_work(struct xenvif_queue *queue)
-{
-       DEFINE_WAIT(wait);
-
-       if (xenvif_have_rx_work(queue))
-               return;
-
-       for (;;) {
-               long ret;
-
-               prepare_to_wait(&queue->wq, &wait, TASK_INTERRUPTIBLE);
-               if (xenvif_have_rx_work(queue))
-                       break;
-               ret = schedule_timeout(xenvif_rx_queue_timeout(queue));
-               if (!ret)
-                       break;
-       }
-       finish_wait(&queue->wq, &wait);
-}
-
-int xenvif_kthread_guest_rx(void *data)
-{
-       struct xenvif_queue *queue = data;
-       struct xenvif *vif = queue->vif;
-
-       if (!vif->stall_timeout)
-               xenvif_queue_carrier_on(queue);
-
-       for (;;) {
-               xenvif_wait_for_rx_work(queue);
-
-               if (kthread_should_stop())
-                       break;
-
-               /* This frontend is found to be rogue, disable it in
-                * kthread context. Currently this is only set when
-                * netback finds out frontend sends malformed packet,
-                * but we cannot disable the interface in softirq
-                * context so we defer it here, if this thread is
-                * associated with queue 0.
-                */
-               if (unlikely(vif->disabled && queue->id == 0)) {
-                       xenvif_carrier_off(vif);
-                       break;
-               }
-
-               if (!skb_queue_empty(&queue->rx_queue))
-                       xenvif_rx_action(queue);
-
-               /* If the guest hasn't provided any Rx slots for a
-                * while it's probably not responsive, drop the
-                * carrier so packets are dropped earlier.
-                */
-               if (vif->stall_timeout) {
-                       if (xenvif_rx_queue_stalled(queue))
-                               xenvif_queue_carrier_off(queue);
-                       else if (xenvif_rx_queue_ready(queue))
-                               xenvif_queue_carrier_on(queue);
-               }
-
-               /* Queued packets may have foreign pages from other
-                * domains.  These cannot be queued indefinitely as
-                * this would starve guests of grant refs and transmit
-                * slots.
-                */
-               xenvif_rx_queue_drop_expired(queue);
-
-               xenvif_rx_queue_maybe_wake(queue);
-
-               cond_resched();
-       }
-
-       /* Bin any remaining skbs */
-       xenvif_rx_queue_purge(queue);
-
-       return 0;
-}
-
  static bool xenvif_dealloc_kthread_should_stop(struct xenvif_queue *queue)
  {
         /* Dealloc thread must remain running until all inflight
diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c

new file mode 100644 (file)

index 0000000..8e9ade6
--- /dev/null
+++ b/drivers/net/xen-netback/rx.c
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2016 Citrix Systems Inc.
+ * Copyright (c) 2002-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "common.h"
+
+#include <linux/kthread.h>
+
+#include <xen/xen.h>
+#include <xen/events.h>
+
+static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
+{
+       RING_IDX prod, cons;
+       struct sk_buff *skb;
+       int needed;
+
+       skb = skb_peek(&queue->rx_queue);
+       if (!skb)
+               return false;
+
+       needed = DIV_ROUND_UP(skb->len, XEN_PAGE_SIZE);
+       if (skb_is_gso(skb))
+               needed++;
+       if (skb->sw_hash)
+               needed++;
+
+       do {
+               prod = queue->rx.sring->req_prod;
+               cons = queue->rx.req_cons;
+
+               if (prod - cons >= needed)
+                       return true;
+
+               queue->rx.sring->req_event = prod + 1;
+
+               /* Make sure event is visible before we check prod
+                * again.
+                */
+               mb();
+       } while (queue->rx.sring->req_prod != prod);
+
+       return false;
+}
+
+void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&queue->rx_queue.lock, flags);
+
+       __skb_queue_tail(&queue->rx_queue, skb);
+
+       queue->rx_queue_len += skb->len;
+       if (queue->rx_queue_len > queue->rx_queue_max) {
+               struct net_device *dev = queue->vif->dev;
+
+               netif_tx_stop_queue(netdev_get_tx_queue(dev, queue->id));
+       }
+
+       spin_unlock_irqrestore(&queue->rx_queue.lock, flags);
+}
+
+static struct sk_buff *xenvif_rx_dequeue(struct xenvif_queue *queue)
+{
+       struct sk_buff *skb;
+
+       spin_lock_irq(&queue->rx_queue.lock);
+
+       skb = __skb_dequeue(&queue->rx_queue);
+       if (skb) {
+               queue->rx_queue_len -= skb->len;
+               if (queue->rx_queue_len < queue->rx_queue_max) {
+                       struct netdev_queue *txq;
+
+                       txq = netdev_get_tx_queue(queue->vif->dev, queue->id);
+                       netif_tx_wake_queue(txq);
+               }
+       }
+
+       spin_unlock_irq(&queue->rx_queue.lock);
+
+       return skb;
+}
+
+static void xenvif_rx_queue_purge(struct xenvif_queue *queue)
+{
+       struct sk_buff *skb;
+
+       while ((skb = xenvif_rx_dequeue(queue)) != NULL)
+               kfree_skb(skb);
+}
+
+static void xenvif_rx_queue_drop_expired(struct xenvif_queue *queue)
+{
+       struct sk_buff *skb;
+
+       for (;;) {
+               skb = skb_peek(&queue->rx_queue);
+               if (!skb)
+                       break;
+               if (time_before(jiffies, XENVIF_RX_CB(skb)->expires))
+                       break;
+               xenvif_rx_dequeue(queue);
+               kfree_skb(skb);
+       }
+}
+
+static void xenvif_rx_copy_flush(struct xenvif_queue *queue)
+{
+       unsigned int i;
+       int notify;
+
+       gnttab_batch_copy(queue->rx_copy.op, queue->rx_copy.num);
+
+       for (i = 0; i < queue->rx_copy.num; i++) {
+               struct gnttab_copy *op;
+
+               op = &queue->rx_copy.op[i];
+
+               /* If the copy failed, overwrite the status field in
+                * the corresponding response.
+                */
+               if (unlikely(op->status != GNTST_okay)) {
+                       struct xen_netif_rx_response *rsp;
+
+                       rsp = RING_GET_RESPONSE(&queue->rx,
+                                               queue->rx_copy.idx[i]);
+                       rsp->status = op->status;
+               }
+       }
+
+       queue->rx_copy.num = 0;
+
+       /* Push responses for all completed packets. */
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue->rx, notify);
+       if (notify)
+               notify_remote_via_irq(queue->rx_irq);
+
+       __skb_queue_purge(queue->rx_copy.completed);
+}
+
+static void xenvif_rx_copy_add(struct xenvif_queue *queue,
+                              struct xen_netif_rx_request *req,
+                              unsigned int offset, void *data, size_t len)
+{
+       struct gnttab_copy *op;
+       struct page *page;
+       struct xen_page_foreign *foreign;
+
+       if (queue->rx_copy.num == COPY_BATCH_SIZE)
+               xenvif_rx_copy_flush(queue);
+
+       op = &queue->rx_copy.op[queue->rx_copy.num];
+
+       page = virt_to_page(data);
+
+       op->flags = GNTCOPY_dest_gref;
+
+       foreign = xen_page_foreign(page);
+       if (foreign) {
+               op->source.domid = foreign->domid;
+               op->source.u.ref = foreign->gref;
+               op->flags |= GNTCOPY_source_gref;
+       } else {
+               op->source.u.gmfn = virt_to_gfn(data);
+               op->source.domid  = DOMID_SELF;
+       }
+
+       op->source.offset = xen_offset_in_page(data);
+       op->dest.u.ref    = req->gref;
+       op->dest.domid    = queue->vif->domid;
+       op->dest.offset   = offset;
+       op->len           = len;
+
+       queue->rx_copy.idx[queue->rx_copy.num] = queue->rx.req_cons;
+       queue->rx_copy.num++;
+}
+
+static unsigned int xenvif_gso_type(struct sk_buff *skb)
+{
+       if (skb_is_gso(skb)) {
+               if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
+                       return XEN_NETIF_GSO_TYPE_TCPV4;
+               else
+                       return XEN_NETIF_GSO_TYPE_TCPV6;
+       }
+       return XEN_NETIF_GSO_TYPE_NONE;
+}
+
+struct xenvif_pkt_state {
+       struct sk_buff *skb;
+       size_t remaining_len;
+       struct sk_buff *frag_iter;
+       int frag; /* frag == -1 => frag_iter->head */
+       unsigned int frag_offset;
+       struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+       unsigned int extra_count;
+       unsigned int slot;
+};
+
+static void xenvif_rx_next_skb(struct xenvif_queue *queue,
+                              struct xenvif_pkt_state *pkt)
+{
+       struct sk_buff *skb;
+       unsigned int gso_type;
+
+       skb = xenvif_rx_dequeue(queue);
+
+       queue->stats.tx_bytes += skb->len;
+       queue->stats.tx_packets++;
+
+       /* Reset packet state. */
+       memset(pkt, 0, sizeof(struct xenvif_pkt_state));
+
+       pkt->skb = skb;
+       pkt->frag_iter = skb;
+       pkt->remaining_len = skb->len;
+       pkt->frag = -1;
+
+       gso_type = xenvif_gso_type(skb);
+       if ((1 << gso_type) & queue->vif->gso_mask) {
+               struct xen_netif_extra_info *extra;
+
+               extra = &pkt->extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+               extra->u.gso.type = gso_type;
+               extra->u.gso.size = skb_shinfo(skb)->gso_size;
+               extra->u.gso.pad = 0;
+               extra->u.gso.features = 0;
+               extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
+               extra->flags = 0;
+
+               pkt->extra_count++;
+       }
+
+       if (skb->sw_hash) {
+               struct xen_netif_extra_info *extra;
+
+               extra = &pkt->extras[XEN_NETIF_EXTRA_TYPE_HASH - 1];
+
+               extra->u.hash.algorithm =
+                       XEN_NETIF_CTRL_HASH_ALGORITHM_TOEPLITZ;
+
+               if (skb->l4_hash)
+                       extra->u.hash.type =
+                               skb->protocol == htons(ETH_P_IP) ?
+                               _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP :
+                               _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP;
+               else
+                       extra->u.hash.type =
+                               skb->protocol == htons(ETH_P_IP) ?
+                               _XEN_NETIF_CTRL_HASH_TYPE_IPV4 :
+                               _XEN_NETIF_CTRL_HASH_TYPE_IPV6;
+
+               *(uint32_t *)extra->u.hash.value = skb_get_hash_raw(skb);
+
+               extra->type = XEN_NETIF_EXTRA_TYPE_HASH;
+               extra->flags = 0;
+
+               pkt->extra_count++;
+       }
+}
+
+static void xenvif_rx_complete(struct xenvif_queue *queue,
+                              struct xenvif_pkt_state *pkt)
+{
+       /* All responses are ready to be pushed. */
+       queue->rx.rsp_prod_pvt = queue->rx.req_cons;
+
+       __skb_queue_tail(queue->rx_copy.completed, pkt->skb);
+}
+
+static void xenvif_rx_next_frag(struct xenvif_pkt_state *pkt)
+{
+       struct sk_buff *frag_iter = pkt->frag_iter;
+       unsigned int nr_frags = skb_shinfo(frag_iter)->nr_frags;
+
+       pkt->frag++;
+       pkt->frag_offset = 0;
+
+       if (pkt->frag >= nr_frags) {
+               if (frag_iter == pkt->skb)
+                       pkt->frag_iter = skb_shinfo(frag_iter)->frag_list;
+               else
+                       pkt->frag_iter = frag_iter->next;
+
+               pkt->frag = -1;
+       }
+}
+
+static void xenvif_rx_next_chunk(struct xenvif_queue *queue,
+                                struct xenvif_pkt_state *pkt,
+                                unsigned int offset, void **data,
+                                size_t *len)
+{
+       struct sk_buff *frag_iter = pkt->frag_iter;
+       void *frag_data;
+       size_t frag_len, chunk_len;
+
+       BUG_ON(!frag_iter);
+
+       if (pkt->frag == -1) {
+               frag_data = frag_iter->data;
+               frag_len = skb_headlen(frag_iter);
+       } else {
+               skb_frag_t *frag = &skb_shinfo(frag_iter)->frags[pkt->frag];
+
+               frag_data = skb_frag_address(frag);
+               frag_len = skb_frag_size(frag);
+       }
+
+       frag_data += pkt->frag_offset;
+       frag_len -= pkt->frag_offset;
+
+       chunk_len = min(frag_len, XEN_PAGE_SIZE - offset);
+       chunk_len = min(chunk_len,
+                       XEN_PAGE_SIZE - xen_offset_in_page(frag_data));
+
+       pkt->frag_offset += chunk_len;
+
+       /* Advance to next frag? */
+       if (frag_len == chunk_len)
+               xenvif_rx_next_frag(pkt);
+
+       *data = frag_data;
+       *len = chunk_len;
+}
+
+static void xenvif_rx_data_slot(struct xenvif_queue *queue,
+                               struct xenvif_pkt_state *pkt,
+                               struct xen_netif_rx_request *req,
+                               struct xen_netif_rx_response *rsp)
+{
+       unsigned int offset = 0;
+       unsigned int flags;
+
+       do {
+               size_t len;
+               void *data;
+
+               xenvif_rx_next_chunk(queue, pkt, offset, &data, &len);
+               xenvif_rx_copy_add(queue, req, offset, data, len);
+
+               offset += len;
+               pkt->remaining_len -= len;
+
+       } while (offset < XEN_PAGE_SIZE && pkt->remaining_len > 0);
+
+       if (pkt->remaining_len > 0)
+               flags = XEN_NETRXF_more_data;
+       else
+               flags = 0;
+
+       if (pkt->slot == 0) {
+               struct sk_buff *skb = pkt->skb;
+
+               if (skb->ip_summed == CHECKSUM_PARTIAL)
+                       flags |= XEN_NETRXF_csum_blank |
+                                XEN_NETRXF_data_validated;
+               else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+                       flags |= XEN_NETRXF_data_validated;
+
+               if (pkt->extra_count != 0)
+                       flags |= XEN_NETRXF_extra_info;
+       }
+
+       rsp->offset = 0;
+       rsp->flags = flags;
+       rsp->id = req->id;
+       rsp->status = (s16)offset;
+}
+
+static void xenvif_rx_extra_slot(struct xenvif_queue *queue,
+                                struct xenvif_pkt_state *pkt,
+                                struct xen_netif_rx_request *req,
+                                struct xen_netif_rx_response *rsp)
+{
+       struct xen_netif_extra_info *extra = (void *)rsp;
+       unsigned int i;
+
+       pkt->extra_count--;
+
+       for (i = 0; i < ARRAY_SIZE(pkt->extras); i++) {
+               if (pkt->extras[i].type) {
+                       *extra = pkt->extras[i];
+
+                       if (pkt->extra_count != 0)
+                               extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+
+                       pkt->extras[i].type = 0;
+                       return;
+               }
+       }
+       BUG();
+}
+
+void xenvif_rx_skb(struct xenvif_queue *queue)
+{
+       struct xenvif_pkt_state pkt;
+
+       xenvif_rx_next_skb(queue, &pkt);
+
+       do {
+               struct xen_netif_rx_request *req;
+               struct xen_netif_rx_response *rsp;
+
+               req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons);
+               rsp = RING_GET_RESPONSE(&queue->rx, queue->rx.req_cons);
+
+               /* Extras must go after the first data slot */
+               if (pkt.slot != 0 && pkt.extra_count != 0)
+                       xenvif_rx_extra_slot(queue, &pkt, req, rsp);
+               else
+                       xenvif_rx_data_slot(queue, &pkt, req, rsp);
+
+               queue->rx.req_cons++;
+               pkt.slot++;
+       } while (pkt.remaining_len > 0 || pkt.extra_count != 0);
+
+       xenvif_rx_complete(queue, &pkt);
+}
+
+#define RX_BATCH_SIZE 64
+
+void xenvif_rx_action(struct xenvif_queue *queue)
+{
+       struct sk_buff_head completed_skbs;
+       unsigned int work_done = 0;
+
+       __skb_queue_head_init(&completed_skbs);
+       queue->rx_copy.completed = &completed_skbs;
+
+       while (xenvif_rx_ring_slots_available(queue) &&
+              work_done < RX_BATCH_SIZE) {
+               xenvif_rx_skb(queue);
+               work_done++;
+       }
+
+       /* Flush any pending copies and complete all skbs. */
+       xenvif_rx_copy_flush(queue);
+}
+
+static bool xenvif_rx_queue_stalled(struct xenvif_queue *queue)
+{
+       RING_IDX prod, cons;
+
+       prod = queue->rx.sring->req_prod;
+       cons = queue->rx.req_cons;
+
+       return !queue->stalled &&
+               prod - cons < 1 &&
+               time_after(jiffies,
+                          queue->last_rx_time + queue->vif->stall_timeout);
+}
+
+static bool xenvif_rx_queue_ready(struct xenvif_queue *queue)
+{
+       RING_IDX prod, cons;
+
+       prod = queue->rx.sring->req_prod;
+       cons = queue->rx.req_cons;
+
+       return queue->stalled && prod - cons >= 1;
+}
+
+static bool xenvif_have_rx_work(struct xenvif_queue *queue)
+{
+       return xenvif_rx_ring_slots_available(queue) ||
+               (queue->vif->stall_timeout &&
+                (xenvif_rx_queue_stalled(queue) ||
+                 xenvif_rx_queue_ready(queue))) ||
+               kthread_should_stop() ||
+               queue->vif->disabled;
+}
+
+static long xenvif_rx_queue_timeout(struct xenvif_queue *queue)
+{
+       struct sk_buff *skb;
+       long timeout;
+
+       skb = skb_peek(&queue->rx_queue);
+       if (!skb)
+               return MAX_SCHEDULE_TIMEOUT;
+
+       timeout = XENVIF_RX_CB(skb)->expires - jiffies;
+       return timeout < 0 ? 0 : timeout;
+}
+
+/* Wait until the guest Rx thread has work.
+ *
+ * The timeout needs to be adjusted based on the current head of the
+ * queue (and not just the head at the beginning).  In particular, if
+ * the queue is initially empty an infinite timeout is used and this
+ * needs to be reduced when a skb is queued.
+ *
+ * This cannot be done with wait_event_timeout() because it only
+ * calculates the timeout once.
+ */
+static void xenvif_wait_for_rx_work(struct xenvif_queue *queue)
+{
+       DEFINE_WAIT(wait);
+
+       if (xenvif_have_rx_work(queue))
+               return;
+
+       for (;;) {
+               long ret;
+
+               prepare_to_wait(&queue->wq, &wait, TASK_INTERRUPTIBLE);
+               if (xenvif_have_rx_work(queue))
+                       break;
+               ret = schedule_timeout(xenvif_rx_queue_timeout(queue));
+               if (!ret)
+                       break;
+       }
+       finish_wait(&queue->wq, &wait);
+}
+
+static void xenvif_queue_carrier_off(struct xenvif_queue *queue)
+{
+       struct xenvif *vif = queue->vif;
+
+       queue->stalled = true;
+
+       /* At least one queue has stalled? Disable the carrier. */
+       spin_lock(&vif->lock);
+       if (vif->stalled_queues++ == 0) {
+               netdev_info(vif->dev, "Guest Rx stalled");
+               netif_carrier_off(vif->dev);
+       }
+       spin_unlock(&vif->lock);
+}
+
+static void xenvif_queue_carrier_on(struct xenvif_queue *queue)
+{
+       struct xenvif *vif = queue->vif;
+
+       queue->last_rx_time = jiffies; /* Reset Rx stall detection. */
+       queue->stalled = false;
+
+       /* All queues are ready? Enable the carrier. */
+       spin_lock(&vif->lock);
+       if (--vif->stalled_queues == 0) {
+               netdev_info(vif->dev, "Guest Rx ready");
+               netif_carrier_on(vif->dev);
+       }
+       spin_unlock(&vif->lock);
+}
+
+int xenvif_kthread_guest_rx(void *data)
+{
+       struct xenvif_queue *queue = data;
+       struct xenvif *vif = queue->vif;
+
+       if (!vif->stall_timeout)
+               xenvif_queue_carrier_on(queue);
+
+       for (;;) {
+               xenvif_wait_for_rx_work(queue);
+
+               if (kthread_should_stop())
+                       break;
+
+               /* This frontend is found to be rogue, disable it in
+                * kthread context. Currently this is only set when
+                * netback finds out frontend sends malformed packet,
+                * but we cannot disable the interface in softirq
+                * context so we defer it here, if this thread is
+                * associated with queue 0.
+                */
+               if (unlikely(vif->disabled && queue->id == 0)) {
+                       xenvif_carrier_off(vif);
+                       break;
+               }
+
+               if (!skb_queue_empty(&queue->rx_queue))
+                       xenvif_rx_action(queue);
+
+               /* If the guest hasn't provided any Rx slots for a
+                * while it's probably not responsive, drop the
+                * carrier so packets are dropped earlier.
+                */
+               if (vif->stall_timeout) {
+                       if (xenvif_rx_queue_stalled(queue))
+                               xenvif_queue_carrier_off(queue);
+                       else if (xenvif_rx_queue_ready(queue))
+                               xenvif_queue_carrier_on(queue);
+               }
+
+               /* Queued packets may have foreign pages from other
+                * domains.  These cannot be queued indefinitely as
+                * this would starve guests of grant refs and transmit
+                * slots.
+                */
+               xenvif_rx_queue_drop_expired(queue);
+
+               cond_resched();
+       }
+
+       /* Bin any remaining skbs */
+       xenvif_rx_queue_purge(queue);
+
+       return 0;
+}
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c

index daf4c78..7056404 100644 (file)
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -1135,7 +1135,6 @@ static int read_xenbus_vif_flags(struct backend_info *be)
         vif->can_sg = !!val;
  
         vif->gso_mask = 0;
-       vif->gso_prefix_mask = 0;
  
         if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4",
                          "%d", &val) < 0)
@@ -1143,32 +1142,12 @@ static int read_xenbus_vif_flags(struct backend_info *be)
         if (val)
                 vif->gso_mask |= GSO_BIT(TCPV4);
  
-       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix",
-                        "%d", &val) < 0)
-               val = 0;
-       if (val)
-               vif->gso_prefix_mask |= GSO_BIT(TCPV4);
-
         if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6",
                          "%d", &val) < 0)
                 val = 0;
         if (val)
                 vif->gso_mask |= GSO_BIT(TCPV6);
  
-       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6-prefix",
-                        "%d", &val) < 0)
-               val = 0;
-       if (val)
-               vif->gso_prefix_mask |= GSO_BIT(TCPV6);
-
-       if (vif->gso_mask & vif->gso_prefix_mask) {
-               xenbus_dev_fatal(dev, err,
-                                "%s: gso and gso prefix flags are not "
-                                "mutually exclusive",
-                                dev->otherend);
-               return -EOPNOTSUPP;
-       }
-
         if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
                          "%d", &val) < 0)
                 val = 0;
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c

index d5dbdb9..9ecfcdc 100644 (file)
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1314,9 +1314,6 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
         if (!VALID_EVTCHN(evtchn))
                 return -1;
  
-       if (!xen_support_evtchn_rebind())
-               return -1;
-
         /* Send future instances of this interrupt to other vcpu. */
         bind_vcpu.port = evtchn;
         bind_vcpu.vcpu = xen_vcpu_nr(tcpu);
@@ -1650,20 +1647,15 @@ void xen_callback_vector(void)
  {
         int rc;
         uint64_t callback_via;
-       if (xen_have_vector_callback) {
-               callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
-               rc = xen_set_callback_via(callback_via);
-               if (rc) {
-                       pr_err("Request for Xen HVM callback vector failed\n");
-                       xen_have_vector_callback = 0;
-                       return;
-               }
-               pr_info("Xen HVM callback vector for event delivery is enabled\n");
-               /* in the restore case the vector has already been allocated */
-               if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
-                       alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
-                                       xen_hvm_callback_vector);
-       }
+
+       callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
+       rc = xen_set_callback_via(callback_via);
+       BUG_ON(rc);
+       pr_info("Xen HVM callback vector for event delivery is enabled\n");
+       /* in the restore case the vector has already been allocated */
+       if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
+               alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+                               xen_hvm_callback_vector);
  }
  #else
  void xen_callback_vector(void) {}
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c

index 266c2c7..7ef27c6 100644 (file)
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -418,30 +418,18 @@ static int evtchn_fifo_alloc_control_block(unsigned cpu)
         return ret;
  }
  
-static int evtchn_fifo_cpu_notification(struct notifier_block *self,
-                                                 unsigned long action,
-                                                 void *hcpu)
+static int xen_evtchn_cpu_prepare(unsigned int cpu)
  {
-       int cpu = (long)hcpu;
-       int ret = 0;
-
-       switch (action) {
-       case CPU_UP_PREPARE:
-               if (!per_cpu(cpu_control_block, cpu))
-                       ret = evtchn_fifo_alloc_control_block(cpu);
-               break;
-       case CPU_DEAD:
-               __evtchn_fifo_handle_events(cpu, true);
-               break;
-       default:
-               break;
-       }
-       return ret < 0 ? NOTIFY_BAD : NOTIFY_OK;
+       if (!per_cpu(cpu_control_block, cpu))
+               return evtchn_fifo_alloc_control_block(cpu);
+       return 0;
  }
  
-static struct notifier_block evtchn_fifo_cpu_notifier = {
-       .notifier_call  = evtchn_fifo_cpu_notification,
-};
+static int xen_evtchn_cpu_dead(unsigned int cpu)
+{
+       __evtchn_fifo_handle_events(cpu, true);
+       return 0;
+}
  
  int __init xen_evtchn_fifo_init(void)
  {
@@ -456,7 +444,9 @@ int __init xen_evtchn_fifo_init(void)
  
         evtchn_ops = &evtchn_ops_fifo;
  
-       register_cpu_notifier(&evtchn_fifo_cpu_notifier);
+       cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE,
+                                 "CPUHP_XEN_EVTCHN_PREPARE",
+                                 xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead);
  out:
         put_cpu();
         return ret;
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c

index cf96666..b59c945 100644 (file)
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -42,7 +42,6 @@
  static unsigned long platform_mmio;
  static unsigned long platform_mmio_alloc;
  static unsigned long platform_mmiolen;
-static uint64_t callback_via;
  
  static unsigned long alloc_xen_mmio(unsigned long len)
  {
@@ -55,51 +54,6 @@ static unsigned long alloc_xen_mmio(unsigned long len)
         return addr;
  }
  
-static uint64_t get_callback_via(struct pci_dev *pdev)
-{
-       u8 pin;
-       int irq;
-
-       irq = pdev->irq;
-       if (irq < 16)
-               return irq; /* ISA IRQ */
-
-       pin = pdev->pin;
-
-       /* We don't know the GSI. Specify the PCI INTx line instead. */
-       return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
-               ((uint64_t)pci_domain_nr(pdev->bus) << 32) |
-               ((uint64_t)pdev->bus->number << 16) |
-               ((uint64_t)(pdev->devfn & 0xff) << 8) |
-               ((uint64_t)(pin - 1) & 3);
-}
-
-static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
-{
-       xen_hvm_evtchn_do_upcall();
-       return IRQ_HANDLED;
-}
-
-static int xen_allocate_irq(struct pci_dev *pdev)
-{
-       return request_irq(pdev->irq, do_hvm_evtchn_intr,
-                       IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
-                       "xen-platform-pci", pdev);
-}
-
-static int platform_pci_resume(struct pci_dev *pdev)
-{
-       int err;
-       if (xen_have_vector_callback)
-               return 0;
-       err = xen_set_callback_via(callback_via);
-       if (err) {
-               dev_err(&pdev->dev, "platform_pci_resume failure!\n");
-               return err;
-       }
-       return 0;
-}
-
  static int platform_pci_probe(struct pci_dev *pdev,
                               const struct pci_device_id *ent)
  {
@@ -138,21 +92,6 @@ static int platform_pci_probe(struct pci_dev *pdev,
         platform_mmio = mmio_addr;
         platform_mmiolen = mmio_len;
  
-       if (!xen_have_vector_callback) {
-               ret = xen_allocate_irq(pdev);
-               if (ret) {
-                       dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
-                       goto out;
-               }
-               callback_via = get_callback_via(pdev);
-               ret = xen_set_callback_via(callback_via);
-               if (ret) {
-                       dev_warn(&pdev->dev, "Unable to set the evtchn callback "
-                                        "err=%d\n", ret);
-                       goto out;
-               }
-       }
-
         max_nr_gframes = gnttab_max_grant_frames();
         grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
         ret = gnttab_setup_auto_xlat_frames(grant_frames);
@@ -184,9 +123,6 @@ static struct pci_driver platform_driver = {
         .name =           DRV_NAME,
         .probe =          platform_pci_probe,
         .id_table =       platform_pci_tbl,
-#ifdef CONFIG_PM
-       .resume_early =   platform_pci_resume,
-#endif
  };
  
  static int __init platform_pci_init(void)
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c

index 6881b3c..84106f9 100644 (file)
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -215,7 +215,7 @@ static const struct attribute_group xen_compilation_group = {
         .attrs = xen_compile_attrs,
  };
  
-static int __init xen_compilation_init(void)
+static int __init xen_sysfs_compilation_init(void)
  {
         return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
  }
@@ -341,7 +341,7 @@ static const struct attribute_group xen_properties_group = {
         .attrs = xen_properties_attrs,
  };
  
-static int __init xen_properties_init(void)
+static int __init xen_sysfs_properties_init(void)
  {
         return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
  }
@@ -455,7 +455,7 @@ static const struct attribute_group xen_pmu_group = {
         .attrs = xen_pmu_attrs,
  };
  
-static int __init xen_pmu_init(void)
+static int __init xen_sysfs_pmu_init(void)
  {
         return sysfs_create_group(hypervisor_kobj, &xen_pmu_group);
  }
@@ -474,18 +474,18 @@ static int __init hyper_sysfs_init(void)
         ret = xen_sysfs_version_init();
         if (ret)
                 goto version_out;
-       ret = xen_compilation_init();
+       ret = xen_sysfs_compilation_init();
         if (ret)
                 goto comp_out;
         ret = xen_sysfs_uuid_init();
         if (ret)
                 goto uuid_out;
-       ret = xen_properties_init();
+       ret = xen_sysfs_properties_init();
         if (ret)
                 goto prop_out;
  #ifdef CONFIG_XEN_HAVE_VPMU
         if (xen_initial_domain()) {
-               ret = xen_pmu_init();
+               ret = xen_sysfs_pmu_init();
                 if (ret) {
                         sysfs_remove_group(hypervisor_kobj,
                                            &xen_properties_group);
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c

index 258b7c3..6331a95 100644 (file)
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -25,6 +25,8 @@
  #include "conf_space.h"
  #include "conf_space_quirks.h"
  
+#define PCISTUB_DRIVER_NAME "pciback"
+
  static char *pci_devs_to_hide;
  wait_queue_head_t xen_pcibk_aer_wait_queue;
  /*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops,
@@ -149,13 +151,10 @@ static inline void pcistub_device_put(struct pcistub_device *psdev)
         kref_put(&psdev->kref, pcistub_device_release);
  }
  
-static struct pcistub_device *pcistub_device_find(int domain, int bus,
-                                                 int slot, int func)
+static struct pcistub_device *pcistub_device_find_locked(int domain, int bus,
+                                                        int slot, int func)
  {
-       struct pcistub_device *psdev = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pcistub_devices_lock, flags);
+       struct pcistub_device *psdev;
  
         list_for_each_entry(psdev, &pcistub_devices, dev_list) {
                 if (psdev->dev != NULL
@@ -163,15 +162,25 @@ static struct pcistub_device *pcistub_device_find(int domain, int bus,
                     && bus == psdev->dev->bus->number
                     && slot == PCI_SLOT(psdev->dev->devfn)
                     && func == PCI_FUNC(psdev->dev->devfn)) {
-                       pcistub_device_get(psdev);
-                       goto out;
+                       return psdev;
                 }
         }
  
-       /* didn't find it */
-       psdev = NULL;
+       return NULL;
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+                                                 int slot, int func)
+{
+       struct pcistub_device *psdev;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+       psdev = pcistub_device_find_locked(domain, bus, slot, func);
+       if (psdev)
+               pcistub_device_get(psdev);
  
-out:
         spin_unlock_irqrestore(&pcistub_devices_lock, flags);
         return psdev;
  }
@@ -207,16 +216,9 @@ struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
  
         spin_lock_irqsave(&pcistub_devices_lock, flags);
  
-       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
-               if (psdev->dev != NULL
-                   && domain == pci_domain_nr(psdev->dev->bus)
-                   && bus == psdev->dev->bus->number
-                   && slot == PCI_SLOT(psdev->dev->devfn)
-                   && func == PCI_FUNC(psdev->dev->devfn)) {
-                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
-                       break;
-               }
-       }
+       psdev = pcistub_device_find_locked(domain, bus, slot, func);
+       if (psdev)
+               found_dev = pcistub_device_get_pci_dev(pdev, psdev);
  
         spin_unlock_irqrestore(&pcistub_devices_lock, flags);
         return found_dev;
@@ -478,15 +480,48 @@ static int __init pcistub_init_devices_late(void)
         return 0;
  }
  
-static int pcistub_seize(struct pci_dev *dev)
+static void pcistub_device_id_add_list(struct pcistub_device_id *new,
+                                      int domain, int bus, unsigned int devfn)
+{
+       struct pcistub_device_id *pci_dev_id;
+       unsigned long flags;
+       int found = 0;
+
+       spin_lock_irqsave(&device_ids_lock, flags);
+
+       list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+               if (pci_dev_id->domain == domain && pci_dev_id->bus == bus &&
+                   pci_dev_id->devfn == devfn) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (!found) {
+               new->domain = domain;
+               new->bus = bus;
+               new->devfn = devfn;
+               list_add_tail(&new->slot_list, &pcistub_device_ids);
+       }
+
+       spin_unlock_irqrestore(&device_ids_lock, flags);
+
+       if (found)
+               kfree(new);
+}
+
+static int pcistub_seize(struct pci_dev *dev,
+                        struct pcistub_device_id *pci_dev_id)
  {
         struct pcistub_device *psdev;
         unsigned long flags;
         int err = 0;
  
         psdev = pcistub_device_alloc(dev);
-       if (!psdev)
+       if (!psdev) {
+               kfree(pci_dev_id);
                 return -ENOMEM;
+       }
  
         spin_lock_irqsave(&pcistub_devices_lock, flags);
  
@@ -507,8 +542,12 @@ static int pcistub_seize(struct pci_dev *dev)
  
         spin_unlock_irqrestore(&pcistub_devices_lock, flags);
  
-       if (err)
+       if (err) {
+               kfree(pci_dev_id);
                 pcistub_device_put(psdev);
+       } else if (pci_dev_id)
+               pcistub_device_id_add_list(pci_dev_id, pci_domain_nr(dev->bus),
+                                          dev->bus->number, dev->devfn);
  
         return err;
  }
@@ -517,11 +556,16 @@ static int pcistub_seize(struct pci_dev *dev)
   * other functions that take the sysfs lock. */
  static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id)
  {
-       int err = 0;
+       int err = 0, match;
+       struct pcistub_device_id *pci_dev_id = NULL;
  
         dev_dbg(&dev->dev, "probing...\n");
  
-       if (pcistub_match(dev)) {
+       match = pcistub_match(dev);
+
+       if ((dev->driver_override &&
+            !strcmp(dev->driver_override, PCISTUB_DRIVER_NAME)) ||
+           match) {
  
                 if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
                     && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
@@ -532,8 +576,16 @@ static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id)
                         goto out;
                 }
  
+               if (!match) {
+                       pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_ATOMIC);
+                       if (!pci_dev_id) {
+                               err = -ENOMEM;
+                               goto out;
+                       }
+               }
+
                 dev_info(&dev->dev, "seizing device\n");
-               err = pcistub_seize(dev);
+               err = pcistub_seize(dev, pci_dev_id);
         } else
                 /* Didn't find the device */
                 err = -ENODEV;
@@ -945,7 +997,7 @@ static const struct pci_error_handlers xen_pcibk_error_handler = {
  static struct pci_driver xen_pcibk_pci_driver = {
         /* The name should be xen_pciback, but until the tools are updated
          * we will keep it as pciback. */
-       .name = "pciback",
+       .name = PCISTUB_DRIVER_NAME,
         .id_table = pcistub_ids,
         .probe = pcistub_probe,
         .remove = pcistub_remove,
@@ -1012,7 +1064,6 @@ static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
  static int pcistub_device_id_add(int domain, int bus, int slot, int func)
  {
         struct pcistub_device_id *pci_dev_id;
-       unsigned long flags;
         int rc = 0, devfn = PCI_DEVFN(slot, func);
  
         if (slot < 0) {
@@ -1042,16 +1093,10 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func)
         if (!pci_dev_id)
                 return -ENOMEM;
  
-       pci_dev_id->domain = domain;
-       pci_dev_id->bus = bus;
-       pci_dev_id->devfn = devfn;
-
         pr_debug("wants to seize %04x:%02x:%02x.%d\n",
                  domain, bus, slot, func);
  
-       spin_lock_irqsave(&device_ids_lock, flags);
-       list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
-       spin_unlock_irqrestore(&device_ids_lock, flags);
+       pcistub_device_id_add_list(pci_dev_id, domain, bus, devfn);
  
         return 0;
  }
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt

index c7efddf..4c09d93 100644 (file)
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -89,7 +89,7 @@ config BINFMT_SCRIPT
  
  config BINFMT_FLAT
         bool "Kernel support for flat binaries"
-       depends on !MMU || M68K
+       depends on !MMU || ARM || M68K
         depends on !FRV || BROKEN
         help
           Support uClinux FLAT format binaries.
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c

index 59bdaa7..477928b 100644 (file)
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -418,7 +418,7 @@ static void afs_deliver_to_call(struct afs_call *call)
                                                      &call->abort_code);
                         if (ret == -EINPROGRESS || ret == -EAGAIN)
                                 return;
-                       if (ret == 1) {
+                       if (ret == 1 || ret < 0) {
                                 call->state = AFS_CALL_COMPLETE;
                                 goto done;
                         }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c

index 431fd7e..e44271d 100644 (file)
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
                 memcpy(&wq->name, &qstr, sizeof(struct qstr));
                 wq->dev = autofs4_get_dev(sbi);
                 wq->ino = autofs4_get_ino(sbi);
-               wq->uid = current_uid();
-               wq->gid = current_gid();
+               wq->uid = current_real_cred()->uid;
+               wq->gid = current_real_cred()->gid;
                 wq->pid = pid;
                 wq->tgid = tgid;
                 wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/dax.c b/fs/dax.c

index 993dc6f..cc025f8 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
  #include <linux/vmstat.h>
  #include <linux/pfn_t.h>
  #include <linux/sizes.h>
+#include <linux/iomap.h>
+#include "internal.h"
  
  /*
   * We use lowest available bit in exceptional entry for locking, other two
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
         return VM_FAULT_LOCKED;
  }
  
-static int copy_user_bh(struct page *to, struct inode *inode,
-               struct buffer_head *bh, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
+               struct page *to, unsigned long vaddr)
  {
         struct blk_dax_ctl dax = {
-               .sector = to_sector(bh, inode),
-               .size = bh->b_size,
+               .sector = sector,
+               .size = size,
         };
-       struct block_device *bdev = bh->b_bdev;
         void *vto;
  
         if (dax_map_atomic(bdev, &dax) < 0)
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  
  static int dax_insert_mapping(struct address_space *mapping,
-                       struct buffer_head *bh, void **entryp,
-                       struct vm_area_struct *vma, struct vm_fault *vmf)
+               struct block_device *bdev, sector_t sector, size_t size,
+               void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         unsigned long vaddr = (unsigned long)vmf->virtual_address;
-       struct block_device *bdev = bh->b_bdev;
         struct blk_dax_ctl dax = {
-               .sector = to_sector(bh, mapping->host),
-               .size = bh->b_size,
+               .sector = sector,
+               .size = size,
         };
         void *ret;
         void *entry = *entryp;
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
         if (vmf->cow_page) {
                 struct page *new_page = vmf->cow_page;
                 if (buffer_written(&bh))
-                       error = copy_user_bh(new_page, inode, &bh, vaddr);
+                       error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
+                                       bh.b_size, new_page, vaddr);
                 else
                         clear_user_highpage(new_page, vaddr);
                 if (error)
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  
         /* Filesystem should not return unwritten buffers to us! */
         WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-       error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+       error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
+                       bh.b_size, &entry, vma, vmf);
   unlock_entry:
         put_locked_mapping_entry(mapping, vmf->pgoff, entry);
   out:
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
         return dax_zero_page_range(inode, from, length, get_block);
  }
  EXPORT_SYMBOL_GPL(dax_truncate_page);
+
+#ifdef CONFIG_FS_IOMAP
+static loff_t
+iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+               struct iomap *iomap)
+{
+       struct iov_iter *iter = data;
+       loff_t end = pos + length, done = 0;
+       ssize_t ret = 0;
+
+       if (iov_iter_rw(iter) == READ) {
+               end = min(end, i_size_read(inode));
+               if (pos >= end)
+                       return 0;
+
+               if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+                       return iov_iter_zero(min(length, end - pos), iter);
+       }
+
+       if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+               return -EIO;
+
+       while (pos < end) {
+               unsigned offset = pos & (PAGE_SIZE - 1);
+               struct blk_dax_ctl dax = { 0 };
+               ssize_t map_len;
+
+               dax.sector = iomap->blkno +
+                       (((pos & PAGE_MASK) - iomap->offset) >> 9);
+               dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
+               map_len = dax_map_atomic(iomap->bdev, &dax);
+               if (map_len < 0) {
+                       ret = map_len;
+                       break;
+               }
+
+               dax.addr += offset;
+               map_len -= offset;
+               if (map_len > end - pos)
+                       map_len = end - pos;
+
+               if (iov_iter_rw(iter) == WRITE)
+                       map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+               else
+                       map_len = copy_to_iter(dax.addr, map_len, iter);
+               dax_unmap_atomic(iomap->bdev, &dax);
+               if (map_len <= 0) {
+                       ret = map_len ? map_len : -EFAULT;
+                       break;
+               }
+
+               pos += map_len;
+               length -= map_len;
+               done += map_len;
+       }
+
+       return done ? done : ret;
+}
+
+/**
+ * iomap_dax_rw - Perform I/O to a DAX file
+ * @iocb:      The control block for this I/O
+ * @iter:      The addresses to do I/O from or to
+ * @ops:       iomap ops passed from the file system
+ *
+ * This function performs read and write operations to directly mapped
+ * persistent memory.  The callers needs to take care of read/write exclusion
+ * and evicting any page cache pages in the region under I/O.
+ */
+ssize_t
+iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+               struct iomap_ops *ops)
+{
+       struct address_space *mapping = iocb->ki_filp->f_mapping;
+       struct inode *inode = mapping->host;
+       loff_t pos = iocb->ki_pos, ret = 0, done = 0;
+       unsigned flags = 0;
+
+       if (iov_iter_rw(iter) == WRITE)
+               flags |= IOMAP_WRITE;
+
+       /*
+        * Yes, even DAX files can have page cache attached to them:  A zeroed
+        * page is inserted into the pagecache when we have to serve a write
+        * fault on a hole.  It should never be dirtied and can simply be
+        * dropped from the pagecache once we get real data for the page.
+        *
+        * XXX: This is racy against mmap, and there's nothing we can do about
+        * it. We'll eventually need to shift this down even further so that
+        * we can check if we allocated blocks over a hole first.
+        */
+       if (mapping->nrpages) {
+               ret = invalidate_inode_pages2_range(mapping,
+                               pos >> PAGE_SHIFT,
+                               (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
+               WARN_ON_ONCE(ret);
+       }
+
+       while (iov_iter_count(iter)) {
+               ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
+                               iter, iomap_dax_actor);
+               if (ret <= 0)
+                       break;
+               pos += ret;
+               done += ret;
+       }
+
+       iocb->ki_pos += done;
+       return done ? done : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_rw);
+
+/**
+ * iomap_dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in their fault
+ * or mkwrite handler for DAX files. Assumes the caller has done all the
+ * necessary locking for the page fault to proceed successfully.
+ */
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                       struct iomap_ops *ops)
+{
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct inode *inode = mapping->host;
+       unsigned long vaddr = (unsigned long)vmf->virtual_address;
+       loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
+       sector_t sector;
+       struct iomap iomap = { 0 };
+       unsigned flags = 0;
+       int error, major = 0;
+       void *entry;
+
+       /*
+        * Check whether offset isn't beyond end of file now. Caller is supposed
+        * to hold locks serializing us with truncate / punch hole so this is
+        * a reliable test.
+        */
+       if (pos >= i_size_read(inode))
+               return VM_FAULT_SIGBUS;
+
+       entry = grab_mapping_entry(mapping, vmf->pgoff);
+       if (IS_ERR(entry)) {
+               error = PTR_ERR(entry);
+               goto out;
+       }
+
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+               flags |= IOMAP_WRITE;
+
+       /*
+        * Note that we don't bother to use iomap_apply here: DAX required
+        * the file system block size to be equal the page size, which means
+        * that we never have to deal with more than a single extent here.
+        */
+       error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+       if (error)
+               goto unlock_entry;
+       if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+               error = -EIO;           /* fs corruption? */
+               goto unlock_entry;
+       }
+
+       sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+
+       if (vmf->cow_page) {
+               switch (iomap.type) {
+               case IOMAP_HOLE:
+               case IOMAP_UNWRITTEN:
+                       clear_user_highpage(vmf->cow_page, vaddr);
+                       break;
+               case IOMAP_MAPPED:
+                       error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
+                                       vmf->cow_page, vaddr);
+                       break;
+               default:
+                       WARN_ON_ONCE(1);
+                       error = -EIO;
+                       break;
+               }
+
+               if (error)
+                       goto unlock_entry;
+               if (!radix_tree_exceptional_entry(entry)) {
+                       vmf->page = entry;
+                       return VM_FAULT_LOCKED;
+               }
+               vmf->entry = entry;
+               return VM_FAULT_DAX_LOCKED;
+       }
+
+       switch (iomap.type) {
+       case IOMAP_MAPPED:
+               if (iomap.flags & IOMAP_F_NEW) {
+                       count_vm_event(PGMAJFAULT);
+                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+                       major = VM_FAULT_MAJOR;
+               }
+               error = dax_insert_mapping(mapping, iomap.bdev, sector,
+                               PAGE_SIZE, &entry, vma, vmf);
+               break;
+       case IOMAP_UNWRITTEN:
+       case IOMAP_HOLE:
+               if (!(vmf->flags & FAULT_FLAG_WRITE))
+                       return dax_load_hole(mapping, entry, vmf);
+               /*FALLTHRU*/
+       default:
+               WARN_ON_ONCE(1);
+               error = -EIO;
+               break;
+       }
+
+ unlock_entry:
+       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+       if (error == -ENOMEM)
+               return VM_FAULT_OOM | major;
+       /* -EBUSY is fine, somebody else faulted on the same PTE */
+       if (error < 0 && error != -EBUSY)
+               return VM_FAULT_SIGBUS | major;
+       return VM_FAULT_NOPAGE | major;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_fault);
+#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig

index c634874..36bea5a 100644 (file)
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,5 +1,6 @@
  config EXT2_FS
         tristate "Second extended fs support"
+       select FS_IOMAP if FS_DAX
         help
           Ext2 is a standard Linux file system for hard disks.
  
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h

index 06af2f9..37e2be7 100644 (file)
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;
  /* inode.c */
  extern const struct address_space_operations ext2_aops;
  extern const struct address_space_operations ext2_nobh_aops;
+extern struct iomap_ops ext2_iomap_ops;
  
  /* namei.c */
  extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c

index 5efeefe..423cc01 100644 (file)
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -22,11 +22,59 @@
  #include <linux/pagemap.h>
  #include <linux/dax.h>
  #include <linux/quotaops.h>
+#include <linux/iomap.h>
+#include <linux/uio.h>
  #include "ext2.h"
  #include "xattr.h"
  #include "acl.h"
  
  #ifdef CONFIG_FS_DAX
+static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       struct inode *inode = iocb->ki_filp->f_mapping->host;
+       ssize_t ret;
+
+       if (!iov_iter_count(to))
+               return 0; /* skip atime */
+
+       inode_lock_shared(inode);
+       ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+       inode_unlock_shared(inode);
+
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       ssize_t ret;
+
+       inode_lock(inode);
+       ret = generic_write_checks(iocb, from);
+       if (ret <= 0)
+               goto out_unlock;
+       ret = file_remove_privs(file);
+       if (ret)
+               goto out_unlock;
+       ret = file_update_time(file);
+       if (ret)
+               goto out_unlock;
+
+       ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+       if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+               i_size_write(inode, iocb->ki_pos);
+               mark_inode_dirty(inode);
+       }
+
+out_unlock:
+       inode_unlock(inode);
+       if (ret > 0)
+               ret = generic_write_sync(iocb, ret);
+       return ret;
+}
+
  /*
   * The lock ordering for ext2 DAX fault paths is:
   *
@@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         }
         down_read(&ei->dax_sem);
  
-       ret = dax_fault(vma, vmf, ext2_get_block);
+       ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
  
         up_read(&ei->dax_sem);
         if (vmf->flags & FAULT_FLAG_WRITE)
@@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
         return ret;
  }
  
-/*
- * We have mostly NULL's here: the current defaults are ok for
- * the ext2 filesystem.
- */
+static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+#ifdef CONFIG_FS_DAX
+       if (IS_DAX(iocb->ki_filp->f_mapping->host))
+               return ext2_dax_read_iter(iocb, to);
+#endif
+       return generic_file_read_iter(iocb, to);
+}
+
+static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+#ifdef CONFIG_FS_DAX
+       if (IS_DAX(iocb->ki_filp->f_mapping->host))
+               return ext2_dax_write_iter(iocb, from);
+#endif
+       return generic_file_write_iter(iocb, from);
+}
+
  const struct file_operations ext2_file_operations = {
         .llseek         = generic_file_llseek,
-       .read_iter      = generic_file_read_iter,
-       .write_iter     = generic_file_write_iter,
+       .read_iter      = ext2_file_read_iter,
+       .write_iter     = ext2_file_write_iter,
         .unlocked_ioctl = ext2_ioctl,
  #ifdef CONFIG_COMPAT
         .compat_ioctl   = ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c

index 303ae2b..1e72d42 100644 (file)
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
  #include <linux/buffer_head.h>
  #include <linux/mpage.h>
  #include <linux/fiemap.h>
+#include <linux/iomap.h>
  #include <linux/namei.h>
  #include <linux/uio.h>
  #include "ext2.h"
@@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode,
   */
  static int ext2_get_blocks(struct inode *inode,
                            sector_t iblock, unsigned long maxblocks,
-                          struct buffer_head *bh_result,
+                          u32 *bno, bool *new, bool *boundary,
                            int create)
  {
         int err = -EIO;
@@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode,
         /* Simplest case - block found, no allocation needed */
         if (!partial) {
                 first_block = le32_to_cpu(chain[depth - 1].key);
-               clear_buffer_new(bh_result); /* What's this do? */
                 count++;
                 /*map more blocks*/
                 while (count < maxblocks && count <= blocks_to_boundary) {
@@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,
                         mutex_unlock(&ei->truncate_mutex);
                         if (err)
                                 goto cleanup;
-                       clear_buffer_new(bh_result);
                         goto got_it;
                 }
         }
@@ -755,15 +754,16 @@ static int ext2_get_blocks(struct inode *inode,
                         mutex_unlock(&ei->truncate_mutex);
                         goto cleanup;
                 }
-       } else
-               set_buffer_new(bh_result);
+       } else {
+               *new = true;
+       }
  
         ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
         mutex_unlock(&ei->truncate_mutex);
  got_it:
-       map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+       *bno = le32_to_cpu(chain[depth-1].key);
         if (count > blocks_to_boundary)
-               set_buffer_boundary(bh_result);
+               *boundary = true;
         err = count;
         /* Clean up and exit */
         partial = chain + depth - 1;    /* the whole chain */
@@ -775,19 +775,82 @@ cleanup:
         return err;
  }
  
-int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+int ext2_get_block(struct inode *inode, sector_t iblock,
+               struct buffer_head *bh_result, int create)
  {
         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-       int ret = ext2_get_blocks(inode, iblock, max_blocks,
-                             bh_result, create);
-       if (ret > 0) {
-               bh_result->b_size = (ret << inode->i_blkbits);
-               ret = 0;
+       bool new = false, boundary = false;
+       u32 bno;
+       int ret;
+
+       ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary,
+                       create);
+       if (ret <= 0)
+               return ret;
+
+       map_bh(bh_result, inode->i_sb, bno);
+       bh_result->b_size = (ret << inode->i_blkbits);
+       if (new)
+               set_buffer_new(bh_result);
+       if (boundary)
+               set_buffer_boundary(bh_result);
+       return 0;
+
+}
+
+#ifdef CONFIG_FS_DAX
+static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+               unsigned flags, struct iomap *iomap)
+{
+       unsigned int blkbits = inode->i_blkbits;
+       unsigned long first_block = offset >> blkbits;
+       unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
+       bool new = false, boundary = false;
+       u32 bno;
+       int ret;
+
+       ret = ext2_get_blocks(inode, first_block, max_blocks,
+                       &bno, &new, &boundary, flags & IOMAP_WRITE);
+       if (ret < 0)
+               return ret;
+
+       iomap->flags = 0;
+       iomap->bdev = inode->i_sb->s_bdev;
+       iomap->offset = (u64)first_block << blkbits;
+
+       if (ret == 0) {
+               iomap->type = IOMAP_HOLE;
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->length = 1 << blkbits;
+       } else {
+               iomap->type = IOMAP_MAPPED;
+               iomap->blkno = (sector_t)bno << (blkbits - 9);
+               iomap->length = (u64)ret << blkbits;
+               iomap->flags |= IOMAP_F_MERGED;
         }
-       return ret;
  
+       if (new)
+               iomap->flags |= IOMAP_F_NEW;
+       return 0;
  }
  
+static int
+ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+               ssize_t written, unsigned flags, struct iomap *iomap)
+{
+       if (iomap->type == IOMAP_MAPPED &&
+           written < length &&
+           (flags & IOMAP_WRITE))
+               ext2_write_failed(inode->i_mapping, offset + length);
+       return 0;
+}
+
+struct iomap_ops ext2_iomap_ops = {
+       .iomap_begin            = ext2_iomap_begin,
+       .iomap_end              = ext2_iomap_end,
+};
+#endif /* CONFIG_FS_DAX */
+
  int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 u64 start, u64 len)
  {
@@ -873,11 +936,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         loff_t offset = iocb->ki_pos;
         ssize_t ret;
  
-       if (IS_DAX(inode))
-               ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
-                               DIO_LOCKING);
-       else
-               ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
+       if (WARN_ON_ONCE(IS_DAX(inode)))
+               return -EIO;
+
+       ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
         if (ret < 0 && iov_iter_rw(iter) == WRITE)
                 ext2_write_failed(mapping, offset + count);
         return ret;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c

index 4dcc9e2..1e29630 100644 (file)
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -109,14 +109,16 @@ fail:
         return ERR_PTR(-EINVAL);
  }
  
-static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
+                               const struct posix_acl *acl, size_t *size)
  {
         struct f2fs_acl_header *f2fs_acl;
         struct f2fs_acl_entry *entry;
         int i;
  
-       f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
-                       sizeof(struct f2fs_acl_entry), GFP_NOFS);
+       f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) +
+                       acl->a_count * sizeof(struct f2fs_acl_entry),
+                       GFP_NOFS);
         if (!f2fs_acl)
                 return ERR_PTR(-ENOMEM);
  
@@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
  
         retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
         if (retval > 0) {
-               value = f2fs_kmalloc(retval, GFP_F2FS_ZERO);
+               value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO);
                 if (!value)
                         return ERR_PTR(-ENOMEM);
                 retval = f2fs_getxattr(inode, name_index, "", value,
@@ -230,7 +232,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
         }
  
         if (acl) {
-               value = f2fs_acl_to_disk(acl, &size);
+               value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size);
                 if (IS_ERR(value)) {
                         clear_inode_flag(inode, FI_ACL_MODE);
                         return (int)PTR_ERR(value);
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h

index b2334d1..2c68518 100644 (file)
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -41,7 +41,6 @@ extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
  extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
                                                         struct page *);
  #else
-#define f2fs_check_acl NULL
  #define f2fs_get_acl   NULL
  #define f2fs_set_acl   NULL
  
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c

index f94d01e..7e9b504 100644 (file)
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -28,7 +28,7 @@ struct kmem_cache *inode_entry_slab;
  
  void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
  {
-       set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+       set_ckpt_flags(sbi, CP_ERROR_FLAG);
         sbi->sb->s_flags |= MS_RDONLY;
         if (!end_io)
                 f2fs_flush_merged_bios(sbi);
@@ -267,7 +267,6 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
                                 struct writeback_control *wbc)
  {
         struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
-       struct blk_plug plug;
         long diff, written;
  
         /* collect a number of dirty meta pages and write together */
@@ -280,9 +279,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
         /* if mounting is failed, skip writing node pages */
         mutex_lock(&sbi->cp_mutex);
         diff = nr_pages_to_write(sbi, META, wbc);
-       blk_start_plug(&plug);
         written = sync_meta_pages(sbi, META, wbc->nr_to_write);
-       blk_finish_plug(&plug);
         mutex_unlock(&sbi->cp_mutex);
         wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
         return 0;
@@ -388,6 +385,9 @@ const struct address_space_operations f2fs_meta_aops = {
         .set_page_dirty = f2fs_set_meta_page_dirty,
         .invalidatepage = f2fs_invalidate_page,
         .releasepage    = f2fs_release_page,
+#ifdef CONFIG_MIGRATION
+       .migratepage    = f2fs_migrate_page,
+#endif
  };
  
  static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -491,7 +491,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
         spin_lock(&im->ino_lock);
  
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (time_to_inject(FAULT_ORPHAN)) {
+       if (time_to_inject(sbi, FAULT_ORPHAN)) {
                 spin_unlock(&im->ino_lock);
                 return -ENOSPC;
         }
@@ -531,8 +531,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
  static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
  {
         struct inode *inode;
+       struct node_info ni;
+       int err = acquire_orphan_inode(sbi);
+
+       if (err) {
+               set_sbi_flag(sbi, SBI_NEED_FSCK);
+               f2fs_msg(sbi->sb, KERN_WARNING,
+                               "%s: orphan failed (ino=%x), run fsck to fix.",
+                               __func__, ino);
+               return err;
+       }
  
-       inode = f2fs_iget(sbi->sb, ino);
+       __add_ino_entry(sbi, ino, ORPHAN_INO);
+
+       inode = f2fs_iget_retry(sbi->sb, ino);
         if (IS_ERR(inode)) {
                 /*
                  * there should be a bug that we can't find the entry
@@ -546,6 +558,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
  
         /* truncate all the data during iput */
         iput(inode);
+
+       get_node_info(sbi, ino, &ni);
+
+       /* ENOMEM was fully retried in f2fs_evict_inode. */
+       if (ni.blk_addr != NULL_ADDR) {
+               set_sbi_flag(sbi, SBI_NEED_FSCK);
+               f2fs_msg(sbi->sb, KERN_WARNING,
+                       "%s: orphan failed (ino=%x), run fsck to fix.",
+                               __func__, ino);
+               return -EIO;
+       }
+       __remove_ino_entry(sbi, ino, ORPHAN_INO);
         return 0;
  }
  
@@ -554,7 +578,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
         block_t start_blk, orphan_blocks, i, j;
         int err;
  
-       if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
+       if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
                 return 0;
  
         start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -578,7 +602,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
                 f2fs_put_page(page, 1);
         }
         /* clear Orphan Flag */
-       clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
+       clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
         return 0;
  }
  
@@ -639,45 +663,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
         }
  }
  
-static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
-                               block_t cp_addr, unsigned long long *version)
+static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
+               struct f2fs_checkpoint **cp_block, struct page **cp_page,
+               unsigned long long *version)
  {
-       struct page *cp_page_1, *cp_page_2 = NULL;
         unsigned long blk_size = sbi->blocksize;
-       struct f2fs_checkpoint *cp_block;
-       unsigned long long cur_version = 0, pre_version = 0;
-       size_t crc_offset;
+       size_t crc_offset = 0;
         __u32 crc = 0;
  
-       /* Read the 1st cp block in this CP pack */
-       cp_page_1 = get_meta_page(sbi, cp_addr);
+       *cp_page = get_meta_page(sbi, cp_addr);
+       *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
  
-       /* get the version number */
-       cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
-       crc_offset = le32_to_cpu(cp_block->checksum_offset);
-       if (crc_offset >= blk_size)
-               goto invalid_cp1;
+       crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
+       if (crc_offset >= blk_size) {
+               f2fs_msg(sbi->sb, KERN_WARNING,
+                       "invalid crc_offset: %zu", crc_offset);
+               return -EINVAL;
+       }
  
-       crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-       if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
-               goto invalid_cp1;
+       crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block
+                                                       + crc_offset)));
+       if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) {
+               f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
+               return -EINVAL;
+       }
  
-       pre_version = cur_cp_version(cp_block);
+       *version = cur_cp_version(*cp_block);
+       return 0;
+}
  
-       /* Read the 2nd cp block in this CP pack */
-       cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
-       cp_page_2 = get_meta_page(sbi, cp_addr);
+static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+                               block_t cp_addr, unsigned long long *version)
+{
+       struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
+       struct f2fs_checkpoint *cp_block = NULL;
+       unsigned long long cur_version = 0, pre_version = 0;
+       int err;
  
-       cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
-       crc_offset = le32_to_cpu(cp_block->checksum_offset);
-       if (crc_offset >= blk_size)
-               goto invalid_cp2;
+       err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+                                       &cp_page_1, version);
+       if (err)
+               goto invalid_cp1;
+       pre_version = *version;
  
-       crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-       if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
+       cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+       err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+                                       &cp_page_2, version);
+       if (err)
                 goto invalid_cp2;
-
-       cur_version = cur_cp_version(cp_block);
+       cur_version = *version;
  
         if (cur_version == pre_version) {
                 *version = cur_version;
@@ -972,10 +1006,40 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
         finish_wait(&sbi->cp_wait, &wait);
  }
  
+static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+       unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
+       struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+       spin_lock(&sbi->cp_lock);
+
+       if (cpc->reason == CP_UMOUNT)
+               __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+       else
+               __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+       if (cpc->reason == CP_FASTBOOT)
+               __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+       else
+               __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+
+       if (orphan_num)
+               __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+       else
+               __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+
+       if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+               __set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
+       /* set this flag to activate crc|cp_ver for recovery */
+       __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
+
+       spin_unlock(&sbi->cp_lock);
+}
+
  static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  {
         struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-       struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
         struct f2fs_nm_info *nm_i = NM_I(sbi);
         unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
         nid_t last_nid = nm_i->next_scan_nid;
@@ -984,19 +1048,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         __u32 crc32 = 0;
         int i;
         int cp_payload_blks = __cp_payload(sbi);
-       block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
-       bool invalidate = false;
         struct super_block *sb = sbi->sb;
         struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
         u64 kbytes_written;
  
-       /*
-        * This avoids to conduct wrong roll-forward operations and uses
-        * metapages, so should be called prior to sync_meta_pages below.
-        */
-       if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk))
-               invalidate = true;
-
         /* Flush all the NAT/SIT pages */
         while (get_pages(sbi, F2FS_DIRTY_META)) {
                 sync_meta_pages(sbi, META, LONG_MAX);
@@ -1036,10 +1091,12 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  
         /* 2 cp  + n data seg summary + orphan inode blocks */
         data_sum_blocks = npages_for_summary_flush(sbi, false);
+       spin_lock(&sbi->cp_lock);
         if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
-               set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+               __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
         else
-               clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+               __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+       spin_unlock(&sbi->cp_lock);
  
         orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
         ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
@@ -1054,23 +1111,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                                 cp_payload_blks + data_sum_blocks +
                                 orphan_blocks);
  
-       if (cpc->reason == CP_UMOUNT)
-               set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-       else
-               clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-
-       if (cpc->reason == CP_FASTBOOT)
-               set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
-       else
-               clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
-
-       if (orphan_num)
-               set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
-       else
-               clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
-
-       if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
-               set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+       /* update ckpt flag for checkpoint */
+       update_ckpt_flags(sbi, cpc);
  
         /* update SIT/NAT bitmap */
         get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
@@ -1137,14 +1179,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         /* wait for previous submitted meta pages writeback */
         wait_on_all_pages_writeback(sbi);
  
-       /*
-        * invalidate meta page which is used temporarily for zeroing out
-        * block at the end of warm node chain.
-        */
-       if (invalidate)
-               invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
-                                                               discard_blk);
-
         release_ino_entry(sbi, false);
  
         if (unlikely(f2fs_cp_error(sbi)))
@@ -1152,6 +1186,17 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  
         clear_prefree_segments(sbi, cpc);
         clear_sbi_flag(sbi, SBI_IS_DIRTY);
+       clear_sbi_flag(sbi, SBI_NEED_CP);
+
+       /*
+        * redirty superblock if metadata like node page or inode cache is
+        * updated during writing checkpoint.
+        */
+       if (get_pages(sbi, F2FS_DIRTY_NODES) ||
+                       get_pages(sbi, F2FS_DIRTY_IMETA))
+               set_sbi_flag(sbi, SBI_IS_DIRTY);
+
+       f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS));
  
         return 0;
  }
@@ -1190,6 +1235,18 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  
         f2fs_flush_merged_bios(sbi);
  
+       /* this is the case of multiple fstrims without any changes */
+       if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) {
+               f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt);
+               f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries);
+               f2fs_bug_on(sbi, prefree_segments(sbi));
+               flush_sit_entries(sbi, cpc);
+               clear_prefree_segments(sbi, cpc);
+               f2fs_wait_all_discard_bio(sbi);
+               unblock_operations(sbi);
+               goto out;
+       }
+
         /*
          * update checkpoint pack index
          * Increase the version number so that
@@ -1205,6 +1262,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         /* unlock all the fs_lock[] in do_checkpoint() */
         err = do_checkpoint(sbi, cpc);
  
+       f2fs_wait_all_discard_bio(sbi);
+
         unblock_operations(sbi);
         stat_inc_cp_count(sbi->stat_info);
  
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c

index ccb401e..0d0177c 100644 (file)
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -34,6 +34,11 @@ static void f2fs_read_end_io(struct bio *bio)
         struct bio_vec *bvec;
         int i;
  
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO))
+               bio->bi_error = -EIO;
+#endif
+
         if (f2fs_bio_encrypted(bio)) {
                 if (bio->bi_error) {
                         fscrypt_release_ctx(bio->bi_private);
@@ -626,11 +631,13 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
         ssize_t ret = 0;
  
         map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
-       map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from));
-       map.m_next_pgofs = NULL;
+       map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
+       if (map.m_len > map.m_lblk)
+               map.m_len -= map.m_lblk;
+       else
+               map.m_len = 0;
  
-       if (f2fs_encrypted_inode(inode))
-               return 0;
+       map.m_next_pgofs = NULL;
  
         if (iocb->ki_flags & IOCB_DIRECT) {
                 ret = f2fs_convert_inline_inode(inode);
@@ -672,6 +679,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
         bool allocated = false;
         block_t blkaddr;
  
+       if (!maxblocks)
+               return 0;
+
         map->m_len = 0;
         map->m_flags = 0;
  
@@ -783,6 +793,7 @@ skip:
                 err = reserve_new_blocks(&dn, prealloc);
                 if (err)
                         goto sync_out;
+               allocated = dn.node_changed;
  
                 map->m_len += dn.ofs_in_node - ofs_in_node;
                 if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
@@ -966,8 +977,8 @@ out:
         return ret;
  }
  
-struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
-                                                       unsigned nr_pages)
+static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
+                                unsigned nr_pages)
  {
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         struct fscrypt_ctx *ctx = NULL;
@@ -1284,7 +1295,7 @@ write:
  
         if (!wbc->for_reclaim)
                 need_balance_fs = true;
-       else if (has_not_enough_free_secs(sbi, 0))
+       else if (has_not_enough_free_secs(sbi, 0, 0))
                 goto redirty_out;
  
         err = -EAGAIN;
@@ -1344,6 +1355,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
         int cycled;
         int range_whole = 0;
         int tag;
+       int nwritten = 0;
  
         pagevec_init(&pvec, 0);
  
@@ -1418,6 +1430,8 @@ continue_unlock:
                                 done_index = page->index + 1;
                                 done = 1;
                                 break;
+                       } else {
+                               nwritten++;
                         }
  
                         if (--wbc->nr_to_write <= 0 &&
@@ -1439,6 +1453,10 @@ continue_unlock:
         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                 mapping->writeback_index = done_index;
  
+       if (nwritten)
+               f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host,
+                                                       NULL, 0, DATA, WRITE);
+
         return ret;
  }
  
@@ -1480,7 +1498,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
          * if some pages were truncated, we cannot guarantee its mapping->host
          * to detect pending bios.
          */
-       f2fs_submit_merged_bio(sbi, DATA, WRITE);
  
         remove_dirty_inode(inode);
         return ret;
@@ -1518,8 +1535,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
          * we already allocated all the blocks, so we don't need to get
          * the block addresses when there is no need to fill the page.
          */
-       if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
-                                       len == PAGE_SIZE)
+       if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE)
                 return 0;
  
         if (f2fs_has_inline_data(inode) ||
@@ -1616,7 +1632,7 @@ repeat:
         if (err)
                 goto fail;
  
-       if (need_balance && has_not_enough_free_secs(sbi, 0)) {
+       if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
                 unlock_page(page);
                 f2fs_balance_fs(sbi, true);
                 lock_page(page);
@@ -1633,22 +1649,12 @@ repeat:
         if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
                 f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
  
-       if (len == PAGE_SIZE)
-               goto out_update;
-       if (PageUptodate(page))
-               goto out_clear;
-
-       if ((pos & PAGE_MASK) >= i_size_read(inode)) {
-               unsigned start = pos & (PAGE_SIZE - 1);
-               unsigned end = start + len;
-
-               /* Reading beyond i_size is simple: memset to zero */
-               zero_user_segments(page, 0, start, end, PAGE_SIZE);
-               goto out_update;
-       }
+       if (len == PAGE_SIZE || PageUptodate(page))
+               return 0;
  
         if (blkaddr == NEW_ADDR) {
                 zero_user_segment(page, 0, PAGE_SIZE);
+               SetPageUptodate(page);
         } else {
                 struct bio *bio;
  
@@ -1676,11 +1682,6 @@ repeat:
                         goto fail;
                 }
         }
-out_update:
-       if (!PageUptodate(page))
-               SetPageUptodate(page);
-out_clear:
-       clear_cold_data(page);
         return 0;
  
  fail:
@@ -1698,11 +1699,26 @@ static int f2fs_write_end(struct file *file,
  
         trace_f2fs_write_end(inode, pos, len, copied);
  
+       /*
+        * This should be come from len == PAGE_SIZE, and we expect copied
+        * should be PAGE_SIZE. Otherwise, we treat it with zero copied and
+        * let generic_perform_write() try to copy data again through copied=0.
+        */
+       if (!PageUptodate(page)) {
+               if (unlikely(copied != PAGE_SIZE))
+                       copied = 0;
+               else
+                       SetPageUptodate(page);
+       }
+       if (!copied)
+               goto unlock_out;
+
         set_page_dirty(page);
+       clear_cold_data(page);
  
         if (pos + copied > i_size_read(inode))
                 f2fs_i_size_write(inode, pos + copied);
-
+unlock_out:
         f2fs_put_page(page, 1);
         f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
         return copied;
@@ -1873,6 +1889,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
         return generic_block_bmap(mapping, block, get_data_block_bmap);
  }
  
+#ifdef CONFIG_MIGRATION
+#include <linux/migrate.h>
+
+int f2fs_migrate_page(struct address_space *mapping,
+               struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+       int rc, extra_count;
+       struct f2fs_inode_info *fi = F2FS_I(mapping->host);
+       bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page);
+
+       BUG_ON(PageWriteback(page));
+
+       /* migrating an atomic written page is safe with the inmem_lock hold */
+       if (atomic_written && !mutex_trylock(&fi->inmem_lock))
+               return -EAGAIN;
+
+       /*
+        * A reference is expected if PagePrivate set when move mapping,
+        * however F2FS breaks this for maintaining dirty page counts when
+        * truncating pages. So here adjusting the 'extra_count' make it work.
+        */
+       extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+       rc = migrate_page_move_mapping(mapping, newpage,
+                               page, NULL, mode, extra_count);
+       if (rc != MIGRATEPAGE_SUCCESS) {
+               if (atomic_written)
+                       mutex_unlock(&fi->inmem_lock);
+               return rc;
+       }
+
+       if (atomic_written) {
+               struct inmem_pages *cur;
+               list_for_each_entry(cur, &fi->inmem_pages, list)
+                       if (cur->page == page) {
+                               cur->page = newpage;
+                               break;
+                       }
+               mutex_unlock(&fi->inmem_lock);
+               put_page(page);
+               get_page(newpage);
+       }
+
+       if (PagePrivate(page))
+               SetPagePrivate(newpage);
+       set_page_private(newpage, page_private(page));
+
+       migrate_page_copy(newpage, page);
+
+       return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
  const struct address_space_operations f2fs_dblock_aops = {
         .readpage       = f2fs_read_data_page,
         .readpages      = f2fs_read_data_pages,
@@ -1885,4 +1953,7 @@ const struct address_space_operations f2fs_dblock_aops = {
         .releasepage    = f2fs_release_page,
         .direct_IO      = f2fs_direct_IO,
         .bmap           = f2fs_bmap,
+#ifdef CONFIG_MIGRATION
+       .migratepage    = f2fs_migrate_page,
+#endif
  };
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c

index badd407..fb245bd 100644 (file)
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -45,6 +45,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
         si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
         si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
         si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+       si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
         si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
         si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
         si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
@@ -54,6 +55,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
         si->rsvd_segs = reserved_segments(sbi);
         si->overp_segs = overprovision_segments(sbi);
         si->valid_count = valid_user_blocks(sbi);
+       si->discard_blks = discard_blocks(sbi);
         si->valid_node_count = valid_node_count(sbi);
         si->valid_inode_count = valid_inode_count(sbi);
         si->inline_xattr = atomic_read(&sbi->inline_xattr);
@@ -154,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
         si->base_mem += sizeof(struct sit_info);
         si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
         si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
-       si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+       si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+       if (f2fs_discard_en(sbi))
+               si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
         si->base_mem += SIT_VBLOCK_MAP_SIZE;
         if (sbi->segs_per_sec > 1)
                 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
@@ -228,8 +232,13 @@ static int stat_show(struct seq_file *s, void *v)
                            si->ssa_area_segs, si->main_area_segs);
                 seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
                            si->overp_segs, si->rsvd_segs);
-               seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
-                          si->utilization, si->valid_count);
+               if (test_opt(si->sbi, DISCARD))
+                       seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
+                               si->utilization, si->valid_count, si->discard_blks);
+               else
+                       seq_printf(s, "Utilization: %u%% (%u valid blocks)\n",
+                               si->utilization, si->valid_count);
+
                 seq_printf(s, "  - Node: %u (Inode: %u, ",
                            si->valid_node_count, si->valid_inode_count);
                 seq_printf(s, "Other: %u)\n  - Data: %u\n",
@@ -311,6 +320,8 @@ static int stat_show(struct seq_file *s, void *v)
                            si->ndirty_data, si->ndirty_files);
                 seq_printf(s, "  - meta: %4lld in %4d\n",
                            si->ndirty_meta, si->meta_pages);
+               seq_printf(s, "  - imeta: %4lld\n",
+                          si->ndirty_imeta);
                 seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
                            si->dirty_nats, si->nats, si->dirty_sits, si->sits);
                 seq_printf(s, "  - free_nids: %9d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c

index 9054aea..cbf85f6 100644 (file)
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level)
                 return 4;
  }
  
-unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
         [F2FS_FT_UNKNOWN]       = DT_UNKNOWN,
         [F2FS_FT_REG_FILE]      = DT_REG,
         [F2FS_FT_DIR]           = DT_DIR,
@@ -172,7 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
         int max_slots;
         f2fs_hash_t namehash;
  
-       namehash = f2fs_dentry_hash(&name);
+       if(fname->hash)
+               namehash = cpu_to_le32(fname->hash);
+       else
+               namehash = f2fs_dentry_hash(&name);
  
         nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
         nblock = bucket_blocks(level);
@@ -212,31 +215,17 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
         return de;
  }
  
-/*
- * Find an entry in the specified directory with the wanted name.
- * It returns the page where the entry was found (as a parameter - res_page),
- * and the entry itself. Page is returned mapped and unlocked.
- * Entry is guaranteed to be valid.
- */
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
-                       const struct qstr *child, struct page **res_page)
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
+                       struct fscrypt_name *fname, struct page **res_page)
  {
         unsigned long npages = dir_blocks(dir);
         struct f2fs_dir_entry *de = NULL;
         unsigned int max_depth;
         unsigned int level;
-       struct fscrypt_name fname;
-       int err;
-
-       err = fscrypt_setup_filename(dir, child, 1, &fname);
-       if (err) {
-               *res_page = ERR_PTR(err);
-               return NULL;
-       }
  
         if (f2fs_has_inline_dentry(dir)) {
                 *res_page = NULL;
-               de = find_in_inline_dir(dir, &fname, res_page);
+               de = find_in_inline_dir(dir, fname, res_page);
                 goto out;
         }
  
@@ -256,11 +245,35 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
  
         for (level = 0; level < max_depth; level++) {
                 *res_page = NULL;
-               de = find_in_level(dir, level, &fname, res_page);
+               de = find_in_level(dir, level, fname, res_page);
                 if (de || IS_ERR(*res_page))
                         break;
         }
  out:
+       return de;
+}
+
+/*
+ * Find an entry in the specified directory with the wanted name.
+ * It returns the page where the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+                       const struct qstr *child, struct page **res_page)
+{
+       struct f2fs_dir_entry *de = NULL;
+       struct fscrypt_name fname;
+       int err;
+
+       err = fscrypt_setup_filename(dir, child, 1, &fname);
+       if (err) {
+               *res_page = ERR_PTR(err);
+               return NULL;
+       }
+
+       de = __f2fs_find_entry(dir, &fname, res_page);
+
         fscrypt_free_filename(&fname);
         return de;
  }
@@ -375,7 +388,8 @@ static int make_empty_dir(struct inode *inode,
  }
  
  struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
-                       const struct qstr *name, struct page *dpage)
+                       const struct qstr *new_name, const struct qstr *orig_name,
+                       struct page *dpage)
  {
         struct page *page;
         int err;
@@ -400,7 +414,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
                 if (err)
                         goto put_error;
  
-               err = f2fs_init_security(inode, dir, name, page);
+               err = f2fs_init_security(inode, dir, orig_name, page);
                 if (err)
                         goto put_error;
  
@@ -417,8 +431,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
                 set_cold_node(inode, page);
         }
  
-       if (name)
-               init_dent_inode(name, page);
+       if (new_name)
+               init_dent_inode(new_name, page);
  
         /*
          * This file should be checkpointed during fsync.
@@ -496,7 +510,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
         de->ino = cpu_to_le32(ino);
         set_de_type(de, mode);
         for (i = 0; i < slots; i++) {
-               test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+               __set_bit_le(bit_pos + i, (void *)d->bitmap);
                 /* avoid wrong garbage data for readdir */
                 if (i)
                         (de + i)->name_len = 0;
@@ -504,6 +518,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
  }
  
  int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
+                               const struct qstr *orig_name,
                                 struct inode *inode, nid_t ino, umode_t mode)
  {
         unsigned int bit_pos;
@@ -530,7 +545,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
  
  start:
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (time_to_inject(FAULT_DIR_DEPTH))
+       if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH))
                 return -ENOSPC;
  #endif
         if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
@@ -569,7 +584,8 @@ add_dentry:
  
         if (inode) {
                 down_write(&F2FS_I(inode)->i_sem);
-               page = init_inode_metadata(inode, dir, new_name, NULL);
+               page = init_inode_metadata(inode, dir, new_name,
+                                               orig_name, NULL);
                 if (IS_ERR(page)) {
                         err = PTR_ERR(page);
                         goto fail;
@@ -599,6 +615,26 @@ fail:
         return err;
  }
  
+int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
+                               struct inode *inode, nid_t ino, umode_t mode)
+{
+       struct qstr new_name;
+       int err = -EAGAIN;
+
+       new_name.name = fname_name(fname);
+       new_name.len = fname_len(fname);
+
+       if (f2fs_has_inline_dentry(dir))
+               err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname,
+                                                       inode, ino, mode);
+       if (err == -EAGAIN)
+               err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname,
+                                                       inode, ino, mode);
+
+       f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+       return err;
+}
+
  /*
   * Caller should grab and release a rwsem by calling f2fs_lock_op() and
   * f2fs_unlock_op().
@@ -607,24 +643,15 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
                                 struct inode *inode, nid_t ino, umode_t mode)
  {
         struct fscrypt_name fname;
-       struct qstr new_name;
         int err;
  
         err = fscrypt_setup_filename(dir, name, 0, &fname);
         if (err)
                 return err;
  
-       new_name.name = fname_name(&fname);
-       new_name.len = fname_len(&fname);
-
-       err = -EAGAIN;
-       if (f2fs_has_inline_dentry(dir))
-               err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
-       if (err == -EAGAIN)
-               err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode);
+       err = __f2fs_do_add_link(dir, &fname, inode, ino, mode);
  
         fscrypt_free_filename(&fname);
-       f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
         return err;
  }
  
@@ -634,7 +661,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
         int err = 0;
  
         down_write(&F2FS_I(inode)->i_sem);
-       page = init_inode_metadata(inode, dir, NULL, NULL);
+       page = init_inode_metadata(inode, dir, NULL, NULL, NULL);
         if (IS_ERR(page)) {
                 err = PTR_ERR(page);
                 goto fail;
@@ -788,16 +815,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
                         int save_len = fstr->len;
                         int ret;
  
-                       de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS);
-                       if (!de_name.name)
-                               return false;
-
-                       memcpy(de_name.name, d->filename[bit_pos], de_name.len);
-
                         ret = fscrypt_fname_disk_to_usr(d->inode,
                                                 (u32)de->hash_code, 0,
                                                 &de_name, fstr);
-                       kfree(de_name.name);
                         if (ret < 0)
                                 return true;
  
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h

index 14f5fe2..9e8de18 100644 (file)
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -46,6 +46,8 @@ enum {
         FAULT_BLOCK,
         FAULT_DIR_DEPTH,
         FAULT_EVICT_INODE,
+       FAULT_IO,
+       FAULT_CHECKPOINT,
         FAULT_MAX,
  };
  
@@ -55,40 +57,8 @@ struct f2fs_fault_info {
         unsigned int inject_type;
  };
  
-extern struct f2fs_fault_info f2fs_fault;
  extern char *fault_name[FAULT_MAX];
-#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type)))
-
-static inline bool time_to_inject(int type)
-{
-       if (!f2fs_fault.inject_rate)
-               return false;
-       if (type == FAULT_KMALLOC && !IS_FAULT_SET(type))
-               return false;
-       else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type))
-               return false;
-       else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type))
-               return false;
-       else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type))
-               return false;
-       else if (type == FAULT_BLOCK && !IS_FAULT_SET(type))
-               return false;
-       else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type))
-               return false;
-       else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type))
-               return false;
-
-       atomic_inc(&f2fs_fault.inject_ops);
-       if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) {
-               atomic_set(&f2fs_fault.inject_ops, 0);
-               printk("%sF2FS-fs : inject %s in %pF\n",
-                               KERN_INFO,
-                               fault_name[type],
-                               __builtin_return_address(0));
-               return true;
-       }
-       return false;
-}
+#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
  #endif
  
  /*
@@ -158,7 +128,7 @@ enum {
         CP_DISCARD,
  };
  
-#define DEF_BATCHED_TRIM_SECTIONS      32
+#define DEF_BATCHED_TRIM_SECTIONS      2
  #define BATCHED_TRIM_SEGMENTS(sbi)     \
                 (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
  #define BATCHED_TRIM_BLOCKS(sbi)       \
@@ -211,6 +181,13 @@ struct discard_entry {
         int len;                /* # of consecutive blocks of the discard */
  };
  
+struct bio_entry {
+       struct list_head list;
+       struct bio *bio;
+       struct completion event;
+       int error;
+};
+
  /* for the list of fsync inodes, used only during recovery */
  struct fsync_inode_entry {
         struct list_head list;  /* list head */
@@ -645,6 +622,7 @@ struct f2fs_sm_info {
  
         /* for small discard management */
         struct list_head discard_list;          /* 4KB discard list */
+       struct list_head wait_list;             /* linked with issued discard bio */
         int nr_discards;                        /* # of discards in the list */
         int max_discards;                       /* max. discards to be issued */
  
@@ -748,6 +726,7 @@ enum {
         SBI_NEED_FSCK,                          /* need fsck.f2fs to fix */
         SBI_POR_DOING,                          /* recovery is doing or not */
         SBI_NEED_SB_WRITE,                      /* need to recover superblock */
+       SBI_NEED_CP,                            /* need to checkpoint */
  };
  
  enum {
@@ -765,7 +744,7 @@ struct f2fs_sb_info {
         struct proc_dir_entry *s_proc;          /* proc entry */
         struct f2fs_super_block *raw_super;     /* raw super block pointer */
         int valid_super_block;                  /* valid super block no */
-       int s_flag;                             /* flags for sbi */
+       unsigned long s_flag;                           /* flags for sbi */
  
  #ifdef CONFIG_F2FS_FS_ENCRYPTION
         u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
@@ -785,6 +764,7 @@ struct f2fs_sb_info {
  
         /* for checkpoint */
         struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
+       spinlock_t cp_lock;                     /* for flag in ckpt */
         struct inode *meta_inode;               /* cache meta blocks */
         struct mutex cp_mutex;                  /* checkpoint procedure lock */
         struct rw_semaphore cp_rwsem;           /* blocking FS operations */
@@ -892,8 +872,37 @@ struct f2fs_sb_info {
  
         /* Reference to checksum algorithm driver via cryptoapi */
         struct crypto_shash *s_chksum_driver;
+
+       /* For fault injection */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       struct f2fs_fault_info fault_info;
+#endif
  };
  
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
+{
+       struct f2fs_fault_info *ffi = &sbi->fault_info;
+
+       if (!ffi->inject_rate)
+               return false;
+
+       if (!IS_FAULT_SET(ffi, type))
+               return false;
+
+       atomic_inc(&ffi->inject_ops);
+       if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
+               atomic_set(&ffi->inject_ops, 0);
+               printk("%sF2FS-fs : inject %s in %pF\n",
+                               KERN_INFO,
+                               fault_name[type],
+                               __builtin_return_address(0));
+               return true;
+       }
+       return false;
+}
+#endif
+
  /* For write statistics. Suppose sector size is 512 bytes,
   * and the return value is in kbytes. s is of struct f2fs_sb_info.
   */
@@ -1034,17 +1043,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
  
  static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
  {
-       return sbi->s_flag & (0x01 << type);
+       return test_bit(type, &sbi->s_flag);
  }
  
  static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
  {
-       sbi->s_flag |= (0x01 << type);
+       set_bit(type, &sbi->s_flag);
  }
  
  static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
  {
-       sbi->s_flag &= ~(0x01 << type);
+       clear_bit(type, &sbi->s_flag);
  }
  
  static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -1052,26 +1061,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
         return le64_to_cpu(cp->checkpoint_ver);
  }
  
-static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
  {
         unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+
         return ckpt_flags & f;
  }
  
-static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
  {
-       unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+       return __is_set_ckpt_flags(F2FS_CKPT(sbi), f);
+}
+
+static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+       unsigned int ckpt_flags;
+
+       ckpt_flags = le32_to_cpu(cp->ckpt_flags);
         ckpt_flags |= f;
         cp->ckpt_flags = cpu_to_le32(ckpt_flags);
  }
  
-static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
  {
-       unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+       spin_lock(&sbi->cp_lock);
+       __set_ckpt_flags(F2FS_CKPT(sbi), f);
+       spin_unlock(&sbi->cp_lock);
+}
+
+static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+       unsigned int ckpt_flags;
+
+       ckpt_flags = le32_to_cpu(cp->ckpt_flags);
         ckpt_flags &= (~f);
         cp->ckpt_flags = cpu_to_le32(ckpt_flags);
  }
  
+static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
+{
+       spin_lock(&sbi->cp_lock);
+       __clear_ckpt_flags(F2FS_CKPT(sbi), f);
+       spin_unlock(&sbi->cp_lock);
+}
+
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+{
+       struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+       return blk_queue_discard(q);
+}
+
  static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
  {
         down_read(&sbi->cp_rwsem);
@@ -1110,8 +1150,8 @@ static inline bool __remain_node_summaries(int reason)
  
  static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
  {
-       return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
-                       is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+       return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) ||
+                       is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG));
  }
  
  /*
@@ -1151,7 +1191,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
         blkcnt_t diff;
  
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (time_to_inject(FAULT_BLOCK))
+       if (time_to_inject(sbi, FAULT_BLOCK))
                 return false;
  #endif
         /*
@@ -1193,6 +1233,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
  static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
  {
         percpu_counter_inc(&sbi->nr_pages[count_type]);
+
+       if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+               return;
+
         set_sbi_flag(sbi, SBI_IS_DIRTY);
  }
  
@@ -1243,6 +1287,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
         return sbi->total_valid_block_count;
  }
  
+static inline block_t discard_blocks(struct f2fs_sb_info *sbi)
+{
+       return sbi->discard_blks;
+}
+
  static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
  {
         struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1376,7 +1425,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
         if (page)
                 return page;
  
-       if (time_to_inject(FAULT_PAGE_ALLOC))
+       if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
                 return NULL;
  #endif
         if (!for_write)
@@ -1804,7 +1853,7 @@ static inline int f2fs_readonly(struct super_block *sb)
  
  static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
  {
-       return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+       return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
  }
  
  static inline bool is_dot_dotdot(const struct qstr *str)
@@ -1827,10 +1876,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
         return S_ISREG(inode->i_mode);
  }
  
-static inline void *f2fs_kmalloc(size_t size, gfp_t flags)
+static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
+                                       size_t size, gfp_t flags)
  {
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (time_to_inject(FAULT_KMALLOC))
+       if (time_to_inject(sbi, FAULT_KMALLOC))
                 return NULL;
  #endif
         return kmalloc(size, flags);
@@ -1885,6 +1935,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
   */
  void f2fs_set_inode_flags(struct inode *);
  struct inode *f2fs_iget(struct super_block *, unsigned long);
+struct inode *f2fs_iget_retry(struct super_block *, unsigned long);
  int try_to_free_nats(struct f2fs_sb_info *, int);
  int update_inode(struct inode *, struct page *);
  int update_inode_page(struct inode *);
@@ -1900,7 +1951,6 @@ struct dentry *f2fs_get_parent(struct dentry *child);
  /*
   * dir.c
   */
-extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
  void set_de_type(struct f2fs_dir_entry *, umode_t);
  unsigned char get_de_type(struct f2fs_dir_entry *);
  struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
@@ -1910,10 +1960,12 @@ bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
  void do_make_empty_dir(struct inode *, struct inode *,
                         struct f2fs_dentry_ptr *);
  struct page *init_inode_metadata(struct inode *, struct inode *,
-                       const struct qstr *, struct page *);
+               const struct qstr *, const struct qstr *, struct page *);
  void update_parent_metadata(struct inode *, struct inode *, unsigned int);
  int room_for_filename(const void *, int, int);
  void f2fs_drop_nlink(struct inode *, struct inode *);
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *,
+                                                       struct page **);
  struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *,
                                                         struct page **);
  struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
@@ -1924,7 +1976,9 @@ int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
  void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
                         const struct qstr *, f2fs_hash_t , unsigned int);
  int f2fs_add_regular_entry(struct inode *, const struct qstr *,
-                                               struct inode *, nid_t, umode_t);
+                       const struct qstr *, struct inode *, nid_t, umode_t);
+int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *,
+                       nid_t, umode_t);
  int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
                         umode_t);
  void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
@@ -2010,9 +2064,9 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
  void invalidate_blocks(struct f2fs_sb_info *, block_t);
  bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
  void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
+void f2fs_wait_all_discard_bio(struct f2fs_sb_info *);
  void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
  void release_discard_addrs(struct f2fs_sb_info *);
-bool discard_next_dnode(struct f2fs_sb_info *, block_t);
  int npages_for_summary_flush(struct f2fs_sb_info *, bool);
  void allocate_new_segments(struct f2fs_sb_info *);
  int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
@@ -2095,6 +2149,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
  void f2fs_set_page_dirty_nobuffers(struct page *);
  void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
  int f2fs_release_page(struct page *, gfp_t);
+#ifdef CONFIG_MIGRATION
+int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
+                               enum migrate_mode);
+#endif
  
  /*
   * gc.c
@@ -2123,13 +2181,14 @@ struct f2fs_stat_info {
         unsigned long long hit_largest, hit_cached, hit_rbtree;
         unsigned long long hit_total, total_ext;
         int ext_tree, zombie_tree, ext_node;
-       s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages;
+       s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+       s64 inmem_pages;
         unsigned int ndirty_dirs, ndirty_files, ndirty_all;
         int nats, dirty_nats, sits, dirty_sits, fnids;
         int total_count, utilization;
         int bg_gc, wb_bios;
         int inline_xattr, inline_inode, inline_dir, orphans;
-       unsigned int valid_count, valid_node_count, valid_inode_count;
+       unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
         unsigned int bimodal, avg_vblocks;
         int util_free, util_valid, util_invalid;
         int rsvd_segs, overp_segs;
@@ -2294,8 +2353,8 @@ bool recover_inline_data(struct inode *, struct page *);
  struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
                                 struct fscrypt_name *, struct page **);
  int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
-int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
-                                               nid_t, umode_t);
+int f2fs_add_inline_entry(struct inode *, const struct qstr *,
+               const struct qstr *, struct inode *, nid_t, umode_t);
  void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
                                                 struct inode *, struct inode *);
  bool f2fs_empty_inline_dir(struct inode *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c

index 28f4f4c..f8b4fe0 100644 (file)
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -135,7 +135,7 @@ static inline bool need_do_checkpoint(struct inode *inode)
  
         if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
                 need_cp = true;
-       else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino))
+       else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
                 need_cp = true;
         else if (file_wrong_pino(inode))
                 need_cp = true;
@@ -523,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
                 return 0;
  
         if (cache_only) {
-               page = f2fs_grab_cache_page(mapping, index, false);
+               page = find_lock_page(mapping, index);
                 if (page && PageUptodate(page))
                         goto truncate_out;
                 f2fs_put_page(page, 1);
@@ -1454,7 +1454,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
  {
         struct inode *inode = file_inode(filp);
         struct f2fs_inode_info *fi = F2FS_I(inode);
-       unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+       unsigned int flags;
         unsigned int oldflags;
         int ret;
  
@@ -1954,7 +1954,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
          * avoid defragment running in SSR mode when free section are allocated
          * intensively
          */
-       if (has_not_enough_free_secs(sbi, sec_num)) {
+       if (has_not_enough_free_secs(sbi, 0, sec_num)) {
                 err = -EAGAIN;
                 goto out;
         }
@@ -2085,6 +2085,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
         if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
                 return -EOPNOTSUPP;
  
+       if (src == dst) {
+               if (pos_in == pos_out)
+                       return 0;
+               if (pos_out > pos_in && pos_out < pos_in + len)
+                       return -EINVAL;
+       }
+
         inode_lock(src);
         if (src != dst) {
                 if (!inode_trylock(dst)) {
@@ -2136,8 +2143,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
  
         f2fs_balance_fs(sbi, true);
         f2fs_lock_op(sbi);
-       ret = __exchange_data_block(src, dst, pos_in,
-                               pos_out, len >> F2FS_BLKSIZE_BITS, false);
+       ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS,
+                               pos_out >> F2FS_BLKSIZE_BITS,
+                               len >> F2FS_BLKSIZE_BITS, false);
  
         if (!ret) {
                 if (dst_max_i_size)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c

index 8f7fa32..93985c6 100644 (file)
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -47,6 +47,11 @@ static int gc_thread_func(void *data)
                         continue;
                 }
  
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+               if (time_to_inject(sbi, FAULT_CHECKPOINT))
+                       f2fs_stop_checkpoint(sbi, false);
+#endif
+
                 /*
                  * [GC triggering condition]
                  * 0. GC is not conducted currently.
@@ -96,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
         dev_t dev = sbi->sb->s_bdev->bd_dev;
         int err = 0;
  
-       gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+       gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
         if (!gc_th) {
                 err = -ENOMEM;
                 goto out;
@@ -270,7 +275,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
  {
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
         struct victim_sel_policy p;
-       unsigned int secno, max_cost, last_victim;
+       unsigned int secno, last_victim;
         unsigned int last_segment = MAIN_SEGS(sbi);
         unsigned int nsearched = 0;
  
@@ -280,7 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
         select_policy(sbi, gc_type, type, &p);
  
         p.min_segno = NULL_SEGNO;
-       p.min_cost = max_cost = get_max_cost(sbi, &p);
+       p.min_cost = get_max_cost(sbi, &p);
  
         if (p.max_search == 0)
                 goto out;
@@ -423,10 +428,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
  static void gc_node_segment(struct f2fs_sb_info *sbi,
                 struct f2fs_summary *sum, unsigned int segno, int gc_type)
  {
-       bool initial = true;
         struct f2fs_summary *entry;
         block_t start_addr;
         int off;
+       int phase = 0;
  
         start_addr = START_BLOCK(sbi, segno);
  
@@ -439,16 +444,24 @@ next_step:
                 struct node_info ni;
  
                 /* stop BG_GC if there is not enough free sections. */
-               if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+               if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
                         return;
  
                 if (check_valid_map(sbi, segno, off) == 0)
                         continue;
  
-               if (initial) {
+               if (phase == 0) {
+                       ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+                                                       META_NAT, true);
+                       continue;
+               }
+
+               if (phase == 1) {
                         ra_node_page(sbi, nid);
                         continue;
                 }
+
+               /* phase == 2 */
                 node_page = get_node_page(sbi, nid);
                 if (IS_ERR(node_page))
                         continue;
@@ -469,10 +482,8 @@ next_step:
                 stat_inc_node_blk_count(sbi, 1, gc_type);
         }
  
-       if (initial) {
-               initial = false;
+       if (++phase < 3)
                 goto next_step;
-       }
  }
  
  /*
@@ -706,16 +717,23 @@ next_step:
                 struct node_info dni; /* dnode info for the data */
                 unsigned int ofs_in_node, nofs;
                 block_t start_bidx;
+               nid_t nid = le32_to_cpu(entry->nid);
  
                 /* stop BG_GC if there is not enough free sections. */
-               if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+               if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
                         return;
  
                 if (check_valid_map(sbi, segno, off) == 0)
                         continue;
  
                 if (phase == 0) {
-                       ra_node_page(sbi, le32_to_cpu(entry->nid));
+                       ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+                                                       META_NAT, true);
+                       continue;
+               }
+
+               if (phase == 1) {
+                       ra_node_page(sbi, nid);
                         continue;
                 }
  
@@ -723,14 +741,14 @@ next_step:
                 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
                         continue;
  
-               if (phase == 1) {
+               if (phase == 2) {
                         ra_node_page(sbi, dni.ino);
                         continue;
                 }
  
                 ofs_in_node = le16_to_cpu(entry->ofs_in_node);
  
-               if (phase == 2) {
+               if (phase == 3) {
                         inode = f2fs_iget(sb, dni.ino);
                         if (IS_ERR(inode) || is_bad_inode(inode))
                                 continue;
@@ -756,7 +774,7 @@ next_step:
                         continue;
                 }
  
-               /* phase 3 */
+               /* phase 4 */
                 inode = find_gc_inode(gc_list, dni.ino);
                 if (inode) {
                         struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -789,7 +807,7 @@ next_step:
                 }
         }
  
-       if (++phase < 4)
+       if (++phase < 5)
                 goto next_step;
  }
  
@@ -815,7 +833,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
         struct blk_plug plug;
         unsigned int segno = start_segno;
         unsigned int end_segno = start_segno + sbi->segs_per_sec;
-       int seg_freed = 0;
+       int sec_freed = 0;
         unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
                                                 SUM_TYPE_DATA : SUM_TYPE_NODE;
  
@@ -834,8 +852,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
  
         for (segno = start_segno; segno < end_segno; segno++) {
  
-               if (get_valid_blocks(sbi, segno, 1) == 0)
-                       continue;
+               if (get_valid_blocks(sbi, segno, 1) == 0 ||
+                                       unlikely(f2fs_cp_error(sbi)))
+                       goto next;
  
                 /* find segment summary of victim */
                 sum_page = find_get_page(META_MAPPING(sbi),
@@ -861,7 +880,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
                                                                 gc_type);
  
                 stat_inc_seg_count(sbi, type, gc_type);
-
+next:
                 f2fs_put_page(sum_page, 0);
         }
  
@@ -871,22 +890,20 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
  
         blk_finish_plug(&plug);
  
-       if (gc_type == FG_GC) {
-               while (start_segno < end_segno)
-                       if (get_valid_blocks(sbi, start_segno++, 1) == 0)
-                               seg_freed++;
-       }
+       if (gc_type == FG_GC &&
+               get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
+               sec_freed = 1;
  
         stat_inc_call_count(sbi->stat_info);
  
-       return seg_freed;
+       return sec_freed;
  }
  
  int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
  {
         unsigned int segno;
         int gc_type = sync ? FG_GC : BG_GC;
-       int sec_freed = 0, seg_freed;
+       int sec_freed = 0;
         int ret = -EINVAL;
         struct cp_control cpc;
         struct gc_inode_list gc_list = {
@@ -905,7 +922,7 @@ gc_more:
                 goto stop;
         }
  
-       if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
+       if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) {
                 gc_type = FG_GC;
                 /*
                  * If there is no victim and no prefree segment but still not
@@ -914,10 +931,14 @@ gc_more:
                  */
                 if (__get_victim(sbi, &segno, gc_type) ||
                                                 prefree_segments(sbi)) {
-                       write_checkpoint(sbi, &cpc);
+                       ret = write_checkpoint(sbi, &cpc);
+                       if (ret)
+                               goto stop;
                         segno = NULL_SEGNO;
-               } else if (has_not_enough_free_secs(sbi, 0)) {
-                       write_checkpoint(sbi, &cpc);
+               } else if (has_not_enough_free_secs(sbi, 0, 0)) {
+                       ret = write_checkpoint(sbi, &cpc);
+                       if (ret)
+                               goto stop;
                 }
         }
  
@@ -925,20 +946,19 @@ gc_more:
                 goto stop;
         ret = 0;
  
-       seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
-
-       if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
+       if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
+                       gc_type == FG_GC)
                 sec_freed++;
  
         if (gc_type == FG_GC)
                 sbi->cur_victim_sec = NULL_SEGNO;
  
         if (!sync) {
-               if (has_not_enough_free_secs(sbi, sec_freed))
+               if (has_not_enough_free_secs(sbi, sec_freed, 0))
                         goto gc_more;
  
                 if (gc_type == FG_GC)
-                       write_checkpoint(sbi, &cpc);
+                       ret = write_checkpoint(sbi, &cpc);
         }
  stop:
         mutex_unlock(&sbi->gc_mutex);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c

index ccea873..34234d8 100644 (file)
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -424,7 +424,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
                 ino = le32_to_cpu(de->ino);
                 fake_mode = get_de_type(de) << S_SHIFT;
  
-               err = f2fs_add_regular_entry(dir, &new_name, NULL,
+               err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL,
                                                         ino, fake_mode);
                 if (err)
                         goto punch_dentry_pages;
@@ -445,8 +445,8 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
         struct f2fs_inline_dentry *backup_dentry;
         int err;
  
-       backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry),
-                                                       GFP_F2FS_ZERO);
+       backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
+                       sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
         if (!backup_dentry) {
                 f2fs_put_page(ipage, 1);
                 return -ENOMEM;
@@ -488,17 +488,17 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
                 return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
  }
  
-int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
-                       struct inode *inode, nid_t ino, umode_t mode)
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
+                               const struct qstr *orig_name,
+                               struct inode *inode, nid_t ino, umode_t mode)
  {
         struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
         struct page *ipage;
         unsigned int bit_pos;
         f2fs_hash_t name_hash;
-       size_t namelen = name->len;
         struct f2fs_inline_dentry *dentry_blk = NULL;
         struct f2fs_dentry_ptr d;
-       int slots = GET_DENTRY_SLOTS(namelen);
+       int slots = GET_DENTRY_SLOTS(new_name->len);
         struct page *page = NULL;
         int err = 0;
  
@@ -519,18 +519,21 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
  
         if (inode) {
                 down_write(&F2FS_I(inode)->i_sem);
-               page = init_inode_metadata(inode, dir, name, ipage);
+               page = init_inode_metadata(inode, dir, new_name,
+                                               orig_name, ipage);
                 if (IS_ERR(page)) {
                         err = PTR_ERR(page);
                         goto fail;
                 }
+               if (f2fs_encrypted_inode(dir))
+                       file_set_enc_name(inode);
         }
  
         f2fs_wait_on_page_writeback(ipage, NODE, true);
  
-       name_hash = f2fs_dentry_hash(name);
+       name_hash = f2fs_dentry_hash(new_name);
         make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
-       f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
+       f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
  
         set_page_dirty(ipage);
  
@@ -563,7 +566,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
         inline_dentry = inline_data_addr(page);
         bit_pos = dentry - inline_dentry->dentry;
         for (i = 0; i < slots; i++)
-               test_and_clear_bit_le(bit_pos + i,
+               __clear_bit_le(bit_pos + i,
                                 &inline_dentry->dentry_bitmap);
  
         set_page_dirty(page);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c

index 9ac5efc..d736989 100644 (file)
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -11,6 +11,7 @@
  #include <linux/fs.h>
  #include <linux/f2fs_fs.h>
  #include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
  #include <linux/writeback.h>
  
  #include "f2fs.h"
@@ -234,6 +235,20 @@ bad_inode:
         return ERR_PTR(ret);
  }
  
+struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino)
+{
+       struct inode *inode;
+retry:
+       inode = f2fs_iget(sb, ino);
+       if (IS_ERR(inode)) {
+               if (PTR_ERR(inode) == -ENOMEM) {
+                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       goto retry;
+               }
+       }
+       return inode;
+}
+
  int update_inode(struct inode *inode, struct page *node_page)
  {
         struct f2fs_inode *ri;
@@ -354,7 +369,7 @@ void f2fs_evict_inode(struct inode *inode)
                 goto no_delete;
  
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (time_to_inject(FAULT_EVICT_INODE))
+       if (time_to_inject(sbi, FAULT_EVICT_INODE))
                 goto no_delete;
  #endif
  
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c

index 73fa356..300aef8 100644 (file)
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -91,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
  {
         size_t slen = strlen(s);
         size_t sublen = strlen(sub);
+       int i;
  
         /*
          * filename format of multimedia file should be defined as:
-        * "filename + '.' + extension".
+        * "filename + '.' + extension + (optional: '.' + temp extension)".
          */
         if (slen < sublen + 2)
                 return 0;
  
-       if (s[slen - sublen - 1] != '.')
-               return 0;
+       for (i = 1; i < slen - sublen; i++) {
+               if (s[i] != '.')
+                       continue;
+               if (!strncasecmp(s + i + 1, sub, sublen))
+                       return 1;
+       }
  
-       return !strncasecmp(s + slen - sublen, sub, sublen);
+       return 0;
  }
  
  /*
@@ -1010,7 +1015,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
         struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
         struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
         struct fscrypt_symlink_data *sd;
-       loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
         u32 max_size = inode->i_sb->s_blocksize;
         int res;
  
@@ -1025,7 +1029,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
         if (IS_ERR(cpage))
                 return ERR_CAST(cpage);
         caddr = page_address(cpage);
-       caddr[size] = 0;
  
         /* Symlink is encrypted */
         sd = (struct fscrypt_symlink_data *)caddr;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c

index f75d197..8831035 100644 (file)
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,8 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
                 if (excess_cached_nats(sbi))
                         res = false;
-               if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD)
-                       res = false;
         } else if (type == DIRTY_DENTS) {
                 if (sbi->sb->s_bdi->wb.dirty_exceeded)
                         return false;
@@ -1314,6 +1312,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
         struct page *last_page = NULL;
         bool marked = false;
         nid_t ino = inode->i_ino;
+       int nwritten = 0;
  
         if (atomic) {
                 last_page = last_fsync_dnode(sbi, ino);
@@ -1387,7 +1386,10 @@ continue_unlock:
                                 unlock_page(page);
                                 f2fs_put_page(last_page, 0);
                                 break;
+                       } else {
+                               nwritten++;
                         }
+
                         if (page == last_page) {
                                 f2fs_put_page(page, 0);
                                 marked = true;
@@ -1409,6 +1411,9 @@ continue_unlock:
                 unlock_page(last_page);
                 goto retry;
         }
+
+       if (nwritten)
+               f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
         return ret ? -EIO: 0;
  }
  
@@ -1418,6 +1423,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
         struct pagevec pvec;
         int step = 0;
         int nwritten = 0;
+       int ret = 0;
  
         pagevec_init(&pvec, 0);
  
@@ -1438,7 +1444,8 @@ next_step:
  
                         if (unlikely(f2fs_cp_error(sbi))) {
                                 pagevec_release(&pvec);
-                               return -EIO;
+                               ret = -EIO;
+                               goto out;
                         }
  
                         /*
@@ -1489,6 +1496,8 @@ continue_unlock:
  
                         if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
                                 unlock_page(page);
+                       else
+                               nwritten++;
  
                         if (--wbc->nr_to_write == 0)
                                 break;
@@ -1506,7 +1515,10 @@ continue_unlock:
                 step++;
                 goto next_step;
         }
-       return nwritten;
+out:
+       if (nwritten)
+               f2fs_submit_merged_bio(sbi, NODE, WRITE);
+       return ret;
  }
  
  int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
@@ -1672,6 +1684,9 @@ const struct address_space_operations f2fs_node_aops = {
         .set_page_dirty = f2fs_set_node_page_dirty,
         .invalidatepage = f2fs_invalidate_page,
         .releasepage    = f2fs_release_page,
+#ifdef CONFIG_MIGRATION
+       .migratepage    = f2fs_migrate_page,
+#endif
  };
  
  static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1838,7 +1853,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
         struct free_nid *i = NULL;
  retry:
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (time_to_inject(FAULT_ALLOC_NID))
+       if (time_to_inject(sbi, FAULT_ALLOC_NID))
                 return false;
  #endif
         if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
@@ -2015,10 +2030,12 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
  
         if (unlikely(old_ni.blk_addr != NULL_ADDR))
                 return -EINVAL;
-
+retry:
         ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
-       if (!ipage)
-               return -ENOMEM;
+       if (!ipage) {
+               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               goto retry;
+       }
  
         /* Should not use this inode from free nid list */
         remove_free_nid(NM_I(sbi), ino);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h

index fc76845..868bec6 100644 (file)
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -229,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
         f2fs_change_bit(block_off, nm_i->nat_bitmap);
  }
  
+static inline nid_t ino_of_node(struct page *node_page)
+{
+       struct f2fs_node *rn = F2FS_NODE(node_page);
+       return le32_to_cpu(rn->footer.ino);
+}
+
+static inline nid_t nid_of_node(struct page *node_page)
+{
+       struct f2fs_node *rn = F2FS_NODE(node_page);
+       return le32_to_cpu(rn->footer.nid);
+}
+
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+       struct f2fs_node *rn = F2FS_NODE(node_page);
+       unsigned flag = le32_to_cpu(rn->footer.flag);
+       return flag >> OFFSET_BIT_SHIFT;
+}
+
+static inline __u64 cpver_of_node(struct page *node_page)
+{
+       struct f2fs_node *rn = F2FS_NODE(node_page);
+       return le64_to_cpu(rn->footer.cp_ver);
+}
+
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+       struct f2fs_node *rn = F2FS_NODE(node_page);
+       return le32_to_cpu(rn->footer.next_blkaddr);
+}
+
  static inline void fill_node_footer(struct page *page, nid_t nid,
                                 nid_t ino, unsigned int ofs, bool reset)
  {
@@ -259,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
  {
         struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
         struct f2fs_node *rn = F2FS_NODE(page);
+       size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+       __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);
  
-       rn->footer.cp_ver = ckpt->checkpoint_ver;
+       if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+               __u64 crc = le32_to_cpu(*((__le32 *)
+                               ((unsigned char *)ckpt + crc_offset)));
+               cp_ver |= (crc << 32);
+       }
+       rn->footer.cp_ver = cpu_to_le64(cp_ver);
         rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
  }
  
-static inline nid_t ino_of_node(struct page *node_page)
-{
-       struct f2fs_node *rn = F2FS_NODE(node_page);
-       return le32_to_cpu(rn->footer.ino);
-}
-
-static inline nid_t nid_of_node(struct page *node_page)
+static inline bool is_recoverable_dnode(struct page *page)
  {
-       struct f2fs_node *rn = F2FS_NODE(node_page);
-       return le32_to_cpu(rn->footer.nid);
-}
-
-static inline unsigned int ofs_of_node(struct page *node_page)
-{
-       struct f2fs_node *rn = F2FS_NODE(node_page);
-       unsigned flag = le32_to_cpu(rn->footer.flag);
-       return flag >> OFFSET_BIT_SHIFT;
-}
-
-static inline unsigned long long cpver_of_node(struct page *node_page)
-{
-       struct f2fs_node *rn = F2FS_NODE(node_page);
-       return le64_to_cpu(rn->footer.cp_ver);
-}
+       struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+       size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
+       __u64 cp_ver = cur_cp_version(ckpt);
  
-static inline block_t next_blkaddr_of_node(struct page *node_page)
-{
-       struct f2fs_node *rn = F2FS_NODE(node_page);
-       return le32_to_cpu(rn->footer.next_blkaddr);
+       if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
+               __u64 crc = le32_to_cpu(*((__le32 *)
+                               ((unsigned char *)ckpt + crc_offset)));
+               cp_ver |= (crc << 32);
+       }
+       return cpu_to_le64(cp_ver) == cpver_of_node(page);
  }
  
  /*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c

index 9e652d5..2fc84a9 100644 (file)
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -68,15 +68,17 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
         return NULL;
  }
  
-static struct fsync_inode_entry *add_fsync_inode(struct list_head *head,
-                                                       struct inode *inode)
+static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
+                                       struct list_head *head, nid_t ino)
  {
+       struct inode *inode;
         struct fsync_inode_entry *entry;
  
-       entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
-       if (!entry)
-               return NULL;
+       inode = f2fs_iget_retry(sbi->sb, ino);
+       if (IS_ERR(inode))
+               return ERR_CAST(inode);
  
+       entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
         entry->inode = inode;
         list_add_tail(&entry->list, head);
  
@@ -96,48 +98,41 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
         struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
         nid_t pino = le32_to_cpu(raw_inode->i_pino);
         struct f2fs_dir_entry *de;
-       struct qstr name;
+       struct fscrypt_name fname;
         struct page *page;
         struct inode *dir, *einode;
         struct fsync_inode_entry *entry;
         int err = 0;
+       char *name;
  
         entry = get_fsync_inode(dir_list, pino);
         if (!entry) {
-               dir = f2fs_iget(inode->i_sb, pino);
-               if (IS_ERR(dir)) {
-                       err = PTR_ERR(dir);
-                       goto out;
-               }
-
-               entry = add_fsync_inode(dir_list, dir);
-               if (!entry) {
-                       err = -ENOMEM;
-                       iput(dir);
+               entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino);
+               if (IS_ERR(entry)) {
+                       dir = ERR_CAST(entry);
+                       err = PTR_ERR(entry);
                         goto out;
                 }
         }
  
         dir = entry->inode;
  
-       if (file_enc_name(inode))
-               return 0;
+       memset(&fname, 0, sizeof(struct fscrypt_name));
+       fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen);
+       fname.disk_name.name = raw_inode->i_name;
  
-       name.len = le32_to_cpu(raw_inode->i_namelen);
-       name.name = raw_inode->i_name;
-
-       if (unlikely(name.len > F2FS_NAME_LEN)) {
+       if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) {
                 WARN_ON(1);
                 err = -ENAMETOOLONG;
                 goto out;
         }
  retry:
-       de = f2fs_find_entry(dir, &name, &page);
+       de = __f2fs_find_entry(dir, &fname, &page);
         if (de && inode->i_ino == le32_to_cpu(de->ino))
                 goto out_unmap_put;
  
         if (de) {
-               einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+               einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
                 if (IS_ERR(einode)) {
                         WARN_ON(1);
                         err = PTR_ERR(einode);
@@ -156,18 +151,24 @@ retry:
         } else if (IS_ERR(page)) {
                 err = PTR_ERR(page);
         } else {
-               err = __f2fs_add_link(dir, &name, inode,
+               err = __f2fs_do_add_link(dir, &fname, inode,
                                         inode->i_ino, inode->i_mode);
         }
+       if (err == -ENOMEM)
+               goto retry;
         goto out;
  
  out_unmap_put:
         f2fs_dentry_kunmap(dir, page);
         f2fs_put_page(page, 0);
  out:
+       if (file_enc_name(inode))
+               name = "<encrypted>";
+       else
+               name = raw_inode->i_name;
         f2fs_msg(inode->i_sb, KERN_NOTICE,
                         "%s: ino = %x, name = %s, dir = %lx, err = %d",
-                       __func__, ino_of_node(ipage), raw_inode->i_name,
+                       __func__, ino_of_node(ipage), name,
                         IS_ERR(dir) ? 0 : dir->i_ino, err);
         return err;
  }
@@ -223,9 +224,7 @@ static bool is_same_inode(struct inode *inode, struct page *ipage)
  
  static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
  {
-       unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
         struct curseg_info *curseg;
-       struct inode *inode;
         struct page *page = NULL;
         block_t blkaddr;
         int err = 0;
@@ -242,7 +241,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
  
                 page = get_tmp_page(sbi, blkaddr);
  
-               if (cp_ver != cpver_of_node(page))
+               if (!is_recoverable_dnode(page))
                         break;
  
                 if (!is_fsync_dnode(page))
@@ -263,23 +262,15 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                          * CP | dnode(F) | inode(DF)
                          * For this case, we should not give up now.
                          */
-                       inode = f2fs_iget(sbi->sb, ino_of_node(page));
-                       if (IS_ERR(inode)) {
-                               err = PTR_ERR(inode);
+                       entry = add_fsync_inode(sbi, head, ino_of_node(page));
+                       if (IS_ERR(entry)) {
+                               err = PTR_ERR(entry);
                                 if (err == -ENOENT) {
                                         err = 0;
                                         goto next;
                                 }
                                 break;
                         }
-
-                       /* add this fsync inode to the list */
-                       entry = add_fsync_inode(head, inode);
-                       if (!entry) {
-                               err = -ENOMEM;
-                               iput(inode);
-                               break;
-                       }
                 }
                 entry->blkaddr = blkaddr;
  
@@ -363,7 +354,7 @@ got_it:
  
         if (ino != dn->inode->i_ino) {
                 /* Deallocate previous index in the node page */
-               inode = f2fs_iget(sbi->sb, ino);
+               inode = f2fs_iget_retry(sbi->sb, ino);
                 if (IS_ERR(inode))
                         return PTR_ERR(inode);
         } else {
@@ -431,10 +422,15 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
         end = start + ADDRS_PER_PAGE(page, inode);
  
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-
+retry_dn:
         err = get_dnode_of_data(&dn, start, ALLOC_NODE);
-       if (err)
+       if (err) {
+               if (err == -ENOMEM) {
+                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       goto retry_dn;
+               }
                 goto out;
+       }
  
         f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
  
@@ -485,11 +481,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                                 if (err)
                                         goto err;
                         }
-
+retry_prev:
                         /* Check the previous node page having this index */
                         err = check_index_in_prev_nodes(sbi, dest, &dn);
-                       if (err)
+                       if (err) {
+                               if (err == -ENOMEM) {
+                                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                                       goto retry_prev;
+                               }
                                 goto err;
+                       }
  
                         /* write dummy data page */
                         f2fs_replace_block(sbi, &dn, src, dest,
@@ -514,7 +515,6 @@ out:
  static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
                                                 struct list_head *dir_list)
  {
-       unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
         struct curseg_info *curseg;
         struct page *page = NULL;
         int err = 0;
@@ -534,7 +534,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
  
                 page = get_tmp_page(sbi, blkaddr);
  
-               if (cp_ver != cpver_of_node(page)) {
+               if (!is_recoverable_dnode(page)) {
                         f2fs_put_page(page, 1);
                         break;
                 }
@@ -626,38 +626,20 @@ out:
         }
  
         clear_sbi_flag(sbi, SBI_POR_DOING);
-       if (err) {
-               bool invalidate = false;
-
-               if (test_opt(sbi, LFS)) {
-                       update_meta_page(sbi, NULL, blkaddr);
-                       invalidate = true;
-               } else if (discard_next_dnode(sbi, blkaddr)) {
-                       invalidate = true;
-               }
-
-               /* Flush all the NAT/SIT pages */
-               while (get_pages(sbi, F2FS_DIRTY_META))
-                       sync_meta_pages(sbi, META, LONG_MAX);
+       if (err)
+               set_ckpt_flags(sbi, CP_ERROR_FLAG);
+       mutex_unlock(&sbi->cp_mutex);
  
-               /* invalidate temporary meta page */
-               if (invalidate)
-                       invalidate_mapping_pages(META_MAPPING(sbi),
-                                                       blkaddr, blkaddr);
+       /* let's drop all the directory inodes for clean checkpoint */
+       destroy_fsync_dnodes(&dir_list);
  
-               set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
-               mutex_unlock(&sbi->cp_mutex);
-       } else if (need_writecp) {
+       if (!err && need_writecp) {
                 struct cp_control cpc = {
                         .reason = CP_RECOVERY,
                 };
-               mutex_unlock(&sbi->cp_mutex);
                 err = write_checkpoint(sbi, &cpc);
-       } else {
-               mutex_unlock(&sbi->cp_mutex);
         }
  
-       destroy_fsync_dnodes(&dir_list);
         kmem_cache_destroy(fsync_entry_slab);
         return ret ? ret: err;
  }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

index a46296f..fc886f0 100644 (file)
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -26,6 +26,7 @@
  #define __reverse_ffz(x) __reverse_ffs(~(x))
  
  static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *bio_entry_slab;
  static struct kmem_cache *sit_entry_set_slab;
  static struct kmem_cache *inmem_entry_slab;
  
@@ -344,6 +345,11 @@ int commit_inmem_pages(struct inode *inode)
   */
  void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
  {
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       if (time_to_inject(sbi, FAULT_CHECKPOINT))
+               f2fs_stop_checkpoint(sbi, false);
+#endif
+
         if (!need)
                 return;
  
@@ -355,7 +361,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
          * We should do GC or end up with checkpoint, if there are so many dirty
          * dir/node pages without enough free segments.
          */
-       if (has_not_enough_free_secs(sbi, 0)) {
+       if (has_not_enough_free_secs(sbi, 0, 0)) {
                 mutex_lock(&sbi->gc_mutex);
                 f2fs_gc(sbi, false);
         }
@@ -580,6 +586,74 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
         mutex_unlock(&dirty_i->seglist_lock);
  }
  
+static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi,
+                                                       struct bio *bio)
+{
+       struct list_head *wait_list = &(SM_I(sbi)->wait_list);
+       struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+
+       INIT_LIST_HEAD(&be->list);
+       be->bio = bio;
+       init_completion(&be->event);
+       list_add_tail(&be->list, wait_list);
+
+       return be;
+}
+
+void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi)
+{
+       struct list_head *wait_list = &(SM_I(sbi)->wait_list);
+       struct bio_entry *be, *tmp;
+
+       list_for_each_entry_safe(be, tmp, wait_list, list) {
+               struct bio *bio = be->bio;
+               int err;
+
+               wait_for_completion_io(&be->event);
+               err = be->error;
+               if (err == -EOPNOTSUPP)
+                       err = 0;
+
+               if (err)
+                       f2fs_msg(sbi->sb, KERN_INFO,
+                               "Issue discard failed, ret: %d", err);
+
+               bio_put(bio);
+               list_del(&be->list);
+               kmem_cache_free(bio_entry_slab, be);
+       }
+}
+
+static void f2fs_submit_bio_wait_endio(struct bio *bio)
+{
+       struct bio_entry *be = (struct bio_entry *)bio->bi_private;
+
+       be->error = bio->bi_error;
+       complete(&be->event);
+}
+
+/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
+int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
+               sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+       struct block_device *bdev = sbi->sb->s_bdev;
+       struct bio *bio = NULL;
+       int err;
+
+       err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
+                       &bio);
+       if (!err && bio) {
+               struct bio_entry *be = __add_bio_entry(sbi, bio);
+
+               bio->bi_private = be;
+               bio->bi_end_io = f2fs_submit_bio_wait_endio;
+               bio->bi_opf |= REQ_SYNC;
+               submit_bio(bio);
+       }
+
+       return err;
+}
+
  static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
                                 block_t blkstart, block_t blklen)
  {
@@ -597,29 +671,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
                         sbi->discard_blks--;
         }
         trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
-       return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
-}
-
-bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
-{
-       int err = -EOPNOTSUPP;
-
-       if (test_opt(sbi, DISCARD)) {
-               struct seg_entry *se = get_seg_entry(sbi,
-                               GET_SEGNO(sbi, blkaddr));
-               unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
-
-               if (f2fs_test_bit(offset, se->discard_map))
-                       return false;
-
-               err = f2fs_issue_discard(sbi, blkaddr, 1);
-       }
-
-       if (err) {
-               update_meta_page(sbi, NULL, blkaddr);
-               return true;
-       }
-       return false;
+       return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0);
  }
  
  static void __add_discard_entry(struct f2fs_sb_info *sbi,
@@ -660,7 +712,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         bool force = (cpc->reason == CP_DISCARD);
         int i;
  
-       if (se->valid_blocks == max_blocks)
+       if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
                 return;
  
         if (!force) {
@@ -719,11 +771,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         struct list_head *head = &(SM_I(sbi)->discard_list);
         struct discard_entry *entry, *this;
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+       struct blk_plug plug;
         unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
         unsigned int start = 0, end = -1;
         unsigned int secno, start_segno;
         bool force = (cpc->reason == CP_DISCARD);
  
+       blk_start_plug(&plug);
+
         mutex_lock(&dirty_i->seglist_lock);
  
         while (1) {
@@ -772,6 +827,8 @@ skip:
                 SM_I(sbi)->nr_discards -= entry->len;
                 kmem_cache_free(discard_entry_slab, entry);
         }
+
+       blk_finish_plug(&plug);
  }
  
  static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -818,12 +875,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
         if (del > 0) {
                 if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
                         f2fs_bug_on(sbi, 1);
-               if (!f2fs_test_and_set_bit(offset, se->discard_map))
+               if (f2fs_discard_en(sbi) &&
+                       !f2fs_test_and_set_bit(offset, se->discard_map))
                         sbi->discard_blks--;
         } else {
                 if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
                         f2fs_bug_on(sbi, 1);
-               if (f2fs_test_and_clear_bit(offset, se->discard_map))
+               if (f2fs_discard_en(sbi) &&
+                       f2fs_test_and_clear_bit(offset, se->discard_map))
                         sbi->discard_blks++;
         }
         if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -1202,7 +1261,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
         struct curseg_info *curseg = CURSEG_I(sbi, type);
         const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
  
-       if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
+       if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0))
                 return v_ops->get_victim(sbi,
                                 &(curseg)->next_segno, BG_GC, type, SSR);
  
@@ -1277,6 +1336,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
         if (end <= MAIN_BLKADDR(sbi))
                 goto out;
  
+       if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+               f2fs_msg(sbi->sb, KERN_WARNING,
+                       "Found FS corruption, run fsck to fix.");
+               goto out;
+       }
+
         /* start/end segment number in main_area */
         start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
         end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
@@ -1301,6 +1366,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
                 mutex_lock(&sbi->gc_mutex);
                 err = write_checkpoint(sbi, &cpc);
                 mutex_unlock(&sbi->gc_mutex);
+               if (err)
+                       break;
+
+               schedule();
         }
  out:
         range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
@@ -1391,7 +1460,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
  
         /* direct_io'ed data is aligned to the segment for better performance */
         if (direct_io && curseg->next_blkoff &&
-                               !has_not_enough_free_secs(sbi, 0))
+                               !has_not_enough_free_secs(sbi, 0, 0))
                 __allocate_new_segments(sbi, type);
  
         *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
@@ -1589,11 +1658,9 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
  {
         struct page *cpage;
  
-       if (blkaddr == NEW_ADDR)
+       if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
                 return;
  
-       f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
-
         cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
         if (cpage) {
                 f2fs_wait_on_page_writeback(cpage, DATA, true);
@@ -1739,7 +1806,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
         int type = CURSEG_HOT_DATA;
         int err;
  
-       if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+       if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) {
                 int npages = npages_for_summary_flush(sbi, true);
  
                 if (npages >= 2)
@@ -1836,7 +1903,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
  
  void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
  {
-       if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+       if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG))
                 write_compacted_summaries(sbi, start_blk);
         else
                 write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
@@ -2127,12 +2194,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
                         = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
                 sit_i->sentries[start].ckpt_valid_map
                         = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
-               sit_i->sentries[start].discard_map
-                       = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
                 if (!sit_i->sentries[start].cur_valid_map ||
-                               !sit_i->sentries[start].ckpt_valid_map ||
-                               !sit_i->sentries[start].discard_map)
+                               !sit_i->sentries[start].ckpt_valid_map)
                         return -ENOMEM;
+
+               if (f2fs_discard_en(sbi)) {
+                       sit_i->sentries[start].discard_map
+                               = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+                       if (!sit_i->sentries[start].discard_map)
+                               return -ENOMEM;
+               }
         }
  
         sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
@@ -2239,6 +2310,8 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
         struct sit_info *sit_i = SIT_I(sbi);
         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
         struct f2fs_journal *journal = curseg->journal;
+       struct seg_entry *se;
+       struct f2fs_sit_entry sit;
         int sit_blk_cnt = SIT_BLK_CNT(sbi);
         unsigned int i, start, end;
         unsigned int readed, start_blk = 0;
@@ -2251,41 +2324,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
                 end = (start_blk + readed) * sit_i->sents_per_block;
  
                 for (; start < end && start < MAIN_SEGS(sbi); start++) {
-                       struct seg_entry *se = &sit_i->sentries[start];
                         struct f2fs_sit_block *sit_blk;
-                       struct f2fs_sit_entry sit;
                         struct page *page;
  
-                       down_read(&curseg->journal_rwsem);
-                       for (i = 0; i < sits_in_cursum(journal); i++) {
-                               if (le32_to_cpu(segno_in_journal(journal, i))
-                                                               == start) {
-                                       sit = sit_in_journal(journal, i);
-                                       up_read(&curseg->journal_rwsem);
-                                       goto got_it;
-                               }
-                       }
-                       up_read(&curseg->journal_rwsem);
-
+                       se = &sit_i->sentries[start];
                         page = get_current_sit_page(sbi, start);
                         sit_blk = (struct f2fs_sit_block *)page_address(page);
                         sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
                         f2fs_put_page(page, 1);
-got_it:
+
                         check_block_count(sbi, start, &sit);
                         seg_info_from_raw_sit(se, &sit);
  
                         /* build discard map only one time */
-                       memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
-                       sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks;
-
-                       if (sbi->segs_per_sec > 1) {
-                               struct sec_entry *e = get_sec_entry(sbi, start);
-                               e->valid_blocks += se->valid_blocks;
+                       if (f2fs_discard_en(sbi)) {
+                               memcpy(se->discard_map, se->cur_valid_map,
+                                                       SIT_VBLOCK_MAP_SIZE);
+                               sbi->discard_blks += sbi->blocks_per_seg -
+                                                       se->valid_blocks;
                         }
+
+                       if (sbi->segs_per_sec > 1)
+                               get_sec_entry(sbi, start)->valid_blocks +=
+                                                       se->valid_blocks;
                 }
                 start_blk += readed;
         } while (start_blk < sit_blk_cnt);
+
+       down_read(&curseg->journal_rwsem);
+       for (i = 0; i < sits_in_cursum(journal); i++) {
+               unsigned int old_valid_blocks;
+
+               start = le32_to_cpu(segno_in_journal(journal, i));
+               se = &sit_i->sentries[start];
+               sit = sit_in_journal(journal, i);
+
+               old_valid_blocks = se->valid_blocks;
+
+               check_block_count(sbi, start, &sit);
+               seg_info_from_raw_sit(se, &sit);
+
+               if (f2fs_discard_en(sbi)) {
+                       memcpy(se->discard_map, se->cur_valid_map,
+                                               SIT_VBLOCK_MAP_SIZE);
+                       sbi->discard_blks += old_valid_blocks -
+                                               se->valid_blocks;
+               }
+
+               if (sbi->segs_per_sec > 1)
+                       get_sec_entry(sbi, start)->valid_blocks +=
+                               se->valid_blocks - old_valid_blocks;
+       }
+       up_read(&curseg->journal_rwsem);
  }
  
  static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -2427,6 +2517,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
         sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
  
         INIT_LIST_HEAD(&sm_info->discard_list);
+       INIT_LIST_HEAD(&sm_info->wait_list);
         sm_info->nr_discards = 0;
         sm_info->max_discards = 0;
  
@@ -2570,10 +2661,15 @@ int __init create_segment_manager_caches(void)
         if (!discard_entry_slab)
                 goto fail;
  
+       bio_entry_slab = f2fs_kmem_cache_create("bio_entry",
+                       sizeof(struct bio_entry));
+       if (!bio_entry_slab)
+               goto destroy_discard_entry;
+
         sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
                         sizeof(struct sit_entry_set));
         if (!sit_entry_set_slab)
-               goto destory_discard_entry;
+               goto destroy_bio_entry;
  
         inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
                         sizeof(struct inmem_pages));
@@ -2583,7 +2679,9 @@ int __init create_segment_manager_caches(void)
  
  destroy_sit_entry_set:
         kmem_cache_destroy(sit_entry_set_slab);
-destory_discard_entry:
+destroy_bio_entry:
+       kmem_cache_destroy(bio_entry_slab);
+destroy_discard_entry:
         kmem_cache_destroy(discard_entry_slab);
  fail:
         return -ENOMEM;
@@ -2592,6 +2690,7 @@ fail:
  void destroy_segment_manager_caches(void)
  {
         kmem_cache_destroy(sit_entry_set_slab);
+       kmem_cache_destroy(bio_entry_slab);
         kmem_cache_destroy(discard_entry_slab);
         kmem_cache_destroy(inmem_entry_slab);
  }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h

index b33f73e..fecb856 100644 (file)
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -479,7 +479,8 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
                                                 reserved_sections(sbi) + 1);
  }
  
-static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
+                                       int freed, int needed)
  {
         int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
         int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
@@ -489,8 +490,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
         if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                 return false;
  
-       return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
-                                               reserved_sections(sbi));
+       return (free_sections(sbi) + freed) <=
+               (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
  }
  
  static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -587,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
  
  static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
  {
-       f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi)
-                                       || blk_addr >= MAX_BLKADDR(sbi));
+       BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
+                       || blk_addr >= MAX_BLKADDR(sbi));
  }
  
  /*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c

index 7f863a6..6132b4c 100644 (file)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -40,7 +40,6 @@ static struct kmem_cache *f2fs_inode_cachep;
  static struct kset *f2fs_kset;
  
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-struct f2fs_fault_info f2fs_fault;
  
  char *fault_name[FAULT_MAX] = {
         [FAULT_KMALLOC]         = "kmalloc",
@@ -50,16 +49,21 @@ char *fault_name[FAULT_MAX] = {
         [FAULT_BLOCK]           = "no more block",
         [FAULT_DIR_DEPTH]       = "too big dir depth",
         [FAULT_EVICT_INODE]     = "evict_inode fail",
+       [FAULT_IO]              = "IO error",
+       [FAULT_CHECKPOINT]      = "checkpoint error",
  };
  
-static void f2fs_build_fault_attr(unsigned int rate)
+static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
+                                               unsigned int rate)
  {
+       struct f2fs_fault_info *ffi = &sbi->fault_info;
+
         if (rate) {
-               atomic_set(&f2fs_fault.inject_ops, 0);
-               f2fs_fault.inject_rate = rate;
-               f2fs_fault.inject_type = (1 << FAULT_MAX) - 1;
+               atomic_set(&ffi->inject_ops, 0);
+               ffi->inject_rate = rate;
+               ffi->inject_type = (1 << FAULT_MAX) - 1;
         } else {
-               memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info));
+               memset(ffi, 0, sizeof(struct f2fs_fault_info));
         }
  }
  #endif
@@ -87,6 +91,7 @@ enum {
         Opt_inline_xattr,
         Opt_inline_data,
         Opt_inline_dentry,
+       Opt_noinline_dentry,
         Opt_flush_merge,
         Opt_noflush_merge,
         Opt_nobarrier,
@@ -118,6 +123,7 @@ static match_table_t f2fs_tokens = {
         {Opt_inline_xattr, "inline_xattr"},
         {Opt_inline_data, "inline_data"},
         {Opt_inline_dentry, "inline_dentry"},
+       {Opt_noinline_dentry, "noinline_dentry"},
         {Opt_flush_merge, "flush_merge"},
         {Opt_noflush_merge, "noflush_merge"},
         {Opt_nobarrier, "nobarrier"},
@@ -167,7 +173,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
  #ifdef CONFIG_F2FS_FAULT_INJECTION
         else if (struct_type == FAULT_INFO_RATE ||
                                         struct_type == FAULT_INFO_TYPE)
-               return (unsigned char *)&f2fs_fault;
+               return (unsigned char *)&sbi->fault_info;
  #endif
         return NULL;
  }
@@ -312,6 +318,10 @@ static struct attribute *f2fs_attrs[] = {
         ATTR_LIST(dirty_nats_ratio),
         ATTR_LIST(cp_interval),
         ATTR_LIST(idle_interval),
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       ATTR_LIST(inject_rate),
+       ATTR_LIST(inject_type),
+#endif
         ATTR_LIST(lifetime_write_kbytes),
         NULL,
  };
@@ -327,22 +337,6 @@ static struct kobj_type f2fs_ktype = {
         .release        = f2fs_sb_release,
  };
  
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-/* sysfs for f2fs fault injection */
-static struct kobject f2fs_fault_inject;
-
-static struct attribute *f2fs_fault_attrs[] = {
-       ATTR_LIST(inject_rate),
-       ATTR_LIST(inject_type),
-       NULL
-};
-
-static struct kobj_type f2fs_fault_ktype = {
-       .default_attrs  = f2fs_fault_attrs,
-       .sysfs_ops      = &f2fs_attr_ops,
-};
-#endif
-
  void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
  {
         struct va_format vaf;
@@ -370,10 +364,6 @@ static int parse_options(struct super_block *sb, char *options)
         char *p, *name;
         int arg = 0;
  
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-       f2fs_build_fault_attr(0);
-#endif
-
         if (!options)
                 return 0;
  
@@ -488,6 +478,9 @@ static int parse_options(struct super_block *sb, char *options)
                 case Opt_inline_dentry:
                         set_opt(sbi, INLINE_DENTRY);
                         break;
+               case Opt_noinline_dentry:
+                       clear_opt(sbi, INLINE_DENTRY);
+                       break;
                 case Opt_flush_merge:
                         set_opt(sbi, FLUSH_MERGE);
                         break;
@@ -533,7 +526,7 @@ static int parse_options(struct super_block *sb, char *options)
                         if (args->from && match_int(args, &arg))
                                 return -EINVAL;
  #ifdef CONFIG_F2FS_FAULT_INJECTION
-                       f2fs_build_fault_attr(arg);
+                       f2fs_build_fault_attr(sbi, arg);
  #else
                         f2fs_msg(sb, KERN_INFO,
                                 "FAULT_INJECTION was not selected");
@@ -730,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb)
          * clean checkpoint again.
          */
         if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
-                       !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
+                       !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
                 struct cp_control cpc = {
                         .reason = CP_UMOUNT,
                 };
@@ -878,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
                 seq_puts(seq, ",noinline_data");
         if (test_opt(sbi, INLINE_DENTRY))
                 seq_puts(seq, ",inline_dentry");
+       else
+               seq_puts(seq, ",noinline_dentry");
         if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
                 seq_puts(seq, ",flush_merge");
         if (test_opt(sbi, NOBARRIER))
@@ -946,7 +941,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset)
                 seq_printf(seq, "%d|%-3u|", se->type,
                                         get_valid_blocks(sbi, i, 1));
                 for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
-                       seq_printf(seq, "%x ", se->cur_valid_map[j]);
+                       seq_printf(seq, " %.2x", se->cur_valid_map[j]);
                 seq_putc(seq, '\n');
         }
         return 0;
@@ -975,6 +970,7 @@ static void default_options(struct f2fs_sb_info *sbi)
  
         set_opt(sbi, BG_GC);
         set_opt(sbi, INLINE_DATA);
+       set_opt(sbi, INLINE_DENTRY);
         set_opt(sbi, EXTENT_CACHE);
         sbi->sb->s_flags |= MS_LAZYTIME;
         set_opt(sbi, FLUSH_MERGE);
@@ -991,6 +987,10 @@ static void default_options(struct f2fs_sb_info *sbi)
  #ifdef CONFIG_F2FS_FS_POSIX_ACL
         set_opt(sbi, POSIX_ACL);
  #endif
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       f2fs_build_fault_attr(sbi, 0);
+#endif
  }
  
  static int f2fs_remount(struct super_block *sb, int *flags, char *data)
@@ -1001,6 +1001,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         bool need_restart_gc = false;
         bool need_stop_gc = false;
         bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       struct f2fs_fault_info ffi = sbi->fault_info;
+#endif
  
         /*
          * Save the old mount options in case we
@@ -1096,6 +1099,9 @@ restore_gc:
  restore_opts:
         sbi->mount_opt = org_mount_opt;
         sbi->active_logs = active_logs;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       sbi->fault_info = ffi;
+#endif
         return err;
  }
  
@@ -1469,6 +1475,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
         mutex_init(&sbi->umount_mutex);
         mutex_init(&sbi->wio_mutex[NODE]);
         mutex_init(&sbi->wio_mutex[DATA]);
+       spin_lock_init(&sbi->cp_lock);
  
  #ifdef CONFIG_F2FS_FS_ENCRYPTION
         memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
@@ -1810,7 +1817,7 @@ try_onemore:
                  * previous checkpoint was not done by clean system shutdown.
                  */
                 if (bdev_read_only(sb->s_bdev) &&
-                               !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
+                               !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
                         err = -EROFS;
                         goto free_kobj;
                 }
@@ -1818,6 +1825,9 @@ try_onemore:
                 if (need_fsck)
                         set_sbi_flag(sbi, SBI_NEED_FSCK);
  
+               if (!retry)
+                       goto skip_recovery;
+
                 err = recover_fsync_data(sbi, false);
                 if (err < 0) {
                         need_fsck = true;
@@ -1835,7 +1845,7 @@ try_onemore:
                         goto free_kobj;
                 }
         }
-
+skip_recovery:
         /* recover_fsync_data() cleared this already */
         clear_sbi_flag(sbi, SBI_POR_DOING);
  
@@ -1879,7 +1889,9 @@ free_root_inode:
         dput(sb->s_root);
         sb->s_root = NULL;
  free_node_inode:
+       truncate_inode_pages_final(NODE_MAPPING(sbi));
         mutex_lock(&sbi->umount_mutex);
+       release_ino_entry(sbi, true);
         f2fs_leave_shrinker(sbi);
         iput(sbi->node_inode);
         mutex_unlock(&sbi->umount_mutex);
@@ -1978,16 +1990,6 @@ static int __init init_f2fs_fs(void)
                 err = -ENOMEM;
                 goto free_extent_cache;
         }
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-       f2fs_fault_inject.kset = f2fs_kset;
-       f2fs_build_fault_attr(0);
-       err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype,
-                               NULL, "fault_injection");
-       if (err) {
-               f2fs_fault_inject.kset = NULL;
-               goto free_kset;
-       }
-#endif
         err = register_shrinker(&f2fs_shrinker_info);
         if (err)
                 goto free_kset;
@@ -2006,10 +2008,6 @@ free_filesystem:
  free_shrinker:
         unregister_shrinker(&f2fs_shrinker_info);
  free_kset:
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-       if (f2fs_fault_inject.kset)
-               kobject_put(&f2fs_fault_inject);
-#endif
         kset_unregister(f2fs_kset);
  free_extent_cache:
         destroy_extent_cache();
@@ -2031,9 +2029,6 @@ static void __exit exit_f2fs_fs(void)
         f2fs_destroy_root_stats();
         unregister_filesystem(&f2fs_fs_type);
         unregister_shrinker(&f2fs_shrinker_info);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-       kobject_put(&f2fs_fault_inject);
-#endif
         kset_unregister(f2fs_kset);
         destroy_extent_cache();
         destroy_checkpoint_caches();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c

index c8898b5..1f74876 100644 (file)
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -217,18 +217,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
         return entry;
  }
  
-static void *read_all_xattrs(struct inode *inode, struct page *ipage)
+static int read_all_xattrs(struct inode *inode, struct page *ipage,
+                                                       void **base_addr)
  {
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         struct f2fs_xattr_header *header;
         size_t size = PAGE_SIZE, inline_size = 0;
         void *txattr_addr;
+       int err;
  
         inline_size = inline_xattr_size(inode);
  
         txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
         if (!txattr_addr)
-               return NULL;
+               return -ENOMEM;
  
         /* read from inline xattr */
         if (inline_size) {
@@ -239,8 +241,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
                         inline_addr = inline_xattr_addr(ipage);
                 } else {
                         page = get_node_page(sbi, inode->i_ino);
-                       if (IS_ERR(page))
+                       if (IS_ERR(page)) {
+                               err = PTR_ERR(page);
                                 goto fail;
+                       }
                         inline_addr = inline_xattr_addr(page);
                 }
                 memcpy(txattr_addr, inline_addr, inline_size);
@@ -254,8 +258,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
  
                 /* The inode already has an extended attribute block. */
                 xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
-               if (IS_ERR(xpage))
+               if (IS_ERR(xpage)) {
+                       err = PTR_ERR(xpage);
                         goto fail;
+               }
  
                 xattr_addr = page_address(xpage);
                 memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
@@ -269,10 +275,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
                 header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
                 header->h_refcount = cpu_to_le32(1);
         }
-       return txattr_addr;
+       *base_addr = txattr_addr;
+       return 0;
  fail:
         kzfree(txattr_addr);
-       return NULL;
+       return err;
  }
  
  static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
@@ -366,9 +373,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
         if (len > F2FS_NAME_LEN)
                 return -ERANGE;
  
-       base_addr = read_all_xattrs(inode, ipage);
-       if (!base_addr)
-               return -ENOMEM;
+       error = read_all_xattrs(inode, ipage, &base_addr);
+       if (error)
+               return error;
  
         entry = __find_xattr(base_addr, index, len, name);
         if (IS_XATTR_LAST_ENTRY(entry)) {
@@ -402,9 +409,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
         int error = 0;
         size_t rest = buffer_size;
  
-       base_addr = read_all_xattrs(inode, NULL);
-       if (!base_addr)
-               return -ENOMEM;
+       error = read_all_xattrs(inode, NULL, &base_addr);
+       if (error)
+               return error;
  
         list_for_each_xattr(entry, base_addr) {
                 const struct xattr_handler *handler =
@@ -463,9 +470,9 @@ static int __f2fs_setxattr(struct inode *inode, int index,
         if (size > MAX_VALUE_LEN(inode))
                 return -E2BIG;
  
-       base_addr = read_all_xattrs(inode, ipage);
-       if (!base_addr)
-               return -ENOMEM;
+       error = read_all_xattrs(inode, ipage, &base_addr);
+       if (error)
+               return error;
  
         /* find entry with wanted name. */
         here = __find_xattr(base_addr, index, len, name);
@@ -548,6 +555,8 @@ static int __f2fs_setxattr(struct inode *inode, int index,
                         !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
                 f2fs_set_encrypted_inode(inode);
         f2fs_mark_inode_dirty_sync(inode);
+       if (!error && S_ISDIR(inode->i_mode))
+               set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
  exit:
         kzfree(base_addr);
         return error;
diff --git a/fs/internal.h b/fs/internal.h

index ba07376..8591786 100644 (file)
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
  struct super_block;
  struct file_system_type;
  struct iomap;
+struct iomap_ops;
  struct linux_binprm;
  struct path;
  struct mount;
@@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations;
  extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
                     unsigned long arg);
  extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/*
+ * iomap support:
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+               void *data, struct iomap *iomap);
+
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+               unsigned flags, struct iomap_ops *ops, void *data,
+               iomap_actor_t actor);
diff --git a/fs/iomap.c b/fs/iomap.c

index 706270f..013d1d3 100644 (file)
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,9 +27,6 @@
  #include <linux/dax.h>
  #include "internal.h"
  
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-               void *data, struct iomap *iomap);
-
  /*
   * Execute a iomap write on a segment of the mapping that spans a
   * contiguous range of pages that have identical block mapping state.
@@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
   * resources they require in the iomap_begin call, and release them in the
   * iomap_end call.
   */
-static loff_t
+loff_t
  iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
                 struct iomap_ops *ops, void *data, iomap_actor_t actor)
  {
@@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
  }
  EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
  
+static struct page *
+__iomap_read_page(struct inode *inode, loff_t offset)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct page *page;
+
+       page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
+       if (IS_ERR(page))
+               return page;
+       if (!PageUptodate(page)) {
+               put_page(page);
+               return ERR_PTR(-EIO);
+       }
+       return page;
+}
+
+static loff_t
+iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+               struct iomap *iomap)
+{
+       long status = 0;
+       ssize_t written = 0;
+
+       do {
+               struct page *page, *rpage;
+               unsigned long offset;   /* Offset into pagecache page */
+               unsigned long bytes;    /* Bytes to write to page */
+
+               offset = (pos & (PAGE_SIZE - 1));
+               bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
+
+               rpage = __iomap_read_page(inode, pos);
+               if (IS_ERR(rpage))
+                       return PTR_ERR(rpage);
+
+               status = iomap_write_begin(inode, pos, bytes,
+                               AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
+                               &page, iomap);
+               put_page(rpage);
+               if (unlikely(status))
+                       return status;
+
+               WARN_ON_ONCE(!PageUptodate(page));
+
+               status = iomap_write_end(inode, pos, bytes, bytes, page);
+               if (unlikely(status <= 0)) {
+                       if (WARN_ON_ONCE(status == 0))
+                               return -EIO;
+                       return status;
+               }
+
+               cond_resched();
+
+               pos += status;
+               written += status;
+               length -= status;
+
+               balance_dirty_pages_ratelimited(inode->i_mapping);
+       } while (length);
+
+       return written;
+}
+
+int
+iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+               struct iomap_ops *ops)
+{
+       loff_t ret;
+
+       while (len) {
+               ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
+                               iomap_dirty_actor);
+               if (ret <= 0)
+                       return ret;
+               pos += ret;
+               len -= ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_file_dirty);
+
  static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
                 unsigned bytes, struct iomap *iomap)
  {
@@ -430,6 +509,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
  
         if (iomap->flags & IOMAP_F_MERGED)
                 flags |= FIEMAP_EXTENT_MERGED;
+       if (iomap->flags & IOMAP_F_SHARED)
+               flags |= FIEMAP_EXTENT_SHARED;
  
         return fiemap_fill_next_extent(fi, iomap->offset,
                         iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
diff --git a/fs/mount.h b/fs/mount.h

index 14db05d..d2e25d7 100644 (file)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,9 +10,12 @@ struct mnt_namespace {
         struct mount *  root;
         struct list_head        list;
         struct user_namespace   *user_ns;
+       struct ucounts          *ucounts;
         u64                     seq;    /* Sequence number to prevent loops */
         wait_queue_head_t poll;
         u64 event;
+       unsigned int            mounts; /* # of mounts in the namespace */
+       unsigned int            pending_mounts;
  };
  
  struct mnt_pcp {
diff --git a/fs/namespace.c b/fs/namespace.c

index 7bb2cda..db1b5a3 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,9 @@
  #include "pnode.h"
  #include "internal.h"
  
+/* Maximum number of mounts in a mount namespace */
+unsigned int sysctl_mount_max __read_mostly = 100000;
+
  static unsigned int m_hash_mask __read_mostly;
  static unsigned int m_hash_shift __read_mostly;
  static unsigned int mp_hash_mask __read_mostly;
@@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
  
         list_splice(&head, n->list.prev);
  
+       n->mounts += n->pending_mounts;
+       n->pending_mounts = 0;
+
         attach_shadowed(mnt, parent, shadows);
         touch_mnt_namespace(n);
  }
@@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                 propagate_umount(&tmp_list);
  
         while (!list_empty(&tmp_list)) {
+               struct mnt_namespace *ns;
                 bool disconnect;
                 p = list_first_entry(&tmp_list, struct mount, mnt_list);
                 list_del_init(&p->mnt_expire);
                 list_del_init(&p->mnt_list);
-               __touch_mnt_namespace(p->mnt_ns);
+               ns = p->mnt_ns;
+               if (ns) {
+                       ns->mounts--;
+                       __touch_mnt_namespace(ns);
+               }
                 p->mnt_ns = NULL;
                 if (how & UMOUNT_SYNC)
                         p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
@@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
         return 0;
  }
  
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+{
+       unsigned int max = READ_ONCE(sysctl_mount_max);
+       unsigned int mounts = 0, old, pending, sum;
+       struct mount *p;
+
+       for (p = mnt; p; p = next_mnt(p, mnt))
+               mounts++;
+
+       old = ns->mounts;
+       pending = ns->pending_mounts;
+       sum = old + pending;
+       if ((old > sum) ||
+           (pending > sum) ||
+           (max < sum) ||
+           (mounts > (max - sum)))
+               return -ENOSPC;
+
+       ns->pending_mounts = pending + mounts;
+       return 0;
+}
+
  /*
   *  @source_mnt : mount tree to be attached
   *  @nd         : place the mount tree @source_mnt is attached
@@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                         struct path *parent_path)
  {
         HLIST_HEAD(tree_list);
+       struct mnt_namespace *ns = dest_mnt->mnt_ns;
         struct mount *child, *p;
         struct hlist_node *n;
         int err;
  
+       /* Is there space to add these mounts to the mount namespace? */
+       if (!parent_path) {
+               err = count_mounts(ns, source_mnt);
+               if (err)
+                       goto out;
+       }
+
         if (IS_MNT_SHARED(dest_mnt)) {
                 err = invent_group_ids(source_mnt, true);
                 if (err)
@@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt,
   out_cleanup_ids:
         while (!hlist_empty(&tree_list)) {
                 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+               child->mnt_parent->mnt_ns->pending_mounts = 0;
                 umount_tree(child, UMOUNT_SYNC);
         }
         unlock_mount_hash();
         cleanup_group_ids(source_mnt, NULL);
   out:
+       ns->pending_mounts = 0;
         return err;
  }
  
@@ -2719,9 +2762,20 @@ dput_out:
         return retval;
  }
  
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
  static void free_mnt_ns(struct mnt_namespace *ns)
  {
         ns_free_inum(&ns->ns);
+       dec_mnt_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         kfree(ns);
  }
@@ -2738,14 +2792,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
  static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
  {
         struct mnt_namespace *new_ns;
+       struct ucounts *ucounts;
         int ret;
  
+       ucounts = inc_mnt_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-       if (!new_ns)
+       if (!new_ns) {
+               dec_mnt_namespaces(ucounts);
                 return ERR_PTR(-ENOMEM);
+       }
         ret = ns_alloc_inum(&new_ns->ns);
         if (ret) {
                 kfree(new_ns);
+               dec_mnt_namespaces(ucounts);
                 return ERR_PTR(ret);
         }
         new_ns->ns.ops = &mntns_operations;
@@ -2756,6 +2818,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
         init_waitqueue_head(&new_ns->poll);
         new_ns->event = 0;
         new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->ucounts = ucounts;
+       new_ns->mounts = 0;
+       new_ns->pending_mounts = 0;
         return new_ns;
  }
  
@@ -2805,6 +2870,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
         q = new;
         while (p) {
                 q->mnt_ns = new_ns;
+               new_ns->mounts++;
                 if (new_fs) {
                         if (&p->mnt == new_fs->root.mnt) {
                                 new_fs->root.mnt = mntget(&q->mnt);
@@ -2843,6 +2909,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
                 struct mount *mnt = real_mount(m);
                 mnt->mnt_ns = new_ns;
                 new_ns->root = mnt;
+               new_ns->mounts++;
                 list_add(&mnt->mnt_list, &new_ns->list);
         } else {
                 mntput(m);
@@ -3348,10 +3415,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return 0;
  }
  
+static struct user_namespace *mntns_owner(struct ns_common *ns)
+{
+       return to_mnt_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations mntns_operations = {
         .name           = "mnt",
         .type           = CLONE_NEWNS,
         .get            = mntns_get,
         .put            = mntns_put,
         .install        = mntns_install,
+       .owner          = mntns_owner,
  };
diff --git a/fs/nsfs.c b/fs/nsfs.c

index 8f20d60..30bb100 100644 (file)
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -5,11 +5,16 @@
  #include <linux/magic.h>
  #include <linux/ktime.h>
  #include <linux/seq_file.h>
+#include <linux/user_namespace.h>
+#include <linux/nsfs.h>
  
  static struct vfsmount *nsfs_mnt;
  
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+                       unsigned long arg);
  static const struct file_operations ns_file_operations = {
         .llseek         = no_llseek,
+       .unlocked_ioctl = ns_ioctl,
  };
  
  static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode)
         ns->ops->put(ns);
  }
  
-void *ns_get_path(struct path *path, struct task_struct *task,
-                       const struct proc_ns_operations *ns_ops)
+static void *__ns_get_path(struct path *path, struct ns_common *ns)
  {
-       struct vfsmount *mnt = mntget(nsfs_mnt);
+       struct vfsmount *mnt = nsfs_mnt;
         struct qstr qname = { .name = "", };
         struct dentry *dentry;
         struct inode *inode;
-       struct ns_common *ns;
         unsigned long d;
  
-again:
-       ns = ns_ops->get(task);
-       if (!ns) {
-               mntput(mnt);
-               return ERR_PTR(-ENOENT);
-       }
         rcu_read_lock();
         d = atomic_long_read(&ns->stashed);
         if (!d)
@@ -68,17 +65,16 @@ again:
         if (!lockref_get_not_dead(&dentry->d_lockref))
                 goto slow;
         rcu_read_unlock();
-       ns_ops->put(ns);
+       ns->ops->put(ns);
  got_it:
-       path->mnt = mnt;
+       path->mnt = mntget(mnt);
         path->dentry = dentry;
         return NULL;
  slow:
         rcu_read_unlock();
         inode = new_inode_pseudo(mnt->mnt_sb);
         if (!inode) {
-               ns_ops->put(ns);
-               mntput(mnt);
+               ns->ops->put(ns);
                 return ERR_PTR(-ENOMEM);
         }
         inode->i_ino = ns->inum;
@@ -91,21 +87,96 @@ slow:
         dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
         if (!dentry) {
                 iput(inode);
-               mntput(mnt);
                 return ERR_PTR(-ENOMEM);
         }
         d_instantiate(dentry, inode);
-       dentry->d_fsdata = (void *)ns_ops;
+       dentry->d_fsdata = (void *)ns->ops;
         d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
         if (d) {
                 d_delete(dentry);       /* make sure ->d_prune() does nothing */
                 dput(dentry);
                 cpu_relax();
-               goto again;
+               return ERR_PTR(-EAGAIN);
         }
         goto got_it;
  }
  
+void *ns_get_path(struct path *path, struct task_struct *task,
+                       const struct proc_ns_operations *ns_ops)
+{
+       struct ns_common *ns;
+       void *ret;
+
+again:
+       ns = ns_ops->get(task);
+       if (!ns)
+               return ERR_PTR(-ENOENT);
+
+       ret = __ns_get_path(path, ns);
+       if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN)
+               goto again;
+       return ret;
+}
+
+static int open_related_ns(struct ns_common *ns,
+                  struct ns_common *(*get_ns)(struct ns_common *ns))
+{
+       struct path path = {};
+       struct file *f;
+       void *err;
+       int fd;
+
+       fd = get_unused_fd_flags(O_CLOEXEC);
+       if (fd < 0)
+               return fd;
+
+       while (1) {
+               struct ns_common *relative;
+
+               relative = get_ns(ns);
+               if (IS_ERR(relative)) {
+                       put_unused_fd(fd);
+                       return PTR_ERR(relative);
+               }
+
+               err = __ns_get_path(&path, relative);
+               if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN)
+                       continue;
+               break;
+       }
+       if (IS_ERR(err)) {
+               put_unused_fd(fd);
+               return PTR_ERR(err);
+       }
+
+       f = dentry_open(&path, O_RDONLY, current_cred());
+       path_put(&path);
+       if (IS_ERR(f)) {
+               put_unused_fd(fd);
+               fd = PTR_ERR(f);
+       } else
+               fd_install(fd, f);
+
+       return fd;
+}
+
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+                       unsigned long arg)
+{
+       struct ns_common *ns = get_proc_ns(file_inode(filp));
+
+       switch (ioctl) {
+       case NS_GET_USERNS:
+               return open_related_ns(ns, ns_get_owner);
+       case NS_GET_PARENT:
+               if (!ns->ops->get_parent)
+                       return -EINVAL;
+               return open_related_ns(ns, ns->ops->get_parent);
+       default:
+               return -ENOTTY;
+       }
+}
+
  int ns_get_name(char *buf, size_t size, struct task_struct *task,
                         const struct proc_ns_operations *ns_ops)
  {
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c

index 00235bf..1e8fe84 100644 (file)
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
                 }
         }
  
-       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
         ret = 1;
  out_release_op:
         op_release(new_op);
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c

index a287a66..516ffb4 100644 (file)
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -11,14 +11,19 @@
  #include "orangefs-kernel.h"
  #include "orangefs-dev-proto.h"
  #include "orangefs-bufmap.h"
+#include "orangefs-debugfs.h"
  
  #include <linux/debugfs.h>
  #include <linux/slab.h>
  
  /* this file implements the /dev/pvfs2-req device node */
  
+uint32_t orangefs_userspace_version;
+
  static int open_access_count;
  
+static DEFINE_MUTEX(devreq_mutex);
+
  #define DUMP_DEVICE_ERROR()                                                   \
  do {                                                                          \
         gossip_err("*****************************************************\n");\
@@ -43,7 +48,7 @@ static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
  {
         int index = hash_func(op->tag, hash_table_size);
  
-       list_add_tail(&op->list, &htable_ops_in_progress[index]);
+       list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]);
  }
  
  /*
@@ -57,20 +62,20 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
  
         index = hash_func(tag, hash_table_size);
  
-       spin_lock(&htable_ops_in_progress_lock);
+       spin_lock(&orangefs_htable_ops_in_progress_lock);
         list_for_each_entry_safe(op,
                                  next,
-                                &htable_ops_in_progress[index],
+                                &orangefs_htable_ops_in_progress[index],
                                  list) {
                 if (op->tag == tag && !op_state_purged(op) &&
                     !op_state_given_up(op)) {
                         list_del_init(&op->list);
-                       spin_unlock(&htable_ops_in_progress_lock);
+                       spin_unlock(&orangefs_htable_ops_in_progress_lock);
                         return op;
                 }
         }
  
-       spin_unlock(&htable_ops_in_progress_lock);
+       spin_unlock(&orangefs_htable_ops_in_progress_lock);
         return NULL;
  }
  
@@ -276,11 +281,11 @@ restart:
         if (ret != 0)
                 goto error;
  
-       spin_lock(&htable_ops_in_progress_lock);
+       spin_lock(&orangefs_htable_ops_in_progress_lock);
         spin_lock(&cur_op->lock);
         if (unlikely(op_state_given_up(cur_op))) {
                 spin_unlock(&cur_op->lock);
-               spin_unlock(&htable_ops_in_progress_lock);
+               spin_unlock(&orangefs_htable_ops_in_progress_lock);
                 complete(&cur_op->waitq);
                 goto restart;
         }
@@ -298,7 +303,7 @@ restart:
                      current->comm);
         orangefs_devreq_add_op(cur_op);
         spin_unlock(&cur_op->lock);
-       spin_unlock(&htable_ops_in_progress_lock);
+       spin_unlock(&orangefs_htable_ops_in_progress_lock);
  
         /* The client only asks to read one size buffer. */
         return MAX_DEV_REQ_UPSIZE;
@@ -387,6 +392,13 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
                 return -EPROTO;
         }
  
+       if (!orangefs_userspace_version) {
+               orangefs_userspace_version = head.version;
+       } else if (orangefs_userspace_version != head.version) {
+               gossip_err("Error: userspace version changes\n");
+               return -EPROTO;
+       }
+
         /* remove the op from the in progress hash table */
         op = orangefs_devreq_remove_op(head.tag);
         if (!op) {
@@ -527,6 +539,7 @@ static int orangefs_devreq_release(struct inode *inode, struct file *file)
         gossip_debug(GOSSIP_DEV_DEBUG,
                      "pvfs2-client-core: device close complete\n");
         open_access_count = 0;
+       orangefs_userspace_version = 0;
         mutex_unlock(&devreq_mutex);
         return 0;
  }
@@ -576,8 +589,6 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
         static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE;
         struct ORANGEFS_dev_map_desc user_desc;
         int ret = 0;
-       struct dev_mask_info_s mask_info = { 0 };
-       struct dev_mask2_info_s mask2_info = { 0, 0 };
         int upstream_kmod = 1;
         struct orangefs_sb_info_s *orangefs_sb;
  
@@ -619,7 +630,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
                  * all of the remounts are serviced (to avoid ops between
                  * mounts to fail)
                  */
-               ret = mutex_lock_interruptible(&request_mutex);
+               ret = mutex_lock_interruptible(&orangefs_request_mutex);
                 if (ret < 0)
                         return ret;
                 gossip_debug(GOSSIP_DEV_DEBUG,
@@ -654,7 +665,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
                 gossip_debug(GOSSIP_DEV_DEBUG,
                              "%s: priority remount complete\n",
                              __func__);
-               mutex_unlock(&request_mutex);
+               mutex_unlock(&orangefs_request_mutex);
                 return ret;
  
         case ORANGEFS_DEV_UPSTREAM:
@@ -668,134 +679,11 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
                         return ret;
  
         case ORANGEFS_DEV_CLIENT_MASK:
-               ret = copy_from_user(&mask2_info,
-                                    (void __user *)arg,
-                                    sizeof(struct dev_mask2_info_s));
-
-               if (ret != 0)
-                       return -EIO;
-
-               client_debug_mask.mask1 = mask2_info.mask1_value;
-               client_debug_mask.mask2 = mask2_info.mask2_value;
-
-               pr_info("%s: client debug mask has been been received "
-                       ":%llx: :%llx:\n",
-                       __func__,
-                       (unsigned long long)client_debug_mask.mask1,
-                       (unsigned long long)client_debug_mask.mask2);
-
-               return ret;
-
+               return orangefs_debugfs_new_client_mask((void __user *)arg);
         case ORANGEFS_DEV_CLIENT_STRING:
-               ret = copy_from_user(&client_debug_array_string,
-                                    (void __user *)arg,
-                                    ORANGEFS_MAX_DEBUG_STRING_LEN);
-               /*
-                * The real client-core makes an effort to ensure
-                * that actual strings that aren't too long to fit in
-                * this buffer is what we get here. We're going to use
-                * string functions on the stuff we got, so we'll make
-                * this extra effort to try and keep from
-                * flowing out of this buffer when we use the string
-                * functions, even if somehow the stuff we end up
-                * with here is garbage.
-                */
-               client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
-                       '\0';
-               
-               if (ret != 0) {
-                       pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
-                               __func__);
-                       return -EIO;
-               }
-
-               pr_info("%s: client debug array string has been received.\n",
-                       __func__);
-
-               if (!help_string_initialized) {
-
-                       /* Free the "we don't know yet" default string... */
-                       kfree(debug_help_string);
-
-                       /* build a proper debug help string */
-                       if (orangefs_prepare_debugfs_help_string(0)) {
-                               gossip_err("%s: no debug help string \n",
-                                          __func__);
-                               return -EIO;
-                       }
-
-                       /* Replace the boilerplate boot-time debug-help file. */
-                       debugfs_remove(help_file_dentry);
-
-                       help_file_dentry =
-                               debugfs_create_file(
-                                       ORANGEFS_KMOD_DEBUG_HELP_FILE,
-                                       0444,
-                                       debug_dir,
-                                       debug_help_string,
-                                       &debug_help_fops);
-
-                       if (!help_file_dentry) {
-                               gossip_err("%s: debugfs_create_file failed for"
-                                          " :%s:!\n",
-                                          __func__,
-                                          ORANGEFS_KMOD_DEBUG_HELP_FILE);
-                               return -EIO;
-                       }
-               }
-
-               debug_mask_to_string(&client_debug_mask, 1);
-
-               debugfs_remove(client_debug_dentry);
-
-               orangefs_client_debug_init();
-
-               help_string_initialized++;
-
-               return ret;
-
+               return orangefs_debugfs_new_client_string((void __user *)arg);
         case ORANGEFS_DEV_DEBUG:
-               ret = copy_from_user(&mask_info,
-                                    (void __user *)arg,
-                                    sizeof(mask_info));
-
-               if (ret != 0)
-                       return -EIO;
-
-               if (mask_info.mask_type == KERNEL_MASK) {
-                       if ((mask_info.mask_value == 0)
-                           && (kernel_mask_set_mod_init)) {
-                               /*
-                                * the kernel debug mask was set when the
-                                * kernel module was loaded; don't override
-                                * it if the client-core was started without
-                                * a value for ORANGEFS_KMODMASK.
-                                */
-                               return 0;
-                       }
-                       debug_mask_to_string(&mask_info.mask_value,
-                                            mask_info.mask_type);
-                       gossip_debug_mask = mask_info.mask_value;
-                       pr_info("%s: kernel debug mask has been modified to "
-                               ":%s: :%llx:\n",
-                               __func__,
-                               kernel_debug_string,
-                               (unsigned long long)gossip_debug_mask);
-               } else if (mask_info.mask_type == CLIENT_MASK) {
-                       debug_mask_to_string(&mask_info.mask_value,
-                                            mask_info.mask_type);
-                       pr_info("%s: client debug mask has been modified to"
-                               ":%s: :%llx:\n",
-                               __func__,
-                               client_debug_string,
-                               llu(mask_info.mask_value));
-               } else {
-                       gossip_lerr("Invalid mask type....\n");
-                       return -EINVAL;
-               }
-
-               return ret;
-
+               return orangefs_debugfs_new_debug((void __user *)arg);
         default:
                 return -ENOIOCTLCMD;
         }
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h

index 66b9921..3b8923f 100644 (file)
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -83,7 +83,10 @@ struct orangefs_listxattr_response {
  };
  
  struct orangefs_param_response {
-       __s64 value;
+       union {
+               __s64 value64;
+               __s32 value32[2];
+       } u;
  };
  
  #define PERF_COUNT_BUF_SIZE 4096
@@ -98,6 +101,11 @@ struct orangefs_fs_key_response {
         char fs_key[FS_KEY_BUF_SIZE];
  };
  
+/* 2.9.6 */
+struct orangefs_features_response {
+       __u64 features;
+};
+
  struct orangefs_downcall_s {
         __s32 type;
         __s32 status;
@@ -119,6 +127,7 @@ struct orangefs_downcall_s {
                 struct orangefs_param_response param;
                 struct orangefs_perf_count_response perf_count;
                 struct orangefs_fs_key_response fs_key;
+               struct orangefs_features_response features;
         } resp;
  };
  
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c

index 526040e..3386886 100644 (file)
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -14,6 +14,32 @@
  #include <linux/fs.h>
  #include <linux/pagemap.h>
  
+static int flush_racache(struct inode *inode)
+{
+       struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+       struct orangefs_kernel_op_s *new_op;
+       int ret;
+
+       gossip_debug(GOSSIP_UTILS_DEBUG,
+           "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
+           get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
+           orangefs_inode->refn.fs_id);
+
+       new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
+       if (!new_op)
+               return -ENOMEM;
+       new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
+
+       ret = service_operation(new_op, "orangefs_flush_racache",
+           get_interruptible_flag(inode));
+
+       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
+           __func__, ret);
+
+       op_release(new_op);
+       return ret;
+}
+
  /*
   * Copy to client-core's address space from the buffers specified
   * by the iovec upto total_size bytes.
@@ -386,7 +412,7 @@ ssize_t orangefs_inode_read(struct inode *inode,
         size_t bufmap_size;
         ssize_t ret = -EINVAL;
  
-       g_orangefs_stats.reads++;
+       orangefs_stats.reads++;
  
         bufmap_size = orangefs_bufmap_size_query();
         if (count > bufmap_size) {
@@ -427,7 +453,7 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter
  
         gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
  
-       g_orangefs_stats.reads++;
+       orangefs_stats.reads++;
  
         rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
         iocb->ki_pos = pos;
@@ -488,7 +514,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
         }
  
         iocb->ki_pos = pos;
-       g_orangefs_stats.writes++;
+       orangefs_stats.writes++;
  
  out:
  
@@ -591,15 +617,24 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
         orangefs_flush_inode(inode);
  
         /*
-        * remove all associated inode pages from the page cache and mmap
+        * remove all associated inode pages from the page cache and
          * readahead cache (if any); this forces an expensive refresh of
          * data for the next caller of mmap (or 'get_block' accesses)
          */
         if (file->f_path.dentry->d_inode &&
             file->f_path.dentry->d_inode->i_mapping &&
-           mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
+           mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) {
+               if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
+                       gossip_debug(GOSSIP_INODE_DEBUG,
+                           "calling flush_racache on %pU\n",
+                           get_khandle_from_ino(inode));
+                       flush_racache(inode);
+                       gossip_debug(GOSSIP_INODE_DEBUG,
+                           "flush_racache finished\n");
+               }
                 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
                                      0);
+       }
         return 0;
  }
  
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c

index 62c5259..35269e3 100644 (file)
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -72,7 +72,7 @@ static int orangefs_create(struct inode *dir,
  
         d_instantiate(dentry, inode);
         unlock_new_inode(inode);
-       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
         ORANGEFS_I(inode)->getattr_time = jiffies - 1;
  
         gossip_debug(GOSSIP_NAME_DEBUG,
@@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
                 goto out;
         }
  
-       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
  
         inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
         if (IS_ERR(inode)) {
@@ -322,7 +322,7 @@ static int orangefs_symlink(struct inode *dir,
  
         d_instantiate(dentry, inode);
         unlock_new_inode(inode);
-       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
         ORANGEFS_I(inode)->getattr_time = jiffies - 1;
  
         gossip_debug(GOSSIP_NAME_DEBUG,
@@ -386,7 +386,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
  
         d_instantiate(dentry, inode);
         unlock_new_inode(inode);
-       dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+       dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
         ORANGEFS_I(inode)->getattr_time = jiffies - 1;
  
         gossip_debug(GOSSIP_NAME_DEBUG,
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c

index b6edbe9..aa3830b 100644 (file)
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op)
                         return "OP_STATFS";
                 else if (type == ORANGEFS_VFS_OP_TRUNCATE)
                         return "OP_TRUNCATE";
-               else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
-                       return "OP_MMAP_RA_FLUSH";
+               else if (type == ORANGEFS_VFS_OP_RA_FLUSH)
+                       return "OP_RA_FLUSH";
                 else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
                         return "OP_FS_MOUNT";
                 else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
@@ -97,6 +97,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op)
                         return "OP_FSYNC";
                 else if (type == ORANGEFS_VFS_OP_FSKEY)
                         return "OP_FSKEY";
+               else if (type == ORANGEFS_VFS_OP_FEATURES)
+                       return "OP_FEATURES";
         }
         return "OP_UNKNOWN?";
  }
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c

index 1714a73..9b24107 100644 (file)
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -43,36 +43,35 @@
  #include "protocol.h"
  #include "orangefs-kernel.h"
  
-static int orangefs_debug_disabled = 1;
-
-static int orangefs_debug_help_open(struct inode *, struct file *);
+#define DEBUG_HELP_STRING_SIZE 4096
+#define HELP_STRING_UNINITIALIZED \
+       "Client Debug Keywords are unknown until the first time\n" \
+       "the client is started after boot.\n"
+#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
+#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
+#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
+#define ORANGEFS_VERBOSE "verbose"
+#define ORANGEFS_ALL "all"
  
-const struct file_operations debug_help_fops = {
-       .open           = orangefs_debug_help_open,
-       .read           = seq_read,
-       .release        = seq_release,
-       .llseek         = seq_lseek,
+/*
+ * An array of client_debug_mask will be built to hold debug keyword/mask
+ * values fetched from userspace.
+ */
+struct client_debug_mask {
+       char *keyword;
+       __u64 mask1;
+       __u64 mask2;
  };
  
+static int orangefs_kernel_debug_init(void);
+
+static int orangefs_debug_help_open(struct inode *, struct file *);
  static void *help_start(struct seq_file *, loff_t *);
  static void *help_next(struct seq_file *, void *, loff_t *);
  static void help_stop(struct seq_file *, void *);
  static int help_show(struct seq_file *, void *);
  
-static const struct seq_operations help_debug_ops = {
-       .start  = help_start,
-       .next   = help_next,
-       .stop   = help_stop,
-       .show   = help_show,
-};
-
-/*
- * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
- * ORANGEFS_KMOD_DEBUG_FILE.
- */
-static DEFINE_MUTEX(orangefs_debug_lock);
-
-int orangefs_debug_open(struct inode *, struct file *);
+static int orangefs_debug_open(struct inode *, struct file *);
  
  static ssize_t orangefs_debug_read(struct file *,
                                  char __user *,
@@ -84,6 +83,43 @@ static ssize_t orangefs_debug_write(struct file *,
                                   size_t,
                                   loff_t *);
  
+static int orangefs_prepare_cdm_array(char *);
+static void debug_mask_to_string(void *, int);
+static void do_k_string(void *, int);
+static void do_c_string(void *, int);
+static int keyword_is_amalgam(char *);
+static int check_amalgam_keyword(void *, int);
+static void debug_string_to_mask(char *, void *, int);
+static void do_c_mask(int, char *, struct client_debug_mask **);
+static void do_k_mask(int, char *, __u64 **);
+
+static char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
+static char *debug_help_string;
+static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+
+static struct dentry *help_file_dentry;
+static struct dentry *client_debug_dentry;
+static struct dentry *debug_dir;
+
+static unsigned int kernel_mask_set_mod_init;
+static int orangefs_debug_disabled = 1;
+static int help_string_initialized;
+
+static const struct seq_operations help_debug_ops = {
+       .start  = help_start,
+       .next   = help_next,
+       .stop   = help_stop,
+       .show   = help_show,
+};
+
+const struct file_operations debug_help_fops = {
+       .open           = orangefs_debug_help_open,
+       .read           = seq_read,
+       .release        = seq_release,
+       .llseek         = seq_lseek,
+};
+
  static const struct file_operations kernel_debug_fops = {
         .open           = orangefs_debug_open,
         .read           = orangefs_debug_read,
@@ -91,15 +127,55 @@ static const struct file_operations kernel_debug_fops = {
         .llseek         = generic_file_llseek,
  };
  
+static int client_all_index;
+static int client_verbose_index;
+
+static struct client_debug_mask *cdm_array;
+static int cdm_element_count;
+
+static struct client_debug_mask client_debug_mask;
+
+/*
+ * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
+ * ORANGEFS_KMOD_DEBUG_FILE.
+ */
+static DEFINE_MUTEX(orangefs_debug_lock);
+
  /*
   * initialize kmod debug operations, create orangefs debugfs dir and
   * ORANGEFS_KMOD_DEBUG_HELP_FILE.
   */
-int orangefs_debugfs_init(void)
+int orangefs_debugfs_init(int debug_mask)
  {
-
         int rc = -ENOMEM;
  
+       /* convert input debug mask to a 64-bit unsigned integer */
+        orangefs_gossip_debug_mask = (unsigned long long)debug_mask;
+
+       /*
+        * set the kernel's gossip debug string; invalid mask values will
+        * be ignored.
+        */
+       debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
+
+       /* remove any invalid values from the mask */
+       debug_string_to_mask(kernel_debug_string, &orangefs_gossip_debug_mask,
+           0);
+
+       /*
+        * if the mask has a non-zero value, then indicate that the mask
+        * was set when the kernel module was loaded.  The orangefs dev ioctl
+        * command will look at this boolean to determine if the kernel's
+        * debug mask should be overwritten when the client-core is started.
+        */
+       if (orangefs_gossip_debug_mask != 0)
+               kernel_mask_set_mod_init = true;
+
+       pr_info("%s: called with debug mask: :%s: :%llx:\n",
+               __func__,
+               kernel_debug_string,
+               (unsigned long long)orangefs_gossip_debug_mask);
+
         debug_dir = debugfs_create_dir("orangefs", NULL);
         if (!debug_dir) {
                 pr_info("%s: debugfs_create_dir failed.\n", __func__);
@@ -117,13 +193,58 @@ int orangefs_debugfs_init(void)
         }
  
         orangefs_debug_disabled = 0;
+
+       rc = orangefs_kernel_debug_init();
+
+out:
+
+       return rc;
+}
+
+/*
+ * initialize the kernel-debug file.
+ */
+static int orangefs_kernel_debug_init(void)
+{
+       int rc = -ENOMEM;
+       struct dentry *ret;
+       char *k_buffer = NULL;
+
+       gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+       k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+       if (!k_buffer)
+               goto out;
+
+       if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+               strcpy(k_buffer, kernel_debug_string);
+               strcat(k_buffer, "\n");
+       } else {
+               strcpy(k_buffer, "none\n");
+               pr_info("%s: overflow 1!\n", __func__);
+       }
+
+       ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
+                                 0444,
+                                 debug_dir,
+                                 k_buffer,
+                                 &kernel_debug_fops);
+       if (!ret) {
+               pr_info("%s: failed to create %s.\n",
+                       __func__,
+                       ORANGEFS_KMOD_DEBUG_FILE);
+               goto out;
+       }
+
         rc = 0;
  
  out:
  
+       gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
         return rc;
  }
  
+
  void orangefs_debugfs_cleanup(void)
  {
         debugfs_remove_recursive(debug_dir);
@@ -195,49 +316,6 @@ static int help_show(struct seq_file *m, void *v)
         return 0;
  }
  
-/*
- * initialize the kernel-debug file.
- */
-int orangefs_kernel_debug_init(void)
-{
-       int rc = -ENOMEM;
-       struct dentry *ret;
-       char *k_buffer = NULL;
-
-       gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
-
-       k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
-       if (!k_buffer)
-               goto out;
-
-       if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
-               strcpy(k_buffer, kernel_debug_string);
-               strcat(k_buffer, "\n");
-       } else {
-               strcpy(k_buffer, "none\n");
-               pr_info("%s: overflow 1!\n", __func__);
-       }
-
-       ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
-                                 0444,
-                                 debug_dir,
-                                 k_buffer,
-                                 &kernel_debug_fops);
-       if (!ret) {
-               pr_info("%s: failed to create %s.\n",
-                       __func__,
-                       ORANGEFS_KMOD_DEBUG_FILE);
-               goto out;
-       }
-
-       rc = 0;
-
-out:
-
-       gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
-       return rc;
-}
-
  /*
   * initialize the client-debug file.
   */
@@ -282,7 +360,7 @@ out:
  }
  
  /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
-int orangefs_debug_open(struct inode *inode, struct file *file)
+static int orangefs_debug_open(struct inode *inode, struct file *file)
  {
         int rc = -ENODEV;
  
@@ -384,8 +462,8 @@ static ssize_t orangefs_debug_write(struct file *file,
          */
         if (!strcmp(file->f_path.dentry->d_name.name,
                     ORANGEFS_KMOD_DEBUG_FILE)) {
-               debug_string_to_mask(buf, &gossip_debug_mask, 0);
-               debug_mask_to_string(&gossip_debug_mask, 0);
+               debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0);
+               debug_mask_to_string(&orangefs_gossip_debug_mask, 0);
                 debug_string = kernel_debug_string;
                 gossip_debug(GOSSIP_DEBUGFS_DEBUG,
                              "New kernel debug string is %s\n",
@@ -452,3 +530,546 @@ out:
         kfree(buf);
         return rc;
  }
+
+/*
+ * After obtaining a string representation of the client's debug
+ * keywords and their associated masks, this function is called to build an
+ * array of these values.
+ */
+static int orangefs_prepare_cdm_array(char *debug_array_string)
+{
+       int i;
+       int rc = -EINVAL;
+       char *cds_head = NULL;
+       char *cds_delimiter = NULL;
+       int keyword_len = 0;
+
+       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+       /*
+        * figure out how many elements the cdm_array needs.
+        */
+       for (i = 0; i < strlen(debug_array_string); i++)
+               if (debug_array_string[i] == '\n')
+                       cdm_element_count++;
+
+       if (!cdm_element_count) {
+               pr_info("No elements in client debug array string!\n");
+               goto out;
+       }
+
+       cdm_array =
+               kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
+                       GFP_KERNEL);
+       if (!cdm_array) {
+               pr_info("malloc failed for cdm_array!\n");
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       cds_head = debug_array_string;
+
+       for (i = 0; i < cdm_element_count; i++) {
+               cds_delimiter = strchr(cds_head, '\n');
+               *cds_delimiter = '\0';
+
+               keyword_len = strcspn(cds_head, " ");
+
+               cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
+               if (!cdm_array[i].keyword) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+
+               sscanf(cds_head,
+                      "%s %llx %llx",
+                      cdm_array[i].keyword,
+                      (unsigned long long *)&(cdm_array[i].mask1),
+                      (unsigned long long *)&(cdm_array[i].mask2));
+
+               if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
+                       client_verbose_index = i;
+
+               if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
+                       client_all_index = i;
+
+               cds_head = cds_delimiter + 1;
+       }
+
+       rc = cdm_element_count;
+
+       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+
+out:
+
+       return rc;
+
+}
+
+/*
+ * /sys/kernel/debug/orangefs/debug-help can be catted to
+ * see all the available kernel and client debug keywords.
+ *
+ * When the kernel boots, we have no idea what keywords the
+ * client supports, nor their associated masks.
+ *
+ * We pass through this function once at boot and stamp a
+ * boilerplate "we don't know" message for the client in the
+ * debug-help file. We pass through here again when the client
+ * starts and then we can fill out the debug-help file fully.
+ *
+ * The client might be restarted any number of times between
+ * reboots, we only build the debug-help file the first time.
+ */
+int orangefs_prepare_debugfs_help_string(int at_boot)
+{
+       int rc = -EINVAL;
+       int i;
+       int byte_count = 0;
+       char *client_title = "Client Debug Keywords:\n";
+       char *kernel_title = "Kernel Debug Keywords:\n";
+
+       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+       if (at_boot) {
+               byte_count += strlen(HELP_STRING_UNINITIALIZED);
+               client_title = HELP_STRING_UNINITIALIZED;
+       } else {
+               /*
+                * fill the client keyword/mask array and remember
+                * how many elements there were.
+                */
+               cdm_element_count =
+                       orangefs_prepare_cdm_array(client_debug_array_string);
+               if (cdm_element_count <= 0)
+                       goto out;
+
+               /* Count the bytes destined for debug_help_string. */
+               byte_count += strlen(client_title);
+
+               for (i = 0; i < cdm_element_count; i++) {
+                       byte_count += strlen(cdm_array[i].keyword + 2);
+                       if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+                               pr_info("%s: overflow 1!\n", __func__);
+                               goto out;
+                       }
+               }
+
+               gossip_debug(GOSSIP_UTILS_DEBUG,
+                            "%s: cdm_element_count:%d:\n",
+                            __func__,
+                            cdm_element_count);
+       }
+
+       byte_count += strlen(kernel_title);
+       for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+               byte_count +=
+                       strlen(s_kmod_keyword_mask_map[i].keyword + 2);
+               if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+                       pr_info("%s: overflow 2!\n", __func__);
+                       goto out;
+               }
+       }
+
+       /* build debug_help_string. */
+       debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
+       if (!debug_help_string) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       strcat(debug_help_string, client_title);
+
+       if (!at_boot) {
+               for (i = 0; i < cdm_element_count; i++) {
+                       strcat(debug_help_string, "\t");
+                       strcat(debug_help_string, cdm_array[i].keyword);
+                       strcat(debug_help_string, "\n");
+               }
+       }
+
+       strcat(debug_help_string, "\n");
+       strcat(debug_help_string, kernel_title);
+
+       for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+               strcat(debug_help_string, "\t");
+               strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
+               strcat(debug_help_string, "\n");
+       }
+
+       rc = 0;
+
+out:
+
+       return rc;
+
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+static void debug_mask_to_string(void *mask, int type)
+{
+       int i;
+       int len = 0;
+       char *debug_string;
+       int element_count = 0;
+
+       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+       if (type) {
+               debug_string = client_debug_string;
+               element_count = cdm_element_count;
+       } else {
+               debug_string = kernel_debug_string;
+               element_count = num_kmod_keyword_mask_map;
+       }
+
+       memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+
+       /*
+        * Some keywords, like "all" or "verbose", are amalgams of
+        * numerous other keywords. Make a special check for those
+        * before grinding through the whole mask only to find out
+        * later...
+        */
+       if (check_amalgam_keyword(mask, type))
+               goto out;
+
+       /* Build the debug string. */
+       for (i = 0; i < element_count; i++)
+               if (type)
+                       do_c_string(mask, i);
+               else
+                       do_k_string(mask, i);
+
+       len = strlen(debug_string);
+
+       if ((len) && (type))
+               client_debug_string[len - 1] = '\0';
+       else if (len)
+               kernel_debug_string[len - 1] = '\0';
+       else if (type)
+               strcpy(client_debug_string, "none");
+       else
+               strcpy(kernel_debug_string, "none");
+
+out:
+gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
+
+       return;
+
+}
+
+static void do_k_string(void *k_mask, int index)
+{
+       __u64 *mask = (__u64 *) k_mask;
+
+       if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
+               goto out;
+
+       if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
+               if ((strlen(kernel_debug_string) +
+                    strlen(s_kmod_keyword_mask_map[index].keyword))
+                       < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+                               strcat(kernel_debug_string,
+                                      s_kmod_keyword_mask_map[index].keyword);
+                               strcat(kernel_debug_string, ",");
+                       } else {
+                               gossip_err("%s: overflow!\n", __func__);
+                               strcpy(kernel_debug_string, ORANGEFS_ALL);
+                               goto out;
+                       }
+       }
+
+out:
+
+       return;
+}
+
+static void do_c_string(void *c_mask, int index)
+{
+       struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
+
+       if (keyword_is_amalgam(cdm_array[index].keyword))
+               goto out;
+
+       if ((mask->mask1 & cdm_array[index].mask1) ||
+           (mask->mask2 & cdm_array[index].mask2)) {
+               if ((strlen(client_debug_string) +
+                    strlen(cdm_array[index].keyword) + 1)
+                       < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+                               strcat(client_debug_string,
+                                      cdm_array[index].keyword);
+                               strcat(client_debug_string, ",");
+                       } else {
+                               gossip_err("%s: overflow!\n", __func__);
+                               strcpy(client_debug_string, ORANGEFS_ALL);
+                               goto out;
+                       }
+       }
+out:
+       return;
+}
+
+static int keyword_is_amalgam(char *keyword)
+{
+       int rc = 0;
+
+       if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
+               rc = 1;
+
+       return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ *
+ * return 1 if we found an amalgam.
+ */
+static int check_amalgam_keyword(void *mask, int type)
+{
+       __u64 *k_mask;
+       struct client_debug_mask *c_mask;
+       int k_all_index = num_kmod_keyword_mask_map - 1;
+       int rc = 0;
+
+       if (type) {
+               c_mask = (struct client_debug_mask *) mask;
+
+               if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
+                   (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
+                       strcpy(client_debug_string, ORANGEFS_ALL);
+                       rc = 1;
+                       goto out;
+               }
+
+               if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
+                   (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
+                       strcpy(client_debug_string, ORANGEFS_VERBOSE);
+                       rc = 1;
+                       goto out;
+               }
+
+       } else {
+               k_mask = (__u64 *) mask;
+
+               if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
+                       strcpy(kernel_debug_string, ORANGEFS_ALL);
+                       rc = 1;
+                       goto out;
+               }
+       }
+
+out:
+
+       return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+static void debug_string_to_mask(char *debug_string, void *mask, int type)
+{
+       char *unchecked_keyword;
+       int i;
+       char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
+       char *original_pointer;
+       int element_count = 0;
+       struct client_debug_mask *c_mask = NULL;
+       __u64 *k_mask = NULL;
+
+       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+       if (type) {
+               c_mask = (struct client_debug_mask *)mask;
+               element_count = cdm_element_count;
+       } else {
+               k_mask = (__u64 *)mask;
+               *k_mask = 0;
+               element_count = num_kmod_keyword_mask_map;
+       }
+
+       original_pointer = strsep_fodder;
+       while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
+               if (strlen(unchecked_keyword)) {
+                       for (i = 0; i < element_count; i++)
+                               if (type)
+                                       do_c_mask(i,
+                                                 unchecked_keyword,
+                                                 &c_mask);
+                               else
+                                       do_k_mask(i,
+                                                 unchecked_keyword,
+                                                 &k_mask);
+               }
+
+       kfree(original_pointer);
+}
+
+static void do_c_mask(int i, char *unchecked_keyword,
+    struct client_debug_mask **sane_mask)
+{
+
+       if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
+               (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
+               (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
+       }
+}
+
+static void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
+{
+
+       if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
+               **sane_mask = (**sane_mask) |
+                               s_kmod_keyword_mask_map[i].mask_val;
+}
+
+int orangefs_debugfs_new_client_mask(void __user *arg)
+{
+       struct dev_mask2_info_s mask2_info = {0};
+       int ret;
+
+       ret = copy_from_user(&mask2_info,
+                            (void __user *)arg,
+                            sizeof(struct dev_mask2_info_s));
+
+       if (ret != 0)
+               return -EIO;
+
+       client_debug_mask.mask1 = mask2_info.mask1_value;
+       client_debug_mask.mask2 = mask2_info.mask2_value;
+
+       pr_info("%s: client debug mask has been been received "
+               ":%llx: :%llx:\n",
+               __func__,
+               (unsigned long long)client_debug_mask.mask1,
+               (unsigned long long)client_debug_mask.mask2);
+
+       return ret;
+}
+
+int orangefs_debugfs_new_client_string(void __user *arg) 
+{
+       int ret;
+
+       ret = copy_from_user(&client_debug_array_string,
+                                     (void __user *)arg,
+                                     ORANGEFS_MAX_DEBUG_STRING_LEN);
+       if (ret != 0)
+               return -EIO;
+
+       /*
+        * The real client-core makes an effort to ensure
+        * that actual strings that aren't too long to fit in
+        * this buffer is what we get here. We're going to use
+        * string functions on the stuff we got, so we'll make
+        * this extra effort to try and keep from
+        * flowing out of this buffer when we use the string
+        * functions, even if somehow the stuff we end up
+        * with here is garbage.
+        */
+       client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
+               '\0';
+       
+       if (ret != 0) {
+               pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
+                       __func__);
+               return -EIO;
+       }
+
+       pr_info("%s: client debug array string has been received.\n",
+               __func__);
+
+       if (!help_string_initialized) {
+
+               /* Free the "we don't know yet" default string... */
+               kfree(debug_help_string);
+
+               /* build a proper debug help string */
+               if (orangefs_prepare_debugfs_help_string(0)) {
+                       gossip_err("%s: no debug help string \n",
+                                  __func__);
+                       return -EIO;
+               }
+
+               /* Replace the boilerplate boot-time debug-help file. */
+               debugfs_remove(help_file_dentry);
+
+               help_file_dentry =
+                       debugfs_create_file(
+                               ORANGEFS_KMOD_DEBUG_HELP_FILE,
+                               0444,
+                               debug_dir,
+                               debug_help_string,
+                               &debug_help_fops);
+
+               if (!help_file_dentry) {
+                       gossip_err("%s: debugfs_create_file failed for"
+                                  " :%s:!\n",
+                                  __func__,
+                                  ORANGEFS_KMOD_DEBUG_HELP_FILE);
+                       return -EIO;
+               }
+       }
+
+       debug_mask_to_string(&client_debug_mask, 1);
+
+       debugfs_remove(client_debug_dentry);
+
+       orangefs_client_debug_init();
+
+       help_string_initialized++;
+
+       return ret;
+}
+
+int orangefs_debugfs_new_debug(void __user *arg) 
+{
+       struct dev_mask_info_s mask_info = {0};
+       int ret;
+
+       ret = copy_from_user(&mask_info,
+                            (void __user *)arg,
+                            sizeof(mask_info));
+
+       if (ret != 0)
+               return -EIO;
+
+       if (mask_info.mask_type == KERNEL_MASK) {
+               if ((mask_info.mask_value == 0)
+                   && (kernel_mask_set_mod_init)) {
+                       /*
+                        * the kernel debug mask was set when the
+                        * kernel module was loaded; don't override
+                        * it if the client-core was started without
+                        * a value for ORANGEFS_KMODMASK.
+                        */
+                       return 0;
+               }
+               debug_mask_to_string(&mask_info.mask_value,
+                                    mask_info.mask_type);
+               orangefs_gossip_debug_mask = mask_info.mask_value;
+               pr_info("%s: kernel debug mask has been modified to "
+                       ":%s: :%llx:\n",
+                       __func__,
+                       kernel_debug_string,
+                       (unsigned long long)orangefs_gossip_debug_mask);
+       } else if (mask_info.mask_type == CLIENT_MASK) {
+               debug_mask_to_string(&mask_info.mask_value,
+                                    mask_info.mask_type);
+               pr_info("%s: client debug mask has been modified to"
+                       ":%s: :%llx:\n",
+                       __func__,
+                       client_debug_string,
+                       llu(mask_info.mask_value));
+       } else {
+               gossip_lerr("Invalid mask type....\n");
+               return -EINVAL;
+       }
+
+       return ret;
+}
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h

index e4828c0..8035172 100644 (file)
--- a/fs/orangefs/orangefs-debugfs.h
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -1,3 +1,7 @@
-int orangefs_debugfs_init(void);
-int orangefs_kernel_debug_init(void);
+int orangefs_debugfs_init(int);
  void orangefs_debugfs_cleanup(void);
+int orangefs_client_debug_init(void);
+int orangefs_prepare_debugfs_help_string(int);
+int orangefs_debugfs_new_client_mask(void __user *);
+int orangefs_debugfs_new_client_string(void __user *);
+int orangefs_debugfs_new_debug(void __user *);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h

index 9eac9d9..a3d84ff 100644 (file)
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -28,7 +28,7 @@
  #define ORANGEFS_VFS_OP_RENAME         0xFF00000A
  #define ORANGEFS_VFS_OP_STATFS         0xFF00000B
  #define ORANGEFS_VFS_OP_TRUNCATE       0xFF00000C
-#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH  0xFF00000D
+#define ORANGEFS_VFS_OP_RA_FLUSH       0xFF00000D
  #define ORANGEFS_VFS_OP_FS_MOUNT       0xFF00000E
  #define ORANGEFS_VFS_OP_FS_UMOUNT      0xFF00000F
  #define ORANGEFS_VFS_OP_GETXATTR       0xFF000010
@@ -41,6 +41,10 @@
  #define ORANGEFS_VFS_OP_FSYNC          0xFF00EE01
  #define ORANGEFS_VFS_OP_FSKEY             0xFF00EE02
  #define ORANGEFS_VFS_OP_READDIRPLUS       0xFF00EE03
+#define ORANGEFS_VFS_OP_FEATURES       0xFF00EE05 /* 2.9.6 */
+
+/* features is a 64-bit unsigned bitmask */
+#define ORANGEFS_FEATURE_READAHEAD 1
  
  /*
   * Misc constants. Please retain them as multiples of 8!
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h

index 633c07a..0a82048 100644 (file)
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -99,16 +99,6 @@ enum orangefs_vfs_op_states {
         OP_VFS_STATE_GIVEN_UP = 16,
  };
  
-/*
- * An array of client_debug_mask will be built to hold debug keyword/mask
- * values fetched from userspace.
- */
-struct client_debug_mask {
-       char *keyword;
-       __u64 mask1;
-       __u64 mask2;
-};
-
  /*
   * orangefs kernel memory related flags
   */
@@ -119,29 +109,6 @@ struct client_debug_mask {
  #define ORANGEFS_CACHE_CREATE_FLAGS 0
  #endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
  
-/* these functions are defined in orangefs-utils.c */
-int orangefs_prepare_cdm_array(char *debug_array_string);
-int orangefs_prepare_debugfs_help_string(int);
-
-/* defined in orangefs-debugfs.c */
-int orangefs_client_debug_init(void);
-
-void debug_string_to_mask(char *, void *, int);
-void do_c_mask(int, char *, struct client_debug_mask **);
-void do_k_mask(int, char *, __u64 **);
-
-void debug_mask_to_string(void *, int);
-void do_k_string(void *, int);
-void do_c_string(void *, int);
-int check_amalgam_keyword(void *, int);
-int keyword_is_amalgam(char *);
-
-/*these variables are defined in orangefs-mod.c */
-extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-extern unsigned int kernel_mask_set_mod_init;
-
  extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
  extern const struct xattr_handler *orangefs_xattr_handlers[];
  
@@ -331,7 +298,7 @@ struct orangefs_stats {
         unsigned long writes;
  };
  
-extern struct orangefs_stats g_orangefs_stats;
+extern struct orangefs_stats orangefs_stats;
  
  /*
   * NOTE: See Documentation/filesystems/porting for information
@@ -447,6 +414,8 @@ void purge_waiting_ops(void);
  /*
   * defined in super.c
   */
+extern uint64_t orangefs_features;
+
  struct dentry *orangefs_mount(struct file_system_type *fst,
                            int flags,
                            const char *devname,
@@ -506,6 +475,8 @@ ssize_t orangefs_inode_read(struct inode *inode,
  /*
   * defined in devorangefs-req.c
   */
+extern uint32_t orangefs_userspace_version;
+
  int orangefs_dev_init(void);
  void orangefs_dev_cleanup(void);
  int is_daemon_in_service(void);
@@ -543,20 +514,18 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
  
  int orangefs_normalize_to_errno(__s32 error_code);
  
-extern struct mutex devreq_mutex;
-extern struct mutex request_mutex;
-extern int debug;
+extern struct mutex orangefs_request_mutex;
  extern int op_timeout_secs;
  extern int slot_timeout_secs;
-extern int dcache_timeout_msecs;
-extern int getattr_timeout_msecs;
+extern int orangefs_dcache_timeout_msecs;
+extern int orangefs_getattr_timeout_msecs;
  extern struct list_head orangefs_superblocks;
  extern spinlock_t orangefs_superblocks_lock;
  extern struct list_head orangefs_request_list;
  extern spinlock_t orangefs_request_list_lock;
  extern wait_queue_head_t orangefs_request_list_waitq;
-extern struct list_head *htable_ops_in_progress;
-extern spinlock_t htable_ops_in_progress_lock;
+extern struct list_head *orangefs_htable_ops_in_progress;
+extern spinlock_t orangefs_htable_ops_in_progress_lock;
  extern int hash_table_size;
  
  extern const struct address_space_operations orangefs_address_operations;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c

index e9fd575..2e5b030 100644 (file)
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -21,34 +21,17 @@
   * global variables declared here
   */
  
-/* array of client debug keyword/mask values */
-struct client_debug_mask *cdm_array;
-int cdm_element_count;
-
-char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
-char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
-
-char *debug_help_string;
-int help_string_initialized;
-struct dentry *help_file_dentry;
-struct dentry *client_debug_dentry;
-struct dentry *debug_dir;
-int client_verbose_index;
-int client_all_index;
-struct orangefs_stats g_orangefs_stats;
+struct orangefs_stats orangefs_stats;
  
  /* the size of the hash tables for ops in progress */
  int hash_table_size = 509;
  
  static ulong module_parm_debug_mask;
-__u64 gossip_debug_mask;
-struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
-unsigned int kernel_mask_set_mod_init; /* implicitly false */
+__u64 orangefs_gossip_debug_mask;
  int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
  int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
-int dcache_timeout_msecs = 50;
-int getattr_timeout_msecs = 50;
+int orangefs_dcache_timeout_msecs = 50;
+int orangefs_getattr_timeout_msecs = 50;
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("ORANGEFS Development Team");
@@ -71,20 +54,17 @@ module_param(module_parm_debug_mask, ulong, 0644);
  module_param(op_timeout_secs, int, 0);
  module_param(slot_timeout_secs, int, 0);
  
-/* synchronizes the request device file */
-DEFINE_MUTEX(devreq_mutex);
-
  /*
   * Blocks non-priority requests from being queued for servicing.  This
   * could be used for protecting the request list data structure, but
   * for now it's only being used to stall the op addition to the request
   * list
   */
-DEFINE_MUTEX(request_mutex);
+DEFINE_MUTEX(orangefs_request_mutex);
  
  /* hash table for storing operations waiting for matching downcall */
-struct list_head *htable_ops_in_progress;
-DEFINE_SPINLOCK(htable_ops_in_progress_lock);
+struct list_head *orangefs_htable_ops_in_progress;
+DEFINE_SPINLOCK(orangefs_htable_ops_in_progress_lock);
  
  /* list for queueing upcall operations */
  LIST_HEAD(orangefs_request_list);
@@ -100,32 +80,6 @@ static int __init orangefs_init(void)
         int ret = -1;
         __u32 i = 0;
  
-       /* convert input debug mask to a 64-bit unsigned integer */
-       gossip_debug_mask = (unsigned long long) module_parm_debug_mask;
-
-       /*
-        * set the kernel's gossip debug string; invalid mask values will
-        * be ignored.
-        */
-       debug_mask_to_string(&gossip_debug_mask, 0);
-
-       /* remove any invalid values from the mask */
-       debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0);
-
-       /*
-        * if the mask has a non-zero value, then indicate that the mask
-        * was set when the kernel module was loaded.  The orangefs dev ioctl
-        * command will look at this boolean to determine if the kernel's
-        * debug mask should be overwritten when the client-core is started.
-        */
-       if (gossip_debug_mask != 0)
-               kernel_mask_set_mod_init = true;
-
-       pr_info("%s: called with debug mask: :%s: :%llx:\n",
-               __func__,
-               kernel_debug_string,
-               (unsigned long long)gossip_debug_mask);
-
         ret = bdi_init(&orangefs_backing_dev_info);
  
         if (ret)
@@ -146,9 +100,9 @@ static int __init orangefs_init(void)
         if (ret < 0)
                 goto cleanup_op;
  
-       htable_ops_in_progress =
+       orangefs_htable_ops_in_progress =
             kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
-       if (!htable_ops_in_progress) {
+       if (!orangefs_htable_ops_in_progress) {
                 gossip_err("Failed to initialize op hashtable");
                 ret = -ENOMEM;
                 goto cleanup_inode;
@@ -156,7 +110,7 @@ static int __init orangefs_init(void)
  
         /* initialize a doubly linked at each hash table index */
         for (i = 0; i < hash_table_size; i++)
-               INIT_LIST_HEAD(&htable_ops_in_progress[i]);
+               INIT_LIST_HEAD(&orangefs_htable_ops_in_progress[i]);
  
         ret = fsid_key_table_initialize();
         if (ret < 0)
@@ -179,14 +133,10 @@ static int __init orangefs_init(void)
         if (ret)
                 goto cleanup_key_table;
  
-       ret = orangefs_debugfs_init();
+       ret = orangefs_debugfs_init(module_parm_debug_mask);
         if (ret)
                 goto debugfs_init_failed;
  
-       ret = orangefs_kernel_debug_init();
-       if (ret)
-               goto kernel_debug_init_failed;
-
         ret = orangefs_sysfs_init();
         if (ret)
                 goto sysfs_init_failed;
@@ -214,8 +164,6 @@ cleanup_device:
  
  sysfs_init_failed:
  
-kernel_debug_init_failed:
-
  debugfs_init_failed:
         orangefs_debugfs_cleanup();
  
@@ -223,7 +171,7 @@ cleanup_key_table:
         fsid_key_table_finalize();
  
  cleanup_progress_table:
-       kfree(htable_ops_in_progress);
+       kfree(orangefs_htable_ops_in_progress);
  
  cleanup_inode:
         orangefs_inode_cache_finalize();
@@ -250,12 +198,12 @@ static void __exit orangefs_exit(void)
         orangefs_dev_cleanup();
         BUG_ON(!list_empty(&orangefs_request_list));
         for (i = 0; i < hash_table_size; i++)
-               BUG_ON(!list_empty(&htable_ops_in_progress[i]));
+               BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i]));
  
         orangefs_inode_cache_finalize();
         op_cache_finalize();
  
-       kfree(htable_ops_in_progress);
+       kfree(orangefs_htable_ops_in_progress);
  
         bdi_destroy(&orangefs_backing_dev_info);
  
@@ -274,10 +222,10 @@ void purge_inprogress_ops(void)
                 struct orangefs_kernel_op_s *op;
                 struct orangefs_kernel_op_s *next;
  
-               spin_lock(&htable_ops_in_progress_lock);
+               spin_lock(&orangefs_htable_ops_in_progress_lock);
                 list_for_each_entry_safe(op,
                                          next,
-                                        &htable_ops_in_progress[i],
+                                        &orangefs_htable_ops_in_progress[i],
                                          list) {
                         set_op_state_purged(op);
                         gossip_debug(GOSSIP_DEV_DEBUG,
@@ -287,7 +235,7 @@ void purge_inprogress_ops(void)
                                      op->op_state,
                                      current->comm);
                 }
-               spin_unlock(&htable_ops_in_progress_lock);
+               spin_unlock(&orangefs_htable_ops_in_progress_lock);
         }
  }
  
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c

index 375708c..a799546 100644 (file)
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -73,6 +73,24 @@
   * Description:
   *                     Time getattr is valid in milliseconds.
   *
+ * What:               /sys/fs/orangefs/readahead_count
+ * Date:               Aug 2016
+ * Contact:            Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *                     Readahead cache buffer count.
+ *
+ * What:               /sys/fs/orangefs/readahead_size
+ * Date:               Aug 2016
+ * Contact:            Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *                     Readahead cache buffer size.
+ *
+ * What:               /sys/fs/orangefs/readahead_count_size
+ * Date:               Aug 2016
+ * Contact:            Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *                     Readahead cache buffer count and size.
+ *
   * What:               /sys/fs/orangefs/acache/...
   * Date:               Jun 2015
   * Contact:            Martin Brandenburg <martin@omnibond.com>
@@ -121,159 +139,34 @@
  #define PC_KOBJ_ID "pc"
  #define STATS_KOBJ_ID "stats"
  
-struct orangefs_obj {
-       struct kobject kobj;
-       int op_timeout_secs;
-       int perf_counter_reset;
-       int perf_history_size;
-       int perf_time_interval_secs;
-       int slot_timeout_secs;
-       int dcache_timeout_msecs;
-       int getattr_timeout_msecs;
-};
-
-struct acache_orangefs_obj {
-       struct kobject kobj;
-       int hard_limit;
-       int reclaim_percentage;
-       int soft_limit;
-       int timeout_msecs;
-};
-
-struct capcache_orangefs_obj {
-       struct kobject kobj;
-       int hard_limit;
-       int reclaim_percentage;
-       int soft_limit;
-       int timeout_secs;
-};
-
-struct ccache_orangefs_obj {
-       struct kobject kobj;
-       int hard_limit;
-       int reclaim_percentage;
-       int soft_limit;
-       int timeout_secs;
-};
-
-struct ncache_orangefs_obj {
-       struct kobject kobj;
-       int hard_limit;
-       int reclaim_percentage;
-       int soft_limit;
-       int timeout_msecs;
-};
-
-struct pc_orangefs_obj {
-       struct kobject kobj;
-       char *acache;
-       char *capcache;
-       char *ncache;
-};
-
-struct stats_orangefs_obj {
-       struct kobject kobj;
-       int reads;
-       int writes;
-};
+/*
+ * Every item calls orangefs_attr_show and orangefs_attr_store through
+ * orangefs_sysfs_ops. They look at the orangefs_attributes further below to
+ * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or
+ * sysfs_service_op_store.
+ */
  
  struct orangefs_attribute {
         struct attribute attr;
-       ssize_t (*show)(struct orangefs_obj *orangefs_obj,
+       ssize_t (*show)(struct kobject *kobj,
                         struct orangefs_attribute *attr,
                         char *buf);
-       ssize_t (*store)(struct orangefs_obj *orangefs_obj,
+       ssize_t (*store)(struct kobject *kobj,
                          struct orangefs_attribute *attr,
                          const char *buf,
                          size_t count);
  };
  
-struct acache_orangefs_attribute {
-       struct attribute attr;
-       ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj,
-                       struct acache_orangefs_attribute *attr,
-                       char *buf);
-       ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj,
-                        struct acache_orangefs_attribute *attr,
-                        const char *buf,
-                        size_t count);
-};
-
-struct capcache_orangefs_attribute {
-       struct attribute attr;
-       ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj,
-                       struct capcache_orangefs_attribute *attr,
-                       char *buf);
-       ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj,
-                        struct capcache_orangefs_attribute *attr,
-                        const char *buf,
-                        size_t count);
-};
-
-struct ccache_orangefs_attribute {
-       struct attribute attr;
-       ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj,
-                       struct ccache_orangefs_attribute *attr,
-                       char *buf);
-       ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj,
-                        struct ccache_orangefs_attribute *attr,
-                        const char *buf,
-                        size_t count);
-};
-
-struct ncache_orangefs_attribute {
-       struct attribute attr;
-       ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj,
-                       struct ncache_orangefs_attribute *attr,
-                       char *buf);
-       ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj,
-                        struct ncache_orangefs_attribute *attr,
-                        const char *buf,
-                        size_t count);
-};
-
-struct pc_orangefs_attribute {
-       struct attribute attr;
-       ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj,
-                       struct pc_orangefs_attribute *attr,
-                       char *buf);
-       ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj,
-                        struct pc_orangefs_attribute *attr,
-                        const char *buf,
-                        size_t count);
-};
-
-struct stats_orangefs_attribute {
-       struct attribute attr;
-       ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj,
-                       struct stats_orangefs_attribute *attr,
-                       char *buf);
-       ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj,
-                        struct stats_orangefs_attribute *attr,
-                        const char *buf,
-                        size_t count);
-};
-
  static ssize_t orangefs_attr_show(struct kobject *kobj,
                                   struct attribute *attr,
                                   char *buf)
  {
         struct orangefs_attribute *attribute;
-       struct orangefs_obj *orangefs_obj;
-       int rc;
  
         attribute = container_of(attr, struct orangefs_attribute, attr);
-       orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-
-       if (!attribute->show) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->show(orangefs_obj, attribute, buf);
-
-out:
-       return rc;
+       if (!attribute->show)
+               return -EIO;
+       return attribute->show(kobj, attribute, buf);
  }
  
  static ssize_t orangefs_attr_store(struct kobject *kobj,
@@ -282,24 +175,15 @@ static ssize_t orangefs_attr_store(struct kobject *kobj,
                                    size_t len)
  {
         struct orangefs_attribute *attribute;
-       struct orangefs_obj *orangefs_obj;
-       int rc;
  
-       gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "orangefs_attr_store: start\n");
+       if (!strcmp(kobj->name, PC_KOBJ_ID) ||
+           !strcmp(kobj->name, STATS_KOBJ_ID))
+               return -EPERM;
  
         attribute = container_of(attr, struct orangefs_attribute, attr);
-       orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-
-       if (!attribute->store) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->store(orangefs_obj, attribute, buf, len);
-
-out:
-       return rc;
+       if (!attribute->store)
+               return -EIO;
+       return attribute->store(kobj, attribute, buf, len);
  }
  
  static const struct sysfs_ops orangefs_sysfs_ops = {
@@ -307,402 +191,58 @@ static const struct sysfs_ops orangefs_sysfs_ops = {
         .store = orangefs_attr_store,
  };
  
-static ssize_t acache_orangefs_attr_show(struct kobject *kobj,
-                                        struct attribute *attr,
-                                        char *buf)
-{
-       struct acache_orangefs_attribute *attribute;
-       struct acache_orangefs_obj *acache_orangefs_obj;
-       int rc;
-
-       attribute = container_of(attr, struct acache_orangefs_attribute, attr);
-       acache_orangefs_obj =
-               container_of(kobj, struct acache_orangefs_obj, kobj);
-
-       if (!attribute->show) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->show(acache_orangefs_obj, attribute, buf);
-
-out:
-       return rc;
-}
-
-static ssize_t acache_orangefs_attr_store(struct kobject *kobj,
-                                         struct attribute *attr,
-                                         const char *buf,
-                                         size_t len)
-{
-       struct acache_orangefs_attribute *attribute;
-       struct acache_orangefs_obj *acache_orangefs_obj;
-       int rc;
-
-       gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "acache_orangefs_attr_store: start\n");
-
-       attribute = container_of(attr, struct acache_orangefs_attribute, attr);
-       acache_orangefs_obj =
-               container_of(kobj, struct acache_orangefs_obj, kobj);
-
-       if (!attribute->store) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->store(acache_orangefs_obj, attribute, buf, len);
-
-out:
-       return rc;
-}
-
-static const struct sysfs_ops acache_orangefs_sysfs_ops = {
-       .show = acache_orangefs_attr_show,
-       .store = acache_orangefs_attr_store,
-};
-
-static ssize_t capcache_orangefs_attr_show(struct kobject *kobj,
-                                          struct attribute *attr,
-                                          char *buf)
-{
-       struct capcache_orangefs_attribute *attribute;
-       struct capcache_orangefs_obj *capcache_orangefs_obj;
-       int rc;
-
-       attribute =
-               container_of(attr, struct capcache_orangefs_attribute, attr);
-       capcache_orangefs_obj =
-               container_of(kobj, struct capcache_orangefs_obj, kobj);
-
-       if (!attribute->show) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->show(capcache_orangefs_obj, attribute, buf);
-
-out:
-       return rc;
-}
-
-static ssize_t capcache_orangefs_attr_store(struct kobject *kobj,
-                                           struct attribute *attr,
-                                           const char *buf,
-                                           size_t len)
-{
-       struct capcache_orangefs_attribute *attribute;
-       struct capcache_orangefs_obj *capcache_orangefs_obj;
-       int rc;
-
-       gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "capcache_orangefs_attr_store: start\n");
-
-       attribute =
-               container_of(attr, struct capcache_orangefs_attribute, attr);
-       capcache_orangefs_obj =
-               container_of(kobj, struct capcache_orangefs_obj, kobj);
-
-       if (!attribute->store) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->store(capcache_orangefs_obj, attribute, buf, len);
-
-out:
-       return rc;
-}
-
-static const struct sysfs_ops capcache_orangefs_sysfs_ops = {
-       .show = capcache_orangefs_attr_show,
-       .store = capcache_orangefs_attr_store,
-};
-
-static ssize_t ccache_orangefs_attr_show(struct kobject *kobj,
-                                        struct attribute *attr,
-                                        char *buf)
-{
-       struct ccache_orangefs_attribute *attribute;
-       struct ccache_orangefs_obj *ccache_orangefs_obj;
-       int rc;
-
-       attribute =
-               container_of(attr, struct ccache_orangefs_attribute, attr);
-       ccache_orangefs_obj =
-               container_of(kobj, struct ccache_orangefs_obj, kobj);
-
-       if (!attribute->show) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->show(ccache_orangefs_obj, attribute, buf);
-
-out:
-       return rc;
-}
-
-static ssize_t ccache_orangefs_attr_store(struct kobject *kobj,
-                                         struct attribute *attr,
-                                         const char *buf,
-                                         size_t len)
-{
-       struct ccache_orangefs_attribute *attribute;
-       struct ccache_orangefs_obj *ccache_orangefs_obj;
-       int rc;
-
-       gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "ccache_orangefs_attr_store: start\n");
-
-       attribute =
-               container_of(attr, struct ccache_orangefs_attribute, attr);
-       ccache_orangefs_obj =
-               container_of(kobj, struct ccache_orangefs_obj, kobj);
-
-       if (!attribute->store) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->store(ccache_orangefs_obj, attribute, buf, len);
-
-out:
-       return rc;
-}
-
-static const struct sysfs_ops ccache_orangefs_sysfs_ops = {
-       .show = ccache_orangefs_attr_show,
-       .store = ccache_orangefs_attr_store,
-};
-
-static ssize_t ncache_orangefs_attr_show(struct kobject *kobj,
-                                        struct attribute *attr,
-                                        char *buf)
-{
-       struct ncache_orangefs_attribute *attribute;
-       struct ncache_orangefs_obj *ncache_orangefs_obj;
-       int rc;
-
-       attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
-       ncache_orangefs_obj =
-               container_of(kobj, struct ncache_orangefs_obj, kobj);
-
-       if (!attribute->show) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->show(ncache_orangefs_obj, attribute, buf);
-
-out:
-       return rc;
-}
-
-static ssize_t ncache_orangefs_attr_store(struct kobject *kobj,
-                                         struct attribute *attr,
-                                         const char *buf,
-                                         size_t len)
-{
-       struct ncache_orangefs_attribute *attribute;
-       struct ncache_orangefs_obj *ncache_orangefs_obj;
-       int rc;
-
-       gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "ncache_orangefs_attr_store: start\n");
-
-       attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
-       ncache_orangefs_obj =
-               container_of(kobj, struct ncache_orangefs_obj, kobj);
-
-       if (!attribute->store) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->store(ncache_orangefs_obj, attribute, buf, len);
-
-out:
-       return rc;
-}
-
-static const struct sysfs_ops ncache_orangefs_sysfs_ops = {
-       .show = ncache_orangefs_attr_show,
-       .store = ncache_orangefs_attr_store,
-};
-
-static ssize_t pc_orangefs_attr_show(struct kobject *kobj,
-                                    struct attribute *attr,
-                                    char *buf)
-{
-       struct pc_orangefs_attribute *attribute;
-       struct pc_orangefs_obj *pc_orangefs_obj;
-       int rc;
-
-       attribute = container_of(attr, struct pc_orangefs_attribute, attr);
-       pc_orangefs_obj =
-               container_of(kobj, struct pc_orangefs_obj, kobj);
-
-       if (!attribute->show) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->show(pc_orangefs_obj, attribute, buf);
-
-out:
-       return rc;
-}
-
-static const struct sysfs_ops pc_orangefs_sysfs_ops = {
-       .show = pc_orangefs_attr_show,
-};
-
-static ssize_t stats_orangefs_attr_show(struct kobject *kobj,
-                                       struct attribute *attr,
-                                       char *buf)
-{
-       struct stats_orangefs_attribute *attribute;
-       struct stats_orangefs_obj *stats_orangefs_obj;
-       int rc;
-
-       attribute = container_of(attr, struct stats_orangefs_attribute, attr);
-       stats_orangefs_obj =
-               container_of(kobj, struct stats_orangefs_obj, kobj);
-
-       if (!attribute->show) {
-               rc = -EIO;
-               goto out;
-       }
-
-       rc = attribute->show(stats_orangefs_obj, attribute, buf);
-
-out:
-       return rc;
-}
-
-static const struct sysfs_ops stats_orangefs_sysfs_ops = {
-       .show = stats_orangefs_attr_show,
-};
-
-static void orangefs_release(struct kobject *kobj)
-{
-       struct orangefs_obj *orangefs_obj;
-
-       orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
-       kfree(orangefs_obj);
-}
-
-static void acache_orangefs_release(struct kobject *kobj)
-{
-       struct acache_orangefs_obj *acache_orangefs_obj;
-
-       acache_orangefs_obj =
-               container_of(kobj, struct acache_orangefs_obj, kobj);
-       kfree(acache_orangefs_obj);
-}
-
-static void capcache_orangefs_release(struct kobject *kobj)
-{
-       struct capcache_orangefs_obj *capcache_orangefs_obj;
-
-       capcache_orangefs_obj =
-               container_of(kobj, struct capcache_orangefs_obj, kobj);
-       kfree(capcache_orangefs_obj);
-}
-
-static void ccache_orangefs_release(struct kobject *kobj)
-{
-       struct ccache_orangefs_obj *ccache_orangefs_obj;
-
-       ccache_orangefs_obj =
-               container_of(kobj, struct ccache_orangefs_obj, kobj);
-       kfree(ccache_orangefs_obj);
-}
-
-static void ncache_orangefs_release(struct kobject *kobj)
-{
-       struct ncache_orangefs_obj *ncache_orangefs_obj;
-
-       ncache_orangefs_obj =
-               container_of(kobj, struct ncache_orangefs_obj, kobj);
-       kfree(ncache_orangefs_obj);
-}
-
-static void pc_orangefs_release(struct kobject *kobj)
-{
-       struct pc_orangefs_obj *pc_orangefs_obj;
-
-       pc_orangefs_obj =
-               container_of(kobj, struct pc_orangefs_obj, kobj);
-       kfree(pc_orangefs_obj);
-}
-
-static void stats_orangefs_release(struct kobject *kobj)
-{
-       struct stats_orangefs_obj *stats_orangefs_obj;
-
-       stats_orangefs_obj =
-               container_of(kobj, struct stats_orangefs_obj, kobj);
-       kfree(stats_orangefs_obj);
-}
-
-static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
+static ssize_t sysfs_int_show(struct kobject *kobj,
+    struct orangefs_attribute *attr, char *buf)
  {
         int rc = -EIO;
-       struct orangefs_attribute *orangefs_attr;
-       struct stats_orangefs_attribute *stats_orangefs_attr;
  
-       gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id);
+       gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n",
+           kobj->name);
  
-       if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
-               orangefs_attr = (struct orangefs_attribute *)attr;
-
-               if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) {
+       if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "op_timeout_secs")) {
                         rc = scnprintf(buf,
                                        PAGE_SIZE,
                                        "%d\n",
                                        op_timeout_secs);
                         goto out;
-               } else if (!strcmp(orangefs_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "slot_timeout_secs")) {
                         rc = scnprintf(buf,
                                        PAGE_SIZE,
                                        "%d\n",
                                        slot_timeout_secs);
                         goto out;
-               } else if (!strcmp(orangefs_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "dcache_timeout_msecs")) {
                         rc = scnprintf(buf,
                                        PAGE_SIZE,
                                        "%d\n",
-                                      dcache_timeout_msecs);
+                                      orangefs_dcache_timeout_msecs);
                         goto out;
-               } else if (!strcmp(orangefs_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "getattr_timeout_msecs")) {
                         rc = scnprintf(buf,
                                        PAGE_SIZE,
                                        "%d\n",
-                                      getattr_timeout_msecs);
+                                      orangefs_getattr_timeout_msecs);
                         goto out;
                 } else {
                         goto out;
                 }
  
-       } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) {
-               stats_orangefs_attr = (struct stats_orangefs_attribute *)attr;
-
-               if (!strcmp(stats_orangefs_attr->attr.name, "reads")) {
+       } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "reads")) {
                         rc = scnprintf(buf,
                                        PAGE_SIZE,
                                        "%lu\n",
-                                      g_orangefs_stats.reads);
+                                      orangefs_stats.reads);
                         goto out;
-               } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) {
+               } else if (!strcmp(attr->attr.name, "writes")) {
                         rc = scnprintf(buf,
                                        PAGE_SIZE,
                                        "%lu\n",
-                                      g_orangefs_stats.writes);
+                                      orangefs_stats.writes);
                         goto out;
                 } else {
                         goto out;
@@ -714,45 +254,13 @@ out:
         return rc;
  }
  
-static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj,
-                                struct orangefs_attribute *attr,
-                                char *buf)
-{
-       int rc;
-
-       gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "int_orangefs_show:start attr->attr.name:%s:\n",
-                    attr->attr.name);
-
-       rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr);
-
-       return rc;
-}
-
-static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj,
-                       struct stats_orangefs_attribute *attr,
-                       char *buf)
-{
-       int rc;
-
-       gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "int_stats_show:start attr->attr.name:%s:\n",
-                    attr->attr.name);
-
-       rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr);
-
-       return rc;
-}
-
-static ssize_t int_store(struct orangefs_obj *orangefs_obj,
-                        struct orangefs_attribute *attr,
-                        const char *buf,
-                        size_t count)
+static ssize_t sysfs_int_store(struct kobject *kobj,
+    struct orangefs_attribute *attr, const char *buf, size_t count)
  {
         int rc = 0;
  
         gossip_debug(GOSSIP_SYSFS_DEBUG,
-                    "int_store: start attr->attr.name:%s: buf:%s:\n",
+                    "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n",
                      attr->attr.name, buf);
  
         if (!strcmp(attr->attr.name, "op_timeout_secs")) {
@@ -762,10 +270,10 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
                 rc = kstrtoint(buf, 0, &slot_timeout_secs);
                 goto out;
         } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
-               rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
+               rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs);
                 goto out;
         } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
-               rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
+               rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs);
                 goto out;
         } else {
                 goto out;
@@ -783,24 +291,19 @@ out:
  /*
   * obtain attribute values from userspace with a service operation.
   */
-static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
+static ssize_t sysfs_service_op_show(struct kobject *kobj,
+    struct orangefs_attribute *attr, char *buf)
  {
         struct orangefs_kernel_op_s *new_op = NULL;
         int rc = 0;
         char *ser_op_type = NULL;
-       struct orangefs_attribute *orangefs_attr;
-       struct acache_orangefs_attribute *acache_attr;
-       struct capcache_orangefs_attribute *capcache_attr;
-       struct ccache_orangefs_attribute *ccache_attr;
-       struct ncache_orangefs_attribute *ncache_attr;
-       struct pc_orangefs_attribute *pc_attr;
         __u32 op_alloc_type;
  
         gossip_debug(GOSSIP_SYSFS_DEBUG,
                      "sysfs_service_op_show: id:%s:\n",
-                    kobj_id);
+                    kobj->name);
  
-       if (strcmp(kobj_id, PC_KOBJ_ID))
+       if (strcmp(kobj->name, PC_KOBJ_ID))
                 op_alloc_type = ORANGEFS_VFS_OP_PARAM;
         else
                 op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT;
@@ -818,124 +321,135 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
                 goto out;
         }
  
-       if (strcmp(kobj_id, PC_KOBJ_ID))
+       if (strcmp(kobj->name, PC_KOBJ_ID))
                 new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET;
  
-       if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
-               orangefs_attr = (struct orangefs_attribute *)attr;
+       if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+               /* Drop unsupported requests first. */
+               if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) &&
+                   (!strcmp(attr->attr.name, "readahead_count") ||
+                   !strcmp(attr->attr.name, "readahead_size") ||
+                   !strcmp(attr->attr.name, "readahead_count_size"))) {
+                       rc = -EINVAL;
+                       goto out;
+               }
  
-               if (!strcmp(orangefs_attr->attr.name, "perf_history_size"))
+               if (!strcmp(attr->attr.name, "perf_history_size"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
-               else if (!strcmp(orangefs_attr->attr.name,
+               else if (!strcmp(attr->attr.name,
                                  "perf_time_interval_secs"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
-               else if (!strcmp(orangefs_attr->attr.name,
+               else if (!strcmp(attr->attr.name,
                                  "perf_counter_reset"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
  
-       } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
-               acache_attr = (struct acache_orangefs_attribute *)attr;
+               else if (!strcmp(attr->attr.name,
+                                "readahead_count"))
+                       new_op->upcall.req.param.op =
+                               ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+
+               else if (!strcmp(attr->attr.name,
+                                "readahead_size"))
+                       new_op->upcall.req.param.op =
+                               ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
  
-               if (!strcmp(acache_attr->attr.name, "timeout_msecs"))
+               else if (!strcmp(attr->attr.name,
+                                "readahead_count_size"))
+                       new_op->upcall.req.param.op =
+                               ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+       } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "timeout_msecs"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
  
-               if (!strcmp(acache_attr->attr.name, "hard_limit"))
+               if (!strcmp(attr->attr.name, "hard_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
  
-               if (!strcmp(acache_attr->attr.name, "soft_limit"))
+               if (!strcmp(attr->attr.name, "soft_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
  
-               if (!strcmp(acache_attr->attr.name, "reclaim_percentage"))
+               if (!strcmp(attr->attr.name, "reclaim_percentage"))
                         new_op->upcall.req.param.op =
                           ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
  
-       } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
-               capcache_attr = (struct capcache_orangefs_attribute *)attr;
-
-               if (!strcmp(capcache_attr->attr.name, "timeout_secs"))
+       } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "timeout_secs"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
  
-               if (!strcmp(capcache_attr->attr.name, "hard_limit"))
+               if (!strcmp(attr->attr.name, "hard_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
  
-               if (!strcmp(capcache_attr->attr.name, "soft_limit"))
+               if (!strcmp(attr->attr.name, "soft_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
  
-               if (!strcmp(capcache_attr->attr.name, "reclaim_percentage"))
+               if (!strcmp(attr->attr.name, "reclaim_percentage"))
                         new_op->upcall.req.param.op =
                           ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
  
-       } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
-               ccache_attr = (struct ccache_orangefs_attribute *)attr;
-
-               if (!strcmp(ccache_attr->attr.name, "timeout_secs"))
+       } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "timeout_secs"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
  
-               if (!strcmp(ccache_attr->attr.name, "hard_limit"))
+               if (!strcmp(attr->attr.name, "hard_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
  
-               if (!strcmp(ccache_attr->attr.name, "soft_limit"))
+               if (!strcmp(attr->attr.name, "soft_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
  
-               if (!strcmp(ccache_attr->attr.name, "reclaim_percentage"))
+               if (!strcmp(attr->attr.name, "reclaim_percentage"))
                         new_op->upcall.req.param.op =
                           ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
  
-       } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
-               ncache_attr = (struct ncache_orangefs_attribute *)attr;
-
-               if (!strcmp(ncache_attr->attr.name, "timeout_msecs"))
+       } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "timeout_msecs"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
  
-               if (!strcmp(ncache_attr->attr.name, "hard_limit"))
+               if (!strcmp(attr->attr.name, "hard_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
  
-               if (!strcmp(ncache_attr->attr.name, "soft_limit"))
+               if (!strcmp(attr->attr.name, "soft_limit"))
                         new_op->upcall.req.param.op =
                                 ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
  
-               if (!strcmp(ncache_attr->attr.name, "reclaim_percentage"))
+               if (!strcmp(attr->attr.name, "reclaim_percentage"))
                         new_op->upcall.req.param.op =
                           ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
  
-       } else if (!strcmp(kobj_id, PC_KOBJ_ID)) {
-               pc_attr = (struct pc_orangefs_attribute *)attr;
-
-               if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID))
+       } else if (!strcmp(kobj->name, PC_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID))
                         new_op->upcall.req.perf_count.type =
                                 ORANGEFS_PERF_COUNT_REQUEST_ACACHE;
  
-               if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID))
+               if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID))
                         new_op->upcall.req.perf_count.type =
                                 ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE;
  
-               if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID))
+               if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID))
                         new_op->upcall.req.perf_count.type =
                                 ORANGEFS_PERF_COUNT_REQUEST_NCACHE;
  
         } else {
                 gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n",
-                          kobj_id);
+                          kobj->name);
                 rc = -EINVAL;
                 goto out;
         }
  
  
-       if (strcmp(kobj_id, PC_KOBJ_ID))
+       if (strcmp(kobj->name, PC_KOBJ_ID))
                 ser_op_type = "orangefs_param";
         else
                 ser_op_type = "orangefs_perf_count";
@@ -948,11 +462,18 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
  
  out:
         if (!rc) {
-               if (strcmp(kobj_id, PC_KOBJ_ID)) {
-                       rc = scnprintf(buf,
-                                      PAGE_SIZE,
-                                      "%d\n",
-                                      (int)new_op->downcall.resp.param.value);
+               if (strcmp(kobj->name, PC_KOBJ_ID)) {
+                       if (new_op->upcall.req.param.op ==
+                           ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) {
+                               rc = scnprintf(buf, PAGE_SIZE, "%d %d\n",
+                                   (int)new_op->downcall.resp.param.u.
+                                   value32[0],
+                                   (int)new_op->downcall.resp.param.u.
+                                   value32[1]);
+                       } else {
+                               rc = scnprintf(buf, PAGE_SIZE, "%d\n",
+                                   (int)new_op->downcall.resp.param.u.value64);
+                       }
                 } else {
                         rc = scnprintf(
                                 buf,
@@ -968,77 +489,6 @@ out:
  
  }
  
-static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj,
-                                    struct orangefs_attribute *attr,
-                                    char *buf)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr);
-
-       return rc;
-}
-
-static ssize_t
-       service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj,
-                           struct acache_orangefs_attribute *attr,
-                           char *buf)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr);
-
-       return rc;
-}
-
-static ssize_t service_capcache_show(struct capcache_orangefs_obj
-                                       *capcache_orangefs_obj,
-                                    struct capcache_orangefs_attribute *attr,
-                                    char *buf)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr);
-
-       return rc;
-}
-
-static ssize_t service_ccache_show(struct ccache_orangefs_obj
-                                       *ccache_orangefs_obj,
-                                  struct ccache_orangefs_attribute *attr,
-                                  char *buf)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr);
-
-       return rc;
-}
-
-static ssize_t
-       service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj,
-                           struct ncache_orangefs_attribute *attr,
-                           char *buf)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr);
-
-       return rc;
-}
-
-static ssize_t
-       service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj,
-                           struct pc_orangefs_attribute *attr,
-                           char *buf)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr);
-
-       return rc;
-}
-
  /*
   * pass attribute values back to userspace with a service operation.
   *
@@ -1050,20 +500,16 @@ static ssize_t
   * We want to return 1 if we think everything went OK, and
   * EINVAL if not.
   */
-static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
+static ssize_t sysfs_service_op_store(struct kobject *kobj,
+    struct orangefs_attribute *attr, const char *buf, size_t count)
  {
         struct orangefs_kernel_op_s *new_op = NULL;
         int val = 0;
         int rc = 0;
-       struct orangefs_attribute *orangefs_attr;
-       struct acache_orangefs_attribute *acache_attr;
-       struct capcache_orangefs_attribute *capcache_attr;
-       struct ccache_orangefs_attribute *ccache_attr;
-       struct ncache_orangefs_attribute *ncache_attr;
  
         gossip_debug(GOSSIP_SYSFS_DEBUG,
                      "sysfs_service_op_store: id:%s:\n",
-                    kobj_id);
+                    kobj->name);
  
         new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
         if (!new_op)
@@ -1079,16 +525,29 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
         }
  
         /*
-        * The value we want to send back to userspace is in buf.
+        * The value we want to send back to userspace is in buf, unless this
+        * there are two parameters, which is specially handled below.
          */
-       rc = kstrtoint(buf, 0, &val);
-       if (rc)
-               goto out;
+       if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) ||
+           strcmp(attr->attr.name, "readahead_count_size")) {
+               rc = kstrtoint(buf, 0, &val);
+               if (rc)
+                       goto out;
+       }
  
-       if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
-               orangefs_attr = (struct orangefs_attribute *)attr;
+       new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+
+       if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
+               /* Drop unsupported requests first. */
+               if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) &&
+                   (!strcmp(attr->attr.name, "readahead_count") ||
+                   !strcmp(attr->attr.name, "readahead_size") ||
+                   !strcmp(attr->attr.name, "readahead_count_size"))) {
+                       rc = -EINVAL;
+                       goto out;
+               }
  
-               if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) {
+               if (!strcmp(attr->attr.name, "perf_history_size")) {
                         if (val > 0) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
@@ -1096,7 +555,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(orangefs_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "perf_time_interval_secs")) {
                         if (val > 0) {
                                 new_op->upcall.req.param.op =
@@ -1105,7 +564,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(orangefs_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "perf_counter_reset")) {
                         if ((val == 0) || (val == 1)) {
                                 new_op->upcall.req.param.op =
@@ -1114,12 +573,55 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
+               } else if (!strcmp(attr->attr.name,
+                                  "readahead_count")) {
+                       if ((val >= 0)) {
+                               new_op->upcall.req.param.op =
+                               ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+                       } else {
+                               rc = 0;
+                               goto out;
+                       }
+               } else if (!strcmp(attr->attr.name,
+                                  "readahead_size")) {
+                       if ((val >= 0)) {
+                               new_op->upcall.req.param.op =
+                               ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
+                       } else {
+                               rc = 0;
+                               goto out;
+                       }
+               } else if (!strcmp(attr->attr.name,
+                                  "readahead_count_size")) {
+                       int val1, val2;
+                       rc = sscanf(buf, "%d %d", &val1, &val2);
+                       if (rc < 2) {
+                               rc = 0;
+                               goto out;
+                       }
+                       if ((val1 >= 0) && (val2 >= 0)) {
+                               new_op->upcall.req.param.op =
+                               ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+                       } else {
+                               rc = 0;
+                               goto out;
+                       }
+                       new_op->upcall.req.param.u.value32[0] = val1;
+                       new_op->upcall.req.param.u.value32[1] = val2;
+                       goto value_set;
+               } else if (!strcmp(attr->attr.name,
+                                  "perf_counter_reset")) {
+                       if ((val > 0)) {
+                               new_op->upcall.req.param.op =
+                               ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+                       } else {
+                               rc = 0;
+                               goto out;
+                       }
                 }
  
-       } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
-               acache_attr = (struct acache_orangefs_attribute *)attr;
-
-               if (!strcmp(acache_attr->attr.name, "hard_limit")) {
+       } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "hard_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
@@ -1127,7 +629,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(acache_attr->attr.name, "soft_limit")) {
+               } else if (!strcmp(attr->attr.name, "soft_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
@@ -1135,7 +637,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(acache_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "reclaim_percentage")) {
                         if ((val > -1) && (val < 101)) {
                                 new_op->upcall.req.param.op =
@@ -1144,7 +646,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) {
+               } else if (!strcmp(attr->attr.name, "timeout_msecs")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
@@ -1154,10 +656,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                         }
                 }
  
-       } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
-               capcache_attr = (struct capcache_orangefs_attribute *)attr;
-
-               if (!strcmp(capcache_attr->attr.name, "hard_limit")) {
+       } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "hard_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
@@ -1165,7 +665,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) {
+               } else if (!strcmp(attr->attr.name, "soft_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
@@ -1173,7 +673,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(capcache_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "reclaim_percentage")) {
                         if ((val > -1) && (val < 101)) {
                                 new_op->upcall.req.param.op =
@@ -1182,7 +682,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) {
+               } else if (!strcmp(attr->attr.name, "timeout_secs")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
@@ -1192,10 +692,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                         }
                 }
  
-       } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
-               ccache_attr = (struct ccache_orangefs_attribute *)attr;
-
-               if (!strcmp(ccache_attr->attr.name, "hard_limit")) {
+       } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "hard_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
@@ -1203,7 +701,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) {
+               } else if (!strcmp(attr->attr.name, "soft_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
@@ -1211,7 +709,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(ccache_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "reclaim_percentage")) {
                         if ((val > -1) && (val < 101)) {
                                 new_op->upcall.req.param.op =
@@ -1220,7 +718,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) {
+               } else if (!strcmp(attr->attr.name, "timeout_secs")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
@@ -1230,10 +728,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                         }
                 }
  
-       } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
-               ncache_attr = (struct ncache_orangefs_attribute *)attr;
-
-               if (!strcmp(ncache_attr->attr.name, "hard_limit")) {
+       } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) {
+               if (!strcmp(attr->attr.name, "hard_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
@@ -1241,7 +737,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) {
+               } else if (!strcmp(attr->attr.name, "soft_limit")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
@@ -1249,7 +745,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(ncache_attr->attr.name,
+               } else if (!strcmp(attr->attr.name,
                                    "reclaim_percentage")) {
                         if ((val > -1) && (val < 101)) {
                                 new_op->upcall.req.param.op =
@@ -1258,7 +754,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                                 rc = 0;
                                 goto out;
                         }
-               } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) {
+               } else if (!strcmp(attr->attr.name, "timeout_msecs")) {
                         if (val > -1) {
                                 new_op->upcall.req.param.op =
                                   ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
@@ -1270,14 +766,13 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
  
         } else {
                 gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n",
-                          kobj_id);
+                          kobj->name);
                 rc = -EINVAL;
                 goto out;
         }
  
-       new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
-
-       new_op->upcall.req.param.value = val;
+       new_op->upcall.req.param.u.value64 = val;
+value_set:
  
         /*
          * The service_operation will return a errno return code on
@@ -1290,7 +785,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
                         rc);
                 rc = 0;
         } else {
-               rc = 1;
+               rc = count;
         }
  
  out:
@@ -1302,127 +797,56 @@ out:
         return rc;
  }
  
-static ssize_t
-       service_orangefs_store(struct orangefs_obj *orangefs_obj,
-                              struct orangefs_attribute *attr,
-                              const char *buf,
-                              size_t count)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr);
-
-       /* rc should have an errno value if the service_op went bad. */
-       if (rc == 1)
-               rc = count;
-
-       return rc;
-}
-
-static ssize_t
-       service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj,
-                            struct acache_orangefs_attribute *attr,
-                            const char *buf,
-                            size_t count)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr);
-
-       /* rc should have an errno value if the service_op went bad. */
-       if (rc == 1)
-               rc = count;
-
-       return rc;
-}
-
-static ssize_t
-       service_capcache_store(struct capcache_orangefs_obj
-                               *capcache_orangefs_obj,
-                              struct capcache_orangefs_attribute *attr,
-                              const char *buf,
-                              size_t count)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr);
-
-       /* rc should have an errno value if the service_op went bad. */
-       if (rc == 1)
-               rc = count;
-
-       return rc;
-}
-
-static ssize_t service_ccache_store(struct ccache_orangefs_obj
-                                       *ccache_orangefs_obj,
-                                   struct ccache_orangefs_attribute *attr,
-                                   const char *buf,
-                                   size_t count)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr);
-
-       /* rc should have an errno value if the service_op went bad. */
-       if (rc == 1)
-               rc = count;
-
-       return rc;
-}
-
-static ssize_t
-       service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj,
-                            struct ncache_orangefs_attribute *attr,
-                            const char *buf,
-                            size_t count)
-{
-       int rc = 0;
-
-       rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr);
-
-       /* rc should have an errno value if the service_op went bad. */
-       if (rc == 1)
-               rc = count;
-
-       return rc;
-}
-
  static struct orangefs_attribute op_timeout_secs_attribute =
-       __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store);
+       __ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
  
  static struct orangefs_attribute slot_timeout_secs_attribute =
-       __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+       __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
  
  static struct orangefs_attribute dcache_timeout_msecs_attribute =
-       __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
+       __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
  
  static struct orangefs_attribute getattr_timeout_msecs_attribute =
-       __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+       __ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
+
+static struct orangefs_attribute readahead_count_attribute =
+       __ATTR(readahead_count, 0664, sysfs_service_op_show,
+              sysfs_service_op_store);
+
+static struct orangefs_attribute readahead_size_attribute =
+       __ATTR(readahead_size, 0664, sysfs_service_op_show,
+              sysfs_service_op_store);
+
+static struct orangefs_attribute readahead_count_size_attribute =
+       __ATTR(readahead_count_size, 0664, sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct orangefs_attribute perf_counter_reset_attribute =
         __ATTR(perf_counter_reset,
                0664,
-              service_orangefs_show,
-              service_orangefs_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct orangefs_attribute perf_history_size_attribute =
         __ATTR(perf_history_size,
                0664,
-              service_orangefs_show,
-              service_orangefs_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct orangefs_attribute perf_time_interval_secs_attribute =
         __ATTR(perf_time_interval_secs,
                0664,
-              service_orangefs_show,
-              service_orangefs_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct attribute *orangefs_default_attrs[] = {
         &op_timeout_secs_attribute.attr,
         &slot_timeout_secs_attribute.attr,
         &dcache_timeout_msecs_attribute.attr,
         &getattr_timeout_msecs_attribute.attr,
+       &readahead_count_attribute.attr,
+       &readahead_size_attribute.attr,
+       &readahead_count_size_attribute.attr,
         &perf_counter_reset_attribute.attr,
         &perf_history_size_attribute.attr,
         &perf_time_interval_secs_attribute.attr,
@@ -1431,33 +855,32 @@ static struct attribute *orangefs_default_attrs[] = {
  
  static struct kobj_type orangefs_ktype = {
         .sysfs_ops = &orangefs_sysfs_ops,
-       .release = orangefs_release,
         .default_attrs = orangefs_default_attrs,
  };
  
-static struct acache_orangefs_attribute acache_hard_limit_attribute =
+static struct orangefs_attribute acache_hard_limit_attribute =
         __ATTR(hard_limit,
                0664,
-              service_acache_show,
-              service_acache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct acache_orangefs_attribute acache_reclaim_percent_attribute =
+static struct orangefs_attribute acache_reclaim_percent_attribute =
         __ATTR(reclaim_percentage,
                0664,
-              service_acache_show,
-              service_acache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct acache_orangefs_attribute acache_soft_limit_attribute =
+static struct orangefs_attribute acache_soft_limit_attribute =
         __ATTR(soft_limit,
                0664,
-              service_acache_show,
-              service_acache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct acache_orangefs_attribute acache_timeout_msecs_attribute =
+static struct orangefs_attribute acache_timeout_msecs_attribute =
         __ATTR(timeout_msecs,
                0664,
-              service_acache_show,
-              service_acache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct attribute *acache_orangefs_default_attrs[] = {
         &acache_hard_limit_attribute.attr,
@@ -1468,34 +891,33 @@ static struct attribute *acache_orangefs_default_attrs[] = {
  };
  
  static struct kobj_type acache_orangefs_ktype = {
-       .sysfs_ops = &acache_orangefs_sysfs_ops,
-       .release = acache_orangefs_release,
+       .sysfs_ops = &orangefs_sysfs_ops,
         .default_attrs = acache_orangefs_default_attrs,
  };
  
-static struct capcache_orangefs_attribute capcache_hard_limit_attribute =
+static struct orangefs_attribute capcache_hard_limit_attribute =
         __ATTR(hard_limit,
                0664,
-              service_capcache_show,
-              service_capcache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute =
+static struct orangefs_attribute capcache_reclaim_percent_attribute =
         __ATTR(reclaim_percentage,
                0664,
-              service_capcache_show,
-              service_capcache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct capcache_orangefs_attribute capcache_soft_limit_attribute =
+static struct orangefs_attribute capcache_soft_limit_attribute =
         __ATTR(soft_limit,
                0664,
-              service_capcache_show,
-              service_capcache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct capcache_orangefs_attribute capcache_timeout_secs_attribute =
+static struct orangefs_attribute capcache_timeout_secs_attribute =
         __ATTR(timeout_secs,
                0664,
-              service_capcache_show,
-              service_capcache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct attribute *capcache_orangefs_default_attrs[] = {
         &capcache_hard_limit_attribute.attr,
@@ -1506,34 +928,33 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
  };
  
  static struct kobj_type capcache_orangefs_ktype = {
-       .sysfs_ops = &capcache_orangefs_sysfs_ops,
-       .release = capcache_orangefs_release,
+       .sysfs_ops = &orangefs_sysfs_ops,
         .default_attrs = capcache_orangefs_default_attrs,
  };
  
-static struct ccache_orangefs_attribute ccache_hard_limit_attribute =
+static struct orangefs_attribute ccache_hard_limit_attribute =
         __ATTR(hard_limit,
                0664,
-              service_ccache_show,
-              service_ccache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute =
+static struct orangefs_attribute ccache_reclaim_percent_attribute =
         __ATTR(reclaim_percentage,
                0664,
-              service_ccache_show,
-              service_ccache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct ccache_orangefs_attribute ccache_soft_limit_attribute =
+static struct orangefs_attribute ccache_soft_limit_attribute =
         __ATTR(soft_limit,
                0664,
-              service_ccache_show,
-              service_ccache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct ccache_orangefs_attribute ccache_timeout_secs_attribute =
+static struct orangefs_attribute ccache_timeout_secs_attribute =
         __ATTR(timeout_secs,
                0664,
-              service_ccache_show,
-              service_ccache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct attribute *ccache_orangefs_default_attrs[] = {
         &ccache_hard_limit_attribute.attr,
@@ -1544,34 +965,33 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
  };
  
  static struct kobj_type ccache_orangefs_ktype = {
-       .sysfs_ops = &ccache_orangefs_sysfs_ops,
-       .release = ccache_orangefs_release,
+       .sysfs_ops = &orangefs_sysfs_ops,
         .default_attrs = ccache_orangefs_default_attrs,
  };
  
-static struct ncache_orangefs_attribute ncache_hard_limit_attribute =
+static struct orangefs_attribute ncache_hard_limit_attribute =
         __ATTR(hard_limit,
                0664,
-              service_ncache_show,
-              service_ncache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute =
+static struct orangefs_attribute ncache_reclaim_percent_attribute =
         __ATTR(reclaim_percentage,
                0664,
-              service_ncache_show,
-              service_ncache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct ncache_orangefs_attribute ncache_soft_limit_attribute =
+static struct orangefs_attribute ncache_soft_limit_attribute =
         __ATTR(soft_limit,
                0664,
-              service_ncache_show,
-              service_ncache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
-static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute =
+static struct orangefs_attribute ncache_timeout_msecs_attribute =
         __ATTR(timeout_msecs,
                0664,
-              service_ncache_show,
-              service_ncache_store);
+              sysfs_service_op_show,
+              sysfs_service_op_store);
  
  static struct attribute *ncache_orangefs_default_attrs[] = {
         &ncache_hard_limit_attribute.attr,
@@ -1582,27 +1002,26 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
  };
  
  static struct kobj_type ncache_orangefs_ktype = {
-       .sysfs_ops = &ncache_orangefs_sysfs_ops,
-       .release = ncache_orangefs_release,
+       .sysfs_ops = &orangefs_sysfs_ops,
         .default_attrs = ncache_orangefs_default_attrs,
  };
  
-static struct pc_orangefs_attribute pc_acache_attribute =
+static struct orangefs_attribute pc_acache_attribute =
         __ATTR(acache,
                0664,
-              service_pc_show,
+              sysfs_service_op_show,
                NULL);
  
-static struct pc_orangefs_attribute pc_capcache_attribute =
+static struct orangefs_attribute pc_capcache_attribute =
         __ATTR(capcache,
                0664,
-              service_pc_show,
+              sysfs_service_op_show,
                NULL);
  
-static struct pc_orangefs_attribute pc_ncache_attribute =
+static struct orangefs_attribute pc_ncache_attribute =
         __ATTR(ncache,
                0664,
-              service_pc_show,
+              sysfs_service_op_show,
                NULL);
  
  static struct attribute *pc_orangefs_default_attrs[] = {
@@ -1613,21 +1032,20 @@ static struct attribute *pc_orangefs_default_attrs[] = {
  };
  
  static struct kobj_type pc_orangefs_ktype = {
-       .sysfs_ops = &pc_orangefs_sysfs_ops,
-       .release = pc_orangefs_release,
+       .sysfs_ops = &orangefs_sysfs_ops,
         .default_attrs = pc_orangefs_default_attrs,
  };
  
-static struct stats_orangefs_attribute stats_reads_attribute =
+static struct orangefs_attribute stats_reads_attribute =
         __ATTR(reads,
                0664,
-              int_stats_show,
+              sysfs_int_show,
                NULL);
  
-static struct stats_orangefs_attribute stats_writes_attribute =
+static struct orangefs_attribute stats_writes_attribute =
         __ATTR(writes,
                0664,
-              int_stats_show,
+              sysfs_int_show,
                NULL);
  
  static struct attribute *stats_orangefs_default_attrs[] = {
@@ -1637,18 +1055,17 @@ static struct attribute *stats_orangefs_default_attrs[] = {
  };
  
  static struct kobj_type stats_orangefs_ktype = {
-       .sysfs_ops = &stats_orangefs_sysfs_ops,
-       .release = stats_orangefs_release,
+       .sysfs_ops = &orangefs_sysfs_ops,
         .default_attrs = stats_orangefs_default_attrs,
  };
  
-static struct orangefs_obj *orangefs_obj;
-static struct acache_orangefs_obj *acache_orangefs_obj;
-static struct capcache_orangefs_obj *capcache_orangefs_obj;
-static struct ccache_orangefs_obj *ccache_orangefs_obj;
-static struct ncache_orangefs_obj *ncache_orangefs_obj;
-static struct pc_orangefs_obj *pc_orangefs_obj;
-static struct stats_orangefs_obj *stats_orangefs_obj;
+static struct kobject *orangefs_obj;
+static struct kobject *acache_orangefs_obj;
+static struct kobject *capcache_orangefs_obj;
+static struct kobject *ccache_orangefs_obj;
+static struct kobject *ncache_orangefs_obj;
+static struct kobject *pc_orangefs_obj;
+static struct kobject *stats_orangefs_obj;
  
  int orangefs_sysfs_init(void)
  {
@@ -1661,7 +1078,7 @@ int orangefs_sysfs_init(void)
         if (!orangefs_obj)
                 goto out;
  
-       rc = kobject_init_and_add(&orangefs_obj->kobj,
+       rc = kobject_init_and_add(orangefs_obj,
                                   &orangefs_ktype,
                                   fs_kobj,
                                   ORANGEFS_KOBJ_ID);
@@ -1669,7 +1086,7 @@ int orangefs_sysfs_init(void)
         if (rc)
                 goto ofs_obj_bail;
  
-       kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD);
+       kobject_uevent(orangefs_obj, KOBJ_ADD);
  
         /* create /sys/fs/orangefs/acache. */
         acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL);
@@ -1678,15 +1095,15 @@ int orangefs_sysfs_init(void)
                 goto ofs_obj_bail;
         }
  
-       rc = kobject_init_and_add(&acache_orangefs_obj->kobj,
+       rc = kobject_init_and_add(acache_orangefs_obj,
                                   &acache_orangefs_ktype,
-                                 &orangefs_obj->kobj,
+                                 orangefs_obj,
                                   ACACHE_KOBJ_ID);
  
         if (rc)
                 goto acache_obj_bail;
  
-       kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD);
+       kobject_uevent(acache_orangefs_obj, KOBJ_ADD);
  
         /* create /sys/fs/orangefs/capcache. */
         capcache_orangefs_obj =
@@ -1696,14 +1113,14 @@ int orangefs_sysfs_init(void)
                 goto acache_obj_bail;
         }
  
-       rc = kobject_init_and_add(&capcache_orangefs_obj->kobj,
+       rc = kobject_init_and_add(capcache_orangefs_obj,
                                   &capcache_orangefs_ktype,
-                                 &orangefs_obj->kobj,
+                                 orangefs_obj,
                                   CAPCACHE_KOBJ_ID);
         if (rc)
                 goto capcache_obj_bail;
  
-       kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD);
+       kobject_uevent(capcache_orangefs_obj, KOBJ_ADD);
  
         /* create /sys/fs/orangefs/ccache. */
         ccache_orangefs_obj =
@@ -1713,14 +1130,14 @@ int orangefs_sysfs_init(void)
                 goto capcache_obj_bail;
         }
  
-       rc = kobject_init_and_add(&ccache_orangefs_obj->kobj,
+       rc = kobject_init_and_add(ccache_orangefs_obj,
                                   &ccache_orangefs_ktype,
-                                 &orangefs_obj->kobj,
+                                 orangefs_obj,
                                   CCACHE_KOBJ_ID);
         if (rc)
                 goto ccache_obj_bail;
  
-       kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD);
+       kobject_uevent(ccache_orangefs_obj, KOBJ_ADD);
  
         /* create /sys/fs/orangefs/ncache. */
         ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL);
@@ -1729,15 +1146,15 @@ int orangefs_sysfs_init(void)
                 goto ccache_obj_bail;
         }
  
-       rc = kobject_init_and_add(&ncache_orangefs_obj->kobj,
+       rc = kobject_init_and_add(ncache_orangefs_obj,
                                   &ncache_orangefs_ktype,
-                                 &orangefs_obj->kobj,
+                                 orangefs_obj,
                                   NCACHE_KOBJ_ID);
  
         if (rc)
                 goto ncache_obj_bail;
  
-       kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD);
+       kobject_uevent(ncache_orangefs_obj, KOBJ_ADD);
  
         /* create /sys/fs/orangefs/perf_counters. */
         pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL);
@@ -1746,15 +1163,15 @@ int orangefs_sysfs_init(void)
                 goto ncache_obj_bail;
         }
  
-       rc = kobject_init_and_add(&pc_orangefs_obj->kobj,
+       rc = kobject_init_and_add(pc_orangefs_obj,
                                   &pc_orangefs_ktype,
-                                 &orangefs_obj->kobj,
+                                 orangefs_obj,
                                   "perf_counters");
  
         if (rc)
                 goto pc_obj_bail;
  
-       kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD);
+       kobject_uevent(pc_orangefs_obj, KOBJ_ADD);
  
         /* create /sys/fs/orangefs/stats. */
         stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL);
@@ -1763,37 +1180,31 @@ int orangefs_sysfs_init(void)
                 goto pc_obj_bail;
         }
  
-       rc = kobject_init_and_add(&stats_orangefs_obj->kobj,
+       rc = kobject_init_and_add(stats_orangefs_obj,
                                   &stats_orangefs_ktype,
-                                 &orangefs_obj->kobj,
+                                 orangefs_obj,
                                   STATS_KOBJ_ID);
  
         if (rc)
                 goto stats_obj_bail;
  
-       kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD);
+       kobject_uevent(stats_orangefs_obj, KOBJ_ADD);
         goto out;
  
  stats_obj_bail:
-               kobject_put(&stats_orangefs_obj->kobj);
-
+               kobject_put(stats_orangefs_obj);
  pc_obj_bail:
-               kobject_put(&pc_orangefs_obj->kobj);
-
+               kobject_put(pc_orangefs_obj);
  ncache_obj_bail:
-               kobject_put(&ncache_orangefs_obj->kobj);
-
+               kobject_put(ncache_orangefs_obj);
  ccache_obj_bail:
-               kobject_put(&ccache_orangefs_obj->kobj);
-
+               kobject_put(ccache_orangefs_obj);
  capcache_obj_bail:
-               kobject_put(&capcache_orangefs_obj->kobj);
-
+               kobject_put(capcache_orangefs_obj);
  acache_obj_bail:
-               kobject_put(&acache_orangefs_obj->kobj);
-
+               kobject_put(acache_orangefs_obj);
  ofs_obj_bail:
-               kobject_put(&orangefs_obj->kobj);
+               kobject_put(orangefs_obj);
  out:
         return rc;
  }
@@ -1801,13 +1212,11 @@ out:
  void orangefs_sysfs_exit(void)
  {
         gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n");
-
-       kobject_put(&acache_orangefs_obj->kobj);
-       kobject_put(&capcache_orangefs_obj->kobj);
-       kobject_put(&ccache_orangefs_obj->kobj);
-       kobject_put(&ncache_orangefs_obj->kobj);
-       kobject_put(&pc_orangefs_obj->kobj);
-       kobject_put(&stats_orangefs_obj->kobj);
-
-       kobject_put(&orangefs_obj->kobj);
+       kobject_put(acache_orangefs_obj);
+       kobject_put(capcache_orangefs_obj);
+       kobject_put(ccache_orangefs_obj);
+       kobject_put(ncache_orangefs_obj);
+       kobject_put(pc_orangefs_obj);
+       kobject_put(stats_orangefs_obj);
+       kobject_put(orangefs_obj);
  }
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c

index d13c729..06af81f 100644 (file)
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -50,7 +50,7 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op)
                 case ORANGEFS_VFS_OP_TRUNCATE:
                         fsid = op->upcall.req.truncate.refn.fs_id;
                         break;
-               case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
+               case ORANGEFS_VFS_OP_RA_FLUSH:
                         fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
                         break;
                 case ORANGEFS_VFS_OP_FS_UMOUNT:
@@ -347,7 +347,8 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
         inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
             orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
  
-       orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
+       orangefs_inode->getattr_time = jiffies +
+           orangefs_getattr_timeout_msecs*HZ/1000;
         ret = 0;
  out:
         op_release(new_op);
@@ -656,401 +657,3 @@ __s32 ORANGEFS_util_translate_mode(int mode)
         return ret;
  }
  #undef NUM_MODES
-
-/*
- * After obtaining a string representation of the client's debug
- * keywords and their associated masks, this function is called to build an
- * array of these values.
- */
-int orangefs_prepare_cdm_array(char *debug_array_string)
-{
-       int i;
-       int rc = -EINVAL;
-       char *cds_head = NULL;
-       char *cds_delimiter = NULL;
-       int keyword_len = 0;
-
-       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-       /*
-        * figure out how many elements the cdm_array needs.
-        */
-       for (i = 0; i < strlen(debug_array_string); i++)
-               if (debug_array_string[i] == '\n')
-                       cdm_element_count++;
-
-       if (!cdm_element_count) {
-               pr_info("No elements in client debug array string!\n");
-               goto out;
-       }
-
-       cdm_array =
-               kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
-                       GFP_KERNEL);
-       if (!cdm_array) {
-               pr_info("malloc failed for cdm_array!\n");
-               rc = -ENOMEM;
-               goto out;
-       }
-
-       cds_head = debug_array_string;
-
-       for (i = 0; i < cdm_element_count; i++) {
-               cds_delimiter = strchr(cds_head, '\n');
-               *cds_delimiter = '\0';
-
-               keyword_len = strcspn(cds_head, " ");
-
-               cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
-               if (!cdm_array[i].keyword) {
-                       rc = -ENOMEM;
-                       goto out;
-               }
-
-               sscanf(cds_head,
-                      "%s %llx %llx",
-                      cdm_array[i].keyword,
-                      (unsigned long long *)&(cdm_array[i].mask1),
-                      (unsigned long long *)&(cdm_array[i].mask2));
-
-               if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
-                       client_verbose_index = i;
-
-               if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
-                       client_all_index = i;
-
-               cds_head = cds_delimiter + 1;
-       }
-
-       rc = cdm_element_count;
-
-       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
-
-out:
-
-       return rc;
-
-}
-
-/*
- * /sys/kernel/debug/orangefs/debug-help can be catted to
- * see all the available kernel and client debug keywords.
- *
- * When the kernel boots, we have no idea what keywords the
- * client supports, nor their associated masks.
- *
- * We pass through this function once at boot and stamp a
- * boilerplate "we don't know" message for the client in the
- * debug-help file. We pass through here again when the client
- * starts and then we can fill out the debug-help file fully.
- *
- * The client might be restarted any number of times between
- * reboots, we only build the debug-help file the first time.
- */
-int orangefs_prepare_debugfs_help_string(int at_boot)
-{
-       int rc = -EINVAL;
-       int i;
-       int byte_count = 0;
-       char *client_title = "Client Debug Keywords:\n";
-       char *kernel_title = "Kernel Debug Keywords:\n";
-
-       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-       if (at_boot) {
-               byte_count += strlen(HELP_STRING_UNINITIALIZED);
-               client_title = HELP_STRING_UNINITIALIZED;
-       } else {
-               /*
-                * fill the client keyword/mask array and remember
-                * how many elements there were.
-                */
-               cdm_element_count =
-                       orangefs_prepare_cdm_array(client_debug_array_string);
-               if (cdm_element_count <= 0)
-                       goto out;
-
-               /* Count the bytes destined for debug_help_string. */
-               byte_count += strlen(client_title);
-
-               for (i = 0; i < cdm_element_count; i++) {
-                       byte_count += strlen(cdm_array[i].keyword + 2);
-                       if (byte_count >= DEBUG_HELP_STRING_SIZE) {
-                               pr_info("%s: overflow 1!\n", __func__);
-                               goto out;
-                       }
-               }
-
-               gossip_debug(GOSSIP_UTILS_DEBUG,
-                            "%s: cdm_element_count:%d:\n",
-                            __func__,
-                            cdm_element_count);
-       }
-
-       byte_count += strlen(kernel_title);
-       for (i = 0; i < num_kmod_keyword_mask_map; i++) {
-               byte_count +=
-                       strlen(s_kmod_keyword_mask_map[i].keyword + 2);
-               if (byte_count >= DEBUG_HELP_STRING_SIZE) {
-                       pr_info("%s: overflow 2!\n", __func__);
-                       goto out;
-               }
-       }
-
-       /* build debug_help_string. */
-       debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
-       if (!debug_help_string) {
-               rc = -ENOMEM;
-               goto out;
-       }
-
-       strcat(debug_help_string, client_title);
-
-       if (!at_boot) {
-               for (i = 0; i < cdm_element_count; i++) {
-                       strcat(debug_help_string, "\t");
-                       strcat(debug_help_string, cdm_array[i].keyword);
-                       strcat(debug_help_string, "\n");
-               }
-       }
-
-       strcat(debug_help_string, "\n");
-       strcat(debug_help_string, kernel_title);
-
-       for (i = 0; i < num_kmod_keyword_mask_map; i++) {
-               strcat(debug_help_string, "\t");
-               strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
-               strcat(debug_help_string, "\n");
-       }
-
-       rc = 0;
-
-out:
-
-       return rc;
-
-}
-
-/*
- * kernel = type 0
- * client = type 1
- */
-void debug_mask_to_string(void *mask, int type)
-{
-       int i;
-       int len = 0;
-       char *debug_string;
-       int element_count = 0;
-
-       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-       if (type) {
-               debug_string = client_debug_string;
-               element_count = cdm_element_count;
-       } else {
-               debug_string = kernel_debug_string;
-               element_count = num_kmod_keyword_mask_map;
-       }
-
-       memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
-
-       /*
-        * Some keywords, like "all" or "verbose", are amalgams of
-        * numerous other keywords. Make a special check for those
-        * before grinding through the whole mask only to find out
-        * later...
-        */
-       if (check_amalgam_keyword(mask, type))
-               goto out;
-
-       /* Build the debug string. */
-       for (i = 0; i < element_count; i++)
-               if (type)
-                       do_c_string(mask, i);
-               else
-                       do_k_string(mask, i);
-
-       len = strlen(debug_string);
-
-       if ((len) && (type))
-               client_debug_string[len - 1] = '\0';
-       else if (len)
-               kernel_debug_string[len - 1] = '\0';
-       else if (type)
-               strcpy(client_debug_string, "none");
-       else
-               strcpy(kernel_debug_string, "none");
-
-out:
-gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
-
-       return;
-
-}
-
-void do_k_string(void *k_mask, int index)
-{
-       __u64 *mask = (__u64 *) k_mask;
-
-       if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
-               goto out;
-
-       if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
-               if ((strlen(kernel_debug_string) +
-                    strlen(s_kmod_keyword_mask_map[index].keyword))
-                       < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
-                               strcat(kernel_debug_string,
-                                      s_kmod_keyword_mask_map[index].keyword);
-                               strcat(kernel_debug_string, ",");
-                       } else {
-                               gossip_err("%s: overflow!\n", __func__);
-                               strcpy(kernel_debug_string, ORANGEFS_ALL);
-                               goto out;
-                       }
-       }
-
-out:
-
-       return;
-}
-
-void do_c_string(void *c_mask, int index)
-{
-       struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
-
-       if (keyword_is_amalgam(cdm_array[index].keyword))
-               goto out;
-
-       if ((mask->mask1 & cdm_array[index].mask1) ||
-           (mask->mask2 & cdm_array[index].mask2)) {
-               if ((strlen(client_debug_string) +
-                    strlen(cdm_array[index].keyword) + 1)
-                       < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
-                               strcat(client_debug_string,
-                                      cdm_array[index].keyword);
-                               strcat(client_debug_string, ",");
-                       } else {
-                               gossip_err("%s: overflow!\n", __func__);
-                               strcpy(client_debug_string, ORANGEFS_ALL);
-                               goto out;
-                       }
-       }
-out:
-       return;
-}
-
-int keyword_is_amalgam(char *keyword)
-{
-       int rc = 0;
-
-       if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
-               rc = 1;
-
-       return rc;
-}
-
-/*
- * kernel = type 0
- * client = type 1
- *
- * return 1 if we found an amalgam.
- */
-int check_amalgam_keyword(void *mask, int type)
-{
-       __u64 *k_mask;
-       struct client_debug_mask *c_mask;
-       int k_all_index = num_kmod_keyword_mask_map - 1;
-       int rc = 0;
-
-       if (type) {
-               c_mask = (struct client_debug_mask *) mask;
-
-               if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
-                   (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
-                       strcpy(client_debug_string, ORANGEFS_ALL);
-                       rc = 1;
-                       goto out;
-               }
-
-               if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
-                   (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
-                       strcpy(client_debug_string, ORANGEFS_VERBOSE);
-                       rc = 1;
-                       goto out;
-               }
-
-       } else {
-               k_mask = (__u64 *) mask;
-
-               if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
-                       strcpy(kernel_debug_string, ORANGEFS_ALL);
-                       rc = 1;
-                       goto out;
-               }
-       }
-
-out:
-
-       return rc;
-}
-
-/*
- * kernel = type 0
- * client = type 1
- */
-void debug_string_to_mask(char *debug_string, void *mask, int type)
-{
-       char *unchecked_keyword;
-       int i;
-       char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
-       char *original_pointer;
-       int element_count = 0;
-       struct client_debug_mask *c_mask;
-       __u64 *k_mask;
-
-       gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
-
-       if (type) {
-               c_mask = (struct client_debug_mask *)mask;
-               element_count = cdm_element_count;
-       } else {
-               k_mask = (__u64 *)mask;
-               *k_mask = 0;
-               element_count = num_kmod_keyword_mask_map;
-       }
-
-       original_pointer = strsep_fodder;
-       while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
-               if (strlen(unchecked_keyword)) {
-                       for (i = 0; i < element_count; i++)
-                               if (type)
-                                       do_c_mask(i,
-                                                 unchecked_keyword,
-                                                 &c_mask);
-                               else
-                                       do_k_mask(i,
-                                                 unchecked_keyword,
-                                                 &k_mask);
-               }
-
-       kfree(original_pointer);
-}
-
-void do_c_mask(int i,
-              char *unchecked_keyword,
-              struct client_debug_mask **sane_mask)
-{
-
-       if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
-               (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
-               (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
-       }
-}
-
-void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
-{
-
-       if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
-               **sane_mask = (**sane_mask) |
-                               s_kmod_keyword_mask_map[i].mask_val;
-}
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h

index 3d7418c..971307a 100644 (file)
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -4,26 +4,6 @@
  #include <linux/slab.h>
  #include <linux/ioctl.h>
  
-extern struct client_debug_mask *cdm_array;
-extern char *debug_help_string;
-extern int help_string_initialized;
-extern struct dentry *debug_dir;
-extern struct dentry *help_file_dentry;
-extern struct dentry *client_debug_dentry;
-extern const struct file_operations debug_help_fops;
-extern int client_all_index;
-extern int client_verbose_index;
-extern int cdm_element_count;
-#define DEBUG_HELP_STRING_SIZE 4096
-#define HELP_STRING_UNINITIALIZED \
-       "Client Debug Keywords are unknown until the first time\n" \
-       "the client is started after boot.\n"
-#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
-#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
-#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
-#define ORANGEFS_VERBOSE "verbose"
-#define ORANGEFS_ALL "all"
-
  /* pvfs2-config.h ***********************************************************/
  #define ORANGEFS_VERSION_MAJOR 2
  #define ORANGEFS_VERSION_MINOR 9
@@ -426,13 +406,12 @@ do {                                                                      \
                 printk(KERN_DEBUG fmt, ##__VA_ARGS__);                  \
  } while (0)
  #else
-extern __u64 gossip_debug_mask;
-extern struct client_debug_mask client_debug_mask;
+extern __u64 orangefs_gossip_debug_mask;
  
  /* try to avoid function call overhead by checking masks in macro */
  #define gossip_debug(mask, fmt, ...)                                   \
  do {                                                                   \
-       if (gossip_debug_mask & (mask))                                 \
+       if (orangefs_gossip_debug_mask & (mask))                        \
                 printk(KERN_DEBUG fmt, ##__VA_ARGS__);                  \
  } while (0)
  #endif /* GOSSIP_DISABLE_DEBUG */
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c

index b9da9a0..c48859f 100644 (file)
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -33,6 +33,7 @@ static const match_table_t tokens = {
         { Opt_err,      NULL }
  };
  
+uint64_t orangefs_features;
  
  static int parse_mount_options(struct super_block *sb, char *options,
                 int silent)
@@ -249,6 +250,19 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
         }
  
         op_release(new_op);
+
+       if (orangefs_userspace_version >= 20906) {
+               new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
+               if (!new_op)
+                       return -ENOMEM;
+               new_op->upcall.req.features.features = 0;
+               ret = service_operation(new_op, "orangefs_features", 0);
+               orangefs_features = new_op->downcall.resp.features.features;
+               op_release(new_op);
+       } else {
+               orangefs_features = 0;
+       }
+
         return ret;
  }
  
@@ -492,6 +506,19 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
         list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks);
         spin_unlock(&orangefs_superblocks_lock);
         op_release(new_op);
+
+       if (orangefs_userspace_version >= 20906) {
+               new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
+               if (!new_op)
+                       return ERR_PTR(-ENOMEM);
+               new_op->upcall.req.features.features = 0;
+               ret = service_operation(new_op, "orangefs_features", 0);
+               orangefs_features = new_op->downcall.resp.features.features;
+               op_release(new_op);
+       } else {
+               orangefs_features = 0;
+       }
+
         return dget(sb->s_root);
  
  free_op:
@@ -530,8 +557,8 @@ void orangefs_kill_sb(struct super_block *sb)
          * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
          * gets completed before we free the dang thing.
          */
-       mutex_lock(&request_mutex);
-       mutex_unlock(&request_mutex);
+       mutex_lock(&orangefs_request_mutex);
+       mutex_unlock(&orangefs_request_mutex);
  
         /* free the orangefs superblock private data */
         kfree(ORANGEFS_SB(sb));
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h

index 001b202..af0b0e3 100644 (file)
--- a/fs/orangefs/upcall.h
+++ b/fs/orangefs/upcall.h
@@ -98,7 +98,7 @@ struct orangefs_truncate_request_s {
         __s64 size;
  };
  
-struct orangefs_mmap_ra_cache_flush_request_s {
+struct orangefs_ra_cache_flush_request_s {
         struct orangefs_object_kref refn;
  };
  
@@ -179,12 +179,18 @@ enum orangefs_param_request_op {
         ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
         ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
         ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
+       ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26,
+       ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27,
+       ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28,
  };
  
  struct orangefs_param_request_s {
         enum orangefs_param_request_type type;
         enum orangefs_param_request_op op;
-       __s64 value;
+       union {
+               __s64 value64;
+               __s32 value32[2];
+       } u;
         char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
  };
  
@@ -204,6 +210,11 @@ struct orangefs_fs_key_request_s {
         __s32 __pad1;
  };
  
+/* 2.9.6 */
+struct orangefs_features_request_s {
+       __u64 features;
+};
+
  struct orangefs_upcall_s {
         __s32 type;
         __u32 uid;
@@ -228,7 +239,7 @@ struct orangefs_upcall_s {
                 struct orangefs_rename_request_s rename;
                 struct orangefs_statfs_request_s statfs;
                 struct orangefs_truncate_request_s truncate;
-               struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
+               struct orangefs_ra_cache_flush_request_s ra_cache_flush;
                 struct orangefs_fs_mount_request_s fs_mount;
                 struct orangefs_fs_umount_request_s fs_umount;
                 struct orangefs_getxattr_request_s getxattr;
@@ -240,6 +251,7 @@ struct orangefs_upcall_s {
                 struct orangefs_param_request_s param;
                 struct orangefs_perf_count_request_s perf_count;
                 struct orangefs_fs_key_request_s fs_key;
+               struct orangefs_features_request_s features;
         } req;
  };
  
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c

index 31635bc..abcfa3f 100644 (file)
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -87,9 +87,9 @@ retry_servicing:
          */
         if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
                 if (flags & ORANGEFS_OP_INTERRUPTIBLE)
-                       ret = mutex_lock_interruptible(&request_mutex);
+                       ret = mutex_lock_interruptible(&orangefs_request_mutex);
                 else
-                       ret = mutex_lock_killable(&request_mutex);
+                       ret = mutex_lock_killable(&orangefs_request_mutex);
                 /*
                  * check to see if we were interrupted while waiting for
                  * mutex
@@ -129,7 +129,7 @@ retry_servicing:
         spin_unlock(&orangefs_request_list_lock);
  
         if (!(flags & ORANGEFS_OP_NO_MUTEX))
-               mutex_unlock(&request_mutex);
+               mutex_unlock(&orangefs_request_mutex);
  
         ret = wait_for_matching_downcall(op, timeout,
                                          flags & ORANGEFS_OP_INTERRUPTIBLE);
@@ -272,9 +272,9 @@ static void
         } else if (op_state_in_progress(op)) {
                 /* op must be removed from the in progress htable */
                 spin_unlock(&op->lock);
-               spin_lock(&htable_ops_in_progress_lock);
+               spin_lock(&orangefs_htable_ops_in_progress_lock);
                 list_del_init(&op->list);
-               spin_unlock(&htable_ops_in_progress_lock);
+               spin_unlock(&orangefs_htable_ops_in_progress_lock);
                 gossip_debug(GOSSIP_WAIT_DEBUG,
                              "Interrupted: Removed op %p"
                              " from htable_ops_in_progress\n",
diff --git a/fs/pnode.c b/fs/pnode.c

index 9989970..234a9ac 100644 (file)
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -259,7 +259,7 @@ static int propagate_one(struct mount *m)
                 read_sequnlock_excl(&mount_lock);
         }
         hlist_add_head(&child->mnt_hash, list);
-       return 0;
+       return count_mounts(m->mnt_ns, child);
  }
  
  /*
diff --git a/fs/pnode.h b/fs/pnode.h

index 0fcdbe7..550f5a8 100644 (file)
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *,
  struct mount *copy_tree(struct mount *, struct dentry *, int);
  bool is_path_reachable(struct mount *, struct dentry *,
                          const struct path *root);
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
  #endif /* _LINUX_PNODE_H */
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c

index 2ed3d71..71025b9 100644 (file)
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
  
  static void drop_sysctl_table(struct ctl_table_header *header);
  static int sysctl_follow_link(struct ctl_table_header **phead,
-       struct ctl_table **pentry, struct nsproxy *namespaces);
+       struct ctl_table **pentry);
  static int insert_links(struct ctl_table_header *head);
  static void put_links(struct ctl_table_header *header);
  
@@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head)
  }
  
  static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+lookup_header_set(struct ctl_table_root *root)
  {
         struct ctl_table_set *set = &root->default_set;
         if (root->lookup)
-               set = root->lookup(root, namespaces);
+               set = root->lookup(root);
         return set;
  }
  
@@ -496,7 +496,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                 goto out;
  
         if (S_ISLNK(p->mode)) {
-               ret = sysctl_follow_link(&h, &p, current->nsproxy);
+               ret = sysctl_follow_link(&h, &p);
                 err = ERR_PTR(ret);
                 if (ret)
                         goto out;
@@ -664,7 +664,7 @@ static bool proc_sys_link_fill_cache(struct file *file,
  
         if (S_ISLNK(table->mode)) {
                 /* It is not an error if we can not follow the link ignore it */
-               int err = sysctl_follow_link(&head, &table, current->nsproxy);
+               int err = sysctl_follow_link(&head, &table);
                 if (err)
                         goto out;
         }
@@ -981,7 +981,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
  }
  
  static int sysctl_follow_link(struct ctl_table_header **phead,
-       struct ctl_table **pentry, struct nsproxy *namespaces)
+       struct ctl_table **pentry)
  {
         struct ctl_table_header *head;
         struct ctl_table_root *root;
@@ -993,7 +993,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
         ret = 0;
         spin_lock(&sysctl_lock);
         root = (*pentry)->data;
-       set = lookup_header_set(root, namespaces);
+       set = lookup_header_set(root);
         dir = xlate_dir(set, (*phead)->parent);
         if (IS_ERR(dir))
                 ret = PTR_ERR(dir);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c

index 16ecca5..14984d9 100644 (file)
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -623,6 +623,40 @@ static int pstore_write_compat(enum pstore_type_id type,
                              size, psi);
  }
  
+static int pstore_write_buf_user_compat(enum pstore_type_id type,
+                              enum kmsg_dump_reason reason,
+                              u64 *id, unsigned int part,
+                              const char __user *buf,
+                              bool compressed, size_t size,
+                              struct pstore_info *psi)
+{
+       unsigned long flags = 0;
+       size_t i, bufsize = size;
+       long ret = 0;
+
+       if (unlikely(!access_ok(VERIFY_READ, buf, size)))
+               return -EFAULT;
+       if (bufsize > psinfo->bufsize)
+               bufsize = psinfo->bufsize;
+       spin_lock_irqsave(&psinfo->buf_lock, flags);
+       for (i = 0; i < size; ) {
+               size_t c = min(size - i, bufsize);
+
+               ret = __copy_from_user(psinfo->buf, buf + i, c);
+               if (unlikely(ret != 0)) {
+                       ret = -EFAULT;
+                       break;
+               }
+               ret = psi->write_buf(type, reason, id, part, psinfo->buf,
+                                    compressed, c, psi);
+               if (unlikely(ret < 0))
+                       break;
+               i += c;
+       }
+       spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+       return unlikely(ret < 0) ? ret : size;
+}
+
  /*
   * platform specific persistent storage driver registers with
   * us here. If pstore is already mounted, call the platform
@@ -645,6 +679,8 @@ int pstore_register(struct pstore_info *psi)
  
         if (!psi->write)
                 psi->write = pstore_write_compat;
+       if (!psi->write_buf_user)
+               psi->write_buf_user = pstore_write_buf_user_compat;
         psinfo = psi;
         mutex_init(&psinfo->read_mutex);
         spin_unlock(&pstore_lock);
@@ -659,13 +695,14 @@ int pstore_register(struct pstore_info *psi)
         if (pstore_is_mounted())
                 pstore_get_records(0);
  
-       pstore_register_kmsg();
-
-       if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+       if (psi->flags & PSTORE_FLAGS_DMESG)
+               pstore_register_kmsg();
+       if (psi->flags & PSTORE_FLAGS_CONSOLE)
                 pstore_register_console();
+       if (psi->flags & PSTORE_FLAGS_FTRACE)
                 pstore_register_ftrace();
+       if (psi->flags & PSTORE_FLAGS_PMSG)
                 pstore_register_pmsg();
-       }
  
         if (pstore_update_ms >= 0) {
                 pstore_timer.expires = jiffies +
@@ -689,12 +726,14 @@ EXPORT_SYMBOL_GPL(pstore_register);
  
  void pstore_unregister(struct pstore_info *psi)
  {
-       if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+       if (psi->flags & PSTORE_FLAGS_PMSG)
                 pstore_unregister_pmsg();
+       if (psi->flags & PSTORE_FLAGS_FTRACE)
                 pstore_unregister_ftrace();
+       if (psi->flags & PSTORE_FLAGS_CONSOLE)
                 pstore_unregister_console();
-       }
-       pstore_unregister_kmsg();
+       if (psi->flags & PSTORE_FLAGS_DMESG)
+               pstore_unregister_kmsg();
  
         free_buf_for_compression();
  
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c

index 7de20cd..78f6176 100644 (file)
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -19,48 +19,25 @@
  #include "internal.h"
  
  static DEFINE_MUTEX(pmsg_lock);
-#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
  
  static ssize_t write_pmsg(struct file *file, const char __user *buf,
                           size_t count, loff_t *ppos)
  {
-       size_t i, buffer_size;
-       char *buffer;
+       u64 id;
+       int ret;
  
         if (!count)
                 return 0;
  
+       /* check outside lock, page in any data. write_buf_user also checks */
         if (!access_ok(VERIFY_READ, buf, count))
                 return -EFAULT;
  
-       buffer_size = count;
-       if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
-               buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
-       buffer = vmalloc(buffer_size);
-       if (!buffer)
-               return -ENOMEM;
-
         mutex_lock(&pmsg_lock);
-       for (i = 0; i < count; ) {
-               size_t c = min(count - i, buffer_size);
-               u64 id;
-               long ret;
-
-               ret = __copy_from_user(buffer, buf + i, c);
-               if (unlikely(ret != 0)) {
-                       mutex_unlock(&pmsg_lock);
-                       vfree(buffer);
-                       return -EFAULT;
-               }
-               psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
-                                 psinfo);
-
-               i += c;
-       }
-
+       ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
+                                    psinfo);
         mutex_unlock(&pmsg_lock);
-       vfree(buffer);
-       return count;
+       return ret ? ret : count;
  }
  
  static const struct file_operations pmsg_fops = {
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c

index 7a034d6..6ad831b 100644 (file)
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -331,6 +331,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
         return 0;
  }
  
+static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
+                                                enum kmsg_dump_reason reason,
+                                                u64 *id, unsigned int part,
+                                                const char __user *buf,
+                                                bool compressed, size_t size,
+                                                struct pstore_info *psi)
+{
+       if (type == PSTORE_TYPE_PMSG) {
+               struct ramoops_context *cxt = psi->data;
+
+               if (!cxt->mprz)
+                       return -ENOMEM;
+               return persistent_ram_write_user(cxt->mprz, buf, size);
+       }
+
+       return -EINVAL;
+}
+
  static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
                                 struct timespec time, struct pstore_info *psi)
  {
@@ -369,6 +387,7 @@ static struct ramoops_context oops_cxt = {
                 .open   = ramoops_pstore_open,
                 .read   = ramoops_pstore_read,
                 .write_buf      = ramoops_pstore_write_buf,
+               .write_buf_user = ramoops_pstore_write_buf_user,
                 .erase  = ramoops_pstore_erase,
         },
  };
@@ -377,13 +396,14 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
  {
         int i;
  
-       cxt->max_dump_cnt = 0;
         if (!cxt->przs)
                 return;
  
-       for (i = 0; !IS_ERR_OR_NULL(cxt->przs[i]); i++)
+       for (i = 0; i < cxt->max_dump_cnt; i++)
                 persistent_ram_free(cxt->przs[i]);
+
         kfree(cxt->przs);
+       cxt->max_dump_cnt = 0;
  }
  
  static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
@@ -408,7 +428,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
                              GFP_KERNEL);
         if (!cxt->przs) {
                 dev_err(dev, "failed to initialize a prz array for dumps\n");
-               goto fail_prz;
+               goto fail_mem;
         }
  
         for (i = 0; i < cxt->max_dump_cnt; i++) {
@@ -419,6 +439,11 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
                         err = PTR_ERR(cxt->przs[i]);
                         dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
                                 cxt->record_size, (unsigned long long)*paddr, err);
+
+                       while (i > 0) {
+                               i--;
+                               persistent_ram_free(cxt->przs[i]);
+                       }
                         goto fail_prz;
                 }
                 *paddr += cxt->record_size;
@@ -426,7 +451,9 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
  
         return 0;
  fail_prz:
-       ramoops_free_przs(cxt);
+       kfree(cxt->przs);
+fail_mem:
+       cxt->max_dump_cnt = 0;
         return err;
  }
  
@@ -608,12 +635,20 @@ static int ramoops_probe(struct platform_device *pdev)
                 cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */
         cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize);
         cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL);
-       spin_lock_init(&cxt->pstore.buf_lock);
         if (!cxt->pstore.buf) {
                 pr_err("cannot allocate pstore buffer\n");
                 err = -ENOMEM;
                 goto fail_clear;
         }
+       spin_lock_init(&cxt->pstore.buf_lock);
+
+       cxt->pstore.flags = PSTORE_FLAGS_DMESG;
+       if (cxt->console_size)
+               cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
+       if (cxt->ftrace_size)
+               cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
+       if (cxt->pmsg_size)
+               cxt->pstore.flags |= PSTORE_FLAGS_PMSG;
  
         err = pstore_register(&cxt->pstore);
         if (err) {
@@ -659,7 +694,6 @@ static int ramoops_remove(struct platform_device *pdev)
         struct ramoops_context *cxt = &oops_cxt;
  
         pstore_unregister(&cxt->pstore);
-       cxt->max_dump_cnt = 0;
  
         kfree(cxt->pstore.buf);
         cxt->pstore.bufsize = 0;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c

index 76c3f80..3975dee 100644 (file)
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -17,15 +17,16 @@
  #include <linux/device.h>
  #include <linux/err.h>
  #include <linux/errno.h>
-#include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/io.h>
+#include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/memblock.h>
+#include <linux/pstore_ram.h>
  #include <linux/rslib.h>
  #include <linux/slab.h>
+#include <linux/uaccess.h>
  #include <linux/vmalloc.h>
-#include <linux/pstore_ram.h>
  #include <asm/page.h>
  
  struct persistent_ram_buffer {
@@ -47,43 +48,10 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
         return atomic_read(&prz->buffer->start);
  }
  
-/* increase and wrap the start pointer, returning the old value */
-static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
-{
-       int old;
-       int new;
-
-       do {
-               old = atomic_read(&prz->buffer->start);
-               new = old + a;
-               while (unlikely(new >= prz->buffer_size))
-                       new -= prz->buffer_size;
-       } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
-
-       return old;
-}
-
-/* increase the size counter until it hits the max size */
-static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
-{
-       size_t old;
-       size_t new;
-
-       if (atomic_read(&prz->buffer->size) == prz->buffer_size)
-               return;
-
-       do {
-               old = atomic_read(&prz->buffer->size);
-               new = old + a;
-               if (new > prz->buffer_size)
-                       new = prz->buffer_size;
-       } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
-}
-
  static DEFINE_RAW_SPINLOCK(buffer_lock);
  
  /* increase and wrap the start pointer, returning the old value */
-static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
+static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
  {
         int old;
         int new;
@@ -103,7 +71,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
  }
  
  /* increase the size counter until it hits the max size */
-static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
+static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
  {
         size_t old;
         size_t new;
@@ -124,9 +92,6 @@ exit:
         raw_spin_unlock_irqrestore(&buffer_lock, flags);
  }
  
-static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
-static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
-
  static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
         uint8_t *data, size_t len, uint8_t *ecc)
  {
@@ -299,10 +264,20 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz,
         const void *s, unsigned int start, unsigned int count)
  {
         struct persistent_ram_buffer *buffer = prz->buffer;
-       memcpy(buffer->data + start, s, count);
+       memcpy_toio(buffer->data + start, s, count);
         persistent_ram_update_ecc(prz, start, count);
  }
  
+static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz,
+       const void __user *s, unsigned int start, unsigned int count)
+{
+       struct persistent_ram_buffer *buffer = prz->buffer;
+       int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ?
+               -EFAULT : 0;
+       persistent_ram_update_ecc(prz, start, count);
+       return ret;
+}
+
  void persistent_ram_save_old(struct persistent_ram_zone *prz)
  {
         struct persistent_ram_buffer *buffer = prz->buffer;
@@ -322,8 +297,8 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz)
         }
  
         prz->old_log_size = size;
-       memcpy(prz->old_log, &buffer->data[start], size - start);
-       memcpy(prz->old_log + size - start, &buffer->data[0], start);
+       memcpy_fromio(prz->old_log, &buffer->data[start], size - start);
+       memcpy_fromio(prz->old_log + size - start, &buffer->data[0], start);
  }
  
  int notrace persistent_ram_write(struct persistent_ram_zone *prz,
@@ -356,6 +331,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz,
         return count;
  }
  
+int notrace persistent_ram_write_user(struct persistent_ram_zone *prz,
+       const void __user *s, unsigned int count)
+{
+       int rem, ret = 0, c = count;
+       size_t start;
+
+       if (unlikely(!access_ok(VERIFY_READ, s, count)))
+               return -EFAULT;
+       if (unlikely(c > prz->buffer_size)) {
+               s += c - prz->buffer_size;
+               c = prz->buffer_size;
+       }
+
+       buffer_size_add(prz, c);
+
+       start = buffer_start_add(prz, c);
+
+       rem = prz->buffer_size - start;
+       if (unlikely(rem < c)) {
+               ret = persistent_ram_update_user(prz, s, start, rem);
+               s += rem;
+               c -= rem;
+               start = 0;
+       }
+       if (likely(!ret))
+               ret = persistent_ram_update_user(prz, s, start, c);
+
+       persistent_ram_update_header_ecc(prz);
+
+       return unlikely(ret) ? ret : count;
+}
+
  size_t persistent_ram_old_size(struct persistent_ram_zone *prz)
  {
         return prz->old_log_size;
@@ -426,9 +433,6 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size,
                 return NULL;
         }
  
-       buffer_start_add = buffer_start_add_locked;
-       buffer_size_add = buffer_size_add_locked;
-
         if (memtype)
                 va = ioremap(start, size);
         else
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile

index fc593c8..584e87e 100644 (file)
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -52,6 +52,7 @@ xfs-y                         += $(addprefix libxfs/, \
                                    xfs_inode_fork.o \
                                    xfs_inode_buf.o \
                                    xfs_log_rlimit.o \
+                                  xfs_ag_resv.o \
                                    xfs_rmap.o \
                                    xfs_rmap_btree.o \
                                    xfs_sb.o \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c

new file mode 100644 (file)

index 0000000..e3ae0f2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_btree.h"
+
+/*
+ * Per-AG Block Reservations
+ *
+ * For some kinds of allocation group metadata structures, it is advantageous
+ * to reserve a small number of blocks in each AG so that future expansions of
+ * that data structure do not encounter ENOSPC because errors during a btree
+ * split cause the filesystem to go offline.
+ *
+ * Prior to the introduction of reflink, this wasn't an issue because the free
+ * space btrees maintain a reserve of space (the AGFL) to handle any expansion
+ * that may be necessary; and allocations of other metadata (inodes, BMBT,
+ * dir/attr) aren't restricted to a single AG.  However, with reflink it is
+ * possible to allocate all the space in an AG, have subsequent reflink/CoW
+ * activity expand the refcount btree, and discover that there's no space left
+ * to handle that expansion.  Since we can calculate the maximum size of the
+ * refcount btree, we can reserve space for it and avoid ENOSPC.
+ *
+ * Handling per-AG reservations consists of three changes to the allocator's
+ * behavior:  First, because these reservations are always needed, we decrease
+ * the ag_max_usable counter to reflect the size of the AG after the reserved
+ * blocks are taken.  Second, the reservations must be reflected in the
+ * fdblocks count to maintain proper accounting.  Third, each AG must maintain
+ * its own reserved block counter so that we can calculate the amount of space
+ * that must remain free to maintain the reservations.  Fourth, the "remaining
+ * reserved blocks" count must be used when calculating the length of the
+ * longest free extent in an AG and to clamp maxlen in the per-AG allocation
+ * functions.  In other words, we maintain a virtual allocation via in-core
+ * accounting tricks so that we don't have to clean up after a crash. :)
+ *
+ * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
+ * values via struct xfs_alloc_arg or directly to the xfs_free_extent
+ * function.  It might seem a little funny to maintain a reservoir of blocks
+ * to feed another reservoir, but the AGFL only holds enough blocks to get
+ * through the next transaction.  The per-AG reservation is to ensure (we
+ * hope) that each AG never runs out of blocks.  Each data structure wanting
+ * to use the reservation system should update ask/used in xfs_ag_resv_init.
+ */
+
+/*
+ * Are we critically low on blocks?  For now we'll define that as the number
+ * of blocks we can get our hands on being less than 10% of what we reserved
+ * or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_ag_resv_critical(
+       struct xfs_perag                *pag,
+       enum xfs_ag_resv_type           type)
+{
+       xfs_extlen_t                    avail;
+       xfs_extlen_t                    orig;
+
+       switch (type) {
+       case XFS_AG_RESV_METADATA:
+               avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
+               orig = pag->pag_meta_resv.ar_asked;
+               break;
+       case XFS_AG_RESV_AGFL:
+               avail = pag->pagf_freeblks + pag->pagf_flcount -
+                       pag->pag_meta_resv.ar_reserved;
+               orig = pag->pag_agfl_resv.ar_asked;
+               break;
+       default:
+               ASSERT(0);
+               return false;
+       }
+
+       trace_xfs_ag_resv_critical(pag, type, avail);
+
+       /* Critically low if less than 10% or max btree height remains. */
+       return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+}
+
+/*
+ * How many blocks are reserved but not used, and therefore must not be
+ * allocated away?
+ */
+xfs_extlen_t
+xfs_ag_resv_needed(
+       struct xfs_perag                *pag,
+       enum xfs_ag_resv_type           type)
+{
+       xfs_extlen_t                    len;
+
+       len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
+       switch (type) {
+       case XFS_AG_RESV_METADATA:
+       case XFS_AG_RESV_AGFL:
+               len -= xfs_perag_resv(pag, type)->ar_reserved;
+               break;
+       case XFS_AG_RESV_NONE:
+               /* empty */
+               break;
+       default:
+               ASSERT(0);
+       }
+
+       trace_xfs_ag_resv_needed(pag, type, len);
+
+       return len;
+}
+
+/* Clean out a reservation */
+static int
+__xfs_ag_resv_free(
+       struct xfs_perag                *pag,
+       enum xfs_ag_resv_type           type)
+{
+       struct xfs_ag_resv              *resv;
+       xfs_extlen_t                    oldresv;
+       int                             error;
+
+       trace_xfs_ag_resv_free(pag, type, 0);
+
+       resv = xfs_perag_resv(pag, type);
+       pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+       /*
+        * AGFL blocks are always considered "free", so whatever
+        * was reserved at mount time must be given back at umount.
+        */
+       if (type == XFS_AG_RESV_AGFL)
+               oldresv = resv->ar_orig_reserved;
+       else
+               oldresv = resv->ar_reserved;
+       error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
+       resv->ar_reserved = 0;
+       resv->ar_asked = 0;
+
+       if (error)
+               trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
+                               error, _RET_IP_);
+       return error;
+}
+
+/* Free a per-AG reservation. */
+int
+xfs_ag_resv_free(
+       struct xfs_perag                *pag)
+{
+       int                             error;
+       int                             err2;
+
+       error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
+       err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
+       if (err2 && !error)
+               error = err2;
+       return error;
+}
+
+static int
+__xfs_ag_resv_init(
+       struct xfs_perag                *pag,
+       enum xfs_ag_resv_type           type,
+       xfs_extlen_t                    ask,
+       xfs_extlen_t                    used)
+{
+       struct xfs_mount                *mp = pag->pag_mount;
+       struct xfs_ag_resv              *resv;
+       int                             error;
+
+       resv = xfs_perag_resv(pag, type);
+       if (used > ask)
+               ask = used;
+       resv->ar_asked = ask;
+       resv->ar_reserved = resv->ar_orig_reserved = ask - used;
+       mp->m_ag_max_usable -= ask;
+
+       trace_xfs_ag_resv_init(pag, type, ask);
+
+       error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
+       if (error)
+               trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+                               error, _RET_IP_);
+
+       return error;
+}
+
+/* Create a per-AG block reservation. */
+int
+xfs_ag_resv_init(
+       struct xfs_perag                *pag)
+{
+       xfs_extlen_t                    ask;
+       xfs_extlen_t                    used;
+       int                             error = 0;
+
+       /* Create the metadata reservation. */
+       if (pag->pag_meta_resv.ar_asked == 0) {
+               ask = used = 0;
+
+               error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+                               ask, used);
+               if (error)
+                       goto out;
+       }
+
+       /* Create the AGFL metadata reservation */
+       if (pag->pag_agfl_resv.ar_asked == 0) {
+               ask = used = 0;
+
+               error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
+               if (error)
+                       goto out;
+       }
+
+out:
+       return error;
+}
+
+/* Allocate a block from the reservation. */
+void
+xfs_ag_resv_alloc_extent(
+       struct xfs_perag                *pag,
+       enum xfs_ag_resv_type           type,
+       struct xfs_alloc_arg            *args)
+{
+       struct xfs_ag_resv              *resv;
+       xfs_extlen_t                    len;
+       uint                            field;
+
+       trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
+
+       switch (type) {
+       case XFS_AG_RESV_METADATA:
+       case XFS_AG_RESV_AGFL:
+               resv = xfs_perag_resv(pag, type);
+               break;
+       default:
+               ASSERT(0);
+               /* fall through */
+       case XFS_AG_RESV_NONE:
+               field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+                                      XFS_TRANS_SB_FDBLOCKS;
+               xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
+               return;
+       }
+
+       len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
+       resv->ar_reserved -= len;
+       if (type == XFS_AG_RESV_AGFL)
+               return;
+       /* Allocations of reserved blocks only need on-disk sb updates... */
+       xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
+       /* ...but non-reserved blocks need in-core and on-disk updates. */
+       if (args->len > len)
+               xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
+                               -((int64_t)args->len - len));
+}
+
+/* Free a block to the reservation. */
+void
+xfs_ag_resv_free_extent(
+       struct xfs_perag                *pag,
+       enum xfs_ag_resv_type           type,
+       struct xfs_trans                *tp,
+       xfs_extlen_t                    len)
+{
+       xfs_extlen_t                    leftover;
+       struct xfs_ag_resv              *resv;
+
+       trace_xfs_ag_resv_free_extent(pag, type, len);
+
+       switch (type) {
+       case XFS_AG_RESV_METADATA:
+       case XFS_AG_RESV_AGFL:
+               resv = xfs_perag_resv(pag, type);
+               break;
+       default:
+               ASSERT(0);
+               /* fall through */
+       case XFS_AG_RESV_NONE:
+               xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+               return;
+       }
+
+       leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
+       resv->ar_reserved += leftover;
+       if (type == XFS_AG_RESV_AGFL)
+               return;
+       /* Freeing into the reserved pool only requires on-disk update... */
+       xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
+       /* ...but freeing beyond that requires in-core and on-disk update. */
+       if (len > leftover)
+               xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
+}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h

new file mode 100644 (file)

index 0000000..8d6c687
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_AG_RESV_H__
+#define        __XFS_AG_RESV_H__
+
+int xfs_ag_resv_free(struct xfs_perag *pag);
+int xfs_ag_resv_init(struct xfs_perag *pag);
+
+bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
+               enum xfs_ag_resv_type type);
+
+void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+               struct xfs_alloc_arg *args);
+void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
+               struct xfs_trans *tp, xfs_extlen_t len);
+
+#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index 05b5243..ca75dc9 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -37,6 +37,7 @@
  #include "xfs_trans.h"
  #include "xfs_buf_item.h"
  #include "xfs_log.h"
+#include "xfs_ag_resv.h"
  
  struct workqueue_struct *xfs_alloc_wq;
  
@@ -74,14 +75,8 @@ xfs_prealloc_blocks(
   * extents need to be actually allocated. To get around this, we explicitly set
   * aside a few blocks which will not be reserved in delayed allocation.
   *
- * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
- * and 4 more to handle a potential split of the file's bmap btree.
- *
- * When rmap is enabled, we must also be able to handle two rmap btree inserts
- * to record both the file data extent and a new bmbt block.  The bmbt block
- * might not be in the same AG as the file data extent.  In the worst case
- * the bmap btree splits multiple levels and all the new blocks come from
- * different AGs, so set aside enough to handle rmap btree splits in all AGs.
+ * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
+ * potential split of the file's bmap btree.
   */
  unsigned int
  xfs_alloc_set_aside(
@@ -90,8 +85,6 @@ xfs_alloc_set_aside(
         unsigned int            blocks;
  
         blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
-       if (xfs_sb_version_hasrmapbt(&mp->m_sb))
-               blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
         return blocks;
  }
  
@@ -265,7 +258,7 @@ xfs_alloc_compute_diff(
         xfs_agblock_t   wantbno,        /* target starting block */
         xfs_extlen_t    wantlen,        /* target length */
         xfs_extlen_t    alignment,      /* target alignment */
-       char            userdata,       /* are we allocating data? */
+       int             datatype,       /* are we allocating data? */
         xfs_agblock_t   freebno,        /* freespace's starting block */
         xfs_extlen_t    freelen,        /* freespace's length */
         xfs_agblock_t   *newbnop)       /* result: best start block from free */
@@ -276,6 +269,7 @@ xfs_alloc_compute_diff(
         xfs_extlen_t    newlen1=0;      /* length with newbno1 */
         xfs_extlen_t    newlen2=0;      /* length with newbno2 */
         xfs_agblock_t   wantend;        /* end of target extent */
+       bool            userdata = xfs_alloc_is_userdata(datatype);
  
         ASSERT(freelen >= wantlen);
         freeend = freebno + freelen;
@@ -680,12 +674,29 @@ xfs_alloc_ag_vextent(
         xfs_alloc_arg_t *args)  /* argument structure for allocation */
  {
         int             error=0;
+       xfs_extlen_t    reservation;
+       xfs_extlen_t    oldmax;
  
         ASSERT(args->minlen > 0);
         ASSERT(args->maxlen > 0);
         ASSERT(args->minlen <= args->maxlen);
         ASSERT(args->mod < args->prod);
         ASSERT(args->alignment > 0);
+
+       /*
+        * Clamp maxlen to the amount of free space minus any reservations
+        * that have been made.
+        */
+       oldmax = args->maxlen;
+       reservation = xfs_ag_resv_needed(args->pag, args->resv);
+       if (args->maxlen > args->pag->pagf_freeblks - reservation)
+               args->maxlen = args->pag->pagf_freeblks - reservation;
+       if (args->maxlen == 0) {
+               args->agbno = NULLAGBLOCK;
+               args->maxlen = oldmax;
+               return 0;
+       }
+
         /*
          * Branch to correct routine based on the type.
          */
@@ -705,12 +716,14 @@ xfs_alloc_ag_vextent(
                 /* NOTREACHED */
         }
  
+       args->maxlen = oldmax;
+
         if (error || args->agbno == NULLAGBLOCK)
                 return error;
  
         ASSERT(args->len >= args->minlen);
         ASSERT(args->len <= args->maxlen);
-       ASSERT(!args->wasfromfl || !args->isfl);
+       ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
         ASSERT(args->agbno % args->alignment == 0);
  
         /* if not file data, insert new block into the reverse map btree */
@@ -732,12 +745,7 @@ xfs_alloc_ag_vextent(
                                               args->agbno, args->len));
         }
  
-       if (!args->isfl) {
-               xfs_trans_mod_sb(args->tp, args->wasdel ?
-                                XFS_TRANS_SB_RES_FDBLOCKS :
-                                XFS_TRANS_SB_FDBLOCKS,
-                                -((long)(args->len)));
-       }
+       xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
  
         XFS_STATS_INC(args->mp, xs_allocx);
         XFS_STATS_ADD(args->mp, xs_allocb, args->len);
@@ -917,7 +925,7 @@ xfs_alloc_find_best_extent(
  
                         sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
                                                        args->alignment,
-                                                      args->userdata, *sbnoa,
+                                                      args->datatype, *sbnoa,
                                                        *slena, &new);
  
                         /*
@@ -1101,7 +1109,7 @@ restart:
                         if (args->len < blen)
                                 continue;
                         ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, args->userdata, ltbnoa,
+                               args->alignment, args->datatype, ltbnoa,
                                 ltlena, &ltnew);
                         if (ltnew != NULLAGBLOCK &&
                             (args->len > blen || ltdiff < bdiff)) {
@@ -1254,7 +1262,7 @@ restart:
                         args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                         xfs_alloc_fix_len(args);
                         ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, args->userdata, ltbnoa,
+                               args->alignment, args->datatype, ltbnoa,
                                 ltlena, &ltnew);
  
                         error = xfs_alloc_find_best_extent(args,
@@ -1271,7 +1279,7 @@ restart:
                         args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                         xfs_alloc_fix_len(args);
                         gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, args->userdata, gtbnoa,
+                               args->alignment, args->datatype, gtbnoa,
                                 gtlena, &gtnew);
  
                         error = xfs_alloc_find_best_extent(args,
@@ -1331,7 +1339,7 @@ restart:
         }
         rlen = args->len;
         (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-                                    args->userdata, ltbnoa, ltlena, &ltnew);
+                                    args->datatype, ltbnoa, ltlena, &ltnew);
         ASSERT(ltnew >= ltbno);
         ASSERT(ltnew + rlen <= ltbnoa + ltlena);
         ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1583,6 +1591,7 @@ xfs_alloc_ag_vextent_small(
         int             *stat)  /* status: 0-freelist, 1-normal/none */
  {
         struct xfs_owner_info   oinfo;
+       struct xfs_perag        *pag;
         int             error;
         xfs_agblock_t   fbno;
         xfs_extlen_t    flen;
@@ -1600,7 +1609,8 @@ xfs_alloc_ag_vextent_small(
          * to respect minleft even when pulling from the
          * freelist.
          */
-       else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+       else if (args->minlen == 1 && args->alignment == 1 &&
+                args->resv != XFS_AG_RESV_AGFL &&
                  (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
                   > args->minleft)) {
                 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1608,9 +1618,9 @@ xfs_alloc_ag_vextent_small(
                         goto error0;
                 if (fbno != NULLAGBLOCK) {
                         xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
-                                            args->userdata);
+                             xfs_alloc_allow_busy_reuse(args->datatype));
  
-                       if (args->userdata) {
+                       if (xfs_alloc_is_userdata(args->datatype)) {
                                 xfs_buf_t       *bp;
  
                                 bp = xfs_btree_get_bufs(args->mp, args->tp,
@@ -1629,13 +1639,18 @@ xfs_alloc_ag_vextent_small(
                         /*
                          * If we're feeding an AGFL block to something that
                          * doesn't live in the free space, we need to clear
-                        * out the OWN_AG rmap.
+                        * out the OWN_AG rmap and add the block back to
+                        * the AGFL per-AG reservation.
                          */
                         xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
                         error = xfs_rmap_free(args->tp, args->agbp, args->agno,
                                         fbno, 1, &oinfo);
                         if (error)
                                 goto error0;
+                       pag = xfs_perag_get(args->mp, args->agno);
+                       xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
+                                       args->tp, 1);
+                       xfs_perag_put(pag);
  
                         *stat = 0;
                         return 0;
@@ -1683,7 +1698,7 @@ xfs_free_ag_extent(
         xfs_agblock_t           bno,
         xfs_extlen_t            len,
         struct xfs_owner_info   *oinfo,
-       int                     isfl)
+       enum xfs_ag_resv_type   type)
  {
         xfs_btree_cur_t *bno_cur;       /* cursor for by-block btree */
         xfs_btree_cur_t *cnt_cur;       /* cursor for by-size btree */
@@ -1911,21 +1926,22 @@ xfs_free_ag_extent(
          */
         pag = xfs_perag_get(mp, agno);
         error = xfs_alloc_update_counters(tp, pag, agbp, len);
+       xfs_ag_resv_free_extent(pag, type, tp, len);
         xfs_perag_put(pag);
         if (error)
                 goto error0;
  
-       if (!isfl)
-               xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
         XFS_STATS_INC(mp, xs_freex);
         XFS_STATS_ADD(mp, xs_freeb, len);
  
-       trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+       trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+                       haveleft, haveright);
  
         return 0;
  
   error0:
-       trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+       trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
+                       -1, -1);
         if (bno_cur)
                 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
         if (cnt_cur)
@@ -1950,21 +1966,43 @@ xfs_alloc_compute_maxlevels(
  }
  
  /*
- * Find the length of the longest extent in an AG.
+ * Find the length of the longest extent in an AG.  The 'need' parameter
+ * specifies how much space we're going to need for the AGFL and the
+ * 'reserved' parameter tells us how many blocks in this AG are reserved for
+ * other callers.
   */
  xfs_extlen_t
  xfs_alloc_longest_free_extent(
         struct xfs_mount        *mp,
         struct xfs_perag        *pag,
-       xfs_extlen_t            need)
+       xfs_extlen_t            need,
+       xfs_extlen_t            reserved)
  {
         xfs_extlen_t            delta = 0;
  
+       /*
+        * If the AGFL needs a recharge, we'll have to subtract that from the
+        * longest extent.
+        */
         if (need > pag->pagf_flcount)
                 delta = need - pag->pagf_flcount;
  
+       /*
+        * If we cannot maintain others' reservations with space from the
+        * not-longest freesp extents, we'll have to subtract /that/ from
+        * the longest extent too.
+        */
+       if (pag->pagf_freeblks - pag->pagf_longest < reserved)
+               delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
+
+       /*
+        * If the longest extent is long enough to satisfy all the
+        * reservations and AGFL rules in place, we can return this extent.
+        */
         if (pag->pagf_longest > delta)
                 return pag->pagf_longest - delta;
+
+       /* Otherwise, let the caller try for 1 block if there's space. */
         return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
  }
  
@@ -2004,20 +2042,24 @@ xfs_alloc_space_available(
  {
         struct xfs_perag        *pag = args->pag;
         xfs_extlen_t            longest;
+       xfs_extlen_t            reservation; /* blocks that are still reserved */
         int                     available;
  
         if (flags & XFS_ALLOC_FLAG_FREEING)
                 return true;
  
+       reservation = xfs_ag_resv_needed(pag, args->resv);
+
         /* do we have enough contiguous free space for the allocation? */
-       longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+       longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
+                       reservation);
         if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
                 return false;
  
-       /* do have enough free space remaining for the allocation? */
+       /* do we have enough free space remaining for the allocation? */
         available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
-                         min_free - args->total);
-       if (available < (int)args->minleft)
+                         reservation - min_free - args->total);
+       if (available < (int)args->minleft || available <= 0)
                 return false;
  
         return true;
@@ -2058,7 +2100,7 @@ xfs_alloc_fix_freelist(
          * somewhere else if we are not being asked to try harder at this
          * point
          */
-       if (pag->pagf_metadata && args->userdata &&
+       if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
             (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
                 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
                 goto out_agbp_relse;
@@ -2124,7 +2166,7 @@ xfs_alloc_fix_freelist(
                 if (error)
                         goto out_agbp_relse;
                 error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
-                                          &targs.oinfo, 1);
+                                          &targs.oinfo, XFS_AG_RESV_AGFL);
                 if (error)
                         goto out_agbp_relse;
                 bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
@@ -2135,7 +2177,7 @@ xfs_alloc_fix_freelist(
         targs.mp = mp;
         targs.agbp = agbp;
         targs.agno = args->agno;
-       targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+       targs.alignment = targs.minlen = targs.prod = 1;
         targs.type = XFS_ALLOCTYPE_THIS_AG;
         targs.pag = pag;
         error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
@@ -2146,6 +2188,7 @@ xfs_alloc_fix_freelist(
         while (pag->pagf_flcount < need) {
                 targs.agbno = 0;
                 targs.maxlen = need - pag->pagf_flcount;
+               targs.resv = XFS_AG_RESV_AGFL;
  
                 /* Allocate as many blocks as possible at once. */
                 error = xfs_alloc_ag_vextent(&targs);
@@ -2633,7 +2676,7 @@ xfs_alloc_vextent(
                  * Try near allocation first, then anywhere-in-ag after
                  * the first a.g. fails.
                  */
-               if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
+               if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
                     (mp->m_flags & XFS_MOUNT_32BITINODES)) {
                         args->fsbno = XFS_AGB_TO_FSB(mp,
                                         ((mp->m_agfrotor / rotorstep) %
@@ -2766,7 +2809,7 @@ xfs_alloc_vextent(
  #endif
  
                 /* Zero the extent if we were asked to do so */
-               if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+               if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
                         error = xfs_zero_extent(args->ip, args->fsbno, args->len);
                         if (error)
                                 goto error0;
@@ -2825,7 +2868,8 @@ xfs_free_extent(
         struct xfs_trans        *tp,    /* transaction pointer */
         xfs_fsblock_t           bno,    /* starting block number of extent */
         xfs_extlen_t            len,    /* length of extent */
-       struct xfs_owner_info   *oinfo) /* extent owner */
+       struct xfs_owner_info   *oinfo, /* extent owner */
+       enum xfs_ag_resv_type   type)   /* block reservation type */
  {
         struct xfs_mount        *mp = tp->t_mountp;
         struct xfs_buf          *agbp;
@@ -2834,6 +2878,7 @@ xfs_free_extent(
         int                     error;
  
         ASSERT(len != 0);
+       ASSERT(type != XFS_AG_RESV_AGFL);
  
         if (XFS_TEST_ERROR(false, mp,
                         XFS_ERRTAG_FREE_EXTENT,
@@ -2851,7 +2896,7 @@ xfs_free_extent(
                 agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
                                 err);
  
-       error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
+       error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
         if (error)
                 goto err;
  
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h

index 6fe2d6b..7c404a6 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg {
         xfs_extlen_t    len;            /* output: actual size of extent */
         xfs_alloctype_t type;           /* allocation type XFS_ALLOCTYPE_... */
         xfs_alloctype_t otype;          /* original allocation type */
+       int             datatype;       /* mask defining data type treatment */
         char            wasdel;         /* set if allocation was prev delayed */
         char            wasfromfl;      /* set if allocation is from freelist */
-       char            isfl;           /* set if is freelist blocks - !acctg */
-       char            userdata;       /* mask defining userdata treatment */
         xfs_fsblock_t   firstblock;     /* io first block allocated */
         struct xfs_owner_info   oinfo;  /* owner of blocks being allocated */
+       enum xfs_ag_resv_type   resv;   /* block reservation to use */
  } xfs_alloc_arg_t;
  
  /*
- * Defines for userdata
+ * Defines for datatype
   */
  #define XFS_ALLOC_USERDATA             (1 << 0)/* allocation is for user data*/
  #define XFS_ALLOC_INITIAL_USER_DATA    (1 << 1)/* special case start of file */
  #define XFS_ALLOC_USERDATA_ZERO                (1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY               (1 << 3)/* Busy extents not allowed */
+
+static inline bool
+xfs_alloc_is_userdata(int datatype)
+{
+       return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
+}
+
+static inline bool
+xfs_alloc_allow_busy_reuse(int datatype)
+{
+       return (datatype & XFS_ALLOC_NOBUSY) == 0;
+}
  
  /* freespace limit calculations */
  #define XFS_ALLOC_AGFL_RESERVE 4
@@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
  unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
  
  xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
-               struct xfs_perag *pag, xfs_extlen_t need);
+               struct xfs_perag *pag, xfs_extlen_t need,
+               xfs_extlen_t reserved);
  unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
                 struct xfs_perag *pag);
  
@@ -184,7 +198,8 @@ xfs_free_extent(
         struct xfs_trans        *tp,    /* transaction pointer */
         xfs_fsblock_t           bno,    /* starting block number of extent */
         xfs_extlen_t            len,    /* length of extent */
-       struct xfs_owner_info   *oinfo);/* extent owner */
+       struct xfs_owner_info   *oinfo, /* extent owner */
+       enum xfs_ag_resv_type   type);  /* block reservation type */
  
  int                            /* error */
  xfs_alloc_lookup_ge(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index b060bca..9d7f61d 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -47,6 +47,7 @@
  #include "xfs_attr_leaf.h"
  #include "xfs_filestream.h"
  #include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
  
  
  kmem_zone_t            *xfs_bmap_free_item_zone;
@@ -1388,7 +1389,7 @@ xfs_bmap_search_multi_extents(
   * Else, *lastxp will be set to the index of the found
   * entry; *gotp will contain the entry.
   */
-STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
  xfs_bmap_search_extents(
         xfs_inode_t     *ip,            /* incore inode pointer */
         xfs_fileoff_t   bno,            /* block number searched for */
@@ -3347,7 +3348,8 @@ xfs_bmap_adjacent(
  
         mp = ap->ip->i_mount;
         nullfb = *ap->firstblock == NULLFSBLOCK;
-       rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+       rt = XFS_IS_REALTIME_INODE(ap->ip) &&
+               xfs_alloc_is_userdata(ap->datatype);
         fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
         /*
          * If allocating at eof, and there's a previous real block,
@@ -3501,7 +3503,8 @@ xfs_bmap_longest_free_extent(
         }
  
         longest = xfs_alloc_longest_free_extent(mp, pag,
-                                       xfs_alloc_min_freelist(mp, pag));
+                               xfs_alloc_min_freelist(mp, pag),
+                               xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
         if (*blen < longest)
                 *blen = longest;
  
@@ -3622,7 +3625,7 @@ xfs_bmap_btalloc(
  {
         xfs_mount_t     *mp;            /* mount point structure */
         xfs_alloctype_t atype = 0;      /* type for allocation routines */
-       xfs_extlen_t    align;          /* minimum allocation alignment */
+       xfs_extlen_t    align = 0;      /* minimum allocation alignment */
         xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
         xfs_agnumber_t  ag;
         xfs_alloc_arg_t args;
@@ -3645,7 +3648,8 @@ xfs_bmap_btalloc(
         else if (mp->m_dalign)
                 stripe_align = mp->m_dalign;
  
-       align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+       if (xfs_alloc_is_userdata(ap->datatype))
+               align = xfs_get_extsz_hint(ap->ip);
         if (unlikely(align)) {
                 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
                                                 align, 0, ap->eof, 0, ap->conv,
@@ -3658,7 +3662,8 @@ xfs_bmap_btalloc(
         nullfb = *ap->firstblock == NULLFSBLOCK;
         fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
         if (nullfb) {
-               if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+               if (xfs_alloc_is_userdata(ap->datatype) &&
+                   xfs_inode_is_filestream(ap->ip)) {
                         ag = xfs_filestream_lookup_ag(ap->ip);
                         ag = (ag != NULLAGNUMBER) ? ag : 0;
                         ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
@@ -3698,7 +3703,8 @@ xfs_bmap_btalloc(
                  * enough for the request.  If one isn't found, then adjust
                  * the minimum allocation size to the largest space found.
                  */
-               if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+               if (xfs_alloc_is_userdata(ap->datatype) &&
+                   xfs_inode_is_filestream(ap->ip))
                         error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
                 else
                         error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
@@ -3781,9 +3787,9 @@ xfs_bmap_btalloc(
         }
         args.minleft = ap->minleft;
         args.wasdel = ap->wasdel;
-       args.isfl = 0;
-       args.userdata = ap->userdata;
-       if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+       args.resv = XFS_AG_RESV_NONE;
+       args.datatype = ap->datatype;
+       if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
                 args.ip = ap->ip;
  
         error = xfs_alloc_vextent(&args);
@@ -3877,7 +3883,8 @@ STATIC int
  xfs_bmap_alloc(
         struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
  {
-       if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+       if (XFS_IS_REALTIME_INODE(ap->ip) &&
+           xfs_alloc_is_userdata(ap->datatype))
                 return xfs_bmap_rtalloc(ap);
         return xfs_bmap_btalloc(ap);
  }
@@ -4074,7 +4081,7 @@ xfs_bmapi_read(
         return 0;
  }
  
-STATIC int
+int
  xfs_bmapi_reserve_delalloc(
         struct xfs_inode        *ip,
         xfs_fileoff_t           aoff,
@@ -4170,91 +4177,6 @@ out_unreserve_quota:
         return error;
  }
  
-/*
- * Map file blocks to filesystem blocks, adding delayed allocations as needed.
- */
-int
-xfs_bmapi_delay(
-       struct xfs_inode        *ip,    /* incore inode */
-       xfs_fileoff_t           bno,    /* starting file offs. mapped */
-       xfs_filblks_t           len,    /* length to map in file */
-       struct xfs_bmbt_irec    *mval,  /* output: map values */
-       int                     *nmap,  /* i/o: mval size/count */
-       int                     flags)  /* XFS_BMAPI_... */
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-       struct xfs_bmbt_irec    got;    /* current file extent record */
-       struct xfs_bmbt_irec    prev;   /* previous file extent record */
-       xfs_fileoff_t           obno;   /* old block number (offset) */
-       xfs_fileoff_t           end;    /* end of mapped file region */
-       xfs_extnum_t            lastx;  /* last useful extent number */
-       int                     eof;    /* we've hit the end of extents */
-       int                     n = 0;  /* current extent index */
-       int                     error = 0;
-
-       ASSERT(*nmap >= 1);
-       ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
-       ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       if (unlikely(XFS_TEST_ERROR(
-           (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
-            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-               XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
-               return -EFSCORRUPTED;
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
-
-       XFS_STATS_INC(mp, xs_blk_mapw);
-
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
-               if (error)
-                       return error;
-       }
-
-       xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
-       end = bno + len;
-       obno = bno;
-
-       while (bno < end && n < *nmap) {
-               if (eof || got.br_startoff > bno) {
-                       error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
-                                                          &prev, &lastx, eof);
-                       if (error) {
-                               if (n == 0) {
-                                       *nmap = 0;
-                                       return error;
-                               }
-                               break;
-                       }
-               }
-
-               /* set up the extent map to return. */
-               xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
-               xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-
-               /* If we're done, stop now. */
-               if (bno >= end || n >= *nmap)
-                       break;
-
-               /* Else go on to the next record. */
-               prev = got;
-               if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-                       xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-               else
-                       eof = 1;
-       }
-
-       *nmap = n;
-       return 0;
-}
-
-
  static int
  xfs_bmapi_allocate(
         struct xfs_bmalloca     *bma)
@@ -4287,15 +4209,21 @@ xfs_bmapi_allocate(
         }
  
         /*
-        * Indicate if this is the first user data in the file, or just any
-        * user data. And if it is userdata, indicate whether it needs to
-        * be initialised to zero during allocation.
+        * Set the data type being allocated. For the data fork, the first data
+        * in the file is treated differently to all other allocations. For the
+        * attribute fork, we only need to ensure the allocated range is not on
+        * the busy list.
          */
         if (!(bma->flags & XFS_BMAPI_METADATA)) {
-               bma->userdata = (bma->offset == 0) ?
-                       XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+               bma->datatype = XFS_ALLOC_NOBUSY;
+               if (whichfork == XFS_DATA_FORK) {
+                       if (bma->offset == 0)
+                               bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+                       else
+                               bma->datatype |= XFS_ALLOC_USERDATA;
+               }
                 if (bma->flags & XFS_BMAPI_ZERO)
-                       bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
+                       bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
         }
  
         bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4565,7 +4493,7 @@ xfs_bmapi_write(
         bma.tp = tp;
         bma.ip = ip;
         bma.total = total;
-       bma.userdata = 0;
+       bma.datatype = 0;
         bma.dfops = dfops;
         bma.firstblock = firstblock;
  
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index 254034f..8395f6e 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -54,7 +54,7 @@ struct xfs_bmalloca {
         bool                    wasdel; /* replacing a delayed allocation */
         bool                    aeof;   /* allocated space at eof */
         bool                    conv;   /* overwriting unwritten extents */
-       char                    userdata;/* userdata mask */
+       int                     datatype;/* data type being allocated */
         int                     flags;
  };
  
@@ -181,9 +181,6 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
  int    xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
                 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
                 int *nmap, int flags);
-int    xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
-               xfs_filblks_t len, struct xfs_bmbt_irec *mval,
-               int *nmap, int flags);
  int    xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
                 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
                 xfs_fsblock_t *firstblock, xfs_extlen_t total,
@@ -202,5 +199,12 @@ int        xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
                 struct xfs_defer_ops *dfops, enum shift_direction direction,
                 int num_exts);
  int    xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+struct xfs_bmbt_rec_host *
+       xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
+               int fork, int *eofp, xfs_extnum_t *lastxp,
+               struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
+int    xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
+               xfs_filblks_t len, struct xfs_bmbt_irec *got,
+               struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
  
  #endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c

index 0856979..aa1752f 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2070,7 +2070,7 @@ __xfs_btree_updkeys(
         struct xfs_buf          *bp0,
         bool                    force_all)
  {
-       union xfs_btree_bigkey  key;    /* keys from current level */
+       union xfs_btree_key     key;    /* keys from current level */
         union xfs_btree_key     *lkey;  /* keys from the next level up */
         union xfs_btree_key     *hkey;
         union xfs_btree_key     *nlkey; /* keys from the next level up */
@@ -2086,7 +2086,7 @@ __xfs_btree_updkeys(
  
         trace_xfs_btree_updkeys(cur, level, bp0);
  
-       lkey = (union xfs_btree_key *)&key;
+       lkey = &key;
         hkey = xfs_btree_high_key_from_key(cur, lkey);
         xfs_btree_get_keys(cur, block, lkey);
         for (level++; level < cur->bc_nlevels; level++) {
@@ -3226,7 +3226,7 @@ xfs_btree_insrec(
         struct xfs_buf          *bp;    /* buffer for block */
         union xfs_btree_ptr     nptr;   /* new block ptr */
         struct xfs_btree_cur    *ncur;  /* new btree cursor */
-       union xfs_btree_bigkey  nkey;   /* new block key */
+       union xfs_btree_key     nkey;   /* new block key */
         union xfs_btree_key     *lkey;
         int                     optr;   /* old key/record index */
         int                     ptr;    /* key/record index */
@@ -3241,7 +3241,7 @@ xfs_btree_insrec(
         XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
  
         ncur = NULL;
-       lkey = (union xfs_btree_key *)&nkey;
+       lkey = &nkey;
  
         /*
          * If we have an external root pointer, and we've made it to the
@@ -3444,14 +3444,14 @@ xfs_btree_insert(
         union xfs_btree_ptr     nptr;   /* new block number (split result) */
         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
-       union xfs_btree_bigkey  bkey;   /* key of block to insert */
+       union xfs_btree_key     bkey;   /* key of block to insert */
         union xfs_btree_key     *key;
         union xfs_btree_rec     rec;    /* record to insert */
  
         level = 0;
         ncur = NULL;
         pcur = cur;
-       key = (union xfs_btree_key *)&bkey;
+       key = &bkey;
  
         xfs_btree_set_ptr_null(cur, &nptr);
  
@@ -4797,3 +4797,50 @@ xfs_btree_query_range(
         return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
                         fn, priv);
  }
+
+/*
+ * Calculate the number of blocks needed to store a given number of records
+ * in a short-format (per-AG metadata) btree.
+ */
+xfs_extlen_t
+xfs_btree_calc_size(
+       struct xfs_mount        *mp,
+       uint                    *limits,
+       unsigned long long      len)
+{
+       int                     level;
+       int                     maxrecs;
+       xfs_extlen_t            rval;
+
+       maxrecs = limits[0];
+       for (level = 0, rval = 0; len > 1; level++) {
+               len += maxrecs - 1;
+               do_div(len, maxrecs);
+               maxrecs = limits[1];
+               rval += len;
+       }
+       return rval;
+}
+
+int
+xfs_btree_count_blocks_helper(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       void                    *data)
+{
+       xfs_extlen_t            *blocks = data;
+       (*blocks)++;
+
+       return 0;
+}
+
+/* Count the blocks in a btree and return the result in *blocks. */
+int
+xfs_btree_count_blocks(
+       struct xfs_btree_cur    *cur,
+       xfs_extlen_t            *blocks)
+{
+       *blocks = 0;
+       return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
+                       blocks);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h

index 04d0865..3f8556a 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -37,30 +37,18 @@ union xfs_btree_ptr {
         __be64                  l;      /* long form ptr */
  };
  
-union xfs_btree_key {
-       struct xfs_bmbt_key             bmbt;
-       xfs_bmdr_key_t                  bmbr;   /* bmbt root block */
-       xfs_alloc_key_t                 alloc;
-       struct xfs_inobt_key            inobt;
-       struct xfs_rmap_key             rmap;
-};
-
  /*
- * In-core key that holds both low and high keys for overlapped btrees.
- * The two keys are packed next to each other on disk, so do the same
- * in memory.  Preserve the existing xfs_btree_key as a single key to
- * avoid the mental model breakage that would happen if we passed a
- * bigkey into a function that operates on a single key.
+ * The in-core btree key.  Overlapping btrees actually store two keys
+ * per pointer, so we reserve enough memory to hold both.  The __*bigkey
+ * items should never be accessed directly.
   */
-union xfs_btree_bigkey {
+union xfs_btree_key {
         struct xfs_bmbt_key             bmbt;
         xfs_bmdr_key_t                  bmbr;   /* bmbt root block */
         xfs_alloc_key_t                 alloc;
         struct xfs_inobt_key            inobt;
-       struct {
-               struct xfs_rmap_key     rmap;
-               struct xfs_rmap_key     rmap_hi;
-       };
+       struct xfs_rmap_key             rmap;
+       struct xfs_rmap_key             __rmap_bigkey[2];
  };
  
  union xfs_btree_rec {
@@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
  bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
  uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
                                  unsigned long len);
+xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
+               unsigned long long len);
  
  /* return codes */
  #define XFS_BTREE_QUERY_RANGE_CONTINUE 0       /* keep iterating */
@@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
  int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
                 xfs_btree_visit_blocks_fn fn, void *data);
  
+int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
+
  #endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c

index c221d0e..613c5cf 100644 (file)
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -81,6 +81,10 @@
   *   - For each work item attached to the log intent item,
   *     * Perform the described action.
   *     * Attach the work item to the log done item.
+ *     * If the result of doing the work was -EAGAIN, ->finish work
+ *       wants a new transaction.  See the "Requesting a Fresh
+ *       Transaction while Finishing Deferred Work" section below for
+ *       details.
   *
   * The key here is that we must log an intent item for all pending
   * work items every time we roll the transaction, and that we must log
@@ -88,6 +92,34 @@
   * we can perform complex remapping operations, chaining intent items
   * as needed.
   *
+ * Requesting a Fresh Transaction while Finishing Deferred Work
+ *
+ * If ->finish_item decides that it needs a fresh transaction to
+ * finish the work, it must ask its caller (xfs_defer_finish) for a
+ * continuation.  The most likely cause of this circumstance are the
+ * refcount adjust functions deciding that they've logged enough items
+ * to be at risk of exceeding the transaction reservation.
+ *
+ * To get a fresh transaction, we want to log the existing log done
+ * item to prevent the log intent item from replaying, immediately log
+ * a new log intent item with the unfinished work items, roll the
+ * transaction, and re-call ->finish_item wherever it left off.  The
+ * log done item and the new log intent item must be in the same
+ * transaction or atomicity cannot be guaranteed; defer_finish ensures
+ * that this happens.
+ *
+ * This requires some coordination between ->finish_item and
+ * defer_finish.  Upon deciding to request a new transaction,
+ * ->finish_item should update the current work item to reflect the
+ * unfinished work.  Next, it should reset the log done item's list
+ * count to the number of items finished, and return -EAGAIN.
+ * defer_finish sees the -EAGAIN, logs the new log intent item
+ * with the remaining work items, and leaves the xfs_defer_pending
+ * item at the head of the dop_work queue.  Then it rolls the
+ * transaction and picks up processing where it left off.  It is
+ * required that ->finish_item must be careful to leave enough
+ * transaction reservation to fit the new log intent item.
+ *
   * This is an example of remapping the extent (E, E+B) into file X at
   * offset A and dealing with the extent (C, C+B) already being mapped
   * there:
@@ -104,21 +136,26 @@
   * | Intent to add rmap (X, E, A, B)                 |
   * +-------------------------------------------------+
   * | Reduce refcount for extent (C, B)               | t2
- * | Done reducing refcount for extent (C, B)        |
+ * | Done reducing refcount for extent (C, 9)        |
+ * | Intent to reduce refcount for extent (C+9, B-9) |
+ * | (ran out of space after 9 refcount updates)     |
+ * +-------------------------------------------------+
+ * | Reduce refcount for extent (C+9, B+9)           | t3
+ * | Done reducing refcount for extent (C+9, B-9)    |
   * | Increase refcount for extent (E, B)             |
   * | Done increasing refcount for extent (E, B)      |
   * | Intent to free extent (C, B)                    |
   * | Intent to free extent (F, 1) (refcountbt block) |
   * | Intent to remove rmap (F, 1, REFC)              |
   * +-------------------------------------------------+
- * | Remove rmap (X, C, A, B)                        | t3
+ * | Remove rmap (X, C, A, B)                        | t4
   * | Done removing rmap (X, C, A, B)                 |
   * | Add rmap (X, E, A, B)                           |
   * | Done adding rmap (X, E, A, B)                   |
   * | Remove rmap (F, 1, REFC)                        |
   * | Done removing rmap (F, 1, REFC)                 |
   * +-------------------------------------------------+
- * | Free extent (C, B)                              | t4
+ * | Free extent (C, B)                              | t5
   * | Done freeing extent (C, B)                      |
   * | Free extent (D, 1)                              |
   * | Done freeing extent (D, 1)                      |
@@ -141,6 +178,9 @@
   * - Intent to free extent (C, B)
   * - Intent to free extent (F, 1) (refcountbt block)
   * - Intent to remove rmap (F, 1, REFC)
+ *
+ * Note that the continuation requested between t2 and t3 is likely to
+ * reoccur.
   */
  
  static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
@@ -323,7 +363,16 @@ xfs_defer_finish(
                         dfp->dfp_count--;
                         error = dfp->dfp_type->finish_item(*tp, dop, li,
                                         dfp->dfp_done, &state);
-                       if (error) {
+                       if (error == -EAGAIN) {
+                               /*
+                                * Caller wants a fresh transaction;
+                                * put the work item back on the list
+                                * and jump out.
+                                */
+                               list_add(li, &dfp->dfp_work);
+                               dfp->dfp_count++;
+                               break;
+                       } else if (error) {
                                 /*
                                  * Clean up after ourselves and jump out.
                                  * xfs_defer_cancel will take care of freeing
@@ -335,9 +384,25 @@ xfs_defer_finish(
                                 goto out;
                         }
                 }
-               /* Done with the dfp, free it. */
-               list_del(&dfp->dfp_list);
-               kmem_free(dfp);
+               if (error == -EAGAIN) {
+                       /*
+                        * Caller wants a fresh transaction, so log a
+                        * new log intent item to replace the old one
+                        * and roll the transaction.  See "Requesting
+                        * a Fresh Transaction while Finishing
+                        * Deferred Work" above.
+                        */
+                       dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+                                       dfp->dfp_count);
+                       dfp->dfp_done = NULL;
+                       list_for_each(li, &dfp->dfp_work)
+                               dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
+                                               li);
+               } else {
+                       /* Done with the dfp, free it. */
+                       list_del(&dfp->dfp_list);
+                       kmem_free(dfp);
+               }
  
                 if (cleanup_fn)
                         cleanup_fn(*tp, state, error);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c

index 31ca220..eab68ae 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -132,7 +132,7 @@ xfs_inobt_free_block(
         xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
         return xfs_free_extent(cur->bc_tp,
                         XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
-                       &oinfo);
+                       &oinfo, XFS_AG_RESV_NONE);
  }
  
  STATIC int
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h

index a6eed43..fc5eef8 100644 (file)
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -647,9 +647,17 @@ struct xfs_rui_log_format {
         __uint16_t              rui_size;       /* size of this item */
         __uint32_t              rui_nextents;   /* # extents to free */
         __uint64_t              rui_id;         /* rui identifier */
-       struct xfs_map_extent   rui_extents[1]; /* array of extents to rmap */
+       struct xfs_map_extent   rui_extents[];  /* array of extents to rmap */
  };
  
+static inline size_t
+xfs_rui_log_format_sizeof(
+       unsigned int            nr)
+{
+       return sizeof(struct xfs_rui_log_format) +
+                       nr * sizeof(struct xfs_map_extent);
+}
+
  /*
   * This is the structure used to lay out an rud log item in the
   * log.  The rud_extents array is a variable size array whose
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index 7575cfc..4a28fa9 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc(
   * Update on-disk file size now that data has been written to disk.
   */
  STATIC int
-xfs_setfilesize(
+__xfs_setfilesize(
         struct xfs_inode        *ip,
         struct xfs_trans        *tp,
         xfs_off_t               offset,
@@ -225,6 +225,23 @@ xfs_setfilesize(
         return xfs_trans_commit(tp);
  }
  
+int
+xfs_setfilesize(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       size_t                  size)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+       if (error)
+               return error;
+
+       return __xfs_setfilesize(ip, tp, offset, size);
+}
+
  STATIC int
  xfs_setfilesize_ioend(
         struct xfs_ioend        *ioend,
@@ -247,7 +264,7 @@ xfs_setfilesize_ioend(
                 return error;
         }
  
-       return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+       return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
  }
  
  /*
@@ -1336,13 +1353,12 @@ xfs_end_io_direct_write(
  {
         struct inode            *inode = file_inode(iocb->ki_filp);
         struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
         uintptr_t               flags = (uintptr_t)private;
         int                     error = 0;
  
         trace_xfs_end_io_direct_write(ip, offset, size);
  
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
         if (size <= 0)
@@ -1380,14 +1396,9 @@ xfs_end_io_direct_write(
  
                 error = xfs_iomap_write_unwritten(ip, offset, size);
         } else if (flags & XFS_DIO_FLAG_APPEND) {
-               struct xfs_trans *tp;
-
                 trace_xfs_end_io_direct_write_append(ip, offset, size);
  
-               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
-                               &tp);
-               if (!error)
-                       error = xfs_setfilesize(ip, tp, offset, size);
+               error = xfs_setfilesize(ip, offset, size);
         }
  
         return error;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index bf2d9a1..1950e3b 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,6 +62,7 @@ int   xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
  
  int    xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
                 ssize_t size, void *private);
+int    xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
  
  extern void xfs_count_page_state(struct page *, int *, int *);
  extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 4ece4f2..e827d65 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -182,7 +182,7 @@ xfs_bmap_rtalloc(
                                         XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
  
                 /* Zero the extent if we were asked to do so */
-               if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+               if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
                         error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
                         if (error)
                                 return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c

index e455f90..2975cb2 100644 (file)
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -865,7 +865,7 @@ xfs_buf_item_log_segment(
          */
         if (bit) {
                 end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
-               mask = ((1 << (end_bit - bit)) - 1) << bit;
+               mask = ((1U << (end_bit - bit)) - 1) << bit;
                 *wordp |= mask;
                 wordp++;
                 bits_set = end_bit - bit;
@@ -888,7 +888,7 @@ xfs_buf_item_log_segment(
          */
         end_bit = bits_to_set - bits_set;
         if (end_bit) {
-               mask = (1 << end_bit) - 1;
+               mask = (1U << end_bit) - 1;
                 *wordp |= mask;
         }
  }
@@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error(
              bp->b_last_error != bp->b_error) {
                 bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
                 bp->b_last_error = bp->b_error;
-               if (cfg->retry_timeout && !bp->b_first_retry_time)
+               if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+                   !bp->b_first_retry_time)
                         bp->b_first_retry_time = jiffies;
  
                 xfs_buf_ioerror(bp, 0);
@@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error(
         if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
             ++bp->b_retries > cfg->max_retries)
                         goto permanent_error;
-       if (cfg->retry_timeout &&
+       if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
             time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
                         goto permanent_error;
  
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c

index c263e07..162dc18 100644 (file)
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -384,7 +384,7 @@ restart:
                  * If this is a metadata allocation, try to reuse the busy
                  * extent instead of trimming the allocation.
                  */
-               if (!args->userdata &&
+               if (!xfs_alloc_is_userdata(args->datatype) &&
                     !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
                         if (!xfs_extent_busy_update_extent(args->mp, args->pag,
                                                           busyp, fbno, flen,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index e612a02..c68517b 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -269,6 +269,8 @@ xfs_file_dio_aio_read(
                 return -EINVAL;
         }
  
+       file_accessed(iocb->ki_filp);
+
         /*
          * Locking is a bit tricky here. If we take an exclusive lock for direct
          * IO, we effectively serialise all new concurrent read IO to this file
@@ -323,7 +325,6 @@ xfs_file_dio_aio_read(
         }
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
-       file_accessed(iocb->ki_filp);
         return ret;
  }
  
@@ -332,10 +333,7 @@ xfs_file_dax_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
  {
-       struct address_space    *mapping = iocb->ki_filp->f_mapping;
-       struct inode            *inode = mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct iov_iter         data = *to;
+       struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
         size_t                  count = iov_iter_count(to);
         ssize_t                 ret = 0;
  
@@ -345,11 +343,7 @@ xfs_file_dax_read(
                 return 0; /* skip atime */
  
         xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
-       if (ret > 0) {
-               iocb->ki_pos += ret;
-               iov_iter_advance(to, ret);
-       }
+       ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
         file_accessed(iocb->ki_filp);
@@ -711,70 +705,32 @@ xfs_file_dax_write(
         struct kiocb            *iocb,
         struct iov_iter         *from)
  {
-       struct address_space    *mapping = iocb->ki_filp->f_mapping;
-       struct inode            *inode = mapping->host;
+       struct inode            *inode = iocb->ki_filp->f_mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       ssize_t                 ret = 0;
-       int                     unaligned_io = 0;
-       int                     iolock;
-       struct iov_iter         data;
+       int                     iolock = XFS_IOLOCK_EXCL;
+       ssize_t                 ret, error = 0;
+       size_t                  count;
+       loff_t                  pos;
  
-       /* "unaligned" here means not aligned to a filesystem block */
-       if ((iocb->ki_pos & mp->m_blockmask) ||
-           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
-               unaligned_io = 1;
-               iolock = XFS_IOLOCK_EXCL;
-       } else if (mapping->nrpages) {
-               iolock = XFS_IOLOCK_EXCL;
-       } else {
-               iolock = XFS_IOLOCK_SHARED;
-       }
         xfs_rw_ilock(ip, iolock);
-
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
  
-       /*
-        * Yes, even DAX files can have page cache attached to them:  A zeroed
-        * page is inserted into the pagecache when we have to serve a write
-        * fault on a hole.  It should never be dirtied and can simply be
-        * dropped from the pagecache once we get real data for the page.
-        *
-        * XXX: This is racy against mmap, and there's nothing we can do about
-        * it. dax_do_io() should really do this invalidation internally as
-        * it will know if we've allocated over a holei for this specific IO and
-        * if so it needs to update the mapping tree and invalidate existing
-        * PTEs over the newly allocated range. Remove this invalidation when
-        * dax_do_io() is fixed up.
-        */
-       if (mapping->nrpages) {
-               loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+       pos = iocb->ki_pos;
+       count = iov_iter_count(from);
  
-               ret = invalidate_inode_pages2_range(mapping,
-                                                   iocb->ki_pos >> PAGE_SHIFT,
-                                                   end >> PAGE_SHIFT);
-               WARN_ON_ONCE(ret);
-       }
+       trace_xfs_file_dax_write(ip, count, pos);
  
-       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
-               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-               iolock = XFS_IOLOCK_SHARED;
+       ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+       if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+               i_size_write(inode, iocb->ki_pos);
+               error = xfs_setfilesize(ip, pos, ret);
         }
  
-       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
-
-       data = *from;
-       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-                       xfs_end_io_direct_write, 0);
-       if (ret > 0) {
-               iocb->ki_pos += ret;
-               iov_iter_advance(from, ret);
-       }
  out:
         xfs_rw_iunlock(ip, iolock);
-       return ret;
+       return error ? error : ret;
  }
  
  STATIC ssize_t
@@ -1513,7 +1469,7 @@ xfs_filemap_page_mkwrite(
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (IS_DAX(inode)) {
-               ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+               ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
         } else {
                 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                 ret = block_page_mkwrite_return(ret);
@@ -1547,7 +1503,7 @@ xfs_filemap_fault(
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
-               ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+               ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c

index 4a33a33..043ca38 100644 (file)
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -30,6 +30,7 @@
  #include "xfs_mru_cache.h"
  #include "xfs_filestream.h"
  #include "xfs_trace.h"
+#include "xfs_ag_resv.h"
  
  struct xfs_fstrm_item {
         struct xfs_mru_cache_elem       mru;
@@ -198,7 +199,8 @@ xfs_filestream_pick_ag(
                 }
  
                 longest = xfs_alloc_longest_free_extent(mp, pag,
-                                       xfs_alloc_min_freelist(mp, pag));
+                               xfs_alloc_min_freelist(mp, pag),
+                               xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
                 if (((minlen && longest >= minlen) ||
                      (!minlen && pag->pagf_freeblks >= minfree)) &&
                     (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
@@ -369,7 +371,8 @@ xfs_filestream_new_ag(
         struct xfs_mount        *mp = ip->i_mount;
         xfs_extlen_t            minlen = ap->length;
         xfs_agnumber_t          startag = 0;
-       int                     flags, err = 0;
+       int                     flags = 0;
+       int                     err = 0;
         struct xfs_mru_cache_elem *mru;
  
         *agp = NULLAGNUMBER;
@@ -385,8 +388,10 @@ xfs_filestream_new_ag(
                 startag = (item->ag + 1) % mp->m_sb.sb_agcount;
         }
  
-       flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
-               (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0);
+       if (xfs_alloc_is_userdata(ap->datatype))
+               flags |= XFS_PICK_USERDATA;
+       if (ap->dfops->dop_low)
+               flags |= XFS_PICK_LOWSPACE;
  
         err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
  
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c

index 0b7f986..94ac06f 100644 (file)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -553,7 +553,7 @@ xfs_growfs_data_private(
                 error = xfs_free_extent(tp,
                                 XFS_AGB_TO_FSB(mp, agno,
                                         be32_to_cpu(agf->agf_length) - new),
-                               new, &oinfo);
+                               new, &oinfo, XFS_AG_RESV_NONE);
                 if (error)
                         goto error0;
         }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index fb39a66..65b2e3f 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1414,6 +1414,16 @@ xfs_inode_set_eofblocks_tag(
         struct xfs_perag *pag;
         int tagged;
  
+       /*
+        * Don't bother locking the AG and looking up in the radix trees
+        * if we already know that we have the tag set.
+        */
+       if (ip->i_flags & XFS_IEOFBLOCKS)
+               return;
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags |= XFS_IEOFBLOCKS;
+       spin_unlock(&ip->i_flags_lock);
+
         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
         spin_lock(&pag->pag_ici_lock);
         trace_xfs_inode_set_eofblocks_tag(ip);
@@ -1449,6 +1459,10 @@ xfs_inode_clear_eofblocks_tag(
         struct xfs_mount *mp = ip->i_mount;
         struct xfs_perag *pag;
  
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags &= ~XFS_IEOFBLOCKS;
+       spin_unlock(&ip->i_flags_lock);
+
         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
         spin_lock(&pag->pag_ici_lock);
         trace_xfs_inode_clear_eofblocks_tag(ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index e1a411e..8f30d25 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,6 +216,7 @@ xfs_get_initial_prid(struct xfs_inode *dp)
  #define __XFS_IPINNED_BIT      8        /* wakeup key for zero pin count */
  #define XFS_IPINNED            (1 << __XFS_IPINNED_BIT)
  #define XFS_IDONTCACHE         (1 << 9) /* don't cache the inode long term */
+#define XFS_IEOFBLOCKS         (1 << 10)/* has the preallocblocks tag set */
  
  /*
   * Per-lifetime flags need to be reset when re-using a reclaimable inode during
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 2af0dda..c08253e 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,6 @@
  /*
   * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
   * All Rights Reserved.
   *
   * This program is free software; you can redistribute it and/or
@@ -42,17 +43,40 @@
  
  #define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
                                                 << mp->m_writeio_log)
-#define XFS_WRITE_IMAPS                XFS_BMAP_MAX_NMAP
  
-STATIC int
-xfs_iomap_eof_align_last_fsb(
-       xfs_mount_t     *mp,
-       xfs_inode_t     *ip,
-       xfs_extlen_t    extsize,
-       xfs_fileoff_t   *last_fsb)
+void
+xfs_bmbt_to_iomap(
+       struct xfs_inode        *ip,
+       struct iomap            *iomap,
+       struct xfs_bmbt_irec    *imap)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+
+       if (imap->br_startblock == HOLESTARTBLOCK) {
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_HOLE;
+       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_DELALLOC;
+       } else {
+               iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+               if (imap->br_state == XFS_EXT_UNWRITTEN)
+                       iomap->type = IOMAP_UNWRITTEN;
+               else
+                       iomap->type = IOMAP_MAPPED;
+       }
+       iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+       iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+       iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static xfs_extlen_t
+xfs_eof_alignment(
+       struct xfs_inode        *ip,
+       xfs_extlen_t            extsize)
  {
-       xfs_extlen_t    align = 0;
-       int             eof, error;
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_extlen_t            align = 0;
  
         if (!XFS_IS_REALTIME_INODE(ip)) {
                 /*
@@ -83,8 +107,21 @@ xfs_iomap_eof_align_last_fsb(
                         align = extsize;
         }
  
+       return align;
+}
+
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+       struct xfs_inode        *ip,
+       xfs_extlen_t            extsize,
+       xfs_fileoff_t           *last_fsb)
+{
+       xfs_extlen_t            align = xfs_eof_alignment(ip, extsize);
+
         if (align) {
                 xfs_fileoff_t   new_last_fsb = roundup_64(*last_fsb, align);
+               int             eof, error;
+
                 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
                 if (error)
                         return error;
@@ -154,7 +191,7 @@ xfs_iomap_write_direct(
                  */
                 ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
                                                                 XFS_IFEXTENTS);
-               error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+               error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
                 if (error)
                         goto out_unlock;
         } else {
@@ -274,130 +311,6 @@ out_trans_cancel:
         goto out_unlock;
  }
  
-/*
- * If the caller is doing a write at the end of the file, then extend the
- * allocation out to the file system's write iosize.  We clean up any extra
- * space left over when the file is closed in xfs_inactive().
- *
- * If we find we already have delalloc preallocation beyond EOF, don't do more
- * preallocation as it it not needed.
- */
-STATIC int
-xfs_iomap_eof_want_preallocate(
-       xfs_mount_t     *mp,
-       xfs_inode_t     *ip,
-       xfs_off_t       offset,
-       size_t          count,
-       xfs_bmbt_irec_t *imap,
-       int             nimaps,
-       int             *prealloc)
-{
-       xfs_fileoff_t   start_fsb;
-       xfs_filblks_t   count_fsb;
-       int             n, error, imaps;
-       int             found_delalloc = 0;
-
-       *prealloc = 0;
-       if (offset + count <= XFS_ISIZE(ip))
-               return 0;
-
-       /*
-        * If the file is smaller than the minimum prealloc and we are using
-        * dynamic preallocation, don't do any preallocation at all as it is
-        * likely this is the only write to the file that is going to be done.
-        */
-       if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
-           XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
-               return 0;
-
-       /*
-        * If there are any real blocks past eof, then don't
-        * do any speculative allocation.
-        */
-       start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
-       count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
-       while (count_fsb > 0) {
-               imaps = nimaps;
-               error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
-                                      0);
-               if (error)
-                       return error;
-               for (n = 0; n < imaps; n++) {
-                       if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
-                           (imap[n].br_startblock != DELAYSTARTBLOCK))
-                               return 0;
-                       start_fsb += imap[n].br_blockcount;
-                       count_fsb -= imap[n].br_blockcount;
-
-                       if (imap[n].br_startblock == DELAYSTARTBLOCK)
-                               found_delalloc = 1;
-               }
-       }
-       if (!found_delalloc)
-               *prealloc = 1;
-       return 0;
-}
-
-/*
- * Determine the initial size of the preallocation. We are beyond the current
- * EOF here, but we need to take into account whether this is a sparse write or
- * an extending write when determining the preallocation size.  Hence we need to
- * look up the extent that ends at the current write offset and use the result
- * to determine the preallocation size.
- *
- * If the extent is a hole, then preallocation is essentially disabled.
- * Otherwise we take the size of the preceeding data extent as the basis for the
- * preallocation size. If the size of the extent is greater than half the
- * maximum extent length, then use the current offset as the basis. This ensures
- * that for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width alignment of
- * real extents.
- */
-STATIC xfs_fsblock_t
-xfs_iomap_eof_prealloc_initial_size(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       xfs_bmbt_irec_t         *imap,
-       int                     nimaps)
-{
-       xfs_fileoff_t   start_fsb;
-       int             imaps = 1;
-       int             error;
-
-       ASSERT(nimaps >= imaps);
-
-       /* if we are using a specific prealloc size, return now */
-       if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
-               return 0;
-
-       /* If the file is small, then use the minimum prealloc */
-       if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
-               return 0;
-
-       /*
-        * As we write multiple pages, the offset will always align to the
-        * start of a page and hence point to a hole at EOF. i.e. if the size is
-        * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
-        * will return FSB 1. Hence if there are blocks in the file, we want to
-        * point to the block prior to the EOF block and not the hole that maps
-        * directly at @offset.
-        */
-       start_fsb = XFS_B_TO_FSB(mp, offset);
-       if (start_fsb)
-               start_fsb--;
-       error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
-       if (error)
-               return 0;
-
-       ASSERT(imaps == 1);
-       if (imap[0].br_startblock == HOLESTARTBLOCK)
-               return 0;
-       if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
-               return imap[0].br_blockcount << 1;
-       return XFS_B_TO_FSB(mp, offset);
-}
-
  STATIC bool
  xfs_quota_need_throttle(
         struct xfs_inode *ip,
@@ -459,27 +372,76 @@ xfs_quota_calc_throttle(
  }
  
  /*
+ * If we are doing a write at the end of the file and there are no allocations
+ * past this one, then extend the allocation out to the file system's write
+ * iosize.
+ *
   * If we don't have a user specified preallocation size, dynamically increase
- * the preallocation size as the size of the file grows. Cap the maximum size
+ * the preallocation size as the size of the file grows.  Cap the maximum size
   * at a single extent or less if the filesystem is near full. The closer the
   * filesystem is to full, the smaller the maximum prealocation.
+ *
+ * As an exception we don't do any preallocation at all if the file is smaller
+ * than the minimum preallocation and we are using the default dynamic
+ * preallocation scheme, as it is likely this is the only write to the file that
+ * is going to be done.
+ *
+ * We clean up any extra space left over when the file is closed in
+ * xfs_inactive().
   */
  STATIC xfs_fsblock_t
  xfs_iomap_prealloc_size(
-       struct xfs_mount        *mp,
         struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       struct xfs_bmbt_irec    *imap,
-       int                     nimaps)
+       loff_t                  offset,
+       loff_t                  count,
+       xfs_extnum_t            idx,
+       struct xfs_bmbt_irec    *prev)
  {
-       xfs_fsblock_t           alloc_blocks = 0;
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
         int                     shift = 0;
         int64_t                 freesp;
         xfs_fsblock_t           qblocks;
         int                     qshift = 0;
+       xfs_fsblock_t           alloc_blocks = 0;
+
+       if (offset + count <= XFS_ISIZE(ip))
+               return 0;
  
-       alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
-                                                          imap, nimaps);
+       if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+           (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
+               return 0;
+
+       /*
+        * If an explicit allocsize is set, the file is small, or we
+        * are writing behind a hole, then use the minimum prealloc:
+        */
+       if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
+           XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
+           idx == 0 ||
+           prev->br_startoff + prev->br_blockcount < offset_fsb)
+               return mp->m_writeio_blocks;
+
+       /*
+        * Determine the initial size of the preallocation. We are beyond the
+        * current EOF here, but we need to take into account whether this is
+        * a sparse write or an extending write when determining the
+        * preallocation size.  Hence we need to look up the extent that ends
+        * at the current write offset and use the result to determine the
+        * preallocation size.
+        *
+        * If the extent is a hole, then preallocation is essentially disabled.
+        * Otherwise we take the size of the preceding data extent as the basis
+        * for the preallocation size. If the size of the extent is greater than
+        * half the maximum extent length, then use the current offset as the
+        * basis. This ensures that for large files the preallocation size
+        * always extends to MAXEXTLEN rather than falling short due to things
+        * like stripe unit/width alignment of real extents.
+        */
+       if (prev->br_blockcount <= (MAXEXTLEN >> 1))
+               alloc_blocks = prev->br_blockcount << 1;
+       else
+               alloc_blocks = XFS_B_TO_FSB(mp, offset);
         if (!alloc_blocks)
                 goto check_writeio;
         qblocks = alloc_blocks;
@@ -550,120 +512,145 @@ xfs_iomap_prealloc_size(
          */
         while (alloc_blocks && alloc_blocks >= freesp)
                 alloc_blocks >>= 4;
-
  check_writeio:
         if (alloc_blocks < mp->m_writeio_blocks)
                 alloc_blocks = mp->m_writeio_blocks;
-
         trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
                                       mp->m_writeio_blocks);
-
         return alloc_blocks;
  }
  
-int
-xfs_iomap_write_delay(
-       xfs_inode_t     *ip,
-       xfs_off_t       offset,
-       size_t          count,
-       xfs_bmbt_irec_t *ret_imap)
+static int
+xfs_file_iomap_begin_delay(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  count,
+       unsigned                flags,
+       struct iomap            *iomap)
  {
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_fileoff_t   offset_fsb;
-       xfs_fileoff_t   last_fsb;
-       xfs_off_t       aligned_offset;
-       xfs_fileoff_t   ioalign;
-       xfs_extlen_t    extsz;
-       int             nimaps;
-       xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-       int             prealloc;
-       int             error;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       /*
-        * Make sure that the dquots are there. This doesn't hold
-        * the ilock across a disk read.
-        */
-       error = xfs_qm_dqattach_locked(ip, 0);
-       if (error)
-               return error;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           maxbytes_fsb =
+               XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+       xfs_fileoff_t           end_fsb, orig_end_fsb;
+       int                     error = 0, eof = 0;
+       struct xfs_bmbt_irec    got;
+       struct xfs_bmbt_irec    prev;
+       xfs_extnum_t            idx;
  
-       extsz = xfs_get_extsz_hint(ip);
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       ASSERT(!XFS_IS_REALTIME_INODE(ip));
+       ASSERT(!xfs_get_extsz_hint(ip));
  
-       error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                               imap, XFS_WRITE_IMAPS, &prealloc);
-       if (error)
-               return error;
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
  
-retry:
-       if (prealloc) {
-               xfs_fsblock_t   alloc_blocks;
+       if (unlikely(XFS_TEST_ERROR(
+           (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+               error = -EFSCORRUPTED;
+               goto out_unlock;
+       }
  
-               alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
-                                                      XFS_WRITE_IMAPS);
+       XFS_STATS_INC(mp, xs_blk_mapw);
  
-               aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
-               ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-               last_fsb = ioalign + alloc_blocks;
-       } else {
-               last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+               if (error)
+                       goto out_unlock;
         }
  
-       if (prealloc || extsz) {
-               error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-               if (error)
-                       return error;
+       xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
+                       &got, &prev);
+       if (!eof && got.br_startoff <= offset_fsb) {
+               trace_xfs_iomap_found(ip, offset, count, 0, &got);
+               goto done;
         }
  
+       error = xfs_qm_dqattach_locked(ip, 0);
+       if (error)
+               goto out_unlock;
+
         /*
-        * Make sure preallocation does not create extents beyond the range we
-        * actually support in this filesystem.
+        * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+        * to keep the chunks of work done where somewhat symmetric with the
+        * work writeback does. This is a completely arbitrary number pulled
+        * out of thin air as a best guess for initial testing.
+        *
+        * Note that the values needs to be less than 32-bits wide until
+        * the lower level functions are updated.
          */
-       if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
-               last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+       count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+       end_fsb = orig_end_fsb =
+               min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+       if (eof) {
+               xfs_fsblock_t   prealloc_blocks;
  
-       ASSERT(last_fsb > offset_fsb);
+               prealloc_blocks =
+                       xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+               if (prealloc_blocks) {
+                       xfs_extlen_t    align;
+                       xfs_off_t       end_offset;
  
-       nimaps = XFS_WRITE_IMAPS;
-       error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
-                               imap, &nimaps, XFS_BMAPI_ENTIRE);
+                       end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
+                       end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+                               prealloc_blocks;
+
+                       align = xfs_eof_alignment(ip, 0);
+                       if (align)
+                               end_fsb = roundup_64(end_fsb, align);
+
+                       end_fsb = min(end_fsb, maxbytes_fsb);
+                       ASSERT(end_fsb > offset_fsb);
+               }
+       }
+
+retry:
+       error = xfs_bmapi_reserve_delalloc(ip, offset_fsb,
+                       end_fsb - offset_fsb, &got,
+                       &prev, &idx, eof);
         switch (error) {
         case 0:
+               break;
         case -ENOSPC:
         case -EDQUOT:
-               break;
-       default:
-               return error;
-       }
-
-       /*
-        * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
-        * without EOF preallocation.
-        */
-       if (nimaps == 0) {
+               /* retry without any preallocation */
                 trace_xfs_delalloc_enospc(ip, offset, count);
-               if (prealloc) {
-                       prealloc = 0;
-                       error = 0;
+               if (end_fsb != orig_end_fsb) {
+                       end_fsb = orig_end_fsb;
                         goto retry;
                 }
-               return error ? error : -ENOSPC;
+               /*FALLTHRU*/
+       default:
+               goto out_unlock;
         }
  
-       if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
-               return xfs_alert_fsblock_zero(ip, &imap[0]);
-
         /*
          * Tag the inode as speculatively preallocated so we can reclaim this
          * space on demand, if necessary.
          */
-       if (prealloc)
+       if (end_fsb != orig_end_fsb)
                 xfs_inode_set_eofblocks_tag(ip);
  
-       *ret_imap = imap[0];
-       return 0;
+       trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+done:
+       if (isnullstartblock(got.br_startblock))
+               got.br_startblock = DELAYSTARTBLOCK;
+
+       if (!got.br_startblock) {
+               error = xfs_alert_fsblock_zero(ip, &got);
+               if (error)
+                       goto out_unlock;
+       }
+
+       xfs_bmbt_to_iomap(ip, iomap, &got);
+
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
  }
  
  /*
@@ -947,37 +934,13 @@ error_on_bmapi_transaction:
         return error;
  }
  
-void
-xfs_bmbt_to_iomap(
-       struct xfs_inode        *ip,
-       struct iomap            *iomap,
-       struct xfs_bmbt_irec    *imap)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-
-       if (imap->br_startblock == HOLESTARTBLOCK) {
-               iomap->blkno = IOMAP_NULL_BLOCK;
-               iomap->type = IOMAP_HOLE;
-       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
-               iomap->blkno = IOMAP_NULL_BLOCK;
-               iomap->type = IOMAP_DELALLOC;
-       } else {
-               iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
-               if (imap->br_state == XFS_EXT_UNWRITTEN)
-                       iomap->type = IOMAP_UNWRITTEN;
-               else
-                       iomap->type = IOMAP_MAPPED;
-       }
-       iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-       iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-       iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
-}
-
-static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool imap_needs_alloc(struct inode *inode,
+               struct xfs_bmbt_irec *imap, int nimaps)
  {
         return !nimaps ||
                 imap->br_startblock == HOLESTARTBLOCK ||
-               imap->br_startblock == DELAYSTARTBLOCK;
+               imap->br_startblock == DELAYSTARTBLOCK ||
+               (IS_DAX(inode) && ISUNWRITTEN(imap));
  }
  
  static int
@@ -993,11 +956,18 @@ xfs_file_iomap_begin(
         struct xfs_bmbt_irec    imap;
         xfs_fileoff_t           offset_fsb, end_fsb;
         int                     nimaps = 1, error = 0;
+       unsigned                lockmode;
  
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
  
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if ((flags & IOMAP_WRITE) &&
+           !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
+               return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+                               iomap);
+       }
+
+       lockmode = xfs_ilock_data_map_shared(ip);
  
         ASSERT(offset <= mp->m_super->s_maxbytes);
         if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
@@ -1008,11 +978,11 @@ xfs_file_iomap_begin(
         error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
                                &nimaps, XFS_BMAPI_ENTIRE);
         if (error) {
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               xfs_iunlock(ip, lockmode);
                 return error;
         }
  
-       if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+       if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
                 /*
                  * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
                  * pages to keep the chunks of work done where somewhat symmetric
@@ -1024,27 +994,23 @@ xfs_file_iomap_begin(
                  * the lower level functions are updated.
                  */
                 length = min_t(loff_t, length, 1024 * PAGE_SIZE);
-               if (xfs_get_extsz_hint(ip)) {
-                       /*
-                        * xfs_iomap_write_direct() expects the shared lock. It
-                        * is unlocked on return.
-                        */
-                       xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
-                       error = xfs_iomap_write_direct(ip, offset, length, &imap,
-                                       nimaps);
-               } else {
-                       error = xfs_iomap_write_delay(ip, offset, length, &imap);
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
-
+               /*
+                * xfs_iomap_write_direct() expects the shared lock. It
+                * is unlocked on return.
+                */
+               if (lockmode == XFS_ILOCK_EXCL)
+                       xfs_ilock_demote(ip, lockmode);
+               error = xfs_iomap_write_direct(ip, offset, length, &imap,
+                               nimaps);
                 if (error)
                         return error;
  
+               iomap->flags = IOMAP_F_NEW;
                 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
         } else {
                 ASSERT(nimaps);
  
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               xfs_iunlock(ip, lockmode);
                 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
         }
  
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h

index fb8aca3..6498be4 100644 (file)
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,8 +25,6 @@ struct xfs_bmbt_irec;
  
  int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
                         struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
-                       struct xfs_bmbt_irec *);
  int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
                         struct xfs_bmbt_irec *);
  int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index 765f084..2b6eec5 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -413,7 +413,8 @@ struct xlog {
         /* log record crc error injection factor */
         uint32_t                l_badcrc_factor;
  #endif
-
+       /* log recovery lsn tracking (for buffer submission */
+       xfs_lsn_t               l_recovery_lsn;
  };
  
  #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index e8638fd..846483d 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -44,6 +44,7 @@
  #include "xfs_error.h"
  #include "xfs_dir2.h"
  #include "xfs_rmap_item.h"
+#include "xfs_buf_item.h"
  
  #define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
  
@@ -381,6 +382,15 @@ xlog_recover_iodone(
                                                 SHUTDOWN_META_IO_ERROR);
                 }
         }
+
+       /*
+        * On v5 supers, a bli could be attached to update the metadata LSN.
+        * Clean it up.
+        */
+       if (bp->b_fspriv)
+               xfs_buf_item_relse(bp);
+       ASSERT(bp->b_fspriv == NULL);
+
         bp->b_iodone = NULL;
         xfs_buf_ioend(bp);
  }
@@ -2360,12 +2370,14 @@ static void
  xlog_recover_validate_buf_type(
         struct xfs_mount        *mp,
         struct xfs_buf          *bp,
-       xfs_buf_log_format_t    *buf_f)
+       xfs_buf_log_format_t    *buf_f,
+       xfs_lsn_t               current_lsn)
  {
         struct xfs_da_blkinfo   *info = bp->b_addr;
         __uint32_t              magic32;
         __uint16_t              magic16;
         __uint16_t              magicda;
+       char                    *warnmsg = NULL;
  
         /*
          * We can only do post recovery validation on items on CRC enabled
@@ -2404,31 +2416,27 @@ xlog_recover_validate_buf_type(
                         bp->b_ops = &xfs_rmapbt_buf_ops;
                         break;
                 default:
-                       xfs_warn(mp, "Bad btree block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad btree block magic!";
                         break;
                 }
                 break;
         case XFS_BLFT_AGF_BUF:
                 if (magic32 != XFS_AGF_MAGIC) {
-                       xfs_warn(mp, "Bad AGF block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad AGF block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_agf_buf_ops;
                 break;
         case XFS_BLFT_AGFL_BUF:
                 if (magic32 != XFS_AGFL_MAGIC) {
-                       xfs_warn(mp, "Bad AGFL block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad AGFL block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_agfl_buf_ops;
                 break;
         case XFS_BLFT_AGI_BUF:
                 if (magic32 != XFS_AGI_MAGIC) {
-                       xfs_warn(mp, "Bad AGI block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad AGI block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_agi_buf_ops;
@@ -2438,8 +2446,7 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_GDQUOT_BUF:
  #ifdef CONFIG_XFS_QUOTA
                 if (magic16 != XFS_DQUOT_MAGIC) {
-                       xfs_warn(mp, "Bad DQUOT block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad DQUOT block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_dquot_buf_ops;
@@ -2451,16 +2458,14 @@ xlog_recover_validate_buf_type(
                 break;
         case XFS_BLFT_DINO_BUF:
                 if (magic16 != XFS_DINODE_MAGIC) {
-                       xfs_warn(mp, "Bad INODE block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad INODE block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_inode_buf_ops;
                 break;
         case XFS_BLFT_SYMLINK_BUF:
                 if (magic32 != XFS_SYMLINK_MAGIC) {
-                       xfs_warn(mp, "Bad symlink block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad symlink block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_symlink_buf_ops;
@@ -2468,8 +2473,7 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_DIR_BLOCK_BUF:
                 if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
                     magic32 != XFS_DIR3_BLOCK_MAGIC) {
-                       xfs_warn(mp, "Bad dir block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_dir3_block_buf_ops;
@@ -2477,8 +2481,7 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_DIR_DATA_BUF:
                 if (magic32 != XFS_DIR2_DATA_MAGIC &&
                     magic32 != XFS_DIR3_DATA_MAGIC) {
-                       xfs_warn(mp, "Bad dir data magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir data magic!";
                         break;
                 }
                 bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -2486,8 +2489,7 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_DIR_FREE_BUF:
                 if (magic32 != XFS_DIR2_FREE_MAGIC &&
                     magic32 != XFS_DIR3_FREE_MAGIC) {
-                       xfs_warn(mp, "Bad dir3 free magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir3 free magic!";
                         break;
                 }
                 bp->b_ops = &xfs_dir3_free_buf_ops;
@@ -2495,8 +2497,7 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_DIR_LEAF1_BUF:
                 if (magicda != XFS_DIR2_LEAF1_MAGIC &&
                     magicda != XFS_DIR3_LEAF1_MAGIC) {
-                       xfs_warn(mp, "Bad dir leaf1 magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir leaf1 magic!";
                         break;
                 }
                 bp->b_ops = &xfs_dir3_leaf1_buf_ops;
@@ -2504,8 +2505,7 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_DIR_LEAFN_BUF:
                 if (magicda != XFS_DIR2_LEAFN_MAGIC &&
                     magicda != XFS_DIR3_LEAFN_MAGIC) {
-                       xfs_warn(mp, "Bad dir leafn magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad dir leafn magic!";
                         break;
                 }
                 bp->b_ops = &xfs_dir3_leafn_buf_ops;
@@ -2513,8 +2513,7 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_DA_NODE_BUF:
                 if (magicda != XFS_DA_NODE_MAGIC &&
                     magicda != XFS_DA3_NODE_MAGIC) {
-                       xfs_warn(mp, "Bad da node magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad da node magic!";
                         break;
                 }
                 bp->b_ops = &xfs_da3_node_buf_ops;
@@ -2522,24 +2521,21 @@ xlog_recover_validate_buf_type(
         case XFS_BLFT_ATTR_LEAF_BUF:
                 if (magicda != XFS_ATTR_LEAF_MAGIC &&
                     magicda != XFS_ATTR3_LEAF_MAGIC) {
-                       xfs_warn(mp, "Bad attr leaf magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad attr leaf magic!";
                         break;
                 }
                 bp->b_ops = &xfs_attr3_leaf_buf_ops;
                 break;
         case XFS_BLFT_ATTR_RMT_BUF:
                 if (magic32 != XFS_ATTR3_RMT_MAGIC) {
-                       xfs_warn(mp, "Bad attr remote magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad attr remote magic!";
                         break;
                 }
                 bp->b_ops = &xfs_attr3_rmt_buf_ops;
                 break;
         case XFS_BLFT_SB_BUF:
                 if (magic32 != XFS_SB_MAGIC) {
-                       xfs_warn(mp, "Bad SB block magic!");
-                       ASSERT(0);
+                       warnmsg = "Bad SB block magic!";
                         break;
                 }
                 bp->b_ops = &xfs_sb_buf_ops;
@@ -2556,6 +2552,40 @@ xlog_recover_validate_buf_type(
                          xfs_blft_from_flags(buf_f));
                 break;
         }
+
+       /*
+        * Nothing else to do in the case of a NULL current LSN as this means
+        * the buffer is more recent than the change in the log and will be
+        * skipped.
+        */
+       if (current_lsn == NULLCOMMITLSN)
+               return;
+
+       if (warnmsg) {
+               xfs_warn(mp, warnmsg);
+               ASSERT(0);
+       }
+
+       /*
+        * We must update the metadata LSN of the buffer as it is written out to
+        * ensure that older transactions never replay over this one and corrupt
+        * the buffer. This can occur if log recovery is interrupted at some
+        * point after the current transaction completes, at which point a
+        * subsequent mount starts recovery from the beginning.
+        *
+        * Write verifiers update the metadata LSN from log items attached to
+        * the buffer. Therefore, initialize a bli purely to carry the LSN to
+        * the verifier. We'll clean it up in our ->iodone() callback.
+        */
+       if (bp->b_ops) {
+               struct xfs_buf_log_item *bip;
+
+               ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
+               bp->b_iodone = xlog_recover_iodone;
+               xfs_buf_item_init(bp, mp);
+               bip = bp->b_fspriv;
+               bip->bli_item.li_lsn = current_lsn;
+       }
  }
  
  /*
@@ -2569,7 +2599,8 @@ xlog_recover_do_reg_buffer(
         struct xfs_mount        *mp,
         xlog_recover_item_t     *item,
         struct xfs_buf          *bp,
-       xfs_buf_log_format_t    *buf_f)
+       xfs_buf_log_format_t    *buf_f,
+       xfs_lsn_t               current_lsn)
  {
         int                     i;
         int                     bit;
@@ -2642,7 +2673,7 @@ xlog_recover_do_reg_buffer(
         /* Shouldn't be any more regions */
         ASSERT(i == item->ri_total);
  
-       xlog_recover_validate_buf_type(mp, bp, buf_f);
+       xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
  }
  
  /*
@@ -2685,7 +2716,7 @@ xlog_recover_do_dquot_buffer(
         if (log->l_quotaoffs_flag & type)
                 return false;
  
-       xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+       xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
         return true;
  }
  
@@ -2773,7 +2804,8 @@ xlog_recover_buffer_pass2(
          */
         lsn = xlog_recover_get_buf_lsn(mp, bp);
         if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
-               xlog_recover_validate_buf_type(mp, bp, buf_f);
+               trace_xfs_log_recover_buf_skip(log, buf_f);
+               xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
                 goto out_release;
         }
  
@@ -2789,7 +2821,7 @@ xlog_recover_buffer_pass2(
                 if (!dirty)
                         goto out_release;
         } else {
-               xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+               xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
         }
  
         /*
@@ -3846,14 +3878,13 @@ STATIC int
  xlog_recover_commit_trans(
         struct xlog             *log,
         struct xlog_recover     *trans,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
  {
         int                             error = 0;
-       int                             error2;
         int                             items_queued = 0;
         struct xlog_recover_item        *item;
         struct xlog_recover_item        *next;
-       LIST_HEAD                       (buffer_list);
         LIST_HEAD                       (ra_list);
         LIST_HEAD                       (done_list);
  
@@ -3876,7 +3907,7 @@ xlog_recover_commit_trans(
                         items_queued++;
                         if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
                                 error = xlog_recover_items_pass2(log, trans,
-                                               &buffer_list, &ra_list);
+                                               buffer_list, &ra_list);
                                 list_splice_tail_init(&ra_list, &done_list);
                                 items_queued = 0;
                         }
@@ -3894,15 +3925,14 @@ out:
         if (!list_empty(&ra_list)) {
                 if (!error)
                         error = xlog_recover_items_pass2(log, trans,
-                                       &buffer_list, &ra_list);
+                                       buffer_list, &ra_list);
                 list_splice_tail_init(&ra_list, &done_list);
         }
  
         if (!list_empty(&done_list))
                 list_splice_init(&done_list, &trans->r_itemq);
  
-       error2 = xfs_buf_delwri_submit(&buffer_list);
-       return error ? error : error2;
+       return error;
  }
  
  STATIC void
@@ -4085,7 +4115,8 @@ xlog_recovery_process_trans(
         char                    *dp,
         unsigned int            len,
         unsigned int            flags,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
  {
         int                     error = 0;
         bool                    freeit = false;
@@ -4109,7 +4140,8 @@ xlog_recovery_process_trans(
                 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
                 break;
         case XLOG_COMMIT_TRANS:
-               error = xlog_recover_commit_trans(log, trans, pass);
+               error = xlog_recover_commit_trans(log, trans, pass,
+                                                 buffer_list);
                 /* success or fail, we are now done with this transaction. */
                 freeit = true;
                 break;
@@ -4191,10 +4223,12 @@ xlog_recover_process_ophdr(
         struct xlog_op_header   *ohead,
         char                    *dp,
         char                    *end,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
  {
         struct xlog_recover     *trans;
         unsigned int            len;
+       int                     error;
  
         /* Do we understand who wrote this op? */
         if (ohead->oh_clientid != XFS_TRANSACTION &&
@@ -4221,8 +4255,39 @@ xlog_recover_process_ophdr(
                 return 0;
         }
  
+       /*
+        * The recovered buffer queue is drained only once we know that all
+        * recovery items for the current LSN have been processed. This is
+        * required because:
+        *
+        * - Buffer write submission updates the metadata LSN of the buffer.
+        * - Log recovery skips items with a metadata LSN >= the current LSN of
+        *   the recovery item.
+        * - Separate recovery items against the same metadata buffer can share
+        *   a current LSN. I.e., consider that the LSN of a recovery item is
+        *   defined as the starting LSN of the first record in which its
+        *   transaction appears, that a record can hold multiple transactions,
+        *   and/or that a transaction can span multiple records.
+        *
+        * In other words, we are allowed to submit a buffer from log recovery
+        * once per current LSN. Otherwise, we may incorrectly skip recovery
+        * items and cause corruption.
+        *
+        * We don't know up front whether buffers are updated multiple times per
+        * LSN. Therefore, track the current LSN of each commit log record as it
+        * is processed and drain the queue when it changes. Use commit records
+        * because they are ordered correctly by the logging code.
+        */
+       if (log->l_recovery_lsn != trans->r_lsn &&
+           ohead->oh_flags & XLOG_COMMIT_TRANS) {
+               error = xfs_buf_delwri_submit(buffer_list);
+               if (error)
+                       return error;
+               log->l_recovery_lsn = trans->r_lsn;
+       }
+
         return xlog_recovery_process_trans(log, trans, dp, len,
-                                          ohead->oh_flags, pass);
+                                          ohead->oh_flags, pass, buffer_list);
  }
  
  /*
@@ -4240,7 +4305,8 @@ xlog_recover_process_data(
         struct hlist_head       rhash[],
         struct xlog_rec_header  *rhead,
         char                    *dp,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
  {
         struct xlog_op_header   *ohead;
         char                    *end;
@@ -4254,6 +4320,7 @@ xlog_recover_process_data(
         if (xlog_header_check_recover(log->l_mp, rhead))
                 return -EIO;
  
+       trace_xfs_log_recover_record(log, rhead, pass);
         while ((dp < end) && num_logops) {
  
                 ohead = (struct xlog_op_header *)dp;
@@ -4262,7 +4329,7 @@ xlog_recover_process_data(
  
                 /* errors will abort recovery */
                 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
-                                                   dp, end, pass);
+                                                  dp, end, pass, buffer_list);
                 if (error)
                         return error;
  
@@ -4685,7 +4752,8 @@ xlog_recover_process(
         struct hlist_head       rhash[],
         struct xlog_rec_header  *rhead,
         char                    *dp,
-       int                     pass)
+       int                     pass,
+       struct list_head        *buffer_list)
  {
         int                     error;
         __le32                  crc;
@@ -4732,7 +4800,8 @@ xlog_recover_process(
         if (error)
                 return error;
  
-       return xlog_recover_process_data(log, rhash, rhead, dp, pass);
+       return xlog_recover_process_data(log, rhash, rhead, dp, pass,
+                                        buffer_list);
  }
  
  STATIC int
@@ -4793,9 +4862,11 @@ xlog_do_recovery_pass(
         char                    *offset;
         xfs_buf_t               *hbp, *dbp;
         int                     error = 0, h_size, h_len;
+       int                     error2 = 0;
         int                     bblks, split_bblks;
         int                     hblks, split_hblks, wrapped_hblks;
         struct hlist_head       rhash[XLOG_RHASH_SIZE];
+       LIST_HEAD               (buffer_list);
  
         ASSERT(head_blk != tail_blk);
         rhead_blk = 0;
@@ -4981,7 +5052,7 @@ xlog_do_recovery_pass(
                         }
  
                         error = xlog_recover_process(log, rhash, rhead, offset,
-                                                    pass);
+                                                    pass, &buffer_list);
                         if (error)
                                 goto bread_err2;
  
@@ -5012,7 +5083,8 @@ xlog_do_recovery_pass(
                 if (error)
                         goto bread_err2;
  
-               error = xlog_recover_process(log, rhash, rhead, offset, pass);
+               error = xlog_recover_process(log, rhash, rhead, offset, pass,
+                                            &buffer_list);
                 if (error)
                         goto bread_err2;
  
@@ -5025,10 +5097,17 @@ xlog_do_recovery_pass(
   bread_err1:
         xlog_put_bp(hbp);
  
+       /*
+        * Submit buffers that have been added from the last record processed,
+        * regardless of error status.
+        */
+       if (!list_empty(&buffer_list))
+               error2 = xfs_buf_delwri_submit(&buffer_list);
+
         if (error && first_bad)
                 *first_bad = rhead_blk;
  
-       return error;
+       return error ? error : error2;
  }
  
  /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index faeead6..56e85a6 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -933,6 +933,20 @@ xfs_mountfs(
                 goto out_rtunmount;
         }
  
+       /*
+        * Now the log is fully replayed, we can transition to full read-only
+        * mode for read-only mounts. This will sync all the metadata and clean
+        * the log so that the recovery we just performed does not have to be
+        * replayed again on the next mount.
+        *
+        * We use the same quiesce mechanism as the rw->ro remount, as they are
+        * semantically identical operations.
+        */
+       if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
+                                                       XFS_MOUNT_RDONLY) {
+               xfs_quiesce_attr(mp);
+       }
+
         /*
          * Complete the quota initialisation, post-log-replay component.
          */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index b36676c..041d949 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -57,10 +57,16 @@ enum {
  
  #define XFS_ERR_RETRY_FOREVER  -1
  
+/*
+ * Although retry_timeout is in jiffies which is normally an unsigned long,
+ * we limit the retry timeout to 86400 seconds, or one day.  So even a
+ * signed 32-bit long is sufficient for a HZ value up to 24855.  Making it
+ * signed lets us store the special "-1" value, meaning retry forever.
+ */
  struct xfs_error_cfg {
         struct xfs_kobj kobj;
         int             max_retries;
-       unsigned long   retry_timeout;  /* in jiffies, 0 = no timeout */
+       long            retry_timeout;  /* in jiffies, -1 = infinite */
  };
  
  typedef struct xfs_mount {
@@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp)
  }
  #endif
  
+/* per-AG block reservation data structures*/
+enum xfs_ag_resv_type {
+       XFS_AG_RESV_NONE = 0,
+       XFS_AG_RESV_METADATA,
+       XFS_AG_RESV_AGFL,
+};
+
+struct xfs_ag_resv {
+       /* number of blocks originally reserved here */
+       xfs_extlen_t                    ar_orig_reserved;
+       /* number of blocks reserved here */
+       xfs_extlen_t                    ar_reserved;
+       /* number of blocks originally asked for */
+       xfs_extlen_t                    ar_asked;
+};
+
  /*
   * Per-ag incore structure, copies of information in agf and agi, to improve the
   * performance of allocation group selection.
@@ -372,8 +394,28 @@ typedef struct xfs_perag {
         /* for rcu-safe freeing */
         struct rcu_head rcu_head;
         int             pagb_count;     /* pagb slots in use */
+
+       /* Blocks reserved for all kinds of metadata. */
+       struct xfs_ag_resv      pag_meta_resv;
+       /* Blocks reserved for just AGFL-based metadata. */
+       struct xfs_ag_resv      pag_agfl_resv;
  } xfs_perag_t;
  
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+       struct xfs_perag        *pag,
+       enum xfs_ag_resv_type   type)
+{
+       switch (type) {
+       case XFS_AG_RESV_METADATA:
+               return &pag->pag_meta_resv;
+       case XFS_AG_RESV_AGFL:
+               return &pag->pag_agfl_resv;
+       default:
+               return NULL;
+       }
+}
+
  extern void    xfs_uuid_table_free(void);
  extern int     xfs_log_sbcount(xfs_mount_t *);
  extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c

index 2500f28..0432a45 100644 (file)
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -51,28 +51,16 @@ xfs_rui_item_free(
                 kmem_zone_free(xfs_rui_zone, ruip);
  }
  
-/*
- * This returns the number of iovecs needed to log the given rui item.
- * We only need 1 iovec for an rui item.  It just logs the rui_log_format
- * structure.
- */
-static inline int
-xfs_rui_item_sizeof(
-       struct xfs_rui_log_item *ruip)
-{
-       return sizeof(struct xfs_rui_log_format) +
-                       (ruip->rui_format.rui_nextents - 1) *
-                       sizeof(struct xfs_map_extent);
-}
-
  STATIC void
  xfs_rui_item_size(
         struct xfs_log_item     *lip,
         int                     *nvecs,
         int                     *nbytes)
  {
+       struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
+
         *nvecs += 1;
-       *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip));
+       *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
  }
  
  /*
@@ -97,7 +85,7 @@ xfs_rui_item_format(
         ruip->rui_format.rui_size = 1;
  
         xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
-                       xfs_rui_item_sizeof(ruip));
+                       xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));
  }
  
  /*
@@ -205,16 +193,12 @@ xfs_rui_init(
  
  {
         struct xfs_rui_log_item         *ruip;
-       uint                            size;
  
         ASSERT(nextents > 0);
-       if (nextents > XFS_RUI_MAX_FAST_EXTENTS) {
-               size = (uint)(sizeof(struct xfs_rui_log_item) +
-                       ((nextents - 1) * sizeof(struct xfs_map_extent)));
-               ruip = kmem_zalloc(size, KM_SLEEP);
-       } else {
+       if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+               ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+       else
                 ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
-       }
  
         xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
         ruip->rui_format.rui_nextents = nextents;
@@ -239,14 +223,12 @@ xfs_rui_copy_format(
         uint                            len;
  
         src_rui_fmt = buf->i_addr;
-       len = sizeof(struct xfs_rui_log_format) +
-                       (src_rui_fmt->rui_nextents - 1) *
-                       sizeof(struct xfs_map_extent);
+       len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
  
         if (buf->i_len != len)
                 return -EFSCORRUPTED;
  
-       memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len);
+       memcpy(dst_rui_fmt, src_rui_fmt, len);
         return 0;
  }
  
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h

index aefcc3a..340c968 100644 (file)
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -70,6 +70,14 @@ struct xfs_rui_log_item {
         struct xfs_rui_log_format       rui_format;
  };
  
+static inline size_t
+xfs_rui_log_item_sizeof(
+       unsigned int            nr)
+{
+       return offsetof(struct xfs_rui_log_item, rui_format) +
+                       xfs_rui_log_format_sizeof(nr);
+}
+
  /*
   * This is the "rmap update done" log item.  It is used to log the fact that
   * some rmapbt updates mentioned in an earlier rui item have been performed.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index fd6be45..2d092f9 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
   * Note: xfs_log_quiesce() stops background log work - the callers must ensure
   * it is started again when appropriate.
   */
-static void
+void
  xfs_quiesce_attr(
         struct xfs_mount        *mp)
  {
@@ -1782,9 +1782,8 @@ xfs_init_zones(void)
         if (!xfs_rud_zone)
                 goto out_destroy_icreate_zone;
  
-       xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) +
-                       ((XFS_RUI_MAX_FAST_EXTENTS - 1) *
-                               sizeof(struct xfs_map_extent))),
+       xfs_rui_zone = kmem_zone_init(
+                       xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
                         "xfs_rui_item");
         if (!xfs_rui_zone)
                 goto out_destroy_rud_zone;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h

index 529bce9..b6418ab 100644 (file)
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,6 +61,7 @@ struct xfs_mount;
  struct xfs_buftarg;
  struct block_device;
  
+extern void xfs_quiesce_attr(struct xfs_mount *mp);
  extern void xfs_flush_inodes(struct xfs_mount *mp);
  extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
  extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c

index 79cfd3f..5f8d55d 100644 (file)
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -393,9 +393,15 @@ max_retries_show(
         struct kobject  *kobject,
         char            *buf)
  {
+       int             retries;
         struct xfs_error_cfg *cfg = to_error_cfg(kobject);
  
-       return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+       if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+               retries = -1;
+       else
+               retries = cfg->max_retries;
+
+       return snprintf(buf, PAGE_SIZE, "%d\n", retries);
  }
  
  static ssize_t
@@ -415,7 +421,10 @@ max_retries_store(
         if (val < -1)
                 return -EINVAL;
  
-       cfg->max_retries = val;
+       if (val == -1)
+               cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+       else
+               cfg->max_retries = val;
         return count;
  }
  XFS_SYSFS_ATTR_RW(max_retries);
@@ -425,10 +434,15 @@ retry_timeout_seconds_show(
         struct kobject  *kobject,
         char            *buf)
  {
+       int             timeout;
         struct xfs_error_cfg *cfg = to_error_cfg(kobject);
  
-       return snprintf(buf, PAGE_SIZE, "%ld\n",
-                       jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+       if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+               timeout = -1;
+       else
+               timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
+
+       return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
  }
  
  static ssize_t
@@ -445,11 +459,16 @@ retry_timeout_seconds_store(
         if (ret)
                 return ret;
  
-       /* 1 day timeout maximum */
-       if (val < 0 || val > 86400)
+       /* 1 day timeout maximum, -1 means infinite */
+       if (val < -1 || val > 86400)
                 return -EINVAL;
  
-       cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+       if (val == -1)
+               cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+       else {
+               cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+               ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
+       }
         return count;
  }
  XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
@@ -519,18 +538,19 @@ struct xfs_error_init {
  static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
         { .name = "default",
           .max_retries = XFS_ERR_RETRY_FOREVER,
-         .retry_timeout = 0,
+         .retry_timeout = XFS_ERR_RETRY_FOREVER,
         },
         { .name = "EIO",
           .max_retries = XFS_ERR_RETRY_FOREVER,
-         .retry_timeout = 0,
+         .retry_timeout = XFS_ERR_RETRY_FOREVER,
         },
         { .name = "ENOSPC",
           .max_retries = XFS_ERR_RETRY_FOREVER,
-         .retry_timeout = 0,
+         .retry_timeout = XFS_ERR_RETRY_FOREVER,
         },
         { .name = "ENODEV",
-         .max_retries = 0,
+         .max_retries = 0,     /* We can't recover from devices disappearing */
+         .retry_timeout = 0,
         },
  };
  
@@ -561,7 +581,10 @@ xfs_error_sysfs_init_class(
                         goto out_error;
  
                 cfg->max_retries = init[i].max_retries;
-               cfg->retry_timeout = msecs_to_jiffies(
+               if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
+                       cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+               else
+                       cfg->retry_timeout = msecs_to_jiffies(
                                         init[i].retry_timeout * MSEC_PER_SEC);
         }
         return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index d303a66..c6b2b1d 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf,
  
  TRACE_EVENT(xfs_free_extent,
         TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-                xfs_extlen_t len, bool isfl, int haveleft, int haveright),
-       TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+                xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
+                int haveright),
+       TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
         TP_STRUCT__entry(
                 __field(dev_t, dev)
                 __field(xfs_agnumber_t, agno)
                 __field(xfs_agblock_t, agbno)
                 __field(xfs_extlen_t, len)
-               __field(int, isfl)
+               __field(int, resv)
                 __field(int, haveleft)
                 __field(int, haveright)
         ),
@@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent,
                 __entry->agno = agno;
                 __entry->agbno = agbno;
                 __entry->len = len;
-               __entry->isfl = isfl;
+               __entry->resv = resv;
                 __entry->haveleft = haveleft;
                 __entry->haveright = haveright;
         ),
-       TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+       TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->agno,
                   __entry->agbno,
                   __entry->len,
-                 __entry->isfl,
+                 __entry->resv,
                   __entry->haveleft ?
                         (__entry->haveright ? "both" : "left") :
                         (__entry->haveright ? "right" : "none"))
@@ -1622,8 +1623,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
                 __field(short, otype)
                 __field(char, wasdel)
                 __field(char, wasfromfl)
-               __field(char, isfl)
-               __field(char, userdata)
+               __field(int, resv)
+               __field(int, datatype)
                 __field(xfs_fsblock_t, firstblock)
         ),
         TP_fast_assign(
@@ -1643,14 +1644,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
                 __entry->otype = args->otype;
                 __entry->wasdel = args->wasdel;
                 __entry->wasfromfl = args->wasfromfl;
-               __entry->isfl = args->isfl;
-               __entry->userdata = args->userdata;
+               __entry->resv = args->resv;
+               __entry->datatype = args->datatype;
                 __entry->firstblock = args->firstblock;
         ),
         TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
                   "prod %u minleft %u total %u alignment %u minalignslop %u "
-                 "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
-                 "userdata %d firstblock 0x%llx",
+                 "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
+                 "datatype 0x%x firstblock 0x%llx",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->agno,
                   __entry->agbno,
@@ -1667,8 +1668,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
                   __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
                   __entry->wasdel,
                   __entry->wasfromfl,
-                 __entry->isfl,
-                 __entry->userdata,
+                 __entry->resv,
+                 __entry->datatype,
                   (unsigned long long)__entry->firstblock)
  )
  
@@ -1984,6 +1985,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
  DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
  DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
  
+TRACE_EVENT(xfs_log_recover_record,
+       TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+       TP_ARGS(log, rhead, pass),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_lsn_t, lsn)
+               __field(int, len)
+               __field(int, num_logops)
+               __field(int, pass)
+       ),
+       TP_fast_assign(
+               __entry->dev = log->l_mp->m_super->s_dev;
+               __entry->lsn = be64_to_cpu(rhead->h_lsn);
+               __entry->len = be32_to_cpu(rhead->h_len);
+               __entry->num_logops = be32_to_cpu(rhead->h_num_logops);
+               __entry->pass = pass;
+       ),
+       TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->lsn, __entry->len, __entry->num_logops,
+                  __entry->pass)
+)
+
  DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
         TP_PROTO(struct xlog *log, struct xlog_recover *trans,
                 struct xlog_recover_item *item, int pass),
@@ -1992,6 +2016,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
                 __field(dev_t, dev)
                 __field(unsigned long, item)
                 __field(xlog_tid_t, tid)
+               __field(xfs_lsn_t, lsn)
                 __field(int, type)
                 __field(int, pass)
                 __field(int, count)
@@ -2001,15 +2026,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
                 __entry->dev = log->l_mp->m_super->s_dev;
                 __entry->item = (unsigned long)item;
                 __entry->tid = trans->r_log_tid;
+               __entry->lsn = trans->r_lsn;
                 __entry->type = ITEM_TYPE(item);
                 __entry->pass = pass;
                 __entry->count = item->ri_cnt;
                 __entry->total = item->ri_total;
         ),
-       TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
-                 "item region count/total %d/%d",
+       TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, "
+                 "item type %s item region count/total %d/%d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->tid,
+                 __entry->lsn,
                   __entry->pass,
                   (void *)__entry->item,
                   __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
@@ -2068,6 +2095,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);
  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
  DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
@@ -2558,6 +2586,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
  DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
  DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
  
+/* per-AG reservation */
+DECLARE_EVENT_CLASS(xfs_ag_resv_class,
+       TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
+                xfs_extlen_t len),
+       TP_ARGS(pag, resv, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(int, resv)
+               __field(xfs_extlen_t, freeblks)
+               __field(xfs_extlen_t, flcount)
+               __field(xfs_extlen_t, reserved)
+               __field(xfs_extlen_t, asked)
+               __field(xfs_extlen_t, len)
+       ),
+       TP_fast_assign(
+               struct xfs_ag_resv      *r = xfs_perag_resv(pag, resv);
+
+               __entry->dev = pag->pag_mount->m_super->s_dev;
+               __entry->agno = pag->pag_agno;
+               __entry->resv = resv;
+               __entry->freeblks = pag->pagf_freeblks;
+               __entry->flcount = pag->pagf_flcount;
+               __entry->reserved = r ? r->ar_reserved : 0;
+               __entry->asked = r ? r->ar_asked : 0;
+               __entry->len = len;
+       ),
+       TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->resv,
+                 __entry->freeblks,
+                 __entry->flcount,
+                 __entry->reserved,
+                 __entry->asked,
+                 __entry->len)
+)
+#define DEFINE_AG_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_ag_resv_class, name, \
+       TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
+                xfs_extlen_t len), \
+       TP_ARGS(pag, type, len))
+
+/* per-AG reservation tracepoints */
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
+DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
+
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
+DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
+
  #endif /* _TRACE_XFS_H */
  
  #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c

index 5f3d33d..70f42ea 100644 (file)
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -217,7 +217,7 @@ undo_log:
  
  undo_blocks:
         if (blocks > 0) {
-               xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+               xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
                 tp->t_blk_res = 0;
         }
  
@@ -318,7 +318,6 @@ xfs_trans_mod_sb(
                  * in-core superblock's counter.  This should only
                  * be applied to the on-disk superblock.
                  */
-               ASSERT(delta < 0);
                 tp->t_res_fdblocks_delta += delta;
                 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
                         flags &= ~XFS_TRANS_SB_DIRTY;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c

index 459ddec..ab43864 100644 (file)
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -79,7 +79,8 @@ xfs_trans_free_extent(
  
         trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
  
-       error = xfs_free_extent(tp, start_block, ext_len, oinfo);
+       error = xfs_free_extent(tp, start_block, ext_len, oinfo,
+                       XFS_AG_RESV_NONE);
  
         /*
          * Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c

index ea62245..6290093 100644 (file)
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -147,6 +147,7 @@ __xfs_xattr_put_listent(
         arraytop = context->count + prefix_len + namelen + 1;
         if (arraytop > context->firstu) {
                 context->count = -1;    /* insufficient space */
+               context->seen_enough = 1;
                 return 0;
         }
         offset = (char *)context->alist + context->count;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h

index 19b698e..002f092 100644 (file)
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -20,9 +20,11 @@
  #include <linux/kvm.h>
  #include <linux/irqreturn.h>
  #include <linux/spinlock.h>
+#include <linux/static_key.h>
  #include <linux/types.h>
  #include <kvm/iodev.h>
  #include <linux/list.h>
+#include <linux/jump_label.h>
  
  #define VGIC_V3_MAX_CPUS       255
  #define VGIC_V2_MAX_CPUS       8
@@ -49,6 +51,9 @@ struct vgic_global {
         /* Physical address of vgic virtual cpu interface */
         phys_addr_t             vcpu_base;
  
+       /* GICV mapping */
+       void __iomem            *vcpu_base_va;
+
         /* virtual control interface mapping */
         void __iomem            *vctrl_base;
  
@@ -63,6 +68,9 @@ struct vgic_global {
  
         /* Only needed for the legacy KVM_CREATE_IRQCHIP */
         bool                    can_emulate_gicv2;
+
+       /* GIC system register CPU interface */
+       struct static_key_false gicv3_cpuif;
  };
  
  extern struct vgic_global kvm_vgic_global_state;
@@ -217,7 +225,6 @@ struct vgic_v2_cpu_if {
  };
  
  struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
         u32             vgic_hcr;
         u32             vgic_vmcr;
         u32             vgic_sre;       /* Restored only, change ignored */
@@ -227,7 +234,6 @@ struct vgic_v3_cpu_if {
         u32             vgic_ap0r[4];
         u32             vgic_ap1r[4];
         u64             vgic_lr[VGIC_V3_MAX_LRS];
-#endif
  };
  
  struct vgic_cpu {
@@ -265,6 +271,8 @@ struct vgic_cpu {
         bool lpis_enabled;
  };
  
+extern struct static_key_false vgic_v2_cpuif_trap;
+
  int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
  void kvm_vgic_early_init(struct kvm *kvm);
  int kvm_vgic_create(struct kvm *kvm, u32 type);
@@ -294,13 +302,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
  void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
  void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
  
-#ifdef CONFIG_KVM_ARM_VGIC_V3
  void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
  
  /**
   * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h

index 2b08e79..09751d3 100644 (file)
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -22,6 +22,20 @@
  
  #include <linux/types.h>
  
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+       u32 ga_tag;
+       u32 prev_ga_tag;
+       u64 base;
+       bool is_guest_mode;
+       struct vcpu_data *vcpu_data;
+       void *ir_data;
+};
+
  #ifdef CONFIG_AMD_IOMMU
  
  struct task_struct;
@@ -168,11 +182,34 @@ typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid);
  
  extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
                                            amd_iommu_invalidate_ctx cb);
-
-#else
+#else /* CONFIG_AMD_IOMMU */
  
  static inline int amd_iommu_detect(void) { return -ENODEV; }
  
-#endif
+#endif /* CONFIG_AMD_IOMMU */
+
+#if defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP)
+
+/* IOMMU AVIC Function */
+extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
+
+extern int
+amd_iommu_update_ga(int cpu, bool is_run, void *data);
+
+#else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
+
+static inline int
+amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+{
+       return 0;
+}
+
+static inline int
+amd_iommu_update_ga(int cpu, bool is_run, void *data)
+{
+       return 0;
+}
+
+#endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
  
  #endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index a4414a1..440a721 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -644,6 +644,7 @@ struct cgroup_namespace {
         atomic_t                count;
         struct ns_common        ns;
         struct user_namespace   *user_ns;
+       struct ucounts          *ucounts;
         struct css_set          *root_cset;
  };
  
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h

index 7b6c446..9b207a8 100644 (file)
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -43,6 +43,8 @@ enum cpuhp_state {
         CPUHP_CPUIDLE_COUPLED_PREPARE,
         CPUHP_POWERPC_PMAC_PREPARE,
         CPUHP_POWERPC_MMU_CTX_PREPARE,
+       CPUHP_XEN_PREPARE,
+       CPUHP_XEN_EVTCHN_PREPARE,
         CPUHP_NOTIFY_PREPARE,
         CPUHP_ARM_SHMOBILE_SCU_PREPARE,
         CPUHP_SH_SH3X_PREPARE,
@@ -114,6 +116,7 @@ enum cpuhp_state {
         CPUHP_AP_PERF_S390_SF_ONLINE,
         CPUHP_AP_PERF_ARM_CCI_ONLINE,
         CPUHP_AP_PERF_ARM_CCN_ONLINE,
+       CPUHP_AP_PERF_ARM_L2X0_ONLINE,
         CPUHP_AP_WORKQUEUE_ONLINE,
         CPUHP_AP_RCUTREE_ONLINE,
         CPUHP_AP_NOTIFY_ONLINE,
diff --git a/include/linux/dax.h b/include/linux/dax.h

index 9c6dc77..add6c4b 100644 (file)
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -6,13 +6,19 @@
  #include <linux/radix-tree.h>
  #include <asm/pgtable.h>
  
+struct iomap_ops;
+
  /* We use lowest available exceptional entry bit for locking */
  #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
  
+ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+               struct iomap_ops *ops);
  ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
                   get_block_t, dio_iodone_t, int flags);
  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                       struct iomap_ops *ops);
  int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
  void dax_wake_mapping_entry_waiter(struct address_space *mapping,
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h

index 4c02c65..422630b 100644 (file)
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -100,6 +100,7 @@ struct f2fs_super_block {
  /*
   * For checkpoint
   */
+#define CP_CRC_RECOVERY_FLAG   0x00000040
  #define CP_FASTBOOT_FLAG       0x00000020
  #define CP_FSCK_FLAG           0x00000010
  #define CP_ERROR_FLAG          0x00000008
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h

index 6f93ac4..b3d34d3 100644 (file)
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -794,7 +794,9 @@ struct ftrace_ret_stack {
         unsigned long ret;
         unsigned long func;
         unsigned long long calltime;
+#ifdef CONFIG_FUNCTION_PROFILER
         unsigned long long subtime;
+#endif
  #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
         unsigned long fp;
  #endif
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h

index dca7bf8..4ec2c9b 100644 (file)
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -3,11 +3,34 @@
  
  
  #ifdef CONFIG_FTRACE_NMI_ENTER
-extern void ftrace_nmi_enter(void);
-extern void ftrace_nmi_exit(void);
+extern void arch_ftrace_nmi_enter(void);
+extern void arch_ftrace_nmi_exit(void);
  #else
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
+static inline void arch_ftrace_nmi_enter(void) { }
+static inline void arch_ftrace_nmi_exit(void) { }
  #endif
  
+#ifdef CONFIG_HWLAT_TRACER
+extern bool trace_hwlat_callback_enabled;
+extern void trace_hwlat_callback(bool enter);
+#endif
+
+static inline void ftrace_nmi_enter(void)
+{
+#ifdef CONFIG_HWLAT_TRACER
+       if (trace_hwlat_callback_enabled)
+               trace_hwlat_callback(true);
+#endif
+       arch_ftrace_nmi_enter();
+}
+
+static inline void ftrace_nmi_exit(void)
+{
+       arch_ftrace_nmi_exit();
+#ifdef CONFIG_HWLAT_TRACER
+       if (trace_hwlat_callback_enabled)
+               trace_hwlat_callback(false);
+#endif
+}
+
  #endif /* _LINUX_FTRACE_IRQ_H */
diff --git a/include/linux/if_team.h b/include/linux/if_team.h

index 174f43f..c05216a 100644 (file)
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -245,7 +245,7 @@ static inline struct team_port *team_get_port_by_index(struct team *team,
         return NULL;
  }
  
-static inline int team_num_to_port_index(struct team *team, int num)
+static inline int team_num_to_port_index(struct team *team, unsigned int num)
  {
         int en_port_count = ACCESS_ONCE(team->en_port_count);
  
diff --git a/include/linux/iomap.h b/include/linux/iomap.h

index 3d70ece..e63e288 100644 (file)
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -22,6 +22,8 @@ struct vm_fault;
   * Flags for iomap mappings:
   */
  #define IOMAP_F_MERGED 0x01    /* contains multiple blocks/extents */
+#define IOMAP_F_SHARED 0x02    /* block shared with another file */
+#define IOMAP_F_NEW    0x04    /* blocks have been newly allocated */
  
  /*
   * Magic value for blkno:
@@ -64,6 +66,8 @@ struct iomap_ops {
  
  ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
                 struct iomap_ops *ops);
+int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+               struct iomap_ops *ops);
  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
                 bool *did_zero, struct iomap_ops *ops);
  int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h

index d10e54f..848e579 100644 (file)
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -58,6 +58,7 @@ struct ipc_namespace {
  
         /* user_ns which owns the ipc ns */
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
  
         struct ns_common ns;
  };
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 9c28b4d..01c0b9c 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -265,6 +265,7 @@ struct kvm_vcpu {
  #endif
         bool preempted;
         struct kvm_vcpu_arch arch;
+       struct dentry *debugfs_dentry;
  };
  
  static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -749,6 +750,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
  void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
  
+bool kvm_arch_has_vcpu_debugfs(void);
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
+
  int kvm_arch_hardware_enable(void);
  void kvm_arch_hardware_disable(void);
  int kvm_arch_hardware_setup(void);
diff --git a/include/linux/mount.h b/include/linux/mount.h

index 54a594d..1172cce 100644 (file)
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -96,4 +96,6 @@ extern void mark_mounts_for_expiry(struct list_head *mounts);
  
  extern dev_t name_to_dev_t(const char *name);
  
+extern unsigned int sysctl_mount_max;
+
  #endif /* _LINUX_MOUNT_H */
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h

index 918b117..34cce96 100644 (file)
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -40,6 +40,7 @@ struct pid_namespace {
         struct fs_pin *bacct;
  #endif
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
         struct work_struct proc_work;
         kgid_t pid_gid;
         int hide_pid;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h

index de0e771..12cb8bd 100644 (file)
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -18,6 +18,8 @@ struct proc_ns_operations {
         struct ns_common *(*get)(struct task_struct *task);
         void (*put)(struct ns_common *ns);
         int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+       struct user_namespace *(*owner)(struct ns_common *ns);
+       struct ns_common *(*get_parent)(struct ns_common *ns);
  };
  
  extern const struct proc_ns_operations netns_operations;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h

index 899e95e..92013cc 100644 (file)
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -22,12 +22,13 @@
  #ifndef _LINUX_PSTORE_H
  #define _LINUX_PSTORE_H
  
-#include <linux/time.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
  #include <linux/kmsg_dump.h>
  #include <linux/mutex.h>
-#include <linux/types.h>
  #include <linux/spinlock.h>
-#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/types.h>
  
  /* types */
  enum pstore_type_id {
@@ -68,13 +69,21 @@ struct pstore_info {
                         enum kmsg_dump_reason reason, u64 *id,
                         unsigned int part, const char *buf, bool compressed,
                         size_t size, struct pstore_info *psi);
+       int             (*write_buf_user)(enum pstore_type_id type,
+                       enum kmsg_dump_reason reason, u64 *id,
+                       unsigned int part, const char __user *buf,
+                       bool compressed, size_t size, struct pstore_info *psi);
         int             (*erase)(enum pstore_type_id type, u64 id,
                         int count, struct timespec time,
                         struct pstore_info *psi);
         void            *data;
  };
  
-#define        PSTORE_FLAGS_FRAGILE    1
+#define PSTORE_FLAGS_DMESG     (1 << 0)
+#define PSTORE_FLAGS_FRAGILE   PSTORE_FLAGS_DMESG
+#define PSTORE_FLAGS_CONSOLE   (1 << 1)
+#define PSTORE_FLAGS_FTRACE    (1 << 2)
+#define PSTORE_FLAGS_PMSG      (1 << 3)
  
  extern int pstore_register(struct pstore_info *);
  extern void pstore_unregister(struct pstore_info *);
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h

index 4660aaa..c668c86 100644 (file)
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -17,11 +17,12 @@
  #ifndef __LINUX_PSTORE_RAM_H__
  #define __LINUX_PSTORE_RAM_H__
  
+#include <linux/compiler.h>
  #include <linux/device.h>
+#include <linux/init.h>
  #include <linux/kernel.h>
  #include <linux/list.h>
  #include <linux/types.h>
-#include <linux/init.h>
  
  struct persistent_ram_buffer;
  struct rs_control;
@@ -59,7 +60,9 @@ void persistent_ram_free(struct persistent_ram_zone *prz);
  void persistent_ram_zap(struct persistent_ram_zone *prz);
  
  int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
-       unsigned int count);
+                        unsigned int count);
+int persistent_ram_write_user(struct persistent_ram_zone *prz,
+                             const void __user *s, unsigned int count);
  
  void persistent_ram_save_old(struct persistent_ram_zone *prz);
  size_t persistent_ram_old_size(struct persistent_ram_zone *prz);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h

index ecc3e07..adf4e51 100644 (file)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -158,8 +158,7 @@ struct ctl_table_set {
  
  struct ctl_table_root {
         struct ctl_table_set default_set;
-       struct ctl_table_set *(*lookup)(struct ctl_table_root *root,
-                                          struct nsproxy *namespaces);
+       struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
         void (*set_ownership)(struct ctl_table_header *head,
                               struct ctl_table *table,
                               kuid_t *uid, kgid_t *gid);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h

index 9217169..eb209d4 100644 (file)
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,19 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
  
  #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
  
+struct ucounts;
+
+enum ucount_type {
+       UCOUNT_USER_NAMESPACES,
+       UCOUNT_PID_NAMESPACES,
+       UCOUNT_UTS_NAMESPACES,
+       UCOUNT_IPC_NAMESPACES,
+       UCOUNT_NET_NAMESPACES,
+       UCOUNT_MNT_NAMESPACES,
+       UCOUNT_CGROUP_NAMESPACES,
+       UCOUNT_COUNTS,
+};
+
  struct user_namespace {
         struct uid_gid_map      uid_map;
         struct uid_gid_map      gid_map;
@@ -39,10 +52,30 @@ struct user_namespace {
         struct key              *persistent_keyring_register;
         struct rw_semaphore     persistent_keyring_register_sem;
  #endif
+       struct work_struct      work;
+#ifdef CONFIG_SYSCTL
+       struct ctl_table_set    set;
+       struct ctl_table_header *sysctls;
+#endif
+       struct ucounts          *ucounts;
+       int ucount_max[UCOUNT_COUNTS];
+};
+
+struct ucounts {
+       struct hlist_node node;
+       struct user_namespace *ns;
+       kuid_t uid;
+       atomic_t count;
+       atomic_t ucount[UCOUNT_COUNTS];
  };
  
  extern struct user_namespace init_user_ns;
  
+bool setup_userns_sysctls(struct user_namespace *ns);
+void retire_userns_sysctls(struct user_namespace *ns);
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+
  #ifdef CONFIG_USER_NS
  
  static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -54,12 +87,12 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
  
  extern int create_user_ns(struct cred *new);
  extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
-extern void free_user_ns(struct user_namespace *ns);
+extern void __put_user_ns(struct user_namespace *ns);
  
  static inline void put_user_ns(struct user_namespace *ns)
  {
         if (ns && atomic_dec_and_test(&ns->count))
-               free_user_ns(ns);
+               __put_user_ns(ns);
  }
  
  struct seq_operations;
@@ -73,6 +106,8 @@ extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t,
  extern int proc_setgroups_show(struct seq_file *m, void *v);
  extern bool userns_may_setgroups(const struct user_namespace *ns);
  extern bool current_in_userns(const struct user_namespace *target_ns);
+
+struct ns_common *ns_get_owner(struct ns_common *ns);
  #else
  
  static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -106,6 +141,11 @@ static inline bool current_in_userns(const struct user_namespace *target_ns)
  {
         return true;
  }
+
+static inline struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+       return ERR_PTR(-EPERM);
+}
  #endif
  
  #endif /* _LINUX_USER_H */
diff --git a/include/linux/utsname.h b/include/linux/utsname.h

index 5093f58..60f0bb8 100644 (file)
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -24,6 +24,7 @@ struct uts_namespace {
         struct kref kref;
         struct new_utsname name;
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
         struct ns_common ns;
  };
  extern struct uts_namespace init_uts_ns;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h

index 0933c74..fc4f757 100644 (file)
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -60,6 +60,7 @@ struct net {
         struct list_head        exit_list;      /* Use only net_mutex */
  
         struct user_namespace   *user_ns;       /* Owning user namespace */
+       struct ucounts          *ucounts;
         spinlock_t              nsid_lock;
         struct idr              netns_ids;
  
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h

index ff95fd0..903a091 100644 (file)
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -58,16 +58,12 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
  #define F2FS_BIO_FLAG_MASK(t)  (t & (REQ_RAHEAD | WRITE_FLUSH_FUA))
  #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO))
  
-#define show_bio_type(op, op_flags) show_bio_op(op),                   \
-                       show_bio_op_flags(op_flags), show_bio_extra(op_flags)
-
-#define show_bio_op(op)                                                        \
-       __print_symbolic(op,                                            \
-               { READ,                 "READ" },                       \
-               { WRITE,                "WRITE" })
+#define show_bio_type(op_flags)        show_bio_op_flags(op_flags),            \
+                                               show_bio_extra(op_flags)
  
  #define show_bio_op_flags(flags)                                       \
         __print_symbolic(F2FS_BIO_FLAG_MASK(flags),                     \
+               { 0,                    "WRITE" },                      \
                 { REQ_RAHEAD,           "READAHEAD" },                  \
                 { READ_SYNC,            "READ_SYNC" },                  \
                 { WRITE_SYNC,           "WRITE_SYNC" },                 \
@@ -754,12 +750,12 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
         ),
  
         TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
-               "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%si%s, type = %s",
+               "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s%s, type = %s",
                 show_dev_ino(__entry),
                 (unsigned long)__entry->index,
                 (unsigned long long)__entry->old_blkaddr,
                 (unsigned long long)__entry->new_blkaddr,
-               show_bio_type(__entry->op, __entry->op_flags),
+               show_bio_type(__entry->op_flags),
                 show_block_type(__entry->type))
  );
  
@@ -806,9 +802,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
                 __entry->size           = bio->bi_iter.bi_size;
         ),
  
-       TP_printk("dev = (%d,%d), %s%s%s, %s, sector = %lld, size = %u",
+       TP_printk("dev = (%d,%d), rw = %s%s, %s, sector = %lld, size = %u",
                 show_dev(__entry),
-               show_bio_type(__entry->op, __entry->op_flags),
+               show_bio_type(__entry->op_flags),
                 show_block_type(__entry->type),
                 (unsigned long long)__entry->sector,
                 __entry->size)
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h

new file mode 100644 (file)

index 0000000..3af6172
--- /dev/null
+++ b/include/uapi/linux/nsfs.h
@@ -0,0 +1,13 @@
+#ifndef __LINUX_NSFS_H
+#define __LINUX_NSFS_H
+
+#include <linux/ioctl.h>
+
+#define NSIO   0xb7
+
+/* Returns a file descriptor that refers to an owning user namespace */
+#define NS_GET_USERNS  _IO(NSIO, 0x1)
+/* Returns a file descriptor that refers to a parent namespace */
+#define NS_GET_PARENT  _IO(NSIO, 0x2)
+
+#endif /* __LINUX_NSFS_H */
diff --git a/include/xen/xen.h b/include/xen/xen.h

index 0c0e3ef..f0f0252 100644 (file)
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -38,8 +38,7 @@ extern enum xen_domain_type xen_domain_type;
   */
  #include <xen/features.h>
  #define xen_pvh_domain() (xen_pv_domain() && \
-                         xen_feature(XENFEAT_auto_translated_physmap) && \
-                         xen_have_vector_callback)
+                         xen_feature(XENFEAT_auto_translated_physmap))
  #else
  #define xen_pvh_domain()       (0)
  #endif
diff --git a/ipc/namespace.c b/ipc/namespace.c

index d87e6ba..0abdea4 100644 (file)
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -16,39 +16,61 @@
  
  #include "util.h"
  
+static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES);
+}
+
+static void dec_ipc_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES);
+}
+
  static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                                            struct ipc_namespace *old_ns)
  {
         struct ipc_namespace *ns;
+       struct ucounts *ucounts;
         int err;
  
+       err = -ENOSPC;
+       ucounts = inc_ipc_namespaces(user_ns);
+       if (!ucounts)
+               goto fail;
+
+       err = -ENOMEM;
         ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
         if (ns == NULL)
-               return ERR_PTR(-ENOMEM);
+               goto fail_dec;
  
         err = ns_alloc_inum(&ns->ns);
-       if (err) {
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_free;
         ns->ns.ops = &ipcns_operations;
  
         atomic_set(&ns->count, 1);
         ns->user_ns = get_user_ns(user_ns);
+       ns->ucounts = ucounts;
  
         err = mq_init_ns(ns);
-       if (err) {
-               put_user_ns(ns->user_ns);
-               ns_free_inum(&ns->ns);
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_put;
  
         sem_init_ns(ns);
         msg_init_ns(ns);
         shm_init_ns(ns);
  
         return ns;
+
+fail_put:
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+fail_free:
+       kfree(ns);
+fail_dec:
+       dec_ipc_namespaces(ucounts);
+fail:
+       return ERR_PTR(err);
  }
  
  struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -96,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
         msg_exit_ns(ns);
         shm_exit_ns(ns);
  
+       dec_ipc_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@ -165,10 +188,16 @@ static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
         return 0;
  }
  
+static struct user_namespace *ipcns_owner(struct ns_common *ns)
+{
+       return to_ipc_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations ipcns_operations = {
         .name           = "ipc",
         .type           = CLONE_NEWIPC,
         .get            = ipcns_get,
         .put            = ipcns_put,
         .install        = ipcns_install,
+       .owner          = ipcns_owner,
  };
diff --git a/kernel/Makefile b/kernel/Makefile

index e2ec54e..eb26e12 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
             extable.o params.o \
             kthread.o sys_ni.o nsproxy.o \
             notifier.o ksysfs.o cred.o reboot.o \
-           async.o range.o smpboot.o
+           async.o range.o smpboot.o ucount.o
  
  obj-$(CONFIG_MULTIUSER) += groups.o
  
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 9ba2831..4406615 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6328,6 +6328,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
  
  /* cgroup namespaces */
  
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
  static struct cgroup_namespace *alloc_cgroup_ns(void)
  {
         struct cgroup_namespace *new_ns;
@@ -6349,6 +6359,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
  void free_cgroup_ns(struct cgroup_namespace *ns)
  {
         put_css_set(ns->root_cset);
+       dec_cgroup_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@ -6360,6 +6371,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                         struct cgroup_namespace *old_ns)
  {
         struct cgroup_namespace *new_ns;
+       struct ucounts *ucounts;
         struct css_set *cset;
  
         BUG_ON(!old_ns);
@@ -6373,6 +6385,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
         if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                 return ERR_PTR(-EPERM);
  
+       ucounts = inc_cgroup_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         /* It is not safe to take cgroup_mutex here */
         spin_lock_irq(&css_set_lock);
         cset = task_css_set(current);
@@ -6382,10 +6398,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
         new_ns = alloc_cgroup_ns();
         if (IS_ERR(new_ns)) {
                 put_css_set(cset);
+               dec_cgroup_namespaces(ucounts);
                 return new_ns;
         }
  
         new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->ucounts = ucounts;
         new_ns->root_cset = cset;
  
         return new_ns;
@@ -6436,12 +6454,18 @@ static void cgroupns_put(struct ns_common *ns)
         put_cgroup_ns(to_cg_ns(ns));
  }
  
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+       return to_cg_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations cgroupns_operations = {
         .name           = "cgroup",
         .type           = CLONE_NEWCGROUP,
         .get            = cgroupns_get,
         .put            = cgroupns_put,
         .install        = cgroupns_install,
+       .owner          = cgroupns_owner,
  };
  
  static __init int cgroup_namespaces_init(void)
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config

new file mode 100644 (file)

index 0000000..8d96437
--- /dev/null
+++ b/kernel/configs/kvm_guest.config
@@ -0,0 +1,32 @@
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_BLK_DEV=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_TTY=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_BINFMT_ELF=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_9P_FS=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/kernel/fork.c b/kernel/fork.c

index c060c7e..9a05bd9 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -418,6 +418,7 @@ int arch_task_struct_size __read_mostly;
  
  void __init fork_init(void)
  {
+       int i;
  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  #ifndef ARCH_MIN_TASKALIGN
  #define ARCH_MIN_TASKALIGN     L1_CACHE_BYTES
@@ -437,6 +438,10 @@ void __init fork_init(void)
         init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
         init_task.signal->rlim[RLIMIT_SIGPENDING] =
                 init_task.signal->rlim[RLIMIT_NPROC];
+
+       for (i = 0; i < UCOUNT_COUNTS; i++) {
+               init_user_ns.ucount_max[i] = max_threads/2;
+       }
  }
  
  int __weak arch_dup_task_struct(struct task_struct *dst,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c

index a65ba13..df9e8e9 100644 (file)
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work)
  /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
  #define MAX_PID_NS_LEVEL 32
  
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
+}
+
+static void dec_pid_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
+}
+
  static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
         struct pid_namespace *parent_pid_ns)
  {
         struct pid_namespace *ns;
         unsigned int level = parent_pid_ns->level + 1;
+       struct ucounts *ucounts;
         int i;
         int err;
  
-       if (level > MAX_PID_NS_LEVEL) {
-               err = -EINVAL;
+       err = -ENOSPC;
+       if (level > MAX_PID_NS_LEVEL)
+               goto out;
+       ucounts = inc_pid_namespaces(user_ns);
+       if (!ucounts)
                 goto out;
-       }
  
         err = -ENOMEM;
         ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
         if (ns == NULL)
-               goto out;
+               goto out_dec;
  
         ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
         if (!ns->pidmap[0].page)
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
         ns->level = level;
         ns->parent = get_pid_ns(parent_pid_ns);
         ns->user_ns = get_user_ns(user_ns);
+       ns->ucounts = ucounts;
         ns->nr_hashed = PIDNS_HASH_ADDING;
         INIT_WORK(&ns->proc_work, proc_cleanup_work);
  
@@ -129,6 +143,8 @@ out_free_map:
         kfree(ns->pidmap[0].page);
  out_free:
         kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+       dec_pid_namespaces(ucounts);
  out:
         return ERR_PTR(err);
  }
@@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
         ns_free_inum(&ns->ns);
         for (i = 0; i < PIDMAP_ENTRIES; i++)
                 kfree(ns->pidmap[i].page);
+       dec_pid_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         call_rcu(&ns->rcu, delayed_free_pidns);
  }
@@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return 0;
  }
  
+static struct ns_common *pidns_get_parent(struct ns_common *ns)
+{
+       struct pid_namespace *active = task_active_pid_ns(current);
+       struct pid_namespace *pid_ns, *p;
+
+       /* See if the parent is in the current namespace */
+       pid_ns = p = to_pid_ns(ns)->parent;
+       for (;;) {
+               if (!p)
+                       return ERR_PTR(-EPERM);
+               if (p == active)
+                       break;
+               p = p->parent;
+       }
+
+       return &get_pid_ns(pid_ns)->ns;
+}
+
+static struct user_namespace *pidns_owner(struct ns_common *ns)
+{
+       return to_pid_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations pidns_operations = {
         .name           = "pid",
         .type           = CLONE_NEWPID,
         .get            = pidns_get,
         .put            = pidns_put,
         .install        = pidns_install,
+       .owner          = pidns_owner,
+       .get_parent     = pidns_get_parent,
  };
  
  static __init int pid_namespaces_init(void)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index a13bbda..a43775c 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,6 +65,7 @@
  #include <linux/sched/sysctl.h>
  #include <linux/kexec.h>
  #include <linux/bpf.h>
+#include <linux/mount.h>
  
  #include <asm/uaccess.h>
  #include <asm/processor.h>
@@ -1838,6 +1839,14 @@ static struct ctl_table fs_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_doulongvec_minmax,
         },
+       {
+               .procname       = "mount-max",
+               .data           = &sysctl_mount_max,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &one,
+       },
         { }
  };
  
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig

index ba33267..2a96b06 100644 (file)
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -216,6 +216,41 @@ config SCHED_TRACER
           This tracer tracks the latency of the highest priority task
           to be scheduled in, starting from the point it has woken up.
  
+config HWLAT_TRACER
+       bool "Tracer to detect hardware latencies (like SMIs)"
+       select GENERIC_TRACER
+       help
+        This tracer, when enabled will create one or more kernel threads,
+        depening on what the cpumask file is set to, which each thread
+        spinning in a loop looking for interruptions caused by
+        something other than the kernel. For example, if a
+        System Management Interrupt (SMI) takes a noticeable amount of
+        time, this tracer will detect it. This is useful for testing
+        if a system is reliable for Real Time tasks.
+
+        Some files are created in the tracing directory when this
+        is enabled:
+
+          hwlat_detector/width   - time in usecs for how long to spin for
+          hwlat_detector/window  - time in usecs between the start of each
+                                    iteration
+
+        A kernel thread is created that will spin with interrupts disabled
+        for "width" microseconds in every "widow" cycle. It will not spin
+        for "window - width" microseconds, where the system can
+        continue to operate.
+
+        The output will appear in the trace and trace_pipe files.
+
+        When the tracer is not running, it has no affect on the system,
+        but when it is running, it can cause the system to be
+        periodically non responsive. Do not run this tracer on a
+        production system.
+
+        To enable this tracer, echo in "hwlat" into the current_tracer
+        file. Every time a latency is greater than tracing_thresh, it will
+        be recorded into the ring buffer.
+
  config ENABLE_DEFAULT_TRACERS
         bool "Trace process context switches and events"
         depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile

index d0a1617..992ab9d 100644 (file)
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index 84752c8..2050a76 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  static int profile_graph_entry(struct ftrace_graph_ent *trace)
  {
+       int index = trace->depth;
+
         function_profile_call(trace->func, 0, NULL, NULL);
+
+       if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
+               current->ret_stack[index].subtime = 0;
+
         return 1;
  }
  
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c

index 37824d9..8696ce6 100644 (file)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void)
   *
   * Shows real state of the ring buffer if it is enabled or not.
   */
-static int tracer_tracing_is_on(struct trace_array *tr)
+int tracer_tracing_is_on(struct trace_array *tr)
  {
         if (tr->trace_buffer.buffer)
                 return ring_buffer_record_is_on(tr->trace_buffer.buffer);
@@ -4969,7 +4969,7 @@ out:
         return ret;
  }
  
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
  
  static ssize_t
  tracing_max_lat_read(struct file *filp, char __user *ubuf,
@@ -5892,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = {
         .llseek         = generic_file_llseek,
  };
  
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
  static const struct file_operations tracing_max_lat_fops = {
         .open           = tracing_open_generic,
         .read           = tracing_max_lat_read,
@@ -7222,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
  
         create_trace_options_dir(tr);
  
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
         trace_create_file("tracing_max_latency", 0644, d_tracer,
                         &tr->max_latency, &tracing_max_lat_fops);
  #endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h

index f783df4..fd24b1f 100644 (file)
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,6 +38,7 @@ enum trace_type {
         TRACE_USER_STACK,
         TRACE_BLK,
         TRACE_BPUTS,
+       TRACE_HWLAT,
  
         __TRACE_LAST_TYPE,
  };
@@ -213,6 +214,8 @@ struct trace_array {
          */
         struct trace_buffer     max_buffer;
         bool                    allocated_snapshot;
+#endif
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
         unsigned long           max_latency;
  #endif
         struct trace_pid_list   __rcu *filtered_pids;
@@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void);
                 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);   \
                 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
                 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);   \
+               IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT);   \
                 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,          \
                           TRACE_MMIO_RW);                               \
                 IF_ASSIGN(var, ent, struct trace_mmiotrace_map,         \
@@ -571,6 +575,7 @@ void tracing_reset_current(int cpu);
  void tracing_reset_all_online_cpus(void);
  int tracing_open_generic(struct inode *inode, struct file *filp);
  bool tracing_is_disabled(void);
+int tracer_tracing_is_on(struct trace_array *tr);
  struct dentry *trace_create_file(const char *name,
                                  umode_t mode,
                                  struct dentry *parent,
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h

index 5c30efc..d1cc37e 100644 (file)
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch,
         FILTER_OTHER
  );
  
+
+FTRACE_ENTRY(hwlat, hwlat_entry,
+
+       TRACE_HWLAT,
+
+       F_STRUCT(
+               __field(        u64,                    duration        )
+               __field(        u64,                    outer_duration  )
+               __field(        u64,                    nmi_total_ts    )
+               __field_struct( struct timespec,        timestamp       )
+               __field_desc(   long,   timestamp,      tv_sec          )
+               __field_desc(   long,   timestamp,      tv_nsec         )
+               __field(        unsigned int,           nmi_count       )
+               __field(        unsigned int,           seqnum          )
+       ),
+
+       F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+                __entry->seqnum,
+                __entry->tv_sec,
+                __entry->tv_nsec,
+                __entry->duration,
+                __entry->outer_duration,
+                __entry->nmi_total_ts,
+                __entry->nmi_count),
+
+       FILTER_OTHER
+);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c

index a975571..6721a1e 100644 (file)
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = {
  static struct event_command trigger_traceoff_cmd = {
         .name                   = "traceoff",
         .trigger_type           = ETT_TRACE_ONOFF,
+       .flags                  = EVENT_CMD_FL_POST_TRIGGER,
         .func                   = event_trigger_callback,
         .reg                    = register_trigger,
         .unreg                  = unregister_trigger,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c

index 0cbe38a..4e480e8 100644 (file)
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -170,7 +170,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
         current->ret_stack[index].ret = ret;
         current->ret_stack[index].func = func;
         current->ret_stack[index].calltime = calltime;
-       current->ret_stack[index].subtime = 0;
  #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
         current->ret_stack[index].fp = frame_pointer;
  #endif
@@ -1183,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
         trace_seq_puts(s, "/* ");
  
         switch (iter->ent->type) {
+       case TRACE_BPUTS:
+               ret = trace_print_bputs_msg_only(iter);
+               if (ret != TRACE_TYPE_HANDLED)
+                       return ret;
+               break;
         case TRACE_BPRINT:
                 ret = trace_print_bprintk_msg_only(iter);
                 if (ret != TRACE_TYPE_HANDLED)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c

new file mode 100644 (file)

index 0000000..b97286c
--- /dev/null
+++ b/kernel/trace/trace_hwlat.c
@@ -0,0 +1,633 @@
+/*
+ * trace_hwlatdetect.c - A simple Hardware Latency detector.
+ *
+ * Use this tracer to detect large system latencies induced by the behavior of
+ * certain underlying system hardware or firmware, independent of Linux itself.
+ * The code was developed originally to detect the presence of SMIs on Intel
+ * and AMD systems, although there is no dependency upon x86 herein.
+ *
+ * The classical example usage of this tracer is in detecting the presence of
+ * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
+ * somewhat special form of hardware interrupt spawned from earlier CPU debug
+ * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
+ * LPC (or other device) to generate a special interrupt under certain
+ * circumstances, for example, upon expiration of a special SMI timer device,
+ * due to certain external thermal readings, on certain I/O address accesses,
+ * and other situations. An SMI hits a special CPU pin, triggers a special
+ * SMI mode (complete with special memory map), and the OS is unaware.
+ *
+ * Although certain hardware-inducing latencies are necessary (for example,
+ * a modern system often requires an SMI handler for correct thermal control
+ * and remote management) they can wreak havoc upon any OS-level performance
+ * guarantees toward low-latency, especially when the OS is not even made
+ * aware of the presence of these interrupts. For this reason, we need a
+ * somewhat brute force mechanism to detect these interrupts. In this case,
+ * we do it by hogging all of the CPU(s) for configurable timer intervals,
+ * sampling the built-in CPU timer, looking for discontiguous readings.
+ *
+ * WARNING: This implementation necessarily introduces latencies. Therefore,
+ *          you should NEVER use this tracer while running in a production
+ *          environment requiring any kind of low-latency performance
+ *          guarantee(s).
+ *
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
+ *
+ * Includes useful feedback from Clark Williams <clark@redhat.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include "trace.h"
+
+static struct trace_array      *hwlat_trace;
+
+#define U64STR_SIZE            22                      /* 20 digits max */
+
+#define BANNER                 "hwlat_detector: "
+#define DEFAULT_SAMPLE_WINDOW  1000000                 /* 1s */
+#define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
+#define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
+
+/* sampling thread*/
+static struct task_struct *hwlat_kthread;
+
+static struct dentry *hwlat_sample_width;      /* sample width us */
+static struct dentry *hwlat_sample_window;     /* sample window us */
+
+/* Save the previous tracing_thresh value */
+static unsigned long save_tracing_thresh;
+
+/* NMI timestamp counters */
+static u64 nmi_ts_start;
+static u64 nmi_total_ts;
+static int nmi_count;
+static int nmi_cpu;
+
+/* Tells NMIs to call back to the hwlat tracer to record timestamps */
+bool trace_hwlat_callback_enabled;
+
+/* If the user changed threshold, remember it */
+static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
+
+/* Individual latency samples are stored here when detected. */
+struct hwlat_sample {
+       u64             seqnum;         /* unique sequence */
+       u64             duration;       /* delta */
+       u64             outer_duration; /* delta (outer loop) */
+       u64             nmi_total_ts;   /* Total time spent in NMIs */
+       struct timespec timestamp;      /* wall time */
+       int             nmi_count;      /* # NMIs during this sample */
+};
+
+/* keep the global state somewhere. */
+static struct hwlat_data {
+
+       struct mutex lock;              /* protect changes */
+
+       u64     count;                  /* total since reset */
+
+       u64     sample_window;          /* total sampling window (on+off) */
+       u64     sample_width;           /* active sampling portion of window */
+
+} hwlat_data = {
+       .sample_window          = DEFAULT_SAMPLE_WINDOW,
+       .sample_width           = DEFAULT_SAMPLE_WIDTH,
+};
+
+static void trace_hwlat_sample(struct hwlat_sample *sample)
+{
+       struct trace_array *tr = hwlat_trace;
+       struct trace_event_call *call = &event_hwlat;
+       struct ring_buffer *buffer = tr->trace_buffer.buffer;
+       struct ring_buffer_event *event;
+       struct hwlat_entry *entry;
+       unsigned long flags;
+       int pc;
+
+       pc = preempt_count();
+       local_save_flags(flags);
+
+       event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
+                                         flags, pc);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       entry->seqnum                   = sample->seqnum;
+       entry->duration                 = sample->duration;
+       entry->outer_duration           = sample->outer_duration;
+       entry->timestamp                = sample->timestamp;
+       entry->nmi_total_ts             = sample->nmi_total_ts;
+       entry->nmi_count                = sample->nmi_count;
+
+       if (!call_filter_check_discard(call, entry, buffer, event))
+               __buffer_unlock_commit(buffer, event);
+}
+
+/* Macros to encapsulate the time capturing infrastructure */
+#define time_type      u64
+#define time_get()     trace_clock_local()
+#define time_to_us(x)  div_u64(x, 1000)
+#define time_sub(a, b) ((a) - (b))
+#define init_time(a, b)        (a = b)
+#define time_u64(a)    a
+
+void trace_hwlat_callback(bool enter)
+{
+       if (smp_processor_id() != nmi_cpu)
+               return;
+
+       /*
+        * Currently trace_clock_local() calls sched_clock() and the
+        * generic version is not NMI safe.
+        */
+       if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
+               if (enter)
+                       nmi_ts_start = time_get();
+               else
+                       nmi_total_ts = time_get() - nmi_ts_start;
+       }
+
+       if (enter)
+               nmi_count++;
+}
+
+/**
+ * get_sample - sample the CPU TSC and look for likely hardware latencies
+ *
+ * Used to repeatedly capture the CPU TSC (or similar), looking for potential
+ * hardware-induced latency. Called with interrupts disabled and with
+ * hwlat_data.lock held.
+ */
+static int get_sample(void)
+{
+       struct trace_array *tr = hwlat_trace;
+       time_type start, t1, t2, last_t2;
+       s64 diff, total, last_total = 0;
+       u64 sample = 0;
+       u64 thresh = tracing_thresh;
+       u64 outer_sample = 0;
+       int ret = -1;
+
+       do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
+
+       nmi_cpu = smp_processor_id();
+       nmi_total_ts = 0;
+       nmi_count = 0;
+       /* Make sure NMIs see this first */
+       barrier();
+
+       trace_hwlat_callback_enabled = true;
+
+       init_time(last_t2, 0);
+       start = time_get(); /* start timestamp */
+
+       do {
+
+               t1 = time_get();        /* we'll look for a discontinuity */
+               t2 = time_get();
+
+               if (time_u64(last_t2)) {
+                       /* Check the delta from outer loop (t2 to next t1) */
+                       diff = time_to_us(time_sub(t1, last_t2));
+                       /* This shouldn't happen */
+                       if (diff < 0) {
+                               pr_err(BANNER "time running backwards\n");
+                               goto out;
+                       }
+                       if (diff > outer_sample)
+                               outer_sample = diff;
+               }
+               last_t2 = t2;
+
+               total = time_to_us(time_sub(t2, start)); /* sample width */
+
+               /* Check for possible overflows */
+               if (total < last_total) {
+                       pr_err("Time total overflowed\n");
+                       break;
+               }
+               last_total = total;
+
+               /* This checks the inner loop (t1 to t2) */
+               diff = time_to_us(time_sub(t2, t1));     /* current diff */
+
+               /* This shouldn't happen */
+               if (diff < 0) {
+                       pr_err(BANNER "time running backwards\n");
+                       goto out;
+               }
+
+               if (diff > sample)
+                       sample = diff; /* only want highest value */
+
+       } while (total <= hwlat_data.sample_width);
+
+       barrier(); /* finish the above in the view for NMIs */
+       trace_hwlat_callback_enabled = false;
+       barrier(); /* Make sure nmi_total_ts is no longer updated */
+
+       ret = 0;
+
+       /* If we exceed the threshold value, we have found a hardware latency */
+       if (sample > thresh || outer_sample > thresh) {
+               struct hwlat_sample s;
+
+               ret = 1;
+
+               /* We read in microseconds */
+               if (nmi_total_ts)
+                       do_div(nmi_total_ts, NSEC_PER_USEC);
+
+               hwlat_data.count++;
+               s.seqnum = hwlat_data.count;
+               s.duration = sample;
+               s.outer_duration = outer_sample;
+               s.timestamp = CURRENT_TIME;
+               s.nmi_total_ts = nmi_total_ts;
+               s.nmi_count = nmi_count;
+               trace_hwlat_sample(&s);
+
+               /* Keep a running maximum ever recorded hardware latency */
+               if (sample > tr->max_latency)
+                       tr->max_latency = sample;
+       }
+
+out:
+       return ret;
+}
+
+static struct cpumask save_cpumask;
+static bool disable_migrate;
+
+static void move_to_next_cpu(void)
+{
+       static struct cpumask *current_mask;
+       int next_cpu;
+
+       if (disable_migrate)
+               return;
+
+       /* Just pick the first CPU on first iteration */
+       if (!current_mask) {
+               current_mask = &save_cpumask;
+               get_online_cpus();
+               cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+               put_online_cpus();
+               next_cpu = cpumask_first(current_mask);
+               goto set_affinity;
+       }
+
+       /*
+        * If for some reason the user modifies the CPU affinity
+        * of this thread, than stop migrating for the duration
+        * of the current test.
+        */
+       if (!cpumask_equal(current_mask, &current->cpus_allowed))
+               goto disable;
+
+       get_online_cpus();
+       cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+       next_cpu = cpumask_next(smp_processor_id(), current_mask);
+       put_online_cpus();
+
+       if (next_cpu >= nr_cpu_ids)
+               next_cpu = cpumask_first(current_mask);
+
+ set_affinity:
+       if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
+               goto disable;
+
+       cpumask_clear(current_mask);
+       cpumask_set_cpu(next_cpu, current_mask);
+
+       sched_setaffinity(0, current_mask);
+       return;
+
+ disable:
+       disable_migrate = true;
+}
+
+/*
+ * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
+ *
+ * Used to periodically sample the CPU TSC via a call to get_sample. We
+ * disable interrupts, which does (intentionally) introduce latency since we
+ * need to ensure nothing else might be running (and thus preempting).
+ * Obviously this should never be used in production environments.
+ *
+ * Currently this runs on which ever CPU it was scheduled on, but most
+ * real-world hardware latency situations occur across several CPUs,
+ * but we might later generalize this if we find there are any actualy
+ * systems with alternate SMI delivery or other hardware latencies.
+ */
+static int kthread_fn(void *data)
+{
+       u64 interval;
+
+       while (!kthread_should_stop()) {
+
+               move_to_next_cpu();
+
+               local_irq_disable();
+               get_sample();
+               local_irq_enable();
+
+               mutex_lock(&hwlat_data.lock);
+               interval = hwlat_data.sample_window - hwlat_data.sample_width;
+               mutex_unlock(&hwlat_data.lock);
+
+               do_div(interval, USEC_PER_MSEC); /* modifies interval value */
+
+               /* Always sleep for at least 1ms */
+               if (interval < 1)
+                       interval = 1;
+
+               if (msleep_interruptible(interval))
+                       break;
+       }
+
+       return 0;
+}
+
+/**
+ * start_kthread - Kick off the hardware latency sampling/detector kthread
+ *
+ * This starts the kernel thread that will sit and sample the CPU timestamp
+ * counter (TSC or similar) and look for potential hardware latencies.
+ */
+static int start_kthread(struct trace_array *tr)
+{
+       struct task_struct *kthread;
+
+       kthread = kthread_create(kthread_fn, NULL, "hwlatd");
+       if (IS_ERR(kthread)) {
+               pr_err(BANNER "could not start sampling thread\n");
+               return -ENOMEM;
+       }
+       hwlat_kthread = kthread;
+       wake_up_process(kthread);
+
+       return 0;
+}
+
+/**
+ * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static void stop_kthread(void)
+{
+       if (!hwlat_kthread)
+               return;
+       kthread_stop(hwlat_kthread);
+       hwlat_kthread = NULL;
+}
+
+/*
+ * hwlat_read - Wrapper read function for reading both window and width
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a generic read implementation for the global state
+ * "hwlat_data" structure filesystem entries.
+ */
+static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
+                         size_t cnt, loff_t *ppos)
+{
+       char buf[U64STR_SIZE];
+       u64 *entry = filp->private_data;
+       u64 val;
+       int len;
+
+       if (!entry)
+               return -EFAULT;
+
+       if (cnt > sizeof(buf))
+               cnt = sizeof(buf);
+
+       val = *entry;
+
+       len = snprintf(buf, sizeof(buf), "%llu\n", val);
+
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+
+/**
+ * hwlat_width_write - Write function for "width" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "width" interface
+ * to the hardware latency detector. It can be used to configure
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latency periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch. It
+ * is enforced that width is less that the total window size.
+ */
+static ssize_t
+hwlat_width_write(struct file *filp, const char __user *ubuf,
+                 size_t cnt, loff_t *ppos)
+{
+       u64 val;
+       int err;
+
+       err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+       if (err)
+               return err;
+
+       mutex_lock(&hwlat_data.lock);
+       if (val < hwlat_data.sample_window)
+               hwlat_data.sample_width = val;
+       else
+               err = -EINVAL;
+       mutex_unlock(&hwlat_data.lock);
+
+       if (err)
+               return err;
+
+       return cnt;
+}
+
+/**
+ * hwlat_window_write - Write function for "window" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "window" interface
+ * to the hardware latency detetector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to write a new total window size. It
+ * is enfoced that any value written must be greater than the sample width
+ * size, or an error results.
+ */
+static ssize_t
+hwlat_window_write(struct file *filp, const char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+       u64 val;
+       int err;
+
+       err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+       if (err)
+               return err;
+
+       mutex_lock(&hwlat_data.lock);
+       if (hwlat_data.sample_width < val)
+               hwlat_data.sample_window = val;
+       else
+               err = -EINVAL;
+       mutex_unlock(&hwlat_data.lock);
+
+       if (err)
+               return err;
+
+       return cnt;
+}
+
+static const struct file_operations width_fops = {
+       .open           = tracing_open_generic,
+       .read           = hwlat_read,
+       .write          = hwlat_width_write,
+};
+
+static const struct file_operations window_fops = {
+       .open           = tracing_open_generic,
+       .read           = hwlat_read,
+       .write          = hwlat_window_write,
+};
+
+/**
+ * init_tracefs - A function to initialize the tracefs interface files
+ *
+ * This function creates entries in tracefs for "hwlat_detector".
+ * It creates the hwlat_detector directory in the tracing directory,
+ * and within that directory is the count, width and window files to
+ * change and view those values.
+ */
+static int init_tracefs(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *top_dir;
+
+       d_tracer = tracing_init_dentry();
+       if (IS_ERR(d_tracer))
+               return -ENOMEM;
+
+       top_dir = tracefs_create_dir("hwlat_detector", d_tracer);
+       if (!top_dir)
+               return -ENOMEM;
+
+       hwlat_sample_window = tracefs_create_file("window", 0640,
+                                                 top_dir,
+                                                 &hwlat_data.sample_window,
+                                                 &window_fops);
+       if (!hwlat_sample_window)
+               goto err;
+
+       hwlat_sample_width = tracefs_create_file("width", 0644,
+                                                top_dir,
+                                                &hwlat_data.sample_width,
+                                                &width_fops);
+       if (!hwlat_sample_width)
+               goto err;
+
+       return 0;
+
+ err:
+       tracefs_remove_recursive(top_dir);
+       return -ENOMEM;
+}
+
+static void hwlat_tracer_start(struct trace_array *tr)
+{
+       int err;
+
+       err = start_kthread(tr);
+       if (err)
+               pr_err(BANNER "Cannot start hwlat kthread\n");
+}
+
+static void hwlat_tracer_stop(struct trace_array *tr)
+{
+       stop_kthread();
+}
+
+static bool hwlat_busy;
+
+static int hwlat_tracer_init(struct trace_array *tr)
+{
+       /* Only allow one instance to enable this */
+       if (hwlat_busy)
+               return -EBUSY;
+
+       hwlat_trace = tr;
+
+       disable_migrate = false;
+       hwlat_data.count = 0;
+       tr->max_latency = 0;
+       save_tracing_thresh = tracing_thresh;
+
+       /* tracing_thresh is in nsecs, we speak in usecs */
+       if (!tracing_thresh)
+               tracing_thresh = last_tracing_thresh;
+
+       if (tracer_tracing_is_on(tr))
+               hwlat_tracer_start(tr);
+
+       hwlat_busy = true;
+
+       return 0;
+}
+
+static void hwlat_tracer_reset(struct trace_array *tr)
+{
+       stop_kthread();
+
+       /* the tracing threshold is static between runs */
+       last_tracing_thresh = tracing_thresh;
+
+       tracing_thresh = save_tracing_thresh;
+       hwlat_busy = false;
+}
+
+static struct tracer hwlat_tracer __read_mostly =
+{
+       .name           = "hwlat",
+       .init           = hwlat_tracer_init,
+       .reset          = hwlat_tracer_reset,
+       .start          = hwlat_tracer_start,
+       .stop           = hwlat_tracer_stop,
+       .allow_instances = true,
+};
+
+__init static int init_hwlat_tracer(void)
+{
+       int ret;
+
+       mutex_init(&hwlat_data.lock);
+
+       ret = register_tracer(&hwlat_tracer);
+       if (ret)
+               return ret;
+
+       init_tracefs();
+
+       return 0;
+}
+late_initcall(init_hwlat_tracer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c

index 0bb9cf2..3fc2042 100644 (file)
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = {
         .funcs          = &trace_user_stack_funcs,
  };
  
+/* TRACE_HWLAT */
+static enum print_line_t
+trace_hwlat_print(struct trace_iterator *iter, int flags,
+                 struct trace_event *event)
+{
+       struct trace_entry *entry = iter->ent;
+       struct trace_seq *s = &iter->seq;
+       struct hwlat_entry *field;
+
+       trace_assign_type(field, entry);
+
+       trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld",
+                        field->seqnum,
+                        field->duration,
+                        field->outer_duration,
+                        field->timestamp.tv_sec,
+                        field->timestamp.tv_nsec);
+
+       if (field->nmi_count) {
+               /*
+                * The generic sched_clock() is not NMI safe, thus
+                * we only record the count and not the time.
+                */
+               if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK))
+                       trace_seq_printf(s, " nmi-total:%llu",
+                                        field->nmi_total_ts);
+               trace_seq_printf(s, " nmi-count:%u",
+                                field->nmi_count);
+       }
+
+       trace_seq_putc(s, '\n');
+
+       return trace_handle_return(s);
+}
+
+
+static enum print_line_t
+trace_hwlat_raw(struct trace_iterator *iter, int flags,
+               struct trace_event *event)
+{
+       struct hwlat_entry *field;
+       struct trace_seq *s = &iter->seq;
+
+       trace_assign_type(field, iter->ent);
+
+       trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
+                        field->duration,
+                        field->outer_duration,
+                        field->timestamp.tv_sec,
+                        field->timestamp.tv_nsec,
+                        field->seqnum);
+
+       return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_hwlat_funcs = {
+       .trace          = trace_hwlat_print,
+       .raw            = trace_hwlat_raw,
+};
+
+static struct trace_event trace_hwlat_event = {
+       .type           = TRACE_HWLAT,
+       .funcs          = &trace_hwlat_funcs,
+};
+
  /* TRACE_BPUTS */
  static enum print_line_t
  trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = {
         &trace_bputs_event,
         &trace_bprint_event,
         &trace_print_event,
+       &trace_hwlat_event,
         NULL
  };
  
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c

index 7a68732..0913693 100644 (file)
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -431,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv)
                 pr_info("Probe point is not specified.\n");
                 return -EINVAL;
         }
-       if (isdigit(argv[1][0])) {
-               pr_info("probe point must be have a filename.\n");
-               return -EINVAL;
-       }
         arg = strchr(argv[1], ':');
         if (!arg) {
                 ret = -EINVAL;
diff --git a/kernel/ucount.c b/kernel/ucount.c

new file mode 100644 (file)

index 0000000..9d20d5d
--- /dev/null
+++ b/kernel/ucount.c
@@ -0,0 +1,235 @@
+/*
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/user_namespace.h>
+
+#define UCOUNTS_HASHTABLE_BITS 10
+static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
+static DEFINE_SPINLOCK(ucounts_lock);
+
+#define ucounts_hashfn(ns, uid)                                                \
+       hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
+                 UCOUNTS_HASHTABLE_BITS)
+#define ucounts_hashentry(ns, uid)     \
+       (ucounts_hashtable + ucounts_hashfn(ns, uid))
+
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+       return &current_user_ns()->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+       return &current_user_ns()->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+                                 struct ctl_table *table)
+{
+       struct user_namespace *user_ns =
+               container_of(head->set, struct user_namespace, set);
+       int mode;
+
+       /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+       if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+               mode = (table->mode & S_IRWXU) >> 6;
+       else
+       /* Allow all others at most read-only access */
+               mode = table->mode & S_IROTH;
+       return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+       .lookup = set_lookup,
+       .permissions = set_permissions,
+};
+
+static int zero = 0;
+static int int_max = INT_MAX;
+#define UCOUNT_ENTRY(name)                             \
+       {                                               \
+               .procname       = name,                 \
+               .maxlen         = sizeof(int),          \
+               .mode           = 0644,                 \
+               .proc_handler   = proc_dointvec_minmax, \
+               .extra1         = &zero,                \
+               .extra2         = &int_max,             \
+       }
+static struct ctl_table user_table[] = {
+       UCOUNT_ENTRY("max_user_namespaces"),
+       UCOUNT_ENTRY("max_pid_namespaces"),
+       UCOUNT_ENTRY("max_uts_namespaces"),
+       UCOUNT_ENTRY("max_ipc_namespaces"),
+       UCOUNT_ENTRY("max_net_namespaces"),
+       UCOUNT_ENTRY("max_mnt_namespaces"),
+       UCOUNT_ENTRY("max_cgroup_namespaces"),
+       { }
+};
+#endif /* CONFIG_SYSCTL */
+
+bool setup_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+       struct ctl_table *tbl;
+       setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+       tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
+       if (tbl) {
+               int i;
+               for (i = 0; i < UCOUNT_COUNTS; i++) {
+                       tbl[i].data = &ns->ucount_max[i];
+               }
+               ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+       }
+       if (!ns->sysctls) {
+               kfree(tbl);
+               retire_sysctl_set(&ns->set);
+               return false;
+       }
+#endif
+       return true;
+}
+
+void retire_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+       struct ctl_table *tbl;
+
+       tbl = ns->sysctls->ctl_table_arg;
+       unregister_sysctl_table(ns->sysctls);
+       retire_sysctl_set(&ns->set);
+       kfree(tbl);
+#endif
+}
+
+static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
+{
+       struct ucounts *ucounts;
+
+       hlist_for_each_entry(ucounts, hashent, node) {
+               if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
+                       return ucounts;
+       }
+       return NULL;
+}
+
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+{
+       struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+       struct ucounts *ucounts, *new;
+
+       spin_lock(&ucounts_lock);
+       ucounts = find_ucounts(ns, uid, hashent);
+       if (!ucounts) {
+               spin_unlock(&ucounts_lock);
+
+               new = kzalloc(sizeof(*new), GFP_KERNEL);
+               if (!new)
+                       return NULL;
+
+               new->ns = ns;
+               new->uid = uid;
+               atomic_set(&new->count, 0);
+
+               spin_lock(&ucounts_lock);
+               ucounts = find_ucounts(ns, uid, hashent);
+               if (ucounts) {
+                       kfree(new);
+               } else {
+                       hlist_add_head(&new->node, hashent);
+                       ucounts = new;
+               }
+       }
+       if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+               ucounts = NULL;
+       spin_unlock(&ucounts_lock);
+       return ucounts;
+}
+
+static void put_ucounts(struct ucounts *ucounts)
+{
+       if (atomic_dec_and_test(&ucounts->count)) {
+               spin_lock(&ucounts_lock);
+               hlist_del_init(&ucounts->node);
+               spin_unlock(&ucounts_lock);
+
+               kfree(ucounts);
+       }
+}
+
+static inline bool atomic_inc_below(atomic_t *v, int u)
+{
+       int c, old;
+       c = atomic_read(v);
+       for (;;) {
+               if (unlikely(c >= u))
+                       return false;
+               old = atomic_cmpxchg(v, c, c+1);
+               if (likely(old == c))
+                       return true;
+               c = old;
+       }
+}
+
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
+                          enum ucount_type type)
+{
+       struct ucounts *ucounts, *iter, *bad;
+       struct user_namespace *tns;
+       ucounts = get_ucounts(ns, uid);
+       for (iter = ucounts; iter; iter = tns->ucounts) {
+               int max;
+               tns = iter->ns;
+               max = READ_ONCE(tns->ucount_max[type]);
+               if (!atomic_inc_below(&iter->ucount[type], max))
+                       goto fail;
+       }
+       return ucounts;
+fail:
+       bad = iter;
+       for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
+               atomic_dec(&iter->ucount[type]);
+
+       put_ucounts(ucounts);
+       return NULL;
+}
+
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
+{
+       struct ucounts *iter;
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               int dec = atomic_dec_if_positive(&iter->ucount[type]);
+               WARN_ON_ONCE(dec < 0);
+       }
+       put_ucounts(ucounts);
+}
+
+static __init int user_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+       static struct ctl_table_header *user_header;
+       static struct ctl_table empty[1];
+       /*
+        * It is necessary to register the user directory in the
+        * default set so that registrations in the child sets work
+        * properly.
+        */
+       user_header = register_sysctl("user", empty);
+       BUG_ON(!user_header);
+       BUG_ON(!setup_userns_sysctls(&init_user_ns));
+#endif
+       return 0;
+}
+subsys_initcall(user_namespace_sysctl_init);
+
+
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c

index 68f5942..86b7854 100644 (file)
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex);
  static bool new_idmap_permitted(const struct file *file,
                                 struct user_namespace *ns, int cap_setid,
                                 struct uid_gid_map *map);
+static void free_user_ns(struct work_struct *work);
+
+static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
+{
+       return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
+}
+
+static void dec_user_namespaces(struct ucounts *ucounts)
+{
+       return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
+}
  
  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
  {
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new)
         struct user_namespace *ns, *parent_ns = new->user_ns;
         kuid_t owner = new->euid;
         kgid_t group = new->egid;
-       int ret;
+       struct ucounts *ucounts;
+       int ret, i;
  
+       ret = -ENOSPC;
         if (parent_ns->level > 32)
-               return -EUSERS;
+               goto fail;
+
+       ucounts = inc_user_namespaces(parent_ns, owner);
+       if (!ucounts)
+               goto fail;
  
         /*
          * Verify that we can not violate the policy of which files
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new)
          * by verifing that the root directory is at the root of the
          * mount namespace which allows all files to be accessed.
          */
+       ret = -EPERM;
         if (current_chrooted())
-               return -EPERM;
+               goto fail_dec;
  
         /* The creator needs a mapping in the parent user namespace
          * or else we won't be able to reasonably tell userspace who
          * created a user_namespace.
          */
+       ret = -EPERM;
         if (!kuid_has_mapping(parent_ns, owner) ||
             !kgid_has_mapping(parent_ns, group))
-               return -EPERM;
+               goto fail_dec;
  
+       ret = -ENOMEM;
         ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
         if (!ns)
-               return -ENOMEM;
+               goto fail_dec;
  
         ret = ns_alloc_inum(&ns->ns);
-       if (ret) {
-               kmem_cache_free(user_ns_cachep, ns);
-               return ret;
-       }
+       if (ret)
+               goto fail_free;
         ns->ns.ops = &userns_operations;
  
         atomic_set(&ns->count, 1);
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new)
         ns->level = parent_ns->level + 1;
         ns->owner = owner;
         ns->group = group;
+       INIT_WORK(&ns->work, free_user_ns);
+       for (i = 0; i < UCOUNT_COUNTS; i++) {
+               ns->ucount_max[i] = INT_MAX;
+       }
+       ns->ucounts = ucounts;
  
         /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
         mutex_lock(&userns_state_mutex);
         ns->flags = parent_ns->flags;
         mutex_unlock(&userns_state_mutex);
  
-       set_cred_user_ns(new, ns);
-
  #ifdef CONFIG_PERSISTENT_KEYRINGS
         init_rwsem(&ns->persistent_keyring_register_sem);
  #endif
+       ret = -ENOMEM;
+       if (!setup_userns_sysctls(ns))
+               goto fail_keyring;
+
+       set_cred_user_ns(new, ns);
         return 0;
+fail_keyring:
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+       key_put(ns->persistent_keyring_register);
+#endif
+       ns_free_inum(&ns->ns);
+fail_free:
+       kmem_cache_free(user_ns_cachep, ns);
+fail_dec:
+       dec_user_namespaces(ucounts);
+fail:
+       return ret;
  }
  
  int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
         return err;
  }
  
-void free_user_ns(struct user_namespace *ns)
+static void free_user_ns(struct work_struct *work)
  {
-       struct user_namespace *parent;
+       struct user_namespace *parent, *ns =
+               container_of(work, struct user_namespace, work);
  
         do {
+               struct ucounts *ucounts = ns->ucounts;
                 parent = ns->parent;
+               retire_userns_sysctls(ns);
  #ifdef CONFIG_PERSISTENT_KEYRINGS
                 key_put(ns->persistent_keyring_register);
  #endif
                 ns_free_inum(&ns->ns);
                 kmem_cache_free(user_ns_cachep, ns);
+               dec_user_namespaces(ucounts);
                 ns = parent;
         } while (atomic_dec_and_test(&parent->count));
  }
-EXPORT_SYMBOL(free_user_ns);
+
+void __put_user_ns(struct user_namespace *ns)
+{
+       schedule_work(&ns->work);
+}
+EXPORT_SYMBOL(__put_user_ns);
  
  static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
  {
@@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return commit_creds(cred);
  }
  
+struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+       struct user_namespace *my_user_ns = current_user_ns();
+       struct user_namespace *owner, *p;
+
+       /* See if the owner is in the current user namespace */
+       owner = p = ns->ops->owner(ns);
+       for (;;) {
+               if (!p)
+                       return ERR_PTR(-EPERM);
+               if (p == my_user_ns)
+                       break;
+               p = p->parent;
+       }
+
+       return &get_user_ns(owner)->ns;
+}
+
+static struct user_namespace *userns_owner(struct ns_common *ns)
+{
+       return to_user_ns(ns)->parent;
+}
+
  const struct proc_ns_operations userns_operations = {
         .name           = "user",
         .type           = CLONE_NEWUSER,
         .get            = userns_get,
         .put            = userns_put,
         .install        = userns_install,
+       .owner          = userns_owner,
+       .get_parent     = ns_get_owner,
  };
  
  static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c

index 831ea71..6976cd4 100644 (file)
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,16 @@
  #include <linux/user_namespace.h>
  #include <linux/proc_ns.h>
  
+static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
+}
+
+static void dec_uts_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
+}
+
  static struct uts_namespace *create_uts_ns(void)
  {
         struct uts_namespace *uts_ns;
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
                                           struct uts_namespace *old_ns)
  {
         struct uts_namespace *ns;
+       struct ucounts *ucounts;
         int err;
  
+       err = -ENOSPC;
+       ucounts = inc_uts_namespaces(user_ns);
+       if (!ucounts)
+               goto fail;
+
+       err = -ENOMEM;
         ns = create_uts_ns();
         if (!ns)
-               return ERR_PTR(-ENOMEM);
+               goto fail_dec;
  
         err = ns_alloc_inum(&ns->ns);
-       if (err) {
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_free;
  
+       ns->ucounts = ucounts;
         ns->ns.ops = &utsns_operations;
  
         down_read(&uts_sem);
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
         ns->user_ns = get_user_ns(user_ns);
         up_read(&uts_sem);
         return ns;
+
+fail_free:
+       kfree(ns);
+fail_dec:
+       dec_uts_namespaces(ucounts);
+fail:
+       return ERR_PTR(err);
  }
  
  /*
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref)
         struct uts_namespace *ns;
  
         ns = container_of(kref, struct uts_namespace, kref);
+       dec_uts_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
         return 0;
  }
  
+static struct user_namespace *utsns_owner(struct ns_common *ns)
+{
+       return to_uts_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations utsns_operations = {
         .name           = "uts",
         .type           = CLONE_NEWUTS,
         .get            = utsns_get,
         .put            = utsns_put,
         .install        = utsns_install,
+       .owner          = utsns_owner,
  };
diff --git a/mm/filemap.c b/mm/filemap.c

index 4bad32d..68f1813 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1923,16 +1923,18 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
         if (iocb->ki_flags & IOCB_DIRECT) {
                 struct address_space *mapping = file->f_mapping;
                 struct inode *inode = mapping->host;
+               struct iov_iter data = *iter;
                 loff_t size;
  
                 size = i_size_read(inode);
                 retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
                                         iocb->ki_pos + count - 1);
-               if (!retval) {
-                       struct iov_iter data = *iter;
-                       retval = mapping->a_ops->direct_IO(iocb, &data);
-               }
+               if (retval < 0)
+                       goto out;
  
+               file_accessed(file);
+
+               retval = mapping->a_ops->direct_IO(iocb, &data);
                 if (retval > 0) {
                         iocb->ki_pos += retval;
                         iov_iter_advance(iter, retval);
@@ -1948,10 +1950,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                  * DAX files, so don't bother trying.
                  */
                 if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
-                   IS_DAX(inode)) {
-                       file_accessed(file);
+                   IS_DAX(inode))
                         goto out;
-               }
         }
  
         retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c

index 42bdda0..989434f 100644 (file)
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -309,6 +309,16 @@ out_undo:
  
  
  #ifdef CONFIG_NET_NS
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
  static struct kmem_cache *net_cachep;
  static struct workqueue_struct *netns_wq;
  
@@ -350,19 +360,27 @@ void net_drop_ns(void *p)
  struct net *copy_net_ns(unsigned long flags,
                         struct user_namespace *user_ns, struct net *old_net)
  {
+       struct ucounts *ucounts;
         struct net *net;
         int rv;
  
         if (!(flags & CLONE_NEWNET))
                 return get_net(old_net);
  
+       ucounts = inc_net_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         net = net_alloc();
-       if (!net)
+       if (!net) {
+               dec_net_namespaces(ucounts);
                 return ERR_PTR(-ENOMEM);
+       }
  
         get_user_ns(user_ns);
  
         mutex_lock(&net_mutex);
+       net->ucounts = ucounts;
         rv = setup_net(net, user_ns);
         if (rv == 0) {
                 rtnl_lock();
@@ -371,6 +389,7 @@ struct net *copy_net_ns(unsigned long flags,
         }
         mutex_unlock(&net_mutex);
         if (rv < 0) {
+               dec_net_namespaces(ucounts);
                 put_user_ns(user_ns);
                 net_drop_ns(net);
                 return ERR_PTR(rv);
@@ -443,6 +462,7 @@ static void cleanup_net(struct work_struct *work)
         /* Finally it is safe to free my network namespace structure */
         list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                 list_del_init(&net->exit_list);
+               dec_net_namespaces(net->ucounts);
                 put_user_ns(net->user_ns);
                 net_drop_ns(net);
         }
@@ -1004,11 +1024,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return 0;
  }
  
+static struct user_namespace *netns_owner(struct ns_common *ns)
+{
+       return to_net_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations netns_operations = {
         .name           = "net",
         .type           = CLONE_NEWNET,
         .get            = netns_get,
         .put            = netns_put,
         .install        = netns_install,
+       .owner          = netns_owner,
  };
  #endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c

index cbd9343..d8983e1 100644 (file)
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5729,6 +5729,7 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
         return ret;
  }
  
+static int minus_one = -1;
  static const int one = 1;
  static const int two_five_five = 255;
  
@@ -5789,7 +5790,8 @@ static const struct ctl_table addrconf_sysctl[] = {
                 .data           = &ipv6_devconf.rtr_solicits,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &minus_one,
         },
         {
                 .procname       = "router_solicitation_interval",
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c

index 627f898..62bea45 100644 (file)
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1832,7 +1832,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
         /* Record the max length of recvmsg() calls for future allocations */
         nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
         nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
-                                    16384);
+                                    SKB_WITH_OVERHEAD(32768));
  
         copied = data_skb->len;
         if (len < copied) {
@@ -2083,8 +2083,9 @@ static int netlink_dump(struct sock *sk)
  
         if (alloc_min_size < nlk->max_recvmsg_len) {
                 alloc_size = nlk->max_recvmsg_len;
-               skb = alloc_skb(alloc_size, GFP_KERNEL |
-                                           __GFP_NOWARN | __GFP_NORETRY);
+               skb = alloc_skb(alloc_size,
+                               (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
+                               __GFP_NOWARN | __GFP_NORETRY);
         }
         if (!skb) {
                 alloc_size = alloc_min_size;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c

index 33a4697..11db0d6 100644 (file)
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3952,6 +3952,7 @@ static int packet_notifier(struct notifier_block *this,
                                 }
                                 if (msg == NETDEV_UNREGISTER) {
                                         packet_cached_dev_reset(po);
+                                       fanout_release(sk);
                                         po->ifindex = -1;
                                         if (po->prot_hook.dev)
                                                 dev_put(po->prot_hook.dev);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c

index 44c9c2b..2d59c9b 100644 (file)
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -678,9 +678,9 @@ static int rxrpc_release_sock(struct sock *sk)
         sk->sk_state = RXRPC_CLOSE;
         spin_unlock_bh(&sk->sk_receive_queue.lock);
  
-       if (rx->local && rx->local->service == rx) {
+       if (rx->local && rcu_access_pointer(rx->local->service) == rx) {
                 write_lock(&rx->local->services_lock);
-               rx->local->service = NULL;
+               rcu_assign_pointer(rx->local->service, NULL);
                 write_unlock(&rx->local->services_lock);
         }
  
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h

index d38dffd..f60e355 100644 (file)
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -398,6 +398,7 @@ enum rxrpc_call_flag {
         RXRPC_CALL_EXPOSED,             /* The call was exposed to the world */
         RXRPC_CALL_RX_LAST,             /* Received the last packet (at rxtx_top) */
         RXRPC_CALL_TX_LAST,             /* Last packet in Tx buffer (at rxtx_top) */
+       RXRPC_CALL_SEND_PING,           /* A ping will need to be sent */
         RXRPC_CALL_PINGING,             /* Ping in process */
         RXRPC_CALL_RETRANS_TIMEOUT,     /* Retransmission due to timeout occurred */
  };
@@ -410,6 +411,7 @@ enum rxrpc_call_event {
         RXRPC_CALL_EV_ABORT,            /* need to generate abort */
         RXRPC_CALL_EV_TIMER,            /* Timer expired */
         RXRPC_CALL_EV_RESEND,           /* Tx resend required */
+       RXRPC_CALL_EV_PING,             /* Ping send required */
  };
  
  /*
@@ -466,6 +468,7 @@ struct rxrpc_call {
         struct rxrpc_sock __rcu *socket;        /* socket responsible */
         ktime_t                 ack_at;         /* When deferred ACK needs to happen */
         ktime_t                 resend_at;      /* When next resend needs to happen */
+       ktime_t                 ping_at;        /* When next to send a ping */
         ktime_t                 expire_at;      /* When the call times out */
         struct timer_list       timer;          /* Combined event timer */
         struct work_struct      processor;      /* Event processor */
@@ -558,8 +561,10 @@ struct rxrpc_call {
         rxrpc_seq_t             ackr_prev_seq;  /* previous sequence number received */
         rxrpc_seq_t             ackr_consumed;  /* Highest packet shown consumed */
         rxrpc_seq_t             ackr_seen;      /* Highest packet shown seen */
-       rxrpc_serial_t          ackr_ping;      /* Last ping sent */
-       ktime_t                 ackr_ping_time; /* Time last ping sent */
+
+       /* ping management */
+       rxrpc_serial_t          ping_serial;    /* Last ping sent */
+       ktime_t                 ping_time;      /* Time last ping sent */
  
         /* transmission-phase ACK management */
         ktime_t                 acks_latest_ts; /* Timestamp of latest ACK received */
@@ -728,8 +733,10 @@ extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
  enum rxrpc_timer_trace {
         rxrpc_timer_begin,
         rxrpc_timer_init_for_reply,
+       rxrpc_timer_init_for_send_reply,
         rxrpc_timer_expired,
         rxrpc_timer_set_for_ack,
+       rxrpc_timer_set_for_ping,
         rxrpc_timer_set_for_resend,
         rxrpc_timer_set_for_send,
         rxrpc_timer__nr_trace
@@ -743,6 +750,7 @@ enum rxrpc_propose_ack_trace {
         rxrpc_propose_ack_ping_for_lost_ack,
         rxrpc_propose_ack_ping_for_lost_reply,
         rxrpc_propose_ack_ping_for_params,
+       rxrpc_propose_ack_processing_op,
         rxrpc_propose_ack_respond_to_ack,
         rxrpc_propose_ack_respond_to_ping,
         rxrpc_propose_ack_retry_tx,
@@ -777,7 +785,7 @@ extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10];
  extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9];
  
  extern const char *const rxrpc_pkts[];
-extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
+extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4];
  
  #include <trace/events/rxrpc.h>
  
@@ -805,6 +813,7 @@ int rxrpc_reject_call(struct rxrpc_sock *);
  /*
   * call_event.c
   */
+void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
  void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
  void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
                        enum rxrpc_propose_ack_trace);
@@ -1068,7 +1077,8 @@ extern const s8 rxrpc_ack_priority[];
  /*
   * output.c
   */
-int rxrpc_send_call_packet(struct rxrpc_call *, u8);
+int rxrpc_send_ack_packet(struct rxrpc_call *, bool);
+int rxrpc_send_abort_packet(struct rxrpc_call *);
  int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
  void rxrpc_reject_packets(struct rxrpc_local *);
  
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c

index 3cac231..832d854 100644 (file)
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -337,7 +337,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
  
         /* Get the socket providing the service */
         rx = rcu_dereference(local->service);
-       if (service_id == rx->srx.srx_service)
+       if (rx && service_id == rx->srx.srx_service)
                 goto found_service;
  
         trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
@@ -565,7 +565,7 @@ out_discard:
         write_unlock_bh(&call->state_lock);
         write_unlock(&rx->call_lock);
         if (abort) {
-               rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+               rxrpc_send_abort_packet(call);
                 rxrpc_release_call(rx, call);
                 rxrpc_put_call(call, rxrpc_call_put);
         }
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c

index 4f00476..97a17ad 100644 (file)
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -24,19 +24,20 @@
  /*
   * Set the timer
   */
-void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-                    ktime_t now)
+void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+                      ktime_t now)
  {
         unsigned long t_j, now_j = jiffies;
         ktime_t t;
         bool queue = false;
  
-       read_lock_bh(&call->state_lock);
-
         if (call->state < RXRPC_CALL_COMPLETE) {
                 t = call->expire_at;
-               if (!ktime_after(t, now))
+               if (!ktime_after(t, now)) {
+                       trace_rxrpc_timer(call, why, now, now_j);
+                       queue = true;
                         goto out;
+               }
  
                 if (!ktime_after(call->resend_at, now)) {
                         call->resend_at = call->expire_at;
@@ -54,6 +55,14 @@ void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
                         t = call->ack_at;
                 }
  
+               if (!ktime_after(call->ping_at, now)) {
+                       call->ping_at = call->expire_at;
+                       if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
+                               queue = true;
+               } else if (ktime_before(call->ping_at, t)) {
+                       t = call->ping_at;
+               }
+
                 t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now)));
                 t_j += jiffies;
  
@@ -68,15 +77,45 @@ void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
                         mod_timer(&call->timer, t_j);
                         trace_rxrpc_timer(call, why, now, now_j);
                 }
-
-               if (queue)
-                       rxrpc_queue_call(call);
         }
  
  out:
+       if (queue)
+               rxrpc_queue_call(call);
+}
+
+/*
+ * Set the timer
+ */
+void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+                    ktime_t now)
+{
+       read_lock_bh(&call->state_lock);
+       __rxrpc_set_timer(call, why, now);
         read_unlock_bh(&call->state_lock);
  }
  
+/*
+ * Propose a PING ACK be sent.
+ */
+static void rxrpc_propose_ping(struct rxrpc_call *call,
+                              bool immediate, bool background)
+{
+       if (immediate) {
+               if (background &&
+                   !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
+                       rxrpc_queue_call(call);
+       } else {
+               ktime_t now = ktime_get_real();
+               ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay);
+
+               if (ktime_before(ping_at, call->ping_at)) {
+                       call->ping_at = ping_at;
+                       rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now);
+               }
+       }
+}
+
  /*
   * propose an ACK be sent
   */
@@ -90,6 +129,14 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
         ktime_t now, ack_at;
         s8 prior = rxrpc_ack_priority[ack_reason];
  
+       /* Pings are handled specially because we don't want to accidentally
+        * lose a ping response by subsuming it into a ping.
+        */
+       if (ack_reason == RXRPC_ACK_PING) {
+               rxrpc_propose_ping(call, immediate, background);
+               goto trace;
+       }
+
         /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
          * numbers, but we don't alter the timeout.
          */
@@ -125,7 +172,6 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
                         expiry = rxrpc_soft_ack_delay;
                 break;
  
-       case RXRPC_ACK_PING:
         case RXRPC_ACK_IDLE:
                 if (rxrpc_idle_ack_delay < expiry)
                         expiry = rxrpc_idle_ack_delay;
@@ -253,7 +299,7 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
                         goto out;
                 rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
                                   rxrpc_propose_ack_ping_for_lost_ack);
-               rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+               rxrpc_send_ack_packet(call, true);
                 goto out;
         }
  
@@ -328,12 +374,13 @@ void rxrpc_process_call(struct work_struct *work)
  
  recheck_state:
         if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
-               rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+               rxrpc_send_abort_packet(call);
                 goto recheck_state;
         }
  
         if (call->state == RXRPC_CALL_COMPLETE) {
                 del_timer_sync(&call->timer);
+               rxrpc_notify_socket(call);
                 goto out_put;
         }
  
@@ -345,13 +392,17 @@ recheck_state:
         }
  
         if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) {
-               call->ack_at = call->expire_at;
                 if (call->ackr_reason) {
-                       rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+                       rxrpc_send_ack_packet(call, false);
                         goto recheck_state;
                 }
         }
  
+       if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) {
+               rxrpc_send_ack_packet(call, true);
+               goto recheck_state;
+       }
+
         if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) {
                 rxrpc_resend(call, now);
                 goto recheck_state;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c

index 364b42d..4353a29 100644 (file)
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -205,6 +205,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
         expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime);
         call->expire_at = expire_at;
         call->ack_at = expire_at;
+       call->ping_at = expire_at;
         call->resend_at = expire_at;
         call->timer.expires = jiffies + LONG_MAX / 2;
         rxrpc_set_timer(call, rxrpc_timer_begin, now);
@@ -498,7 +499,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
                                   struct rxrpc_call, sock_link);
                 rxrpc_get_call(call, rxrpc_call_got);
                 rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET);
-               rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+               rxrpc_send_abort_packet(call);
                 rxrpc_release_call(rx, call);
                 rxrpc_put_call(call, rxrpc_call_put);
         }
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c

index 3ad9f75..44fb8d8 100644 (file)
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -625,9 +625,9 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call,
         rxrpc_serial_t ping_serial;
         ktime_t ping_time;
  
-       ping_time = call->ackr_ping_time;
+       ping_time = call->ping_time;
         smp_rmb();
-       ping_serial = call->ackr_ping;
+       ping_serial = call->ping_serial;
  
         if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
             before(orig_serial, ping_serial))
@@ -847,7 +847,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
  
         if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
             RXRPC_TX_ANNO_LAST &&
-           summary.nr_acks == call->tx_top - hard_ack)
+           summary.nr_acks == call->tx_top - hard_ack &&
+           rxrpc_is_client_call(call))
                 rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial,
                                   false, true,
                                   rxrpc_propose_ack_ping_for_lost_reply);
@@ -937,6 +938,33 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
         _leave("");
  }
  
+/*
+ * Handle a new call on a channel implicitly completing the preceding call on
+ * that channel.
+ *
+ * TODO: If callNumber > call_id + 1, renegotiate security.
+ */
+static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
+                                         struct rxrpc_call *call)
+{
+       switch (call->state) {
+       case RXRPC_CALL_SERVER_AWAIT_ACK:
+               rxrpc_call_completed(call);
+               break;
+       case RXRPC_CALL_COMPLETE:
+               break;
+       default:
+               if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, ESHUTDOWN)) {
+                       set_bit(RXRPC_CALL_EV_ABORT, &call->events);
+                       rxrpc_queue_call(call);
+               }
+               break;
+       }
+
+       __rxrpc_disconnect_call(conn, call);
+       rxrpc_notify_socket(call);
+}
+
  /*
   * post connection-level events to the connection
   * - this includes challenges, responses, some aborts and call terminal packet
@@ -1145,6 +1173,16 @@ void rxrpc_data_ready(struct sock *udp_sk)
                 }
  
                 call = rcu_dereference(chan->call);
+
+               if (sp->hdr.callNumber > chan->call_id) {
+                       if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) {
+                               rcu_read_unlock();
+                               goto reject_packet;
+                       }
+                       if (call)
+                               rxrpc_input_implicit_end_call(conn, call);
+                       call = NULL;
+               }
         } else {
                 skew = 0;
                 call = NULL;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c

index 9d1c721..6dee55f 100644 (file)
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -93,10 +93,9 @@ const s8 rxrpc_ack_priority[] = {
         [RXRPC_ACK_EXCEEDS_WINDOW]      = 6,
         [RXRPC_ACK_NOSPACE]             = 7,
         [RXRPC_ACK_PING_RESPONSE]       = 8,
-       [RXRPC_ACK_PING]                = 9,
  };
  
-const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
+const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = {
         "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
         "IDL", "-?-"
  };
@@ -196,7 +195,9 @@ const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = {
         [rxrpc_timer_begin]                     = "Begin ",
         [rxrpc_timer_expired]                   = "*EXPR*",
         [rxrpc_timer_init_for_reply]            = "IniRpl",
+       [rxrpc_timer_init_for_send_reply]       = "SndRpl",
         [rxrpc_timer_set_for_ack]               = "SetAck",
+       [rxrpc_timer_set_for_ping]              = "SetPng",
         [rxrpc_timer_set_for_send]              = "SetTx ",
         [rxrpc_timer_set_for_resend]            = "SetRTx",
  };
@@ -207,6 +208,7 @@ const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = {
         [rxrpc_propose_ack_ping_for_lost_ack]   = "LostAck",
         [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl",
         [rxrpc_propose_ack_ping_for_params]     = "Params ",
+       [rxrpc_propose_ack_processing_op]       = "ProcOp ",
         [rxrpc_propose_ack_respond_to_ack]      = "Rsp2Ack",
         [rxrpc_propose_ack_respond_to_ping]     = "Rsp2Png",
         [rxrpc_propose_ack_retry_tx]            = "RetryTx",
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c

index 0d47db8..5dab1ff 100644 (file)
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -19,26 +19,27 @@
  #include <net/af_rxrpc.h>
  #include "ar-internal.h"
  
-struct rxrpc_pkt_buffer {
+struct rxrpc_ack_buffer {
         struct rxrpc_wire_header whdr;
-       union {
-               struct {
-                       struct rxrpc_ackpacket ack;
-                       u8 acks[255];
-                       u8 pad[3];
-               };
-               __be32 abort_code;
-       };
+       struct rxrpc_ackpacket ack;
+       u8 acks[255];
+       u8 pad[3];
         struct rxrpc_ackinfo ackinfo;
  };
  
+struct rxrpc_abort_buffer {
+       struct rxrpc_wire_header whdr;
+       __be32 abort_code;
+};
+
  /*
   * Fill out an ACK packet.
   */
  static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
-                                struct rxrpc_pkt_buffer *pkt,
+                                struct rxrpc_ack_buffer *pkt,
                                  rxrpc_seq_t *_hard_ack,
-                                rxrpc_seq_t *_top)
+                                rxrpc_seq_t *_top,
+                                u8 reason)
  {
         rxrpc_serial_t serial;
         rxrpc_seq_t hard_ack, top, seq;
@@ -58,10 +59,10 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
         pkt->ack.firstPacket    = htonl(hard_ack + 1);
         pkt->ack.previousPacket = htonl(call->ackr_prev_seq);
         pkt->ack.serial         = htonl(serial);
-       pkt->ack.reason         = call->ackr_reason;
+       pkt->ack.reason         = reason;
         pkt->ack.nAcks          = top - hard_ack;
  
-       if (pkt->ack.reason == RXRPC_ACK_PING)
+       if (reason == RXRPC_ACK_PING)
                 pkt->whdr.flags |= RXRPC_REQUEST_ACK;
  
         if (after(top, hard_ack)) {
@@ -91,22 +92,19 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
  }
  
  /*
- * Send an ACK or ABORT call packet.
+ * Send an ACK call packet.
   */
-int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
+int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
  {
         struct rxrpc_connection *conn = NULL;
-       struct rxrpc_pkt_buffer *pkt;
+       struct rxrpc_ack_buffer *pkt;
         struct msghdr msg;
         struct kvec iov[2];
         rxrpc_serial_t serial;
         rxrpc_seq_t hard_ack, top;
         size_t len, n;
-       bool ping = false;
-       int ioc, ret;
-       u32 abort_code;
-
-       _enter("%u,%s", call->debug_id, rxrpc_pkts[type]);
+       int ret;
+       u8 reason;
  
         spin_lock_bh(&call->lock);
         if (call->conn)
@@ -131,68 +129,44 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
         pkt->whdr.cid           = htonl(call->cid);
         pkt->whdr.callNumber    = htonl(call->call_id);
         pkt->whdr.seq           = 0;
-       pkt->whdr.type          = type;
-       pkt->whdr.flags         = conn->out_clientflag;
+       pkt->whdr.type          = RXRPC_PACKET_TYPE_ACK;
+       pkt->whdr.flags         = RXRPC_SLOW_START_OK | conn->out_clientflag;
         pkt->whdr.userStatus    = 0;
         pkt->whdr.securityIndex = call->security_ix;
         pkt->whdr._rsvd         = 0;
         pkt->whdr.serviceId     = htons(call->service_id);
  
-       iov[0].iov_base = pkt;
-       iov[0].iov_len  = sizeof(pkt->whdr);
-       len = sizeof(pkt->whdr);
-
-       switch (type) {
-       case RXRPC_PACKET_TYPE_ACK:
-               spin_lock_bh(&call->lock);
+       spin_lock_bh(&call->lock);
+       if (ping) {
+               reason = RXRPC_ACK_PING;
+       } else {
+               reason = call->ackr_reason;
                 if (!call->ackr_reason) {
                         spin_unlock_bh(&call->lock);
                         ret = 0;
                         goto out;
                 }
-               ping = (call->ackr_reason == RXRPC_ACK_PING);
-               n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top);
                 call->ackr_reason = 0;
+       }
+       n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top, reason);
  
-               spin_unlock_bh(&call->lock);
-
-
-               pkt->whdr.flags |= RXRPC_SLOW_START_OK;
-
-               iov[0].iov_len += sizeof(pkt->ack) + n;
-               iov[1].iov_base = &pkt->ackinfo;
-               iov[1].iov_len  = sizeof(pkt->ackinfo);
-               len += sizeof(pkt->ack) + n + sizeof(pkt->ackinfo);
-               ioc = 2;
-               break;
-
-       case RXRPC_PACKET_TYPE_ABORT:
-               abort_code = call->abort_code;
-               pkt->abort_code = htonl(abort_code);
-               iov[0].iov_len += sizeof(pkt->abort_code);
-               len += sizeof(pkt->abort_code);
-               ioc = 1;
-               break;
+       spin_unlock_bh(&call->lock);
  
-       default:
-               BUG();
-               ret = -ENOANO;
-               goto out;
-       }
+       iov[0].iov_base = pkt;
+       iov[0].iov_len  = sizeof(pkt->whdr) + sizeof(pkt->ack) + n;
+       iov[1].iov_base = &pkt->ackinfo;
+       iov[1].iov_len  = sizeof(pkt->ackinfo);
+       len = iov[0].iov_len + iov[1].iov_len;
  
         serial = atomic_inc_return(&conn->serial);
         pkt->whdr.serial = htonl(serial);
-       switch (type) {
-       case RXRPC_PACKET_TYPE_ACK:
-               trace_rxrpc_tx_ack(call, serial,
-                                  ntohl(pkt->ack.firstPacket),
-                                  ntohl(pkt->ack.serial),
-                                  pkt->ack.reason, pkt->ack.nAcks);
-               break;
-       }
+       trace_rxrpc_tx_ack(call, serial,
+                          ntohl(pkt->ack.firstPacket),
+                          ntohl(pkt->ack.serial),
+                          pkt->ack.reason, pkt->ack.nAcks);
  
         if (ping) {
-               call->ackr_ping = serial;
+               call->ping_serial = serial;
                 smp_wmb();
                 /* We need to stick a time in before we send the packet in case
                  * the reply gets back before kernel_sendmsg() completes - but
@@ -201,19 +175,19 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
                  * the packet transmission is more likely to happen towards the
                  * end of the kernel_sendmsg() call.
                  */
-               call->ackr_ping_time = ktime_get_real();
+               call->ping_time = ktime_get_real();
                 set_bit(RXRPC_CALL_PINGING, &call->flags);
                 trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial);
         }
-       ret = kernel_sendmsg(conn->params.local->socket,
-                            &msg, iov, ioc, len);
+
+       ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
         if (ping)
-               call->ackr_ping_time = ktime_get_real();
+               call->ping_time = ktime_get_real();
  
-       if (type == RXRPC_PACKET_TYPE_ACK &&
-           call->state < RXRPC_CALL_COMPLETE) {
+       if (call->state < RXRPC_CALL_COMPLETE) {
                 if (ret < 0) {
-                       clear_bit(RXRPC_CALL_PINGING, &call->flags);
+                       if (ping)
+                               clear_bit(RXRPC_CALL_PINGING, &call->flags);
                         rxrpc_propose_ACK(call, pkt->ack.reason,
                                           ntohs(pkt->ack.maxSkew),
                                           ntohl(pkt->ack.serial),
@@ -235,6 +209,56 @@ out:
         return ret;
  }
  
+/*
+ * Send an ABORT call packet.
+ */
+int rxrpc_send_abort_packet(struct rxrpc_call *call)
+{
+       struct rxrpc_connection *conn = NULL;
+       struct rxrpc_abort_buffer pkt;
+       struct msghdr msg;
+       struct kvec iov[1];
+       rxrpc_serial_t serial;
+       int ret;
+
+       spin_lock_bh(&call->lock);
+       if (call->conn)
+               conn = rxrpc_get_connection_maybe(call->conn);
+       spin_unlock_bh(&call->lock);
+       if (!conn)
+               return -ECONNRESET;
+
+       msg.msg_name    = &call->peer->srx.transport;
+       msg.msg_namelen = call->peer->srx.transport_len;
+       msg.msg_control = NULL;
+       msg.msg_controllen = 0;
+       msg.msg_flags   = 0;
+
+       pkt.whdr.epoch          = htonl(conn->proto.epoch);
+       pkt.whdr.cid            = htonl(call->cid);
+       pkt.whdr.callNumber     = htonl(call->call_id);
+       pkt.whdr.seq            = 0;
+       pkt.whdr.type           = RXRPC_PACKET_TYPE_ABORT;
+       pkt.whdr.flags          = conn->out_clientflag;
+       pkt.whdr.userStatus     = 0;
+       pkt.whdr.securityIndex  = call->security_ix;
+       pkt.whdr._rsvd          = 0;
+       pkt.whdr.serviceId      = htons(call->service_id);
+       pkt.abort_code          = htonl(call->abort_code);
+
+       iov[0].iov_base = &pkt;
+       iov[0].iov_len  = sizeof(pkt);
+
+       serial = atomic_inc_return(&conn->serial);
+       pkt.whdr.serial = htonl(serial);
+
+       ret = kernel_sendmsg(conn->params.local->socket,
+                            &msg, iov, 1, sizeof(pkt));
+
+       rxrpc_put_connection(conn);
+       return ret;
+}
+
  /*
   * send a packet through the transport endpoint
   */
@@ -283,11 +307,12 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
         /* If our RTT cache needs working on, request an ACK.  Also request
          * ACKs if a DATA packet appears to have been lost.
          */
-       if (retrans ||
-           call->cong_mode == RXRPC_CALL_SLOW_START ||
-           (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
-           ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
-                        ktime_get_real()))
+       if (!(sp->hdr.flags & RXRPC_LAST_PACKET) &&
+           (retrans ||
+            call->cong_mode == RXRPC_CALL_SLOW_START ||
+            (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+            ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+                         ktime_get_real())))
                 whdr.flags |= RXRPC_REQUEST_ACK;
  
         if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c

index f05ea0a..c29362d 100644 (file)
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -143,7 +143,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
         if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
                 rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
                                   rxrpc_propose_ack_terminal_ack);
-               rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+               rxrpc_send_ack_packet(call, false);
         }
  
         write_lock_bh(&call->state_lock);
@@ -151,17 +151,21 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
         switch (call->state) {
         case RXRPC_CALL_CLIENT_RECV_REPLY:
                 __rxrpc_call_completed(call);
+               write_unlock_bh(&call->state_lock);
                 break;
  
         case RXRPC_CALL_SERVER_RECV_REQUEST:
                 call->tx_phase = true;
                 call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+               call->ack_at = call->expire_at;
+               write_unlock_bh(&call->state_lock);
+               rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true,
+                                 rxrpc_propose_ack_processing_op);
                 break;
         default:
+               write_unlock_bh(&call->state_lock);
                 break;
         }
-
-       write_unlock_bh(&call->state_lock);
  }
  
  /*
@@ -212,7 +216,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
                                           true, false,
                                           rxrpc_propose_ack_rotate_rx);
                 if (call->ackr_reason)
-                       rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+                       rxrpc_send_ack_packet(call, false);
         }
  }
  
@@ -652,7 +656,7 @@ excess_data:
         goto out;
  call_complete:
         *_abort = call->abort_code;
-       ret = call->error;
+       ret = -call->error;
         if (call->completion == RXRPC_CALL_SUCCEEDED) {
                 ret = 1;
                 if (size > 0)
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c

index 627abed..4374e7b 100644 (file)
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -381,7 +381,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
         return 0;
  
  protocol_error:
-       rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+       rxrpc_send_abort_packet(call);
         _leave(" = -EPROTO");
         return -EPROTO;
  
@@ -471,7 +471,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
         return 0;
  
  protocol_error:
-       rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+       rxrpc_send_abort_packet(call);
         _leave(" = -EPROTO");
         return -EPROTO;
  
@@ -523,7 +523,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
  
         if (cksum != expected_cksum) {
                 rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO);
-               rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+               rxrpc_send_abort_packet(call);
                 _leave(" = -EPROTO [csum failed]");
                 return -EPROTO;
         }
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c

index 3322543..b214a4d 100644 (file)
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -130,6 +130,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
                         break;
                 case RXRPC_CALL_SERVER_ACK_REQUEST:
                         call->state = RXRPC_CALL_SERVER_SEND_REPLY;
+                       call->ack_at = call->expire_at;
+                       if (call->ackr_reason == RXRPC_ACK_DELAY)
+                               call->ackr_reason = 0;
+                       __rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply,
+                                         ktime_get_real());
                         if (!last)
                                 break;
                 case RXRPC_CALL_SERVER_SEND_REPLY:
@@ -197,7 +202,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
         do {
                 /* Check to see if there's a ping ACK to reply to. */
                 if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
-                       rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK);
+                       rxrpc_send_ack_packet(call, false);
  
                 if (!skb) {
                         size_t size, chunk, max, space;
@@ -514,8 +519,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
         } else if (cmd == RXRPC_CMD_SEND_ABORT) {
                 ret = 0;
                 if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED))
-                       ret = rxrpc_send_call_packet(call,
-                                                    RXRPC_PACKET_TYPE_ABORT);
+                       ret = rxrpc_send_abort_packet(call);
         } else if (cmd != RXRPC_CMD_SEND_DATA) {
                 ret = -EINVAL;
         } else if (rxrpc_is_client_call(call) &&
@@ -597,7 +601,7 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
         lock_sock(sock->sk);
  
         if (rxrpc_abort_call(why, call, 0, abort_code, error))
-               rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT);
+               rxrpc_send_abort_packet(call);
  
         release_sock(sock->sk);
         _leave("");
diff --git a/net/sysctl_net.c b/net/sysctl_net.c

index e0c71bd..9199813 100644 (file)
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -27,9 +27,9 @@
  #endif
  
  static struct ctl_table_set *
-net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
+net_ctl_header_lookup(struct ctl_table_root *root)
  {
-       return &namespaces->net_ns->sysctls;
+       return &current->nsproxy->net_ns->sysctls;
  }
  
  static int is_seen(struct ctl_table_set *set)
diff --git a/scripts/tracing/ftrace-bisect.sh b/scripts/tracing/ftrace-bisect.sh

new file mode 100755 (executable)

index 0000000..9ff8ac5
--- /dev/null
+++ b/scripts/tracing/ftrace-bisect.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+#
+# Here's how to use this:
+#
+# This script is used to help find functions that are being traced by function
+# tracer or function graph tracing that causes the machine to reboot, hang, or
+# crash. Here's the steps to take.
+#
+# First, determine if function tracing is working with a single function:
+#
+#   (note, if this is a problem with function_graph tracing, then simply
+#    replace "function" with "function_graph" in the following steps).
+#
+#  # cd /sys/kernel/debug/tracing
+#  # echo schedule > set_ftrace_filter
+#  # echo function > current_tracer
+#
+# If this works, then we know that something is being traced that shouldn't be.
+#
+#  # echo nop > current_tracer
+#
+#  # cat available_filter_functions > ~/full-file
+#  # ftrace-bisect ~/full-file ~/test-file ~/non-test-file
+#  # cat ~/test-file > set_ftrace_filter
+#
+# *** Note *** this will take several minutes. Setting multiple functions is
+# an O(n^2) operation, and we are dealing with thousands of functions. So go
+# have  coffee, talk with your coworkers, read facebook. And eventually, this
+# operation will end.
+#
+#  # echo function > current_tracer
+#
+# If it crashes, we know that ~/test-file has a bad function.
+#
+#   Reboot back to test kernel.
+#
+#     # cd /sys/kernel/debug/tracing
+#     # mv ~/test-file ~/full-file
+#
+# If it didn't crash.
+#
+#     # echo nop > current_tracer
+#     # mv ~/non-test-file ~/full-file
+#
+# Get rid of the other test file from previous run (or save them off somewhere).
+#  # rm -f ~/test-file ~/non-test-file
+#
+# And start again:
+#
+#  # ftrace-bisect ~/full-file ~/test-file ~/non-test-file
+#
+# The good thing is, because this cuts the number of functions in ~/test-file
+# by half, the cat of it into set_ftrace_filter takes half as long each
+# iteration, so don't talk so much at the water cooler the second time.
+#
+# Eventually, if you did this correctly, you will get down to the problem
+# function, and all we need to do is to notrace it.
+#
+# The way to figure out if the problem function is bad, just do:
+#
+#  # echo <problem-function> > set_ftrace_notrace
+#  # echo > set_ftrace_filter
+#  # echo function > current_tracer
+#
+# And if it doesn't crash, we are done.
+#
+# If it does crash, do this again (there's more than one problem function)
+# but you need to echo the problem function(s) into set_ftrace_notrace before
+# enabling function tracing in the above steps. Or if you can compile the
+# kernel, annotate the problem functions with "notrace" and start again.
+#
+
+
+if [ $# -ne 3 ]; then
+  echo 'usage: ftrace-bisect full-file test-file  non-test-file'
+  exit
+fi
+
+full=$1
+test=$2
+nontest=$3
+
+x=`cat $full | wc -l`
+if [ $x -eq 1 ]; then
+       echo "There's only one function left, must be the bad one"
+       cat $full
+       exit 0
+fi
+
+let x=$x/2
+let y=$x+1
+
+if [ ! -f $full ]; then
+       echo "$full does not exist"
+       exit 1
+fi
+
+if [ -f $test ]; then
+       echo -n "$test exists, delete it? [y/N]"
+       read a
+       if [ "$a" != "y" -a "$a" != "Y" ]; then
+               exit 1
+       fi
+fi
+
+if [ -f $nontest ]; then
+       echo -n "$nontest exists, delete it? [y/N]"
+       read a
+       if [ "$a" != "y" -a "$a" != "Y" ]; then
+               exit 1
+       fi
+fi
+
+sed -ne "1,${x}p" $full > $test
+sed -ne "$y,\$p" $full > $nontest
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile

index ff9e5f2..f770dba 100644 (file)
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -15,6 +15,7 @@ TARGETS += memory-hotplug
  TARGETS += mount
  TARGETS += mqueue
  TARGETS += net
+TARGETS += nsfs
  TARGETS += powerpc
  TARGETS += pstore
  TARGETS += ptrace
diff --git a/tools/testing/selftests/nsfs/Makefile b/tools/testing/selftests/nsfs/Makefile

new file mode 100644 (file)

index 0000000..2306054
--- /dev/null
+++ b/tools/testing/selftests/nsfs/Makefile
@@ -0,0 +1,12 @@
+TEST_PROGS := owner pidns
+
+CFLAGS := -Wall -Werror
+
+all: owner pidns
+owner: owner.c
+pidns: pidns.c
+
+clean:
+       $(RM) owner pidns
+
+include ../lib.mk
diff --git a/tools/testing/selftests/nsfs/owner.c b/tools/testing/selftests/nsfs/owner.c

new file mode 100644 (file)

index 0000000..437205f
--- /dev/null
+++ b/tools/testing/selftests/nsfs/owner.c
@@ -0,0 +1,91 @@
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define NSIO    0xb7
+#define NS_GET_USERNS   _IO(NSIO, 0x1)
+
+#define pr_err(fmt, ...) \
+               ({ \
+                       fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+                               __func__, __LINE__, ##__VA_ARGS__); \
+                       1; \
+               })
+
+int main(int argc, char *argvp[])
+{
+       int pfd[2], ns, uns, init_uns;
+       struct stat st1, st2;
+       char path[128];
+       pid_t pid;
+       char c;
+
+       if (pipe(pfd))
+               return 1;
+
+       pid = fork();
+       if (pid < 0)
+               return pr_err("fork");
+       if (pid == 0) {
+               prctl(PR_SET_PDEATHSIG, SIGKILL);
+               if (unshare(CLONE_NEWUTS | CLONE_NEWUSER))
+                       return pr_err("unshare");
+               close(pfd[0]);
+               close(pfd[1]);
+               while (1)
+                       sleep(1);
+               return 0;
+       }
+       close(pfd[1]);
+       if (read(pfd[0], &c, 1) != 0)
+               return pr_err("Unable to read from pipe");
+       close(pfd[0]);
+
+       snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
+       ns = open(path, O_RDONLY);
+       if (ns < 0)
+               return pr_err("Unable to open %s", path);
+
+       uns = ioctl(ns, NS_GET_USERNS);
+       if (uns < 0)
+               return pr_err("Unable to get an owning user namespace");
+
+       if (fstat(uns, &st1))
+               return pr_err("fstat");
+
+       snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+       if (stat(path, &st2))
+               return pr_err("stat");
+
+       if (st1.st_ino != st2.st_ino)
+               return pr_err("NS_GET_USERNS returned a wrong namespace");
+
+       init_uns = ioctl(uns, NS_GET_USERNS);
+       if (uns < 0)
+               return pr_err("Unable to get an owning user namespace");
+
+       if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+               return pr_err("Don't get EPERM");
+
+       if (unshare(CLONE_NEWUSER))
+               return pr_err("unshare");
+
+       if (ioctl(ns, NS_GET_USERNS) >= 0 || errno != EPERM)
+               return pr_err("Don't get EPERM");
+       if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+               return pr_err("Don't get EPERM");
+
+       kill(pid, SIGKILL);
+       wait(NULL);
+       return 0;
+}
diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/nsfs/pidns.c

new file mode 100644 (file)

index 0000000..ae3a0d6
--- /dev/null
+++ b/tools/testing/selftests/nsfs/pidns.c
@@ -0,0 +1,78 @@
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define pr_err(fmt, ...) \
+               ({ \
+                       fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+                               __func__, __LINE__, ##__VA_ARGS__); \
+                       1; \
+               })
+
+#define NSIO   0xb7
+#define NS_GET_USERNS   _IO(NSIO, 0x1)
+#define NS_GET_PARENT   _IO(NSIO, 0x2)
+
+#define __stack_aligned__      __attribute__((aligned(16)))
+struct cr_clone_arg {
+       char stack[128] __stack_aligned__;
+       char stack_ptr[0];
+};
+
+static int child(void *args)
+{
+       prctl(PR_SET_PDEATHSIG, SIGKILL);
+       while (1)
+               sleep(1);
+       exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+       char *ns_strs[] = {"pid", "user"};
+       char path[] = "/proc/0123456789/ns/pid";
+       struct cr_clone_arg ca;
+       struct stat st1, st2;
+       int ns, pns, i;
+       pid_t pid;
+
+       pid = clone(child, ca.stack_ptr, CLONE_NEWUSER | CLONE_NEWPID | SIGCHLD, NULL);
+       if (pid < 0)
+               return pr_err("clone");
+
+       for (i = 0; i < 2; i++) {
+               snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns_strs[i]);
+               ns = open(path, O_RDONLY);
+               if (ns < 0)
+                       return pr_err("Unable to open %s", path);
+
+               pns = ioctl(ns, NS_GET_PARENT);
+               if (pns < 0)
+                       return pr_err("Unable to get a parent pidns");
+
+               snprintf(path, sizeof(path), "/proc/self/ns/%s", ns_strs[i]);
+               if (stat(path, &st2))
+                       return pr_err("Unable to stat %s", path);
+               if (fstat(pns, &st1))
+                       return pr_err("Unable to stat the parent pidns");
+               if (st1.st_ino != st2.st_ino)
+                       return pr_err("NS_GET_PARENT returned a wrong namespace");
+
+               if (ioctl(pns, NS_GET_PARENT) >= 0 || errno != EPERM)
+                       return pr_err("Don't get EPERM");;
+       }
+
+       kill(pid, SIGKILL);
+       wait(NULL);
+       return 0;
+}
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c

new file mode 100644 (file)

index 0000000..528af4b
--- /dev/null
+++ b/virt/kvm/arm/aarch32.c
@@ -0,0 +1,152 @@
+/*
+ * (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#ifndef CONFIG_ARM64
+#define COMPAT_PSR_T_BIT       PSR_T_BIT
+#define COMPAT_PSR_IT_MASK     PSR_IT_MASK
+#endif
+
+/*
+ * stolen from arch/arm/kernel/opcodes.c
+ *
+ * condition code lookup table
+ * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
+ *
+ * bit position in short is condition code: NZCV
+ */
+static const unsigned short cc_map[16] = {
+       0xF0F0,                 /* EQ == Z set            */
+       0x0F0F,                 /* NE                     */
+       0xCCCC,                 /* CS == C set            */
+       0x3333,                 /* CC                     */
+       0xFF00,                 /* MI == N set            */
+       0x00FF,                 /* PL                     */
+       0xAAAA,                 /* VS == V set            */
+       0x5555,                 /* VC                     */
+       0x0C0C,                 /* HI == C set && Z clear */
+       0xF3F3,                 /* LS == C clear || Z set */
+       0xAA55,                 /* GE == (N==V)           */
+       0x55AA,                 /* LT == (N!=V)           */
+       0x0A05,                 /* GT == (!Z && (N==V))   */
+       0xF5FA,                 /* LE == (Z || (N!=V))    */
+       0xFFFF,                 /* AL always              */
+       0                       /* NV                     */
+};
+
+/*
+ * Check if a trapped instruction should have been executed or not.
+ */
+bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
+{
+       unsigned long cpsr;
+       u32 cpsr_cond;
+       int cond;
+
+       /* Top two bits non-zero?  Unconditional. */
+       if (kvm_vcpu_get_hsr(vcpu) >> 30)
+               return true;
+
+       /* Is condition field valid? */
+       cond = kvm_vcpu_get_condition(vcpu);
+       if (cond == 0xE)
+               return true;
+
+       cpsr = *vcpu_cpsr(vcpu);
+
+       if (cond < 0) {
+               /* This can happen in Thumb mode: examine IT state. */
+               unsigned long it;
+
+               it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
+
+               /* it == 0 => unconditional. */
+               if (it == 0)
+                       return true;
+
+               /* The cond for this insn works out as the top 4 bits. */
+               cond = (it >> 4);
+       }
+
+       cpsr_cond = cpsr >> 28;
+
+       if (!((cc_map[cond] >> cpsr_cond) & 1))
+               return false;
+
+       return true;
+}
+
+/**
+ * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
+ * @vcpu:      The VCPU pointer
+ *
+ * When exceptions occur while instructions are executed in Thumb IF-THEN
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
+ * to do this little bit of work manually. The fields map like this:
+ *
+ * IT[7:0] -> CPSR[26:25],CPSR[15:10]
+ */
+static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
+{
+       unsigned long itbits, cond;
+       unsigned long cpsr = *vcpu_cpsr(vcpu);
+       bool is_arm = !(cpsr & COMPAT_PSR_T_BIT);
+
+       if (is_arm || !(cpsr & COMPAT_PSR_IT_MASK))
+               return;
+
+       cond = (cpsr & 0xe000) >> 13;
+       itbits = (cpsr & 0x1c00) >> (10 - 2);
+       itbits |= (cpsr & (0x3 << 25)) >> 25;
+
+       /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
+       if ((itbits & 0x7) == 0)
+               itbits = cond = 0;
+       else
+               itbits = (itbits << 1) & 0x1f;
+
+       cpsr &= ~COMPAT_PSR_IT_MASK;
+       cpsr |= cond << 13;
+       cpsr |= (itbits & 0x1c) << (10 - 2);
+       cpsr |= (itbits & 0x3) << 25;
+       *vcpu_cpsr(vcpu) = cpsr;
+}
+
+/**
+ * kvm_skip_instr - skip a trapped instruction and proceed to the next
+ * @vcpu: The vcpu pointer
+ */
+void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+       bool is_thumb;
+
+       is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT);
+       if (is_thumb && !is_wide_instr)
+               *vcpu_pc(vcpu) += 2;
+       else
+               *vcpu_pc(vcpu) += 4;
+       kvm_adjust_itstate(vcpu);
+}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c

index 77e6ccf..27a1f63 100644 (file)
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -31,7 +31,6 @@
  #include "trace.h"
  
  static struct timecounter *timecounter;
-static struct workqueue_struct *wqueue;
  static unsigned int host_vtimer_irq;
  static u32 host_vtimer_irq_flags;
  
@@ -141,7 +140,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
                 return HRTIMER_RESTART;
         }
  
-       queue_work(wqueue, &timer->expired);
+       schedule_work(&timer->expired);
         return HRTIMER_NORESTART;
  }
  
@@ -446,13 +445,7 @@ int kvm_timer_hyp_init(void)
         if (err) {
                 kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
                         host_vtimer_irq, err);
-               goto out;
-       }
-
-       wqueue = create_singlethread_workqueue("kvm_arch_timer");
-       if (!wqueue) {
-               err = -ENOMEM;
-               goto out_free;
+               return err;
         }
  
         kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
@@ -460,10 +453,6 @@ int kvm_timer_hyp_init(void)
         cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
                           "AP_KVM_ARM_TIMER_STARTING", kvm_timer_starting_cpu,
                           kvm_timer_dying_cpu);
-       goto out;
-out_free:
-       free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
-out:
         return err;
  }
  
@@ -518,7 +507,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
          * VCPUs have the enabled variable set, before entering the guest, if
          * the arch timers are enabled.
          */
-       if (timecounter && wqueue)
+       if (timecounter)
                 timer->enabled = 1;
  
         return 0;
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c

index 7cffd93..c8aeb7b 100644 (file)
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -19,6 +19,7 @@
  #include <linux/irqchip/arm-gic.h>
  #include <linux/kvm_host.h>
  
+#include <asm/kvm_emulate.h>
  #include <asm/kvm_hyp.h>
  
  static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
@@ -167,3 +168,59 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
         writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
         vcpu->arch.vgic_cpu.live_lrs = live_lrs;
  }
+
+#ifdef CONFIG_ARM64
+/*
+ * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the
+ *                                  guest.
+ *
+ * @vcpu: the offending vcpu
+ *
+ * Returns:
+ *  1: GICV access successfully performed
+ *  0: Not a GICV access
+ * -1: Illegal GICV access
+ */
+int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       phys_addr_t fault_ipa;
+       void __iomem *addr;
+       int rd;
+
+       /* Build the full address */
+       fault_ipa  = kvm_vcpu_get_fault_ipa(vcpu);
+       fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+
+       /* If not for GICV, move on */
+       if (fault_ipa <  vgic->vgic_cpu_base ||
+           fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE))
+               return 0;
+
+       /* Reject anything but a 32bit access */
+       if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32))
+               return -1;
+
+       /* Not aligned? Don't bother */
+       if (fault_ipa & 3)
+               return -1;
+
+       rd = kvm_vcpu_dabt_get_rd(vcpu);
+       addr  = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va);
+       addr += fault_ipa - vgic->vgic_cpu_base;
+
+       if (kvm_vcpu_dabt_iswrite(vcpu)) {
+               u32 data = vcpu_data_guest_to_host(vcpu,
+                                                  vcpu_get_reg(vcpu, rd),
+                                                  sizeof(u32));
+               writel_relaxed(data, addr);
+       } else {
+               u32 data = readl_relaxed(addr);
+               vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data,
+                                                              sizeof(u32)));
+       }
+
+       return 1;
+}
+#endif
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c

new file mode 100644 (file)

index 0000000..3947095
--- /dev/null
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+
+#define vtr_to_max_lr_idx(v)           ((v) & 0xf)
+#define vtr_to_nr_pri_bits(v)          (((u32)(v) >> 29) + 1)
+
+static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
+{
+       switch (lr & 0xf) {
+       case 0:
+               return read_gicreg(ICH_LR0_EL2);
+       case 1:
+               return read_gicreg(ICH_LR1_EL2);
+       case 2:
+               return read_gicreg(ICH_LR2_EL2);
+       case 3:
+               return read_gicreg(ICH_LR3_EL2);
+       case 4:
+               return read_gicreg(ICH_LR4_EL2);
+       case 5:
+               return read_gicreg(ICH_LR5_EL2);
+       case 6:
+               return read_gicreg(ICH_LR6_EL2);
+       case 7:
+               return read_gicreg(ICH_LR7_EL2);
+       case 8:
+               return read_gicreg(ICH_LR8_EL2);
+       case 9:
+               return read_gicreg(ICH_LR9_EL2);
+       case 10:
+               return read_gicreg(ICH_LR10_EL2);
+       case 11:
+               return read_gicreg(ICH_LR11_EL2);
+       case 12:
+               return read_gicreg(ICH_LR12_EL2);
+       case 13:
+               return read_gicreg(ICH_LR13_EL2);
+       case 14:
+               return read_gicreg(ICH_LR14_EL2);
+       case 15:
+               return read_gicreg(ICH_LR15_EL2);
+       }
+
+       unreachable();
+}
+
+static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
+{
+       switch (lr & 0xf) {
+       case 0:
+               write_gicreg(val, ICH_LR0_EL2);
+               break;
+       case 1:
+               write_gicreg(val, ICH_LR1_EL2);
+               break;
+       case 2:
+               write_gicreg(val, ICH_LR2_EL2);
+               break;
+       case 3:
+               write_gicreg(val, ICH_LR3_EL2);
+               break;
+       case 4:
+               write_gicreg(val, ICH_LR4_EL2);
+               break;
+       case 5:
+               write_gicreg(val, ICH_LR5_EL2);
+               break;
+       case 6:
+               write_gicreg(val, ICH_LR6_EL2);
+               break;
+       case 7:
+               write_gicreg(val, ICH_LR7_EL2);
+               break;
+       case 8:
+               write_gicreg(val, ICH_LR8_EL2);
+               break;
+       case 9:
+               write_gicreg(val, ICH_LR9_EL2);
+               break;
+       case 10:
+               write_gicreg(val, ICH_LR10_EL2);
+               break;
+       case 11:
+               write_gicreg(val, ICH_LR11_EL2);
+               break;
+       case 12:
+               write_gicreg(val, ICH_LR12_EL2);
+               break;
+       case 13:
+               write_gicreg(val, ICH_LR13_EL2);
+               break;
+       case 14:
+               write_gicreg(val, ICH_LR14_EL2);
+               break;
+       case 15:
+               write_gicreg(val, ICH_LR15_EL2);
+               break;
+       }
+}
+
+static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       int i;
+       bool expect_mi;
+
+       expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE);
+
+       for (i = 0; i < nr_lr; i++) {
+               if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
+                               continue;
+
+               expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) &&
+                             (cpu_if->vgic_lr[i] & ICH_LR_EOI));
+       }
+
+       if (expect_mi) {
+               cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
+
+               if (cpu_if->vgic_misr & ICH_MISR_EOI)
+                       cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2);
+               else
+                       cpu_if->vgic_eisr = 0;
+       } else {
+               cpu_if->vgic_misr = 0;
+               cpu_if->vgic_eisr = 0;
+       }
+}
+
+void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 val;
+
+       /*
+        * Make sure stores to the GIC via the memory mapped interface
+        * are now visible to the system register interface.
+        */
+       if (!cpu_if->vgic_sre)
+               dsb(st);
+
+       cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
+
+       if (vcpu->arch.vgic_cpu.live_lrs) {
+               int i;
+               u32 max_lr_idx, nr_pri_bits;
+
+               cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
+
+               write_gicreg(0, ICH_HCR_EL2);
+               val = read_gicreg(ICH_VTR_EL2);
+               max_lr_idx = vtr_to_max_lr_idx(val);
+               nr_pri_bits = vtr_to_nr_pri_bits(val);
+
+               save_maint_int_state(vcpu, max_lr_idx + 1);
+
+               for (i = 0; i <= max_lr_idx; i++) {
+                       if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
+                               continue;
+
+                       if (cpu_if->vgic_elrsr & (1 << i))
+                               cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
+                       else
+                               cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
+
+                       __gic_v3_set_lr(0, i);
+               }
+
+               switch (nr_pri_bits) {
+               case 7:
+                       cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
+                       cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
+               case 6:
+                       cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
+               default:
+                       cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
+               }
+
+               switch (nr_pri_bits) {
+               case 7:
+                       cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
+                       cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
+               case 6:
+                       cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
+               default:
+                       cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
+               }
+
+               vcpu->arch.vgic_cpu.live_lrs = 0;
+       } else {
+               cpu_if->vgic_misr  = 0;
+               cpu_if->vgic_eisr  = 0;
+               cpu_if->vgic_elrsr = 0xffff;
+               cpu_if->vgic_ap0r[0] = 0;
+               cpu_if->vgic_ap0r[1] = 0;
+               cpu_if->vgic_ap0r[2] = 0;
+               cpu_if->vgic_ap0r[3] = 0;
+               cpu_if->vgic_ap1r[0] = 0;
+               cpu_if->vgic_ap1r[1] = 0;
+               cpu_if->vgic_ap1r[2] = 0;
+               cpu_if->vgic_ap1r[3] = 0;
+       }
+
+       val = read_gicreg(ICC_SRE_EL2);
+       write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+
+       if (!cpu_if->vgic_sre) {
+               /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+               isb();
+               write_gicreg(1, ICC_SRE_EL1);
+       }
+}
+
+void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 val;
+       u32 max_lr_idx, nr_pri_bits;
+       u16 live_lrs = 0;
+       int i;
+
+       /*
+        * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
+        * Group0 interrupt (as generated in GICv2 mode) to be
+        * delivered as a FIQ to the guest, with potentially fatal
+        * consequences. So we must make sure that ICC_SRE_EL1 has
+        * been actually programmed with the value we want before
+        * starting to mess with the rest of the GIC.
+        */
+       if (!cpu_if->vgic_sre) {
+               write_gicreg(0, ICC_SRE_EL1);
+               isb();
+       }
+
+       val = read_gicreg(ICH_VTR_EL2);
+       max_lr_idx = vtr_to_max_lr_idx(val);
+       nr_pri_bits = vtr_to_nr_pri_bits(val);
+
+       for (i = 0; i <= max_lr_idx; i++) {
+               if (cpu_if->vgic_lr[i] & ICH_LR_STATE)
+                       live_lrs |= (1 << i);
+       }
+
+       write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
+
+       if (live_lrs) {
+               write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+
+               switch (nr_pri_bits) {
+               case 7:
+                       write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
+                       write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
+               case 6:
+                       write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
+               default:
+                       write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
+               }
+
+               switch (nr_pri_bits) {
+               case 7:
+                       write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
+                       write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
+               case 6:
+                       write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
+               default:
+                       write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
+               }
+
+               for (i = 0; i <= max_lr_idx; i++) {
+                       if (!(live_lrs & (1 << i)))
+                               continue;
+
+                       __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+               }
+       }
+
+       /*
+        * Ensures that the above will have reached the
+        * (re)distributors. This ensure the guest will read the
+        * correct values from the memory-mapped interface.
+        */
+       if (!cpu_if->vgic_sre) {
+               isb();
+               dsb(sy);
+       }
+       vcpu->arch.vgic_cpu.live_lrs = live_lrs;
+
+       /*
+        * Prevent the guest from touching the GIC system registers if
+        * SRE isn't enabled for GICv3 emulation.
+        */
+       write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+                    ICC_SRE_EL2);
+}
+
+void __hyp_text __vgic_v3_init_lrs(void)
+{
+       int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
+       int i;
+
+       for (i = 0; i <= max_lr_idx; i++)
+               __gic_v3_set_lr(0, i);
+}
+
+u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
+{
+       return read_gicreg(ICH_VTR_EL2);
+}
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c

index a027569..6e9c40e 100644 (file)
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -423,6 +423,14 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
         if (!kvm_arm_support_pmu_v3())
                 return -ENODEV;
  
+       /*
+        * We currently require an in-kernel VGIC to use the PMU emulation,
+        * because we do not support forwarding PMU overflow interrupts to
+        * userspace yet.
+        */
+       if (!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))
+               return -ENODEV;
+
         if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features) ||
             !kvm_arm_pmu_irq_initialized(vcpu))
                 return -ENXIO;
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c

index 83777c1..8cebfbc 100644 (file)
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -405,6 +405,10 @@ int kvm_vgic_hyp_init(void)
                 break;
         case GIC_V3:
                 ret = vgic_v3_probe(gic_kvm_info);
+               if (!ret) {
+                       static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+                       kvm_info("GIC system register CPU interface enabled\n");
+               }
                 break;
         default:
                 ret = -ENODEV;
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c

index b31a51a..d918dcf 100644 (file)
--- a/virt/kvm/arm/vgic/vgic-irqfd.c
+++ b/virt/kvm/arm/vgic/vgic-irqfd.c
@@ -46,15 +46,9 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
   * @ue: user api routing entry handle
   * return 0 on success, -EINVAL on errors.
   */
-#ifdef KVM_CAP_X2APIC_API
  int kvm_set_routing_entry(struct kvm *kvm,
                           struct kvm_kernel_irq_routing_entry *e,
                           const struct kvm_irq_routing_entry *ue)
-#else
-/* Remove this version and the ifdefery once merged into 4.8 */
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
-                         const struct kvm_irq_routing_entry *ue)
-#endif
  {
         int r = -EINVAL;
  
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c

index 1813f93..ce1f4ed 100644 (file)
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -71,7 +71,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
                 addr_ptr = &vgic->vgic_cpu_base;
                 alignment = SZ_4K;
                 break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
         case KVM_VGIC_V3_ADDR_TYPE_DIST:
                 type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
                 addr_ptr = &vgic->vgic_dist_base;
@@ -82,7 +81,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
                 addr_ptr = &vgic->vgic_redist_base;
                 alignment = SZ_64K;
                 break;
-#endif
         default:
                 r = -ENODEV;
                 goto out;
@@ -219,52 +217,65 @@ int kvm_register_vgic_device(unsigned long type)
                 ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
                                               KVM_DEV_TYPE_ARM_VGIC_V2);
                 break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
         case KVM_DEV_TYPE_ARM_VGIC_V3:
                 ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
                                               KVM_DEV_TYPE_ARM_VGIC_V3);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
                 if (ret)
                         break;
                 ret = kvm_vgic_register_its_device();
-               break;
  #endif
+               break;
         }
  
         return ret;
  }
  
-/** vgic_attr_regs_access: allows user space to read/write VGIC registers
- *
- * @dev: kvm device handle
- * @attr: kvm device attribute
- * @reg: address the value is read or written
- * @is_write: write flag
- *
- */
-static int vgic_attr_regs_access(struct kvm_device *dev,
-                                struct kvm_device_attr *attr,
-                                u32 *reg, bool is_write)
-{
+struct vgic_reg_attr {
+       struct kvm_vcpu *vcpu;
         gpa_t addr;
-       int cpuid, ret, c;
-       struct kvm_vcpu *vcpu, *tmp_vcpu;
-       int vcpu_lock_idx = -1;
+};
+
+static int parse_vgic_v2_attr(struct kvm_device *dev,
+                             struct kvm_device_attr *attr,
+                             struct vgic_reg_attr *reg_attr)
+{
+       int cpuid;
  
         cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
                  KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-       vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-       addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
  
-       mutex_lock(&dev->kvm->lock);
+       if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
+               return -EINVAL;
  
-       ret = vgic_init(dev->kvm);
-       if (ret)
-               goto out;
+       reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+       reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
  
-       if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
-               ret = -EINVAL;
-               goto out;
+       return 0;
+}
+
+/* unlocks vcpus from @vcpu_lock_idx and smaller */
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+       struct kvm_vcpu *tmp_vcpu;
+
+       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+               tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+               mutex_unlock(&tmp_vcpu->mutex);
         }
+}
+
+static void unlock_all_vcpus(struct kvm *kvm)
+{
+       unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+/* Returns true if all vcpus were locked, false otherwise */
+static bool lock_all_vcpus(struct kvm *kvm)
+{
+       struct kvm_vcpu *tmp_vcpu;
+       int c;
  
         /*
          * Any time a vcpu is run, vcpu_load is called which tries to grab the
@@ -272,11 +283,49 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
          * that no other VCPUs are run and fiddle with the vgic state while we
          * access it.
          */
-       ret = -EBUSY;
-       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
-               if (!mutex_trylock(&tmp_vcpu->mutex))
-                       goto out;
-               vcpu_lock_idx = c;
+       kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+               if (!mutex_trylock(&tmp_vcpu->mutex)) {
+                       unlock_vcpus(kvm, c - 1);
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+/**
+ * vgic_attr_regs_access_v2 - allows user space to access VGIC v2 state
+ *
+ * @dev:      kvm device handle
+ * @attr:     kvm device attribute
+ * @reg:      address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_attr_regs_access_v2(struct kvm_device *dev,
+                                   struct kvm_device_attr *attr,
+                                   u32 *reg, bool is_write)
+{
+       struct vgic_reg_attr reg_attr;
+       gpa_t addr;
+       struct kvm_vcpu *vcpu;
+       int ret;
+
+       ret = parse_vgic_v2_attr(dev, attr, &reg_attr);
+       if (ret)
+               return ret;
+
+       vcpu = reg_attr.vcpu;
+       addr = reg_attr.addr;
+
+       mutex_lock(&dev->kvm->lock);
+
+       ret = vgic_init(dev->kvm);
+       if (ret)
+               goto out;
+
+       if (!lock_all_vcpus(dev->kvm)) {
+               ret = -EBUSY;
+               goto out;
         }
  
         switch (attr->group) {
@@ -291,18 +340,12 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
                 break;
         }
  
+       unlock_all_vcpus(dev->kvm);
  out:
-       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-               tmp_vcpu = kvm_get_vcpu(dev->kvm, vcpu_lock_idx);
-               mutex_unlock(&tmp_vcpu->mutex);
-       }
-
         mutex_unlock(&dev->kvm->lock);
         return ret;
  }
  
-/* V2 ops */
-
  static int vgic_v2_set_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
  {
@@ -321,7 +364,7 @@ static int vgic_v2_set_attr(struct kvm_device *dev,
                 if (get_user(reg, uaddr))
                         return -EFAULT;
  
-               return vgic_attr_regs_access(dev, attr, &reg, true);
+               return vgic_attr_regs_access_v2(dev, attr, &reg, true);
         }
         }
  
@@ -343,7 +386,7 @@ static int vgic_v2_get_attr(struct kvm_device *dev,
                 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
                 u32 reg = 0;
  
-               ret = vgic_attr_regs_access(dev, attr, &reg, false);
+               ret = vgic_attr_regs_access_v2(dev, attr, &reg, false);
                 if (ret)
                         return ret;
                 return put_user(reg, uaddr);
@@ -387,10 +430,6 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = {
         .has_attr = vgic_v2_has_attr,
  };
  
-/* V3 ops */
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-
  static int vgic_v3_set_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
  {
@@ -433,5 +472,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
         .get_attr = vgic_v3_get_attr,
         .has_attr = vgic_v3_has_attr,
  };
-
-#endif /* CONFIG_KVM_ARM_VGIC_V3 */
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c

index 90d8181..0d3c76a 100644 (file)
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -23,7 +23,7 @@
  #include "vgic-mmio.h"
  
  /* extract @num bytes at @offset bytes offset in data */
-unsigned long extract_bytes(unsigned long data, unsigned int offset,
+unsigned long extract_bytes(u64 data, unsigned int offset,
                             unsigned int num)
  {
         return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
@@ -42,6 +42,7 @@ u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
         return reg | ((u64)val << lower);
  }
  
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
  bool vgic_has_its(struct kvm *kvm)
  {
         struct vgic_dist *dist = &kvm->arch.vgic;
@@ -51,6 +52,7 @@ bool vgic_has_its(struct kvm *kvm)
  
         return dist->has_its;
  }
+#endif
  
  static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
                                             gpa_t addr, unsigned int len)
@@ -179,7 +181,7 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
         int target_vcpu_id = vcpu->vcpu_id;
         u64 value;
  
-       value = (mpidr & GENMASK(23, 0)) << 32;
+       value = (u64)(mpidr & GENMASK(23, 0)) << 32;
         value |= ((target_vcpu_id & 0xffff) << 8);
         if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
                 value |= GICR_TYPER_LAST;
@@ -609,7 +611,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
         bool broadcast;
  
         sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-       broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+       broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
         target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
         mpidr = SGI_AFFINITY_LEVEL(reg, 3);
         mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c

index 3bad3c5..e18b30d 100644 (file)
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -550,11 +550,9 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
         case VGIC_V2:
                 len = vgic_v2_init_dist_iodev(io_device);
                 break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
         case VGIC_V3:
                 len = vgic_v3_init_dist_iodev(io_device);
                 break;
-#endif
         default:
                 BUG_ON(1);
         }
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h

index 0b3ecf9..4c34d39 100644 (file)
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -96,7 +96,7 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
  void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
                                 unsigned long data);
  
-unsigned long extract_bytes(unsigned long data, unsigned int offset,
+unsigned long extract_bytes(u64 data, unsigned int offset,
                             unsigned int num);
  
  u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
@@ -162,12 +162,10 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
  
  unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
  
-#ifdef CONFIG_KVM_ARM_VGIC_V3
  u64 vgic_sanitise_outer_cacheability(u64 reg);
  u64 vgic_sanitise_inner_cacheability(u64 reg);
  u64 vgic_sanitise_shareability(u64 reg);
  u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
                         u64 (*sanitise_fn)(u64));
-#endif
  
  #endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c

index 0bf6709..0a063af 100644 (file)
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -278,12 +278,14 @@ int vgic_v2_map_resources(struct kvm *kvm)
                 goto out;
         }
  
-       ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-                                   kvm_vgic_global_state.vcpu_base,
-                                   KVM_VGIC_V2_CPU_SIZE, true);
-       if (ret) {
-               kvm_err("Unable to remap VGIC CPU to VCPU\n");
-               goto out;
+       if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
+               ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+                                           kvm_vgic_global_state.vcpu_base,
+                                           KVM_VGIC_V2_CPU_SIZE, true);
+               if (ret) {
+                       kvm_err("Unable to remap VGIC CPU to VCPU\n");
+                       goto out;
+               }
         }
  
         dist->ready = true;
@@ -294,6 +296,8 @@ out:
         return ret;
  }
  
+DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
+
  /**
   * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
   * @node:      pointer to the DT node
@@ -310,45 +314,51 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
                 return -ENXIO;
         }
  
-       if (!PAGE_ALIGNED(info->vcpu.start)) {
-               kvm_err("GICV physical address 0x%llx not page aligned\n",
-                       (unsigned long long)info->vcpu.start);
-               return -ENXIO;
-       }
+       if (!PAGE_ALIGNED(info->vcpu.start) ||
+           !PAGE_ALIGNED(resource_size(&info->vcpu))) {
+               kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
+               kvm_vgic_global_state.vcpu_base_va = ioremap(info->vcpu.start,
+                                                            resource_size(&info->vcpu));
+               if (!kvm_vgic_global_state.vcpu_base_va) {
+                       kvm_err("Cannot ioremap GICV\n");
+                       return -ENOMEM;
+               }
  
-       if (!PAGE_ALIGNED(resource_size(&info->vcpu))) {
-               kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-                       (unsigned long long)resource_size(&info->vcpu),
-                       PAGE_SIZE);
-               return -ENXIO;
+               ret = create_hyp_io_mappings(kvm_vgic_global_state.vcpu_base_va,
+                                            kvm_vgic_global_state.vcpu_base_va + resource_size(&info->vcpu),
+                                            info->vcpu.start);
+               if (ret) {
+                       kvm_err("Cannot map GICV into hyp\n");
+                       goto out;
+               }
+
+               static_branch_enable(&vgic_v2_cpuif_trap);
         }
  
         kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start,
                                                    resource_size(&info->vctrl));
         if (!kvm_vgic_global_state.vctrl_base) {
                 kvm_err("Cannot ioremap GICH\n");
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
         }
  
         vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
         kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
  
-       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-       if (ret) {
-               kvm_err("Cannot register GICv2 KVM device\n");
-               iounmap(kvm_vgic_global_state.vctrl_base);
-               return ret;
-       }
-
         ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
                                      kvm_vgic_global_state.vctrl_base +
                                          resource_size(&info->vctrl),
                                      info->vctrl.start);
         if (ret) {
                 kvm_err("Cannot map VCTRL into hyp\n");
-               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
-               iounmap(kvm_vgic_global_state.vctrl_base);
-               return ret;
+               goto out;
+       }
+
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+       if (ret) {
+               kvm_err("Cannot register GICv2 KVM device\n");
+               goto out;
         }
  
         kvm_vgic_global_state.can_emulate_gicv2 = true;
@@ -359,4 +369,11 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
         kvm_info("vgic-v2@%llx\n", info->vctrl.start);
  
         return 0;
+out:
+       if (kvm_vgic_global_state.vctrl_base)
+               iounmap(kvm_vgic_global_state.vctrl_base);
+       if (kvm_vgic_global_state.vcpu_base_va)
+               iounmap(kvm_vgic_global_state.vcpu_base_va);
+
+       return ret;
  }
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c

index e83b7fe..2893d5b 100644 (file)
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -29,7 +29,7 @@
  #define DEBUG_SPINLOCK_BUG_ON(p)
  #endif
  
-struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
+struct vgic_global __section(.hyp.text) kvm_vgic_global_state = {.gicv3_cpuif = STATIC_KEY_FALSE_INIT,};
  
  /*
   * Locking order is always:
@@ -645,6 +645,9 @@ next:
  /* Sync back the hardware VGIC state into our emulation after a guest's run. */
  void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
  {
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
         vgic_process_maintenance_interrupt(vcpu);
         vgic_fold_lr_state(vcpu);
         vgic_prune_ap_list(vcpu);
@@ -653,6 +656,9 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
  /* Flush our emulation state into the GIC hardware before entering the guest. */
  void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
  {
+       if (unlikely(!vgic_initialized(vcpu->kvm)))
+               return;
+
         spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
         vgic_flush_lr_state(vcpu);
         spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h

index 6c4625c..9d9e014 100644 (file)
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -72,7 +72,6 @@ static inline void vgic_get_irq_kref(struct vgic_irq *irq)
         kref_get(&irq->refcount);
  }
  
-#ifdef CONFIG_KVM_ARM_VGIC_V3
  void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
  void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
  void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
@@ -84,63 +83,14 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
  int vgic_v3_probe(const struct gic_kvm_info *info);
  int vgic_v3_map_resources(struct kvm *kvm);
  int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS
  int vgic_register_its_iodevs(struct kvm *kvm);
  bool vgic_has_its(struct kvm *kvm);
  int kvm_vgic_register_its_device(void);
  void vgic_enable_lpis(struct kvm_vcpu *vcpu);
  int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
  #else
-static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu,
-                                      struct vgic_irq *irq, int lr)
-{
-}
-
-static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-}
-
-static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-}
-
-static inline
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-}
-
-static inline void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline int vgic_v3_probe(const struct gic_kvm_info *info)
-{
-       return -ENODEV;
-}
-
-static inline int vgic_v3_map_resources(struct kvm *kvm)
-{
-       return -ENODEV;
-}
-
-static inline int vgic_register_redist_iodevs(struct kvm *kvm,
-                                             gpa_t dist_base_address)
-{
-       return -ENODEV;
-}
-
  static inline int vgic_register_its_iodevs(struct kvm *kvm)
  {
         return -ENODEV;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c

index e469b60..f397e9b 100644 (file)
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -42,7 +42,6 @@
  
  #ifdef CONFIG_HAVE_KVM_IRQFD
  
-static struct workqueue_struct *irqfd_cleanup_wq;
  
  static void
  irqfd_inject(struct work_struct *work)
@@ -168,7 +167,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
  
         list_del_init(&irqfd->list);
  
-       queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
+       schedule_work(&irqfd->shutdown);
  }
  
  int __attribute__((weak)) kvm_arch_set_irq_inatomic(
@@ -555,7 +554,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
          * so that we guarantee there will not be any more interrupts on this
          * gsi once this deassign function returns.
          */
-       flush_workqueue(irqfd_cleanup_wq);
+       flush_work(&irqfd->shutdown);
  
         return 0;
  }
@@ -592,7 +591,7 @@ kvm_irqfd_release(struct kvm *kvm)
          * Block until we know all outstanding shutdown jobs have completed
          * since we do not take a kvm* reference.
          */
-       flush_workqueue(irqfd_cleanup_wq);
+       flush_work(&irqfd->shutdown);
  
  }
  
@@ -622,23 +621,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
         spin_unlock_irq(&kvm->irqfds.lock);
  }
  
-/*
- * create a host-wide workqueue for issuing deferred shutdown requests
- * aggregated from all vm* instances. We need our own isolated single-thread
- * queue to prevent deadlock against flushing the normal work-queue.
- */
-int kvm_irqfd_init(void)
-{
-       irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
-       if (!irqfd_cleanup_wq)
-               return -ENOMEM;
-
-       return 0;
-}
-
  void kvm_irqfd_exit(void)
  {
-       destroy_workqueue(irqfd_cleanup_wq);
  }
  #endif
  
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 1950782..81dfc73 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -559,9 +559,11 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm)
  
         debugfs_remove_recursive(kvm->debugfs_dentry);
  
-       for (i = 0; i < kvm_debugfs_num_entries; i++)
-               kfree(kvm->debugfs_stat_data[i]);
-       kfree(kvm->debugfs_stat_data);
+       if (kvm->debugfs_stat_data) {
+               for (i = 0; i < kvm_debugfs_num_entries; i++)
+                       kfree(kvm->debugfs_stat_data[i]);
+               kfree(kvm->debugfs_stat_data);
+       }
  }
  
  static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
@@ -2369,6 +2371,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
  {
         struct kvm_vcpu *vcpu = filp->private_data;
  
+       debugfs_remove_recursive(vcpu->debugfs_dentry);
         kvm_put_kvm(vcpu->kvm);
         return 0;
  }
@@ -2391,6 +2394,32 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
         return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
  }
  
+static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       char dir_name[ITOA_MAX_LEN * 2];
+       int ret;
+
+       if (!kvm_arch_has_vcpu_debugfs())
+               return 0;
+
+       if (!debugfs_initialized())
+               return 0;
+
+       snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
+       vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
+                                                               vcpu->kvm->debugfs_dentry);
+       if (!vcpu->debugfs_dentry)
+               return -ENOMEM;
+
+       ret = kvm_arch_create_vcpu_debugfs(vcpu);
+       if (ret < 0) {
+               debugfs_remove_recursive(vcpu->debugfs_dentry);
+               return ret;
+       }
+
+       return 0;
+}
+
  /*
   * Creates some virtual cpus.  Good luck creating more than one.
   */
@@ -2423,6 +2452,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
         if (r)
                 goto vcpu_destroy;
  
+       r = kvm_create_vcpu_debugfs(vcpu);
+       if (r)
+               goto vcpu_destroy;
+
         mutex_lock(&kvm->lock);
         if (kvm_get_vcpu_by_id(kvm, id)) {
                 r = -EEXIST;
@@ -2454,6 +2487,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
  
  unlock_vcpu_destroy:
         mutex_unlock(&kvm->lock);
+       debugfs_remove_recursive(vcpu->debugfs_dentry);
  vcpu_destroy:
         kvm_arch_vcpu_destroy(vcpu);
  vcpu_decrement:
@@ -3619,7 +3653,7 @@ static int vm_stat_get_per_vm(void *data, u64 *val)
  {
         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
  
-       *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+       *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
  
         return 0;
  }
@@ -3649,7 +3683,7 @@ static int vcpu_stat_get_per_vm(void *data, u64 *val)
         *val = 0;
  
         kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-               *val += *(u32 *)((void *)vcpu + stat_data->offset);
+               *val += *(u64 *)((void *)vcpu + stat_data->offset);
  
         return 0;
  }
@@ -3807,12 +3841,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
          * kvm_arch_init makes sure there's at most one caller
          * for architectures that support multiple implementations,
          * like intel and amd on x86.
-        * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
-        * conflicts in case kvm is already setup for another implementation.
          */
-       r = kvm_irqfd_init();
-       if (r)
-               goto out_irqfd;
  
         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
                 r = -ENOMEM;
@@ -3894,7 +3923,6 @@ out_free_0a:
         free_cpumask_var(cpus_hardware_enabled);
  out_free_0:
         kvm_irqfd_exit();
-out_irqfd:
         kvm_arch_exit();
  out_fail:
         return r;
author	David S. Miller <davem@davemloft.net>
	Sat, 8 Oct 2016 12:24:37 +0000 (08:24 -0400)
committer	David S. Miller <davem@davemloft.net>
	Sat, 8 Oct 2016 12:24:37 +0000 (08:24 -0400)