Merge tag 'nfs-for-4.9-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 04:28:20 +0000 (21:28 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 04:28:20 +0000 (21:28 -0700)
Pull NFS client updates from Anna Schumaker:
 "Highlights include:

  Stable bugfixes:
   - sunrpc: fix writ espace race causing stalls
   - NFS: Fix inode corruption in nfs_prime_dcache()
   - NFSv4: Don't report revoked delegations as valid in nfs_have_delegation()
   - NFSv4: nfs4_copy_delegation_stateid() must fail if the delegation is invalid
   - NFSv4: Open state recovery must account for file permission changes
   - NFSv4.2: Fix a reference leak in nfs42_proc_layoutstats_generic

  Features:
   - Add support for tracking multiple layout types with an ordered list
   - Add support for using multiple backchannel threads on the client
   - Add support for pNFS file layout session trunking
   - Delay xprtrdma use of DMA API (for device driver removal)
   - Add support for xprtrdma remote invalidation
   - Add support for larger xprtrdma inline thresholds
   - Use a scatter/gather list for sending xprtrdma RPC calls
   - Add support for the CB_NOTIFY_LOCK callback
   - Improve hashing sunrpc auth_creds by using both uid and gid

  Bugfixes:
   - Fix xprtrdma use of DMA API
   - Validate filenames before adding to the dcache
   - Fix corruption of xdr->nwords in xdr_copy_to_scratch
   - Fix setting buffer length in xdr_set_next_buffer()
   - Don't deadlock the state manager on the SEQUENCE status flags
   - Various delegation and stateid related fixes
   - Retry operations if an interrupted slot receives EREMOTEIO
   - Make nfs boot time y2038 safe"

* tag 'nfs-for-4.9-1' of git://git.linux-nfs.org/projects/anna/linux-nfs: (100 commits)
  NFSv4.2: Fix a reference leak in nfs42_proc_layoutstats_generic
  fs: nfs: Make nfs boot time y2038 safe
  sunrpc: replace generic auth_cred hash with auth-specific function
  sunrpc: add RPCSEC_GSS hash_cred() function
  sunrpc: add auth_unix hash_cred() function
  sunrpc: add generic_auth hash_cred() function
  sunrpc: add hash_cred() function to rpc_authops struct
  Retry operation on EREMOTEIO on an interrupted slot
  pNFS: Fix atime updates on pNFS clients
  sunrpc: queue work on system_power_efficient_wq
  NFSv4.1: Even if the stateid is OK, we may need to recover the open modes
  NFSv4: If recovery failed for a specific open stateid, then don't retry
  NFSv4: Fix retry issues with nfs41_test/free_stateid
  NFSv4: Open state recovery must account for file permission changes
  NFSv4: Mark the lock and open stateids as invalid after freeing them
  NFSv4: Don't test open_stateid unless it is set
  NFSv4: nfs4_do_handle_exception() handle revoke/expiry of a single stateid
  NFS: Always call nfs_inode_find_state_and_recover() when revoking a delegation
  NFSv4: Fix a race when updating an open_stateid
  NFSv4: Fix a race in nfs_inode_reclaim_delegation()
  ...

1  2 
Documentation/kernel-parameters.txt
fs/nfs/dir.c
fs/nfs/file.c
fs/nfs/internal.h
fs/nfs/nfs4proc.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_unix.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
net/sunrpc/xprtrdma/verbs.c

@@@ -33,37 -33,6 +33,37 @@@ can also be entered a
  Double-quotes can be used to protect spaces in values, e.g.:
        param="spaces in here"
  
 +cpu lists:
 +----------
 +
 +Some kernel parameters take a list of CPUs as a value, e.g.  isolcpus,
 +nohz_full, irqaffinity, rcu_nocbs.  The format of this list is:
 +
 +      <cpu number>,...,<cpu number>
 +
 +or
 +
 +      <cpu number>-<cpu number>
 +      (must be a positive range in ascending order)
 +
 +or a mixture
 +
 +<cpu number>,...,<cpu number>-<cpu number>
 +
 +Note that for the special case of a range one can split the range into equal
 +sized groups and for each group use some amount from the beginning of that
 +group:
 +
 +      <cpu number>-cpu number>:<used size>/<group size>
 +
 +For example one can add to the command line following parameter:
 +
 +      isolcpus=1,2,10-20,100-2000:2/25
 +
 +where the final item represents CPUs 100,101,125,126,150,151,...
 +
 +
 +
  This document may not be entirely up to date and comprehensive. The command
  "modinfo -p ${modulename}" shows a current list of all parameters of a loadable
  module. Loadable modules, after being loaded into the running kernel, also
@@@ -491,15 -460,6 +491,15 @@@ bytes respectively. Such letter suffixe
                        driver will print ACPI tables for AMD IOMMU during
                        IOMMU initialization.
  
 +      amd_iommu_intr= [HW,X86-64]
 +                      Specifies one of the following AMD IOMMU interrupt
 +                      remapping modes:
 +                      legacy     - Use legacy interrupt remapping mode.
 +                      vapic      - Use virtual APIC mode, which allows IOMMU
 +                                   to inject interrupts directly into guest.
 +                                   This mode requires kvm-amd.avic=1.
 +                                   (Default when IOMMU HW support is present.)
 +
        amijoy.map=     [HW,JOY] Amiga joystick support
                        Map of devices attached to JOY0DAT and JOY1DAT
                        Format: <a>,<b>
                        loops can be debugged more effectively on production
                        systems.
  
 +      clocksource.arm_arch_timer.fsl-a008585=
 +                      [ARM64]
 +                      Format: <bool>
 +                      Enable/disable the workaround of Freescale/NXP
 +                      erratum A-008585.  This can be useful for KVM
 +                      guests, if the guest device tree doesn't show the
 +                      erratum.  If unspecified, the workaround is
 +                      enabled based on the device tree.
 +
        clearcpuid=BITNUM [X86]
                        Disable CPUID feature X for the kernel. See
                        arch/x86/include/asm/cpufeatures.h for the valid bit
                        determined by the stdout-path property in device
                        tree's chosen node.
  
 -              cdns,<addr>
 -                      Start an early, polled-mode console on a cadence serial
 -                      port at the specified address. The cadence serial port
 -                      must already be setup and configured. Options are not
 -                      yet supported.
 +              cdns,<addr>[,options]
 +                      Start an early, polled-mode console on a Cadence
 +                      (xuartps) serial port at the specified address. Only
 +                      supported option is baud rate. If baud rate is not
 +                      specified, the serial port must already be setup and
 +                      configured.
  
                uart[8250],io,<addr>[,options]
                uart[8250],mmio,<addr>[,options]
                        Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                        Default: 1024
  
 +      gpio-mockup.gpio_mockup_ranges
 +                      [HW] Sets the ranges of gpiochip of for this device.
 +                      Format: <start1>,<end1>,<start2>,<end2>...
 +
        hardlockup_all_cpu_backtrace=
                        [KNL] Should the hard-lockup detector generate
                        backtraces on all cpus.
  
        initrd=         [BOOT] Specify the location of the initial ramdisk
  
 +      init_pkru=      [x86] Specify the default memory protection keys rights
 +                      register contents for all processes.  0x55555554 by
 +                      default (disallow access to all but pkey 0).  Can
 +                      override in debugfs after boot.
 +
        inport.irq=     [HW] Inport (ATI XL and Microsoft) busmouse driver
                        Format: <irq>
  
  
        intel_idle.max_cstate=  [KNL,HW,ACPI,X86]
                        0       disables intel_idle and fall back on acpi_idle.
 -                      1 to 6  specify maximum depth of C-state.
 +                      1 to 9  specify maximum depth of C-state.
  
        intel_pstate=  [X86]
                       disable
                        See Documentation/filesystems/nfs/nfsroot.txt.
  
        irqaffinity=    [SMP] Set the default irq affinity mask
 -                      Format:
 -                      <cpu number>,...,<cpu number>
 -                      or
 -                      <cpu number>-<cpu number>
 -                      (must be a positive range in ascending order)
 -                      or a mixture
 -                      <cpu number>,...,<cpu number>-<cpu number>
 +                      The argument is a cpu list, as described above.
  
        irqfixup        [HW]
                        When an interrupt is not handled search all handlers
                        Format: <RDP>,<reset>,<pci_scan>,<verbosity>
  
        isolcpus=       [KNL,SMP] Isolate CPUs from the general scheduler.
 -                      Format:
 -                      <cpu number>,...,<cpu number>
 -                      or
 -                      <cpu number>-<cpu number>
 -                      (must be a positive range in ascending order)
 -                      or a mixture
 -                      <cpu number>,...,<cpu number>-<cpu number>
 +                      The argument is a cpu list, as described above.
  
                        This option can be used to specify one or more CPUs
                        to isolate from the general SMP balancing and scheduling
                        than or equal to this physical address is ignored.
  
        maxcpus=        [SMP] Maximum number of processors that an SMP kernel
 -                      should make use of.  maxcpus=n : n >= 0 limits the
 -                      kernel to using 'n' processors.  n=0 is a special case,
 -                      it is equivalent to "nosmp", which also disables
 -                      the IO APIC.
 +                      will bring up during bootup.  maxcpus=n : n >= 0 limits
 +                      the kernel to bring up 'n' processors. Surely after
 +                      bootup you can bring up the other plugged cpu by executing
 +                      "echo 1 > /sys/devices/system/cpu/cpuX/online". So maxcpus
 +                      only takes effect during system bootup.
 +                      While n=0 is a special case, it is equivalent to "nosmp",
 +                      which also disables the IO APIC.
  
        max_loop=       [LOOP] The number of loop block devices that get
        (loop.max_loop) unconditionally pre-created at init time. The default
        nfsrootdebug    [NFS] enable nfsroot debugging messages.
                        See Documentation/filesystems/nfs/nfsroot.txt.
  
+       nfs.callback_nr_threads=
+                       [NFSv4] set the total number of threads that the
+                       NFS client will assign to service NFSv4 callback
+                       requests.
        nfs.callback_tcpport=
                        [NFS] set the TCP port on which the NFSv4 callback
                        channel should listen.
                        of returning the full 64-bit number.
                        The default is to return 64-bit inode numbers.
  
+       nfs.max_session_cb_slots=
+                       [NFSv4.1] Sets the maximum number of session
+                       slots the client will assign to the callback
+                       channel. This determines the maximum number of
+                       callbacks the client will process in parallel for
+                       a particular server.
        nfs.max_session_slots=
                        [NFSv4.1] Sets the maximum number of session slots
                        the client will attempt to negotiate with the server.
  
        nodelayacct     [KNL] Disable per-task delay accounting
  
 -      nodisconnect    [HW,SCSI,M68K] Disables SCSI disconnects.
 -
        nodsp           [SH] Disable hardware DSP at boot time.
  
        noefi           Disable EFI runtime services support.
                        Default: on
  
        nohz_full=      [KNL,BOOT]
 +                      The argument is a cpu list, as described above.
                        In kernels built with CONFIG_NO_HZ_FULL=y, set
                        the specified list of CPUs whose tick will be stopped
                        whenever possible. The boot CPU will be forced outside
  
        nr_cpus=        [SMP] Maximum number of processors that an SMP kernel
                        could support.  nr_cpus=n : n >= 1 limits the kernel to
 -                      supporting 'n' processors. Later in runtime you can not
 -                      use hotplug cpu feature to put more cpu back to online.
 -                      just like you compile the kernel NR_CPUS=n
 +                      support 'n' processors. It could be larger than the
 +                      number of already plugged CPU during bootup, later in
 +                      runtime you can physically add extra cpu until it reaches
 +                      n. So during boot up some boot time memory for per-cpu
 +                      variables need be pre-allocated for later physical cpu
 +                      hot plugging.
  
        nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
  
                        See Documentation/blockdev/ramdisk.txt.
  
        rcu_nocbs=      [KNL]
 +                      The argument is a cpu list, as described above.
 +
                        In kernels built with CONFIG_RCU_NOCB_CPU=y, set
                        the specified list of CPUs to be no-callback CPUs.
                        Invocation of these CPUs' RCU callbacks will
                                u = IGNORE_UAS (don't bind to the uas driver);
                                w = NO_WP_DETECT (don't test whether the
                                        medium is write-protected).
 +                              y = ALWAYS_SYNC (issue a SYNCHRONIZE_CACHE
 +                                      even if the device claims no cache)
                        Example: quirks=0419:aaf5:rl,0421:0433:rc
  
        user_debug=     [KNL,ARM]
diff --combined fs/nfs/dir.c
@@@ -435,11 -435,11 +435,11 @@@ int nfs_same_file(struct dentry *dentry
                return 0;
  
        nfsi = NFS_I(inode);
-       if (entry->fattr->fileid == nfsi->fileid)
-               return 1;
-       if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
-               return 1;
-       return 0;
+       if (entry->fattr->fileid != nfsi->fileid)
+               return 0;
+       if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0)
+               return 0;
+       return 1;
  }
  
  static
@@@ -496,6 -496,14 +496,14 @@@ void nfs_prime_dcache(struct dentry *pa
                return;
        if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
                return;
+       if (filename.len == 0)
+               return;
+       /* Validate that the name doesn't contain any illegal '\0' */
+       if (strnlen(filename.name, filename.len) != filename.len)
+               return;
+       /* ...or '/' */
+       if (strnchr(filename.name, filename.len, '/'))
+               return;
        if (filename.name[0] == '.') {
                if (filename.len == 1)
                        return;
@@@ -517,6 -525,8 +525,8 @@@ again
                                        &entry->fattr->fsid))
                        goto out;
                if (nfs_same_file(dentry, entry)) {
+                       if (!entry->fh->size)
+                               goto out;
                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
                        status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
                        if (!status)
                        goto again;
                }
        }
+       if (!entry->fh->size) {
+               d_lookup_done(dentry);
+               goto out;
+       }
  
        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
        alias = d_splice_alias(inode, dentry);
@@@ -2013,8 -2027,7 +2027,8 @@@ EXPORT_SYMBOL_GPL(nfs_link)
   * the rename.
   */
  int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 -                    struct inode *new_dir, struct dentry *new_dentry)
 +             struct inode *new_dir, struct dentry *new_dentry,
 +             unsigned int flags)
  {
        struct inode *old_inode = d_inode(old_dentry);
        struct inode *new_inode = d_inode(new_dentry);
        struct rpc_task *task;
        int error = -EBUSY;
  
 +      if (flags)
 +              return -EINVAL;
 +
        dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
                 old_dentry, new_dentry,
                 d_count(new_dentry));
diff --combined fs/nfs/file.c
@@@ -182,6 -182,29 +182,6 @@@ nfs_file_read(struct kiocb *iocb, struc
  }
  EXPORT_SYMBOL_GPL(nfs_file_read);
  
 -ssize_t
 -nfs_file_splice_read(struct file *filp, loff_t *ppos,
 -                   struct pipe_inode_info *pipe, size_t count,
 -                   unsigned int flags)
 -{
 -      struct inode *inode = file_inode(filp);
 -      ssize_t res;
 -
 -      dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
 -              filp, (unsigned long) count, (unsigned long long) *ppos);
 -
 -      nfs_start_io_read(inode);
 -      res = nfs_revalidate_mapping(inode, filp->f_mapping);
 -      if (!res) {
 -              res = generic_file_splice_read(filp, ppos, pipe, count, flags);
 -              if (res > 0)
 -                      nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
 -      }
 -      nfs_end_io_read(inode);
 -      return res;
 -}
 -EXPORT_SYMBOL_GPL(nfs_file_splice_read);
 -
  int
  nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
@@@ -520,7 -543,9 +520,9 @@@ const struct address_space_operations n
        .invalidatepage = nfs_invalidate_page,
        .releasepage = nfs_release_page,
        .direct_IO = nfs_direct_IO,
+ #ifdef CONFIG_MIGRATION
        .migratepage = nfs_migrate_page,
+ #endif
        .launder_page = nfs_launder_page,
        .is_dirty_writeback = nfs_check_dirty_writeback,
        .error_remove_page = generic_error_remove_page,
@@@ -685,11 -710,6 +687,6 @@@ out_noconflict
        goto out;
  }
  
- static int do_vfs_lock(struct file *file, struct file_lock *fl)
- {
-       return locks_lock_file_wait(file, fl);
- }
  static int
  do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
  {
        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
-               status = do_vfs_lock(filp, fl);
+               status = locks_lock_file_wait(filp, fl);
        return status;
  }
  
@@@ -747,7 -767,7 +744,7 @@@ do_setlk(struct file *filp, int cmd, st
        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
-               status = do_vfs_lock(filp, fl);
+               status = locks_lock_file_wait(filp, fl);
        if (status < 0)
                goto out;
  
@@@ -848,7 -868,7 +845,7 @@@ const struct file_operations nfs_file_o
        .fsync          = nfs_file_fsync,
        .lock           = nfs_lock,
        .flock          = nfs_flock,
 -      .splice_read    = nfs_file_splice_read,
 +      .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
        .check_flags    = nfs_check_flags,
        .setlease       = simple_nosetlease,
diff --combined fs/nfs/internal.h
@@@ -359,13 -359,14 +359,13 @@@ int nfs_unlink(struct inode *, struct d
  int nfs_symlink(struct inode *, struct dentry *, const char *);
  int nfs_link(struct dentry *, struct inode *, struct dentry *);
  int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 -int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
 +int nfs_rename(struct inode *, struct dentry *,
 +             struct inode *, struct dentry *, unsigned int);
  
  /* file.c */
  int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
  loff_t nfs_file_llseek(struct file *, loff_t, int);
  ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
 -ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
 -                           size_t, unsigned int);
  int nfs_file_mmap(struct file *, struct vm_area_struct *);
  ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
  int nfs_file_release(struct inode *, struct file *);
@@@ -534,12 -535,9 +534,9 @@@ void nfs_clear_pnfs_ds_commit_verifiers
  }
  #endif
  
  #ifdef CONFIG_MIGRATION
  extern int nfs_migrate_page(struct address_space *,
                struct page *, struct page *, enum migrate_mode);
- #else
- #define nfs_migrate_page NULL
  #endif
  
  static inline int
@@@ -562,7 -560,6 +559,6 @@@ void nfs_init_cinfo_from_dreq(struct nf
  extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
  
  /* nfs4proc.c */
- extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
  extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct nfs_client_initdata *);
  extern int nfs40_walk_client_list(struct nfs_client *clp,
  extern int nfs41_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                struct rpc_cred *cred);
+ extern int nfs4_test_session_trunk(struct rpc_clnt *,
+                               struct rpc_xprt *,
+                               void *);
  
  static inline struct inode *nfs_igrab_and_active(struct inode *inode)
  {
@@@ -680,11 -680,11 +679,11 @@@ unsigned int nfs_page_length(struct pag
        loff_t i_size = i_size_read(page_file_mapping(page)->host);
  
        if (i_size > 0) {
 -              pgoff_t page_index = page_file_index(page);
 +              pgoff_t index = page_index(page);
                pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
 -              if (page_index < end_index)
 +              if (index < end_index)
                        return PAGE_SIZE;
 -              if (page_index == end_index)
 +              if (index == end_index)
                        return ((i_size - 1) & ~PAGE_MASK) + 1;
        }
        return 0;
diff --combined fs/nfs/nfs4proc.c
@@@ -99,8 -99,8 +99,8 @@@ static int nfs4_do_setattr(struct inod
  #ifdef CONFIG_NFS_V4_1
  static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
                struct rpc_cred *);
- static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
-               struct rpc_cred *);
+ static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+               struct rpc_cred *, bool);
  #endif
  
  #ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@@ -328,6 -328,33 +328,33 @@@ static void nfs4_setup_readdir(u64 cook
        kunmap_atomic(start);
  }
  
+ static void nfs4_test_and_free_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+ {
+       const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+       ops->test_and_free_expired(server, stateid, cred);
+ }
+ static void __nfs4_free_revoked_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+ {
+       stateid->type = NFS4_REVOKED_STATEID_TYPE;
+       nfs4_test_and_free_stateid(server, stateid, cred);
+ }
+ static void nfs4_free_revoked_stateid(struct nfs_server *server,
+               const nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+ {
+       nfs4_stateid tmp;
+       nfs4_stateid_copy(&tmp, stateid);
+       __nfs4_free_revoked_stateid(server, &tmp, cred);
+ }
  static long nfs4_update_delay(long *timeout)
  {
        long ret;
@@@ -370,13 -397,23 +397,23 @@@ static int nfs4_do_handle_exception(str
        exception->delay = 0;
        exception->recovering = 0;
        exception->retry = 0;
+       if (stateid == NULL && state != NULL)
+               stateid = &state->stateid;
        switch(errorcode) {
                case 0:
                        return 0;
-               case -NFS4ERR_OPENMODE:
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_EXPIRED:
                case -NFS4ERR_BAD_STATEID:
+                       if (inode != NULL && stateid != NULL) {
+                               nfs_inode_find_state_and_recover(inode,
+                                               stateid);
+                               goto wait_on_recovery;
+                       }
+               case -NFS4ERR_OPENMODE:
                        if (inode) {
                                int err;
  
                        if (ret < 0)
                                break;
                        goto wait_on_recovery;
-               case -NFS4ERR_EXPIRED:
-                       if (state != NULL) {
-                               ret = nfs4_schedule_stateid_recovery(server, state);
-                               if (ret < 0)
-                                       break;
-                       }
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                        nfs4_schedule_lease_recovery(clp);
@@@ -616,6 -647,7 +647,7 @@@ int nfs40_setup_sequence(struct nfs4_sl
        }
        spin_unlock(&tbl->slot_tbl_lock);
  
+       slot->privileged = args->sa_privileged ? 1 : 0;
        args->sa_slot = slot;
        res->sr_slot = slot;
  
@@@ -723,12 -755,20 +755,20 @@@ static int nfs41_sequence_process(struc
        /* Check the SEQUENCE operation status */
        switch (res->sr_status) {
        case 0:
+               /* If previous op on slot was interrupted and we reused
+                * the seq# and got a reply from the cache, then retry
+                */
+               if (task->tk_status == -EREMOTEIO && interrupted) {
+                       ++slot->seq_nr;
+                       goto retry_nowait;
+               }
                /* Update the slot's sequence and clientid lease timer */
                slot->seq_done = 1;
                clp = session->clp;
                do_renew_lease(clp, res->sr_timestamp);
                /* Check sequence flags */
-               nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+               nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
+                               !!slot->privileged);
                nfs41_update_target_slotid(slot->table, slot, res);
                break;
        case 1:
@@@ -875,6 -915,7 +915,7 @@@ int nfs41_setup_sequence(struct nfs4_se
        }
        spin_unlock(&tbl->slot_tbl_lock);
  
+       slot->privileged = args->sa_privileged ? 1 : 0;
        args->sa_slot = slot;
  
        dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
@@@ -1353,6 -1394,19 +1394,19 @@@ static void update_open_stateflags(stru
        nfs4_state_set_mode_locked(state, state->state | fmode);
  }
  
+ #ifdef CONFIG_NFS_V4_1
+ static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
+ {
+       if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags))
+               return true;
+       if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags))
+               return true;
+       if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags))
+               return true;
+       return false;
+ }
+ #endif /* CONFIG_NFS_V4_1 */
  static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
  {
        struct nfs_client *clp = state->owner->so_server->nfs_client;
  }
  
  static bool nfs_need_update_open_stateid(struct nfs4_state *state,
-               nfs4_stateid *stateid)
+               const nfs4_stateid *stateid, nfs4_stateid *freeme)
  {
        if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
                return true;
        if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+               nfs4_stateid_copy(freeme, &state->open_stateid);
                nfs_test_and_clear_all_open_stateid(state);
                return true;
        }
@@@ -1437,7 -1492,9 +1492,9 @@@ static void nfs_clear_open_stateid(stru
                nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
  }
  
- static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
+               const nfs4_stateid *stateid, fmode_t fmode,
+               nfs4_stateid *freeme)
  {
        switch (fmode) {
                case FMODE_READ:
                case FMODE_READ|FMODE_WRITE:
                        set_bit(NFS_O_RDWR_STATE, &state->flags);
        }
-       if (!nfs_need_update_open_stateid(state, stateid))
+       if (!nfs_need_update_open_stateid(state, stateid, freeme))
                return;
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                nfs4_stateid_copy(&state->stateid, stateid);
        nfs4_stateid_copy(&state->open_stateid, stateid);
  }
  
- static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
+ static void __update_open_stateid(struct nfs4_state *state,
+               const nfs4_stateid *open_stateid,
+               const nfs4_stateid *deleg_stateid,
+               fmode_t fmode,
+               nfs4_stateid *freeme)
  {
        /*
         * Protect the call to nfs4_state_set_mode_locked and
                set_bit(NFS_DELEGATED_STATE, &state->flags);
        }
        if (open_stateid != NULL)
-               nfs_set_open_stateid_locked(state, open_stateid, fmode);
+               nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
        write_sequnlock(&state->seqlock);
        update_open_stateflags(state, fmode);
        spin_unlock(&state->owner->so_lock);
  }
  
- static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+ static int update_open_stateid(struct nfs4_state *state,
+               const nfs4_stateid *open_stateid,
+               const nfs4_stateid *delegation,
+               fmode_t fmode)
  {
+       struct nfs_server *server = NFS_SERVER(state->inode);
+       struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_delegation *deleg_cur;
+       nfs4_stateid freeme = {0};
        int ret = 0;
  
        fmode &= (FMODE_READ|FMODE_WRITE);
                goto no_delegation_unlock;
  
        nfs_mark_delegation_referenced(deleg_cur);
-       __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+       __update_open_stateid(state, open_stateid, &deleg_cur->stateid,
+                       fmode, &freeme);
        ret = 1;
  no_delegation_unlock:
        spin_unlock(&deleg_cur->lock);
@@@ -1508,11 -1576,14 +1576,14 @@@ no_delegation
        rcu_read_unlock();
  
        if (!ret && open_stateid != NULL) {
-               __update_open_stateid(state, open_stateid, NULL, fmode);
+               __update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
                ret = 1;
        }
        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
-               nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+               nfs4_schedule_state_manager(clp);
+       if (freeme.type != 0)
+               nfs4_test_and_free_stateid(server, &freeme,
+                               state->owner->so_cred);
  
        return ret;
  }
@@@ -1889,7 -1960,6 +1960,6 @@@ static int nfs4_handle_delegation_recal
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_STALE_STATEID:
                        set_bit(NFS_DELEGATED_STATE, &state->flags);
-               case -NFS4ERR_EXPIRED:
                        /* Don't recall a delegation if it was lost */
                        nfs4_schedule_lease_recovery(server->nfs_client);
                        return -EAGAIN;
                        return -EAGAIN;
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_EXPIRED:
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_OPENMODE:
                        nfs_inode_find_state_and_recover(state->inode,
@@@ -2382,9 -2453,10 +2453,10 @@@ static int nfs4_open_expired(struct nfs
        return ret;
  }
  
- static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
+               const nfs4_stateid *stateid)
  {
-       nfs_remove_bad_delegation(state->inode);
+       nfs_remove_bad_delegation(state->inode, stateid);
        write_seqlock(&state->seqlock);
        nfs4_stateid_copy(&state->stateid, &state->open_stateid);
        write_sequnlock(&state->seqlock);
  static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
  {
        if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
-               nfs_finish_clear_delegation_stateid(state);
+               nfs_finish_clear_delegation_stateid(state, NULL);
  }
  
  static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
        return nfs4_open_expired(sp, state);
  }
  
+ static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+ {
+       return -NFS4ERR_BAD_STATEID;
+ }
  #if defined(CONFIG_NFS_V4_1)
+ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
+ {
+       int status;
+       switch (stateid->type) {
+       default:
+               break;
+       case NFS4_INVALID_STATEID_TYPE:
+       case NFS4_SPECIAL_STATEID_TYPE:
+               return -NFS4ERR_BAD_STATEID;
+       case NFS4_REVOKED_STATEID_TYPE:
+               goto out_free;
+       }
+       status = nfs41_test_stateid(server, stateid, cred);
+       switch (status) {
+       case -NFS4ERR_EXPIRED:
+       case -NFS4ERR_ADMIN_REVOKED:
+       case -NFS4ERR_DELEG_REVOKED:
+               break;
+       default:
+               return status;
+       }
+ out_free:
+       /* Ack the revoked state to the server */
+       nfs41_free_stateid(server, stateid, cred, true);
+       return -NFS4ERR_EXPIRED;
+ }
  static void nfs41_check_delegation_stateid(struct nfs4_state *state)
  {
        struct nfs_server *server = NFS_SERVER(state->inode);
        }
  
        nfs4_stateid_copy(&stateid, &delegation->stateid);
+       if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+               rcu_read_unlock();
+               nfs_finish_clear_delegation_stateid(state, &stateid);
+               return;
+       }
+       if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) {
+               rcu_read_unlock();
+               return;
+       }
        cred = get_rpccred(delegation->cred);
        rcu_read_unlock();
-       status = nfs41_test_stateid(server, &stateid, cred);
+       status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
        trace_nfs4_test_delegation_stateid(state, NULL, status);
-       if (status != NFS_OK) {
-               /* Free the stateid unless the server explicitly
-                * informs us the stateid is unrecognized. */
-               if (status != -NFS4ERR_BAD_STATEID)
-                       nfs41_free_stateid(server, &stateid, cred);
-               nfs_finish_clear_delegation_stateid(state);
-       }
+       if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+               nfs_finish_clear_delegation_stateid(state, &stateid);
  
        put_rpccred(cred);
  }
  
+ /**
+  * nfs41_check_expired_locks - possibly free a lock stateid
+  *
+  * @state: NFSv4 state for an inode
+  *
+  * Returns NFS_OK if recovery for this stateid is now finished.
+  * Otherwise a negative NFS4ERR value is returned.
+  */
+ static int nfs41_check_expired_locks(struct nfs4_state *state)
+ {
+       int status, ret = NFS_OK;
+       struct nfs4_lock_state *lsp;
+       struct nfs_server *server = NFS_SERVER(state->inode);
+       if (!test_bit(LK_STATE_IN_USE, &state->flags))
+               goto out;
+       list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+               if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+                       struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+                       status = nfs41_test_and_free_expired_stateid(server,
+                                       &lsp->ls_stateid,
+                                       cred);
+                       trace_nfs4_test_lock_stateid(state, lsp, status);
+                       if (status == -NFS4ERR_EXPIRED ||
+                           status == -NFS4ERR_BAD_STATEID) {
+                               clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+                               lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE;
+                               if (!recover_lost_locks)
+                                       set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+                       } else if (status != NFS_OK) {
+                               ret = status;
+                               break;
+                       }
+               }
+       };
+ out:
+       return ret;
+ }
  /**
   * nfs41_check_open_stateid - possibly free an open stateid
   *
@@@ -2453,26 -2608,28 +2608,28 @@@ static int nfs41_check_open_stateid(str
        struct rpc_cred *cred = state->owner->so_cred;
        int status;
  
-       /* If a state reset has been done, test_stateid is unneeded */
-       if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) &&
-           (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) &&
-           (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
+       if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
+               if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)  {
+                       if (nfs4_have_delegation(state->inode, state->state))
+                               return NFS_OK;
+                       return -NFS4ERR_OPENMODE;
+               }
                return -NFS4ERR_BAD_STATEID;
-       status = nfs41_test_stateid(server, stateid, cred);
+       }
+       status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
        trace_nfs4_test_open_stateid(state, NULL, status);
-       if (status != NFS_OK) {
-               /* Free the stateid unless the server explicitly
-                * informs us the stateid is unrecognized. */
-               if (status != -NFS4ERR_BAD_STATEID)
-                       nfs41_free_stateid(server, stateid, cred);
+       if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
                clear_bit(NFS_O_RDWR_STATE, &state->flags);
                clear_bit(NFS_OPEN_STATE, &state->flags);
+               stateid->type = NFS4_INVALID_STATEID_TYPE;
        }
-       return status;
+       if (status != NFS_OK)
+               return status;
+       if (nfs_open_stateid_recover_openmode(state))
+               return -NFS4ERR_OPENMODE;
+       return NFS_OK;
  }
  
  static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
        int status;
  
        nfs41_check_delegation_stateid(state);
+       status = nfs41_check_expired_locks(state);
+       if (status != NFS_OK)
+               return status;
        status = nfs41_check_open_stateid(state);
        if (status != NFS_OK)
                status = nfs4_open_expired(sp, state);
@@@ -2537,6 -2697,8 +2697,8 @@@ static int _nfs4_open_and_get_state(str
                goto out;
        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+       if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
+               set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
  
        dentry = opendata->dentry;
        if (d_really_is_negative(dentry)) {
@@@ -2899,9 -3061,12 +3061,12 @@@ static void nfs4_close_done(struct rpc_
                        break;
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_STALE_STATEID:
+               case -NFS4ERR_EXPIRED:
+                       nfs4_free_revoked_stateid(server,
+                                       &calldata->arg.stateid,
+                                       task->tk_msg.rpc_cred);
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_BAD_STATEID:
-               case -NFS4ERR_EXPIRED:
                        if (!nfs4_stateid_match(&calldata->arg.stateid,
                                                &state->open_stateid)) {
                                rpc_restart_call_prepare(task);
@@@ -4312,7 -4477,7 +4477,7 @@@ static int nfs4_proc_fsinfo(struct nfs_
        if (error == 0) {
                /* block layout checks this! */
                server->pnfs_blksize = fsinfo->blksize;
-               set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+               set_pnfs_layoutdriver(server, fhandle, fsinfo);
        }
  
        return error;
@@@ -4399,24 -4564,25 +4564,25 @@@ static bool nfs4_error_stateid_expired(
        return false;
  }
  
- void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
- {
-       nfs_invalidate_atime(hdr->inode);
- }
  static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
  {
        struct nfs_server *server = NFS_SERVER(hdr->inode);
  
        trace_nfs4_read(hdr, task->tk_status);
-       if (nfs4_async_handle_error(task, server,
-                                   hdr->args.context->state,
-                                   NULL) == -EAGAIN) {
-               rpc_restart_call_prepare(task);
-               return -EAGAIN;
+       if (task->tk_status < 0) {
+               struct nfs4_exception exception = {
+                       .inode = hdr->inode,
+                       .state = hdr->args.context->state,
+                       .stateid = &hdr->args.stateid,
+               };
+               task->tk_status = nfs4_async_handle_exception(task,
+                               server, task->tk_status, &exception);
+               if (exception.retry) {
+                       rpc_restart_call_prepare(task);
+                       return -EAGAIN;
+               }
        }
  
-       __nfs4_read_done_cb(hdr);
        if (task->tk_status > 0)
                renew_lease(server, hdr->timestamp);
        return 0;
@@@ -4445,6 -4611,8 +4611,8 @@@ static int nfs4_read_done(struct rpc_ta
                return -EAGAIN;
        if (nfs4_read_stateid_changed(task, &hdr->args))
                return -EAGAIN;
+       if (task->tk_status > 0)
+               nfs_invalidate_atime(hdr->inode);
        return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
                                    nfs4_read_done_cb(task, hdr);
  }
@@@ -4482,11 -4650,19 +4650,19 @@@ static int nfs4_write_done_cb(struct rp
        struct inode *inode = hdr->inode;
  
        trace_nfs4_write(hdr, task->tk_status);
-       if (nfs4_async_handle_error(task, NFS_SERVER(inode),
-                                   hdr->args.context->state,
-                                   NULL) == -EAGAIN) {
-               rpc_restart_call_prepare(task);
-               return -EAGAIN;
+       if (task->tk_status < 0) {
+               struct nfs4_exception exception = {
+                       .inode = hdr->inode,
+                       .state = hdr->args.context->state,
+                       .stateid = &hdr->args.stateid,
+               };
+               task->tk_status = nfs4_async_handle_exception(task,
+                               NFS_SERVER(inode), task->tk_status,
+                               &exception);
+               if (exception.retry) {
+                       rpc_restart_call_prepare(task);
+                       return -EAGAIN;
+               }
        }
        if (task->tk_status >= 0) {
                renew_lease(NFS_SERVER(inode), hdr->timestamp);
@@@ -5123,12 -5299,14 +5299,14 @@@ static void nfs4_init_boot_verifier(con
        if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
                /* An impossible timestamp guarantees this value
                 * will never match a generated boot time. */
-               verf[0] = 0;
-               verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
+               verf[0] = cpu_to_be32(U32_MAX);
+               verf[1] = cpu_to_be32(U32_MAX);
        } else {
                struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
-               verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
-               verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
+               u64 ns = ktime_to_ns(nn->boot_time);
+               verf[0] = cpu_to_be32(ns >> 32);
+               verf[1] = cpu_to_be32(ns);
        }
        memcpy(bootverf->data, verf, sizeof(bootverf->data));
  }
@@@ -5393,10 -5571,13 +5571,13 @@@ static void nfs4_delegreturn_done(struc
                renew_lease(data->res.server, data->timestamp);
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_DELEG_REVOKED:
+       case -NFS4ERR_EXPIRED:
+               nfs4_free_revoked_stateid(data->res.server,
+                               data->args.stateid,
+                               task->tk_msg.rpc_cred);
        case -NFS4ERR_BAD_STATEID:
        case -NFS4ERR_OLD_STATEID:
        case -NFS4ERR_STALE_STATEID:
-       case -NFS4ERR_EXPIRED:
                task->tk_status = 0;
                if (data->roc)
                        pnfs_roc_set_barrier(data->inode, data->roc_barrier);
@@@ -5528,22 -5709,6 +5709,6 @@@ int nfs4_proc_delegreturn(struct inode 
        return err;
  }
  
- #define NFS4_LOCK_MINTIMEOUT (1 * HZ)
- #define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
- /* 
-  * sleep, with exponential backoff, and retry the LOCK operation. 
-  */
- static unsigned long
- nfs4_set_lock_task_retry(unsigned long timeout)
- {
-       freezable_schedule_timeout_killable_unsafe(timeout);
-       timeout <<= 1;
-       if (timeout > NFS4_LOCK_MAXTIMEOUT)
-               return NFS4_LOCK_MAXTIMEOUT;
-       return timeout;
- }
  static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
  {
        struct inode *inode = state->inode;
@@@ -5600,11 -5765,6 +5765,6 @@@ static int nfs4_proc_getlk(struct nfs4_
        return err;
  }
  
- static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
- {
-       return locks_lock_inode_wait(inode, fl);
- }
  struct nfs4_unlockdata {
        struct nfs_locku_args arg;
        struct nfs_locku_res res;
@@@ -5657,14 -5817,18 +5817,18 @@@ static void nfs4_locku_done(struct rpc_
        switch (task->tk_status) {
                case 0:
                        renew_lease(calldata->server, calldata->timestamp);
-                       do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl);
+                       locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl);
                        if (nfs4_update_lock_stateid(calldata->lsp,
                                        &calldata->res.stateid))
                                break;
+               case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_EXPIRED:
+                       nfs4_free_revoked_stateid(calldata->server,
+                                       &calldata->arg.stateid,
+                                       task->tk_msg.rpc_cred);
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_STALE_STATEID:
-               case -NFS4ERR_EXPIRED:
                        if (!nfs4_stateid_match(&calldata->arg.stateid,
                                                &calldata->lsp->ls_stateid))
                                rpc_restart_call_prepare(task);
@@@ -5765,7 -5929,7 +5929,7 @@@ static int nfs4_proc_unlck(struct nfs4_
        mutex_lock(&sp->so_delegreturn_mutex);
        /* Exclude nfs4_reclaim_open_stateid() - note nesting! */
        down_read(&nfsi->rwsem);
-       if (do_vfs_lock(inode, request) == -ENOENT) {
+       if (locks_lock_inode_wait(inode, request) == -ENOENT) {
                up_read(&nfsi->rwsem);
                mutex_unlock(&sp->so_delegreturn_mutex);
                goto out;
@@@ -5906,7 -6070,7 +6070,7 @@@ static void nfs4_lock_done(struct rpc_t
                                data->timestamp);
                if (data->arg.new_lock) {
                        data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
-                       if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) {
+                       if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) {
                                rpc_restart_call_prepare(task);
                                break;
                        }
@@@ -5965,6 -6129,7 +6129,7 @@@ static void nfs4_handle_setlk_error(str
  {
        switch (error) {
        case -NFS4ERR_ADMIN_REVOKED:
+       case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                if (new_lock_owner != 0 ||
                break;
        case -NFS4ERR_STALE_STATEID:
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
-       case -NFS4ERR_EXPIRED:
                nfs4_schedule_lease_recovery(server->nfs_client);
        };
  }
  }
  
  #if defined(CONFIG_NFS_V4_1)
- /**
-  * nfs41_check_expired_locks - possibly free a lock stateid
-  *
-  * @state: NFSv4 state for an inode
-  *
-  * Returns NFS_OK if recovery for this stateid is now finished.
-  * Otherwise a negative NFS4ERR value is returned.
-  */
- static int nfs41_check_expired_locks(struct nfs4_state *state)
- {
-       int status, ret = -NFS4ERR_BAD_STATEID;
-       struct nfs4_lock_state *lsp;
-       struct nfs_server *server = NFS_SERVER(state->inode);
-       list_for_each_entry(lsp, &state->lock_states, ls_locks) {
-               if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-                       struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
-                       status = nfs41_test_stateid(server,
-                                       &lsp->ls_stateid,
-                                       cred);
-                       trace_nfs4_test_lock_stateid(state, lsp, status);
-                       if (status != NFS_OK) {
-                               /* Free the stateid unless the server
-                                * informs us the stateid is unrecognized. */
-                               if (status != -NFS4ERR_BAD_STATEID)
-                                       nfs41_free_stateid(server,
-                                                       &lsp->ls_stateid,
-                                                       cred);
-                               clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
-                               ret = status;
-                       }
-               }
-       };
-       return ret;
- }
  static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
  {
-       int status = NFS_OK;
+       struct nfs4_lock_state *lsp;
+       int status;
  
-       if (test_bit(LK_STATE_IN_USE, &state->flags))
-               status = nfs41_check_expired_locks(state);
-       if (status != NFS_OK)
-               status = nfs4_lock_expired(state, request);
+       status = nfs4_set_lock_state(state, request);
+       if (status != 0)
+               return status;
+       lsp = request->fl_u.nfs4_fl.owner;
+       if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) ||
+           test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+               return 0;
+       status = nfs4_lock_expired(state, request);
        return status;
  }
  #endif
@@@ -6138,17 -6269,10 +6269,10 @@@ static int _nfs4_proc_setlk(struct nfs4
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs4_state_owner *sp = state->owner;
        unsigned char fl_flags = request->fl_flags;
-       int status = -ENOLCK;
+       int status;
  
-       if ((fl_flags & FL_POSIX) &&
-                       !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
-               goto out;
-       /* Is this a delegated open? */
-       status = nfs4_set_lock_state(state, request);
-       if (status != 0)
-               goto out;
        request->fl_flags |= FL_ACCESS;
-       status = do_vfs_lock(state->inode, request);
+       status = locks_lock_inode_wait(state->inode, request);
        if (status < 0)
                goto out;
        mutex_lock(&sp->so_delegreturn_mutex);
                /* Yes: cache locks! */
                /* ...but avoid races with delegation recall... */
                request->fl_flags = fl_flags & ~FL_SLEEP;
-               status = do_vfs_lock(state->inode, request);
+               status = locks_lock_inode_wait(state->inode, request);
                up_read(&nfsi->rwsem);
                mutex_unlock(&sp->so_delegreturn_mutex);
                goto out;
@@@ -6188,12 -6312,124 +6312,124 @@@ static int nfs4_proc_setlk(struct nfs4_
        return err;
  }
  
+ #define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+ #define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+ static int
+ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
+                       struct file_lock *request)
+ {
+       int             status = -ERESTARTSYS;
+       unsigned long   timeout = NFS4_LOCK_MINTIMEOUT;
+       while(!signalled()) {
+               status = nfs4_proc_setlk(state, cmd, request);
+               if ((status != -EAGAIN) || IS_SETLK(cmd))
+                       break;
+               freezable_schedule_timeout_interruptible(timeout);
+               timeout *= 2;
+               timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
+               status = -ERESTARTSYS;
+       }
+       return status;
+ }
+ #ifdef CONFIG_NFS_V4_1
+ struct nfs4_lock_waiter {
+       struct task_struct      *task;
+       struct inode            *inode;
+       struct nfs_lowner       *owner;
+       bool                    notified;
+ };
+ static int
+ nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+ {
+       int ret;
+       struct cb_notify_lock_args *cbnl = key;
+       struct nfs4_lock_waiter *waiter = wait->private;
+       struct nfs_lowner       *lowner = &cbnl->cbnl_owner,
+                               *wowner = waiter->owner;
+       /* Only wake if the callback was for the same owner */
+       if (lowner->clientid != wowner->clientid ||
+           lowner->id != wowner->id             ||
+           lowner->s_dev != wowner->s_dev)
+               return 0;
+       /* Make sure it's for the right inode */
+       if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+               return 0;
+       waiter->notified = true;
+       /* override "private" so we can use default_wake_function */
+       wait->private = waiter->task;
+       ret = autoremove_wake_function(wait, mode, flags, key);
+       wait->private = waiter;
+       return ret;
+ }
+ static int
+ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+ {
+       int status = -ERESTARTSYS;
+       unsigned long flags;
+       struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+       struct nfs_server *server = NFS_SERVER(state->inode);
+       struct nfs_client *clp = server->nfs_client;
+       wait_queue_head_t *q = &clp->cl_lock_waitq;
+       struct nfs_lowner owner = { .clientid = clp->cl_clientid,
+                                   .id = lsp->ls_seqid.owner_id,
+                                   .s_dev = server->s_dev };
+       struct nfs4_lock_waiter waiter = { .task  = current,
+                                          .inode = state->inode,
+                                          .owner = &owner,
+                                          .notified = false };
+       wait_queue_t wait;
+       /* Don't bother with waitqueue if we don't expect a callback */
+       if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
+               return nfs4_retry_setlk_simple(state, cmd, request);
+       init_wait(&wait);
+       wait.private = &waiter;
+       wait.func = nfs4_wake_lock_waiter;
+       add_wait_queue(q, &wait);
+       while(!signalled()) {
+               status = nfs4_proc_setlk(state, cmd, request);
+               if ((status != -EAGAIN) || IS_SETLK(cmd))
+                       break;
+               status = -ERESTARTSYS;
+               spin_lock_irqsave(&q->lock, flags);
+               if (waiter.notified) {
+                       spin_unlock_irqrestore(&q->lock, flags);
+                       continue;
+               }
+               set_current_state(TASK_INTERRUPTIBLE);
+               spin_unlock_irqrestore(&q->lock, flags);
+               freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT);
+       }
+       finish_wait(q, &wait);
+       return status;
+ }
+ #else /* !CONFIG_NFS_V4_1 */
+ static inline int
+ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+ {
+       return nfs4_retry_setlk_simple(state, cmd, request);
+ }
+ #endif
  static int
  nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
  {
        struct nfs_open_context *ctx;
        struct nfs4_state *state;
-       unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
        int status;
  
        /* verify open state */
  
        if (state == NULL)
                return -ENOLCK;
+       if ((request->fl_flags & FL_POSIX) &&
+           !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+               return -ENOLCK;
        /*
         * Don't rely on the VFS having checked the file open mode,
         * since it won't do this for flock() locks.
                        return -EBADF;
        }
  
-       do {
-               status = nfs4_proc_setlk(state, cmd, request);
-               if ((status != -EAGAIN) || IS_SETLK(cmd))
-                       break;
-               timeout = nfs4_set_lock_task_retry(timeout);
-               status = -ERESTARTSYS;
-               if (signalled())
-                       break;
-       } while(status < 0);
-       return status;
+       status = nfs4_set_lock_state(state, request);
+       if (status != 0)
+               return status;
+       return nfs4_retry_setlk(state, cmd, request);
  }
  
  int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
@@@ -7104,75 -7340,161 +7340,161 @@@ static int nfs4_sp4_select_mode(struct 
        return 0;
  }
  
+ struct nfs41_exchange_id_data {
+       struct nfs41_exchange_id_res res;
+       struct nfs41_exchange_id_args args;
+       struct rpc_xprt *xprt;
+       int rpc_status;
+ };
+ static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
+ {
+       struct nfs41_exchange_id_data *cdata =
+                                       (struct nfs41_exchange_id_data *)data;
+       struct nfs_client *clp = cdata->args.client;
+       int status = task->tk_status;
+       trace_nfs4_exchange_id(clp, status);
+       if (status == 0)
+               status = nfs4_check_cl_exchange_flags(cdata->res.flags);
+       if (cdata->xprt && status == 0) {
+               status = nfs4_detect_session_trunking(clp, &cdata->res,
+                                                     cdata->xprt);
+               goto out;
+       }
+       if (status  == 0)
+               status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
+       if (status == 0) {
+               clp->cl_clientid = cdata->res.clientid;
+               clp->cl_exchange_flags = cdata->res.flags;
+               /* Client ID is not confirmed */
+               if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+                       clear_bit(NFS4_SESSION_ESTABLISHED,
+                       &clp->cl_session->session_state);
+                       clp->cl_seqid = cdata->res.seqid;
+               }
+               kfree(clp->cl_serverowner);
+               clp->cl_serverowner = cdata->res.server_owner;
+               cdata->res.server_owner = NULL;
+               /* use the most recent implementation id */
+               kfree(clp->cl_implid);
+               clp->cl_implid = cdata->res.impl_id;
+               cdata->res.impl_id = NULL;
+               if (clp->cl_serverscope != NULL &&
+                   !nfs41_same_server_scope(clp->cl_serverscope,
+                                       cdata->res.server_scope)) {
+                       dprintk("%s: server_scope mismatch detected\n",
+                               __func__);
+                       set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+                       kfree(clp->cl_serverscope);
+                       clp->cl_serverscope = NULL;
+               }
+               if (clp->cl_serverscope == NULL) {
+                       clp->cl_serverscope = cdata->res.server_scope;
+                       cdata->res.server_scope = NULL;
+               }
+               /* Save the EXCHANGE_ID verifier session trunk tests */
+               memcpy(clp->cl_confirm.data, cdata->args.verifier->data,
+                      sizeof(clp->cl_confirm.data));
+       }
+ out:
+       cdata->rpc_status = status;
+       return;
+ }
+ static void nfs4_exchange_id_release(void *data)
+ {
+       struct nfs41_exchange_id_data *cdata =
+                                       (struct nfs41_exchange_id_data *)data;
+       nfs_put_client(cdata->args.client);
+       if (cdata->xprt) {
+               xprt_put(cdata->xprt);
+               rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient);
+       }
+       kfree(cdata->res.impl_id);
+       kfree(cdata->res.server_scope);
+       kfree(cdata->res.server_owner);
+       kfree(cdata);
+ }
+ static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
+       .rpc_call_done = nfs4_exchange_id_done,
+       .rpc_release = nfs4_exchange_id_release,
+ };
  /*
   * _nfs4_proc_exchange_id()
   *
   * Wrapper for EXCHANGE_ID operation.
   */
  static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
-       u32 sp4_how)
+                       u32 sp4_how, struct rpc_xprt *xprt)
  {
        nfs4_verifier verifier;
-       struct nfs41_exchange_id_args args = {
-               .verifier = &verifier,
-               .client = clp,
- #ifdef CONFIG_NFS_V4_1_MIGRATION
-               .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-                        EXCHGID4_FLAG_BIND_PRINC_STATEID |
-                        EXCHGID4_FLAG_SUPP_MOVED_MIGR,
- #else
-               .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-                        EXCHGID4_FLAG_BIND_PRINC_STATEID,
- #endif
-       };
-       struct nfs41_exchange_id_res res = {
-               0
-       };
-       int status;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
-               .rpc_argp = &args,
-               .rpc_resp = &res,
                .rpc_cred = cred,
        };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = clp->cl_rpcclient,
+               .callback_ops = &nfs4_exchange_id_call_ops,
+               .rpc_message = &msg,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+       };
+       struct nfs41_exchange_id_data *calldata;
+       struct rpc_task *task;
+       int status = -EIO;
+       if (!atomic_inc_not_zero(&clp->cl_count))
+               goto out;
+       status = -ENOMEM;
+       calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+       if (!calldata)
+               goto out;
  
-       nfs4_init_boot_verifier(clp, &verifier);
+       if (!xprt)
+               nfs4_init_boot_verifier(clp, &verifier);
  
        status = nfs4_init_uniform_client_string(clp);
        if (status)
-               goto out;
+               goto out_calldata;
  
        dprintk("NFS call  exchange_id auth=%s, '%s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
                clp->cl_owner_id);
  
-       res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
-                                       GFP_NOFS);
-       if (unlikely(res.server_owner == NULL)) {
-               status = -ENOMEM;
-               goto out;
-       }
+       calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+                                               GFP_NOFS);
+       status = -ENOMEM;
+       if (unlikely(calldata->res.server_owner == NULL))
+               goto out_calldata;
  
-       res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+       calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
                                        GFP_NOFS);
-       if (unlikely(res.server_scope == NULL)) {
-               status = -ENOMEM;
+       if (unlikely(calldata->res.server_scope == NULL))
                goto out_server_owner;
-       }
  
-       res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
-       if (unlikely(res.impl_id == NULL)) {
-               status = -ENOMEM;
+       calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+       if (unlikely(calldata->res.impl_id == NULL))
                goto out_server_scope;
-       }
  
        switch (sp4_how) {
        case SP4_NONE:
-               args.state_protect.how = SP4_NONE;
+               calldata->args.state_protect.how = SP4_NONE;
                break;
  
        case SP4_MACH_CRED:
-               args.state_protect = nfs4_sp4_mach_cred_request;
+               calldata->args.state_protect = nfs4_sp4_mach_cred_request;
                break;
  
        default:
                status = -EINVAL;
                goto out_impl_id;
        }
+       if (xprt) {
+               calldata->xprt = xprt;
+               task_setup_data.rpc_xprt = xprt;
+               task_setup_data.flags =
+                               RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC;
+               calldata->args.verifier = &clp->cl_confirm;
+       } else {
+               calldata->args.verifier = &verifier;
+       }
+       calldata->args.client = clp;
+ #ifdef CONFIG_NFS_V4_1_MIGRATION
+       calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+       EXCHGID4_FLAG_BIND_PRINC_STATEID |
+       EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+ #else
+       calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+       EXCHGID4_FLAG_BIND_PRINC_STATEID,
+ #endif
+       msg.rpc_argp = &calldata->args;
+       msg.rpc_resp = &calldata->res;
+       task_setup_data.callback_data = calldata;
  
-       status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
-       trace_nfs4_exchange_id(clp, status);
-       if (status == 0)
-               status = nfs4_check_cl_exchange_flags(res.flags);
-       if (status == 0)
-               status = nfs4_sp4_select_mode(clp, &res.state_protect);
-       if (status == 0) {
-               clp->cl_clientid = res.clientid;
-               clp->cl_exchange_flags = res.flags;
-               /* Client ID is not confirmed */
-               if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
-                       clear_bit(NFS4_SESSION_ESTABLISHED,
-                                       &clp->cl_session->session_state);
-                       clp->cl_seqid = res.seqid;
-               }
-               kfree(clp->cl_serverowner);
-               clp->cl_serverowner = res.server_owner;
-               res.server_owner = NULL;
-               /* use the most recent implementation id */
-               kfree(clp->cl_implid);
-               clp->cl_implid = res.impl_id;
-               res.impl_id = NULL;
-               if (clp->cl_serverscope != NULL &&
-                   !nfs41_same_server_scope(clp->cl_serverscope,
-                                            res.server_scope)) {
-                       dprintk("%s: server_scope mismatch detected\n",
-                               __func__);
-                       set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
-                       kfree(clp->cl_serverscope);
-                       clp->cl_serverscope = NULL;
-               }
-               if (clp->cl_serverscope == NULL) {
-                       clp->cl_serverscope = res.server_scope;
-                       res.server_scope = NULL;
-               }
+       task = rpc_run_task(&task_setup_data);
+       if (IS_ERR(task)) {
+       status = PTR_ERR(task);
+               goto out_impl_id;
        }
  
- out_impl_id:
-       kfree(res.impl_id);
- out_server_scope:
-       kfree(res.server_scope);
- out_server_owner:
-       kfree(res.server_owner);
+       if (!xprt) {
+               status = rpc_wait_for_completion_task(task);
+               if (!status)
+                       status = calldata->rpc_status;
+       } else  /* session trunking test */
+               status = calldata->rpc_status;
+       rpc_put_task(task);
  out:
        if (clp->cl_implid != NULL)
                dprintk("NFS reply exchange_id: Server Implementation ID: "
                        clp->cl_implid->date.nseconds);
        dprintk("NFS reply exchange_id: %d\n", status);
        return status;
+ out_impl_id:
+       kfree(calldata->res.impl_id);
+ out_server_scope:
+       kfree(calldata->res.server_scope);
+ out_server_owner:
+       kfree(calldata->res.server_owner);
+ out_calldata:
+       kfree(calldata);
+       goto out;
  }
  
  /*
@@@ -7262,14 -7580,45 +7580,45 @@@ int nfs4_proc_exchange_id(struct nfs_cl
        /* try SP4_MACH_CRED if krb5i/p */
        if (authflavor == RPC_AUTH_GSS_KRB5I ||
            authflavor == RPC_AUTH_GSS_KRB5P) {
-               status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+               status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
                if (!status)
                        return 0;
        }
  
        /* try SP4_NONE */
-       return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+       return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
+ }
+ /**
+  * nfs4_test_session_trunk
+  *
+  * This is an add_xprt_test() test function called from
+  * rpc_clnt_setup_test_and_add_xprt.
+  *
+  * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt
+  * and is dereferrenced in nfs4_exchange_id_release
+  *
+  * Upon success, add the new transport to the rpc_clnt
+  *
+  * @clnt: struct rpc_clnt to get new transport
+  * @xprt: the rpc_xprt to test
+  * @data: call data for _nfs4_proc_exchange_id.
+  */
+ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+                           void *data)
+ {
+       struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+       u32 sp4_how;
+       dprintk("--> %s try %s\n", __func__,
+               xprt->address_strings[RPC_DISPLAY_ADDR]);
+       sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+       /* Test connection for session trunking. Async exchange_id call */
+       return  _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
  }
+ EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
  
  static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
                struct rpc_cred *cred)
@@@ -7463,7 -7812,7 +7812,7 @@@ static void nfs4_init_channel_attrs(str
        args->bc_attrs.max_resp_sz = max_bc_payload;
        args->bc_attrs.max_resp_sz_cached = 0;
        args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
-       args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
+       args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1);
  
        dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
@@@ -7510,10 -7859,9 +7859,9 @@@ static int nfs4_verify_back_channel_att
                return -EINVAL;
        if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
                return -EINVAL;
-       /* These would render the backchannel useless: */
-       if (rcvd->max_ops != sent->max_ops)
+       if (rcvd->max_ops > sent->max_ops)
                return -EINVAL;
-       if (rcvd->max_reqs != sent->max_reqs)
+       if (rcvd->max_reqs > sent->max_reqs)
                return -EINVAL;
  out:
        return 0;
@@@ -7982,6 -8330,8 +8330,8 @@@ nfs4_layoutget_handle_exception(struct 
        case -NFS4ERR_RECALLCONFLICT:
                status = -ERECALLCONFLICT;
                break;
+       case -NFS4ERR_DELEG_REVOKED:
+       case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
                exception->timeout = 0;
                                        &lgp->args.ctx->state->stateid)) {
                        spin_unlock(&inode->i_lock);
                        exception->state = lgp->args.ctx->state;
+                       exception->stateid = &lgp->args.stateid;
                        break;
                }
  
@@@ -8591,6 -8942,24 +8942,24 @@@ static int _nfs41_test_stateid(struct n
        return -res.status;
  }
  
+ static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
+               int err, struct nfs4_exception *exception)
+ {
+       exception->retry = 0;
+       switch(err) {
+       case -NFS4ERR_DELAY:
+       case -NFS4ERR_RETRY_UNCACHED_REP:
+               nfs4_handle_exception(server, err, exception);
+               break;
+       case -NFS4ERR_BADSESSION:
+       case -NFS4ERR_BADSLOT:
+       case -NFS4ERR_BAD_HIGH_SLOT:
+       case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+       case -NFS4ERR_DEADSESSION:
+               nfs4_do_handle_exception(server, err, exception);
+       }
+ }
  /**
   * nfs41_test_stateid - perform a TEST_STATEID operation
   *
@@@ -8610,9 -8979,7 +8979,7 @@@ static int nfs41_test_stateid(struct nf
        int err;
        do {
                err = _nfs41_test_stateid(server, stateid, cred);
-               if (err != -NFS4ERR_DELAY)
-                       break;
-               nfs4_handle_exception(server, err, &exception);
+               nfs4_handle_delay_or_session_error(server, err, &exception);
        } while (exception.retry);
        return err;
  }
@@@ -8657,7 -9024,7 +9024,7 @@@ static const struct rpc_call_ops nfs41_
  };
  
  static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
-               nfs4_stateid *stateid,
+               const nfs4_stateid *stateid,
                struct rpc_cred *cred,
                bool privileged)
  {
  
        msg.rpc_argp = &data->args;
        msg.rpc_resp = &data->res;
-       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
        if (privileged)
                nfs4_set_sequence_privileged(&data->args.seq_args);
  
   * @server: server / transport on which to perform the operation
   * @stateid: state ID to release
   * @cred: credential
+  * @is_recovery: set to true if this call needs to be privileged
   *
-  * Returns NFS_OK if the server freed "stateid".  Otherwise a
-  * negative NFS4ERR value is returned.
+  * Note: this function is always asynchronous.
   */
  static int nfs41_free_stateid(struct nfs_server *server,
-               nfs4_stateid *stateid,
-               struct rpc_cred *cred)
+               const nfs4_stateid *stateid,
+               struct rpc_cred *cred,
+               bool is_recovery)
  {
        struct rpc_task *task;
-       int ret;
  
-       task = _nfs41_free_stateid(server, stateid, cred, true);
+       task = _nfs41_free_stateid(server, stateid, cred, is_recovery);
        if (IS_ERR(task))
                return PTR_ERR(task);
-       ret = rpc_wait_for_completion_task(task);
-       if (!ret)
-               ret = task->tk_status;
        rpc_put_task(task);
-       return ret;
+       return 0;
  }
  
  static void
  nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
  {
-       struct rpc_task *task;
        struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
  
-       task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+       nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
        nfs4_free_lock_state(server, lsp);
-       if (IS_ERR(task))
-               return;
-       rpc_put_task(task);
  }
  
  static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@@ -8835,6 -9195,7 +9195,7 @@@ static const struct nfs4_minor_version_
        .match_stateid = nfs4_match_stateid,
        .find_root_sec = nfs4_find_root_sec,
        .free_lock_state = nfs4_release_lockowner,
+       .test_and_free_expired = nfs40_test_and_free_expired_stateid,
        .alloc_seqid = nfs_alloc_seqid,
        .call_sync_ops = &nfs40_call_sync_ops,
        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
@@@ -8862,7 -9223,9 +9223,9 @@@ static const struct nfs4_minor_version_
        .match_stateid = nfs41_match_stateid,
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
+       .test_and_free_expired = nfs41_test_and_free_expired_stateid,
        .alloc_seqid = nfs_alloc_no_seqid,
+       .session_trunk = nfs4_test_session_trunk,
        .call_sync_ops = &nfs41_call_sync_ops,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@@ -8891,7 -9254,9 +9254,9 @@@ static const struct nfs4_minor_version_
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
        .call_sync_ops = &nfs41_call_sync_ops,
+       .test_and_free_expired = nfs41_test_and_free_expired_stateid,
        .alloc_seqid = nfs_alloc_no_seqid,
+       .session_trunk = nfs4_test_session_trunk,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
        .state_renewal_ops = &nfs41_state_renewal_ops,
@@@ -8941,14 -9306,20 +9306,14 @@@ static const struct inode_operations nf
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
 -      .getxattr       = generic_getxattr,
 -      .setxattr       = generic_setxattr,
        .listxattr      = nfs4_listxattr,
 -      .removexattr    = generic_removexattr,
  };
  
  static const struct inode_operations nfs4_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
 -      .getxattr       = generic_getxattr,
 -      .setxattr       = generic_setxattr,
        .listxattr      = nfs4_listxattr,
 -      .removexattr    = generic_removexattr,
  };
  
  const struct nfs_rpc_ops nfs_v4_clientops = {
@@@ -78,6 -78,14 +78,14 @@@ static struct rpc_cred *generic_bind_cr
        return auth->au_ops->lookup_cred(auth, acred, lookupflags);
  }
  
+ static int
+ generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+ {
+       return hash_64(from_kgid(&init_user_ns, acred->gid) |
+               ((u64)from_kuid(&init_user_ns, acred->uid) <<
+                       (sizeof(gid_t) * 8)), hashbits);
+ }
  /*
   * Lookup generic creds for current process
   */
@@@ -176,8 -184,8 +184,8 @@@ generic_match(struct auth_cred *acred, 
        if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
                goto out_nomatch;
        for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
 -              if (!gid_eq(GROUP_AT(gcred->acred.group_info, i),
 -                              GROUP_AT(acred->group_info, i)))
 +              if (!gid_eq(gcred->acred.group_info->gid[i],
 +                              acred->group_info->gid[i]))
                        goto out_nomatch;
        }
  out_match:
@@@ -258,6 -266,7 +266,7 @@@ generic_key_timeout(struct rpc_auth *au
  static const struct rpc_authops generic_auth_ops = {
        .owner = THIS_MODULE,
        .au_name = "Generic",
+       .hash_cred = generic_hash_cred,
        .lookup_cred = generic_lookup_cred,
        .crcreate = generic_create_cred,
        .key_timeout = generic_key_timeout,
diff --combined net/sunrpc/auth_unix.c
@@@ -46,6 -46,14 +46,14 @@@ unx_destroy(struct rpc_auth *auth
        rpcauth_clear_credcache(auth->au_credcache);
  }
  
+ static int
+ unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+ {
+       return hash_64(from_kgid(&init_user_ns, acred->gid) |
+               ((u64)from_kuid(&init_user_ns, acred->uid) <<
+                       (sizeof(gid_t) * 8)), hashbits);
+ }
  /*
   * Lookup AUTH_UNIX creds for current process
   */
@@@ -79,7 -87,7 +87,7 @@@ unx_create_cred(struct rpc_auth *auth, 
  
        cred->uc_gid = acred->gid;
        for (i = 0; i < groups; i++)
 -              cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
 +              cred->uc_gids[i] = acred->group_info->gid[i];
        if (i < NFS_NGROUPS)
                cred->uc_gids[i] = INVALID_GID;
  
@@@ -127,7 -135,7 +135,7 @@@ unx_match(struct auth_cred *acred, stru
        if (groups > NFS_NGROUPS)
                groups = NFS_NGROUPS;
        for (i = 0; i < groups ; i++)
 -              if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i)))
 +              if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
                        return 0;
        if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
                return 0;
@@@ -220,6 -228,7 +228,7 @@@ const struct rpc_authops authunix_ops 
        .au_name        = "UNIX",
        .create         = unx_create,
        .destroy        = unx_destroy,
+       .hash_cred      = unx_hash_cred,
        .lookup_cred    = unx_lookup_cred,
        .crcreate       = unx_create_cred,
  };
@@@ -129,7 -129,7 +129,7 @@@ static int svc_rdma_bc_sendto(struct sv
                ret = -EIO;
                goto out_unmap;
        }
 -      atomic_inc(&rdma->sc_dma_used);
 +      svc_rdma_count_mappings(rdma, ctxt);
  
        memset(&send_wr, 0, sizeof(send_wr));
        ctxt->cqe.done = svc_rdma_wc_send;
@@@ -159,33 -159,34 +159,34 @@@ out_unmap
  /* Server-side transport endpoint wants a whole page for its send
   * buffer. The client RPC code constructs the RPC header in this
   * buffer before it invokes ->send_request.
-  *
-  * Returns NULL if there was a temporary allocation failure.
   */
- static void *
- xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+ static int
+ xprt_rdma_bc_allocate(struct rpc_task *task)
  {
        struct rpc_rqst *rqst = task->tk_rqstp;
        struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+       size_t size = rqst->rq_callsize;
        struct svcxprt_rdma *rdma;
        struct page *page;
  
        rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
  
-       /* Prevent an infinite loop: try to make this case work */
-       if (size > PAGE_SIZE)
+       if (size > PAGE_SIZE) {
                WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
                          size);
+               return -EINVAL;
+       }
  
        page = alloc_page(RPCRDMA_DEF_GFP);
        if (!page)
-               return NULL;
+               return -ENOMEM;
  
-       return page_address(page);
+       rqst->rq_buffer = page_address(page);
+       return 0;
  }
  
  static void
- xprt_rdma_bc_free(void *buffer)
+ xprt_rdma_bc_free(struct rpc_task *task)
  {
        /* No-op: ctxt and page have already been freed. */
  }
@@@ -129,15 -129,6 +129,6 @@@ rpcrdma_wc_send(struct ib_cq *cq, struc
                       wc->status, wc->vendor_err);
  }
  
- static void
- rpcrdma_receive_worker(struct work_struct *work)
- {
-       struct rpcrdma_rep *rep =
-                       container_of(work, struct rpcrdma_rep, rr_work);
-       rpcrdma_reply_handler(rep);
- }
  /* Perform basic sanity checking to avoid using garbage
   * to update the credit grant value.
   */
@@@ -161,13 -152,13 +152,13 @@@ rpcrdma_update_granted_credits(struct r
  }
  
  /**
-  * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
+  * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
   * @cq:       completion queue (ignored)
   * @wc:       completed WR
   *
   */
  static void
- rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
+ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
  {
        struct ib_cqe *cqe = wc->wr_cqe;
        struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
                __func__, rep, wc->byte_len);
  
        rep->rr_len = wc->byte_len;
+       rep->rr_wc_flags = wc->wc_flags;
+       rep->rr_inv_rkey = wc->ex.invalidate_rkey;
        ib_dma_sync_single_for_cpu(rep->rr_device,
                                   rdmab_addr(rep->rr_rdmabuf),
                                   rep->rr_len, DMA_FROM_DEVICE);
@@@ -204,6 -198,36 +198,36 @@@ out_fail
        goto out_schedule;
  }
  
+ static void
+ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
+                              struct rdma_conn_param *param)
+ {
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       const struct rpcrdma_connect_private *pmsg = param->private_data;
+       unsigned int rsize, wsize;
+       /* Default settings for RPC-over-RDMA Version One */
+       r_xprt->rx_ia.ri_reminv_expected = false;
+       rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+       wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+       if (pmsg &&
+           pmsg->cp_magic == rpcrdma_cmp_magic &&
+           pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+               r_xprt->rx_ia.ri_reminv_expected = true;
+               rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+               wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+       }
+       if (rsize < cdata->inline_rsize)
+               cdata->inline_rsize = rsize;
+       if (wsize < cdata->inline_wsize)
+               cdata->inline_wsize = wsize;
+       pr_info("rpcrdma: max send %u, max recv %u\n",
+               cdata->inline_wsize, cdata->inline_rsize);
+       rpcrdma_set_max_header_sizes(r_xprt);
+ }
  static int
  rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
  {
                        " (%d initiator)\n",
                        __func__, attr->max_dest_rd_atomic,
                        attr->max_rd_atomic);
+               rpcrdma_update_connect_private(xprt, &event->param.conn);
                goto connected;
        case RDMA_CM_EVENT_CONNECT_ERROR:
                connstate = -ENOTCONN;
@@@ -387,7 -412,7 +412,7 @@@ rpcrdma_ia_open(struct rpcrdma_xprt *xp
        }
        ia->ri_device = ia->ri_id->device;
  
 -      ia->ri_pd = ib_alloc_pd(ia->ri_device);
 +      ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
        if (IS_ERR(ia->ri_pd)) {
                rc = PTR_ERR(ia->ri_pd);
                pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
@@@ -454,11 -479,12 +479,12 @@@ in
  rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                                struct rpcrdma_create_data_internal *cdata)
  {
+       struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
        struct ib_cq *sendcq, *recvcq;
        unsigned int max_qp_wr;
        int rc;
  
-       if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
+       if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
                dprintk("RPC:       %s: insufficient sge's available\n",
                        __func__);
                return -ENOMEM;
        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
        ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
        ep->rep_attr.cap.max_recv_wr += 1;      /* drain cqe */
-       ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
+       ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
        ep->rep_attr.cap.max_recv_sge = 1;
        ep->rep_attr.cap.max_inline_data = 0;
        ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        /* Initialize cma parameters */
        memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
  
-       /* RPC/RDMA does not use private data */
-       ep->rep_remote_cma.private_data = NULL;
-       ep->rep_remote_cma.private_data_len = 0;
+       /* Prepare RDMA-CM private message */
+       pmsg->cp_magic = rpcrdma_cmp_magic;
+       pmsg->cp_version = RPCRDMA_CMP_VERSION;
+       pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
+       pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
+       pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+       ep->rep_remote_cma.private_data = pmsg;
+       ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
  
        /* Client offers RDMA Read but does not initiate */
        ep->rep_remote_cma.initiator_depth = 0;
@@@ -849,6 -880,10 +880,10 @@@ rpcrdma_create_req(struct rpcrdma_xprt 
        req->rl_cqe.done = rpcrdma_wc_send;
        req->rl_buffer = &r_xprt->rx_buf;
        INIT_LIST_HEAD(&req->rl_registered);
+       req->rl_send_wr.next = NULL;
+       req->rl_send_wr.wr_cqe = &req->rl_cqe;
+       req->rl_send_wr.sg_list = req->rl_send_sge;
+       req->rl_send_wr.opcode = IB_WR_SEND;
        return req;
  }
  
@@@ -865,17 -900,21 +900,21 @@@ rpcrdma_create_rep(struct rpcrdma_xprt 
        if (rep == NULL)
                goto out;
  
-       rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
-                                              GFP_KERNEL);
+       rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+                                              DMA_FROM_DEVICE, GFP_KERNEL);
        if (IS_ERR(rep->rr_rdmabuf)) {
                rc = PTR_ERR(rep->rr_rdmabuf);
                goto out_free;
        }
  
        rep->rr_device = ia->ri_device;
-       rep->rr_cqe.done = rpcrdma_receive_wc;
+       rep->rr_cqe.done = rpcrdma_wc_receive;
        rep->rr_rxprt = r_xprt;
-       INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
+       INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
+       rep->rr_recv_wr.next = NULL;
+       rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+       rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+       rep->rr_recv_wr.num_sge = 1;
        return rep;
  
  out_free:
@@@ -966,17 -1005,18 +1005,18 @@@ rpcrdma_buffer_get_rep_locked(struct rp
  }
  
  static void
- rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
+ rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
  {
-       rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
+       rpcrdma_free_regbuf(rep->rr_rdmabuf);
        kfree(rep);
  }
  
  void
- rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+ rpcrdma_destroy_req(struct rpcrdma_req *req)
  {
-       rpcrdma_free_regbuf(ia, req->rl_sendbuf);
-       rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
+       rpcrdma_free_regbuf(req->rl_recvbuf);
+       rpcrdma_free_regbuf(req->rl_sendbuf);
+       rpcrdma_free_regbuf(req->rl_rdmabuf);
        kfree(req);
  }
  
@@@ -1009,15 -1049,13 +1049,13 @@@ rpcrdma_destroy_mrs(struct rpcrdma_buff
  void
  rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  {
-       struct rpcrdma_ia *ia = rdmab_to_ia(buf);
        cancel_delayed_work_sync(&buf->rb_recovery_worker);
  
        while (!list_empty(&buf->rb_recv_bufs)) {
                struct rpcrdma_rep *rep;
  
                rep = rpcrdma_buffer_get_rep_locked(buf);
-               rpcrdma_destroy_rep(ia, rep);
+               rpcrdma_destroy_rep(rep);
        }
        buf->rb_send_count = 0;
  
                list_del(&req->rl_all);
  
                spin_unlock(&buf->rb_reqslock);
-               rpcrdma_destroy_req(ia, req);
+               rpcrdma_destroy_req(req);
                spin_lock(&buf->rb_reqslock);
        }
        spin_unlock(&buf->rb_reqslock);
@@@ -1129,7 -1167,7 +1167,7 @@@ rpcrdma_buffer_put(struct rpcrdma_req *
        struct rpcrdma_buffer *buffers = req->rl_buffer;
        struct rpcrdma_rep *rep = req->rl_reply;
  
-       req->rl_niovs = 0;
+       req->rl_send_wr.num_sge = 0;
        req->rl_reply = NULL;
  
        spin_lock(&buffers->rb_lock);
@@@ -1171,70 -1209,81 +1209,81 @@@ rpcrdma_recv_buffer_put(struct rpcrdma_
        spin_unlock(&buffers->rb_lock);
  }
  
- /*
-  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
-  */
  /**
-  * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
-  * @ia: controlling rpcrdma_ia
+  * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
   * @size: size of buffer to be allocated, in bytes
+  * @direction: direction of data movement
   * @flags: GFP flags
   *
-  * Returns pointer to private header of an area of internally
-  * registered memory, or an ERR_PTR. The registered buffer follows
-  * the end of the private header.
+  * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
+  * can be persistently DMA-mapped for I/O.
   *
   * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
-  * receiving the payload of RDMA RECV operations. regbufs are not
-  * used for RDMA READ/WRITE operations, thus are registered only for
-  * LOCAL access.
+  * receiving the payload of RDMA RECV operations. During Long Calls
+  * or Replies they may be registered externally via ro_map.
   */
  struct rpcrdma_regbuf *
- rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
+ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+                    gfp_t flags)
  {
        struct rpcrdma_regbuf *rb;
-       struct ib_sge *iov;
  
        rb = kmalloc(sizeof(*rb) + size, flags);
        if (rb == NULL)
-               goto out;
+               return ERR_PTR(-ENOMEM);
  
-       iov = &rb->rg_iov;
-       iov->addr = ib_dma_map_single(ia->ri_device,
-                                     (void *)rb->rg_base, size,
-                                     DMA_BIDIRECTIONAL);
-       if (ib_dma_mapping_error(ia->ri_device, iov->addr))
-               goto out_free;
+       rb->rg_device = NULL;
+       rb->rg_direction = direction;
+       rb->rg_iov.length = size;
  
-       iov->length = size;
-       iov->lkey = ia->ri_pd->local_dma_lkey;
-       rb->rg_size = size;
-       rb->rg_owner = NULL;
        return rb;
+ }
  
- out_free:
-       kfree(rb);
- out:
-       return ERR_PTR(-ENOMEM);
+ /**
+  * __rpcrdma_map_regbuf - DMA-map a regbuf
+  * @ia: controlling rpcrdma_ia
+  * @rb: regbuf to be mapped
+  */
+ bool
+ __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+ {
+       if (rb->rg_direction == DMA_NONE)
+               return false;
+       rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+                                           (void *)rb->rg_base,
+                                           rdmab_length(rb),
+                                           rb->rg_direction);
+       if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+               return false;
+       rb->rg_device = ia->ri_device;
+       rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+       return true;
+ }
+ static void
+ rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+ {
+       if (!rpcrdma_regbuf_is_mapped(rb))
+               return;
+       ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
+                           rdmab_length(rb), rb->rg_direction);
+       rb->rg_device = NULL;
  }
  
  /**
   * rpcrdma_free_regbuf - deregister and free registered buffer
-  * @ia: controlling rpcrdma_ia
   * @rb: regbuf to be deregistered and freed
   */
  void
- rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+ rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
  {
-       struct ib_sge *iov;
        if (!rb)
                return;
  
-       iov = &rb->rg_iov;
-       ib_dma_unmap_single(ia->ri_device,
-                           iov->addr, iov->length, DMA_BIDIRECTIONAL);
+       rpcrdma_dma_unmap_regbuf(rb);
        kfree(rb);
  }
  
@@@ -1248,39 -1297,28 +1297,28 @@@ rpcrdma_ep_post(struct rpcrdma_ia *ia
                struct rpcrdma_ep *ep,
                struct rpcrdma_req *req)
  {
-       struct ib_device *device = ia->ri_device;
-       struct ib_send_wr send_wr, *send_wr_fail;
-       struct rpcrdma_rep *rep = req->rl_reply;
-       struct ib_sge *iov = req->rl_send_iov;
-       int i, rc;
+       struct ib_send_wr *send_wr = &req->rl_send_wr;
+       struct ib_send_wr *send_wr_fail;
+       int rc;
  
-       if (rep) {
-               rc = rpcrdma_ep_post_recv(ia, ep, rep);
+       if (req->rl_reply) {
+               rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
                if (rc)
                        return rc;
                req->rl_reply = NULL;
        }
  
-       send_wr.next = NULL;
-       send_wr.wr_cqe = &req->rl_cqe;
-       send_wr.sg_list = iov;
-       send_wr.num_sge = req->rl_niovs;
-       send_wr.opcode = IB_WR_SEND;
-       for (i = 0; i < send_wr.num_sge; i++)
-               ib_dma_sync_single_for_device(device, iov[i].addr,
-                                             iov[i].length, DMA_TO_DEVICE);
        dprintk("RPC:       %s: posting %d s/g entries\n",
-               __func__, send_wr.num_sge);
+               __func__, send_wr->num_sge);
  
        if (DECR_CQCOUNT(ep) > 0)
-               send_wr.send_flags = 0;
+               send_wr->send_flags = 0;
        else { /* Provider must take a send completion every now and then */
                INIT_CQCOUNT(ep);
-               send_wr.send_flags = IB_SEND_SIGNALED;
+               send_wr->send_flags = IB_SEND_SIGNALED;
        }
  
-       rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+       rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
        if (rc)
                goto out_postsend_err;
        return 0;
@@@ -1290,32 -1328,24 +1328,24 @@@ out_postsend_err
        return -ENOTCONN;
  }
  
- /*
-  * (Re)post a receive buffer.
-  */
  int
  rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
-                    struct rpcrdma_ep *ep,
                     struct rpcrdma_rep *rep)
  {
-       struct ib_recv_wr recv_wr, *recv_wr_fail;
+       struct ib_recv_wr *recv_wr_fail;
        int rc;
  
-       recv_wr.next = NULL;
-       recv_wr.wr_cqe = &rep->rr_cqe;
-       recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
-       recv_wr.num_sge = 1;
-       ib_dma_sync_single_for_cpu(ia->ri_device,
-                                  rdmab_addr(rep->rr_rdmabuf),
-                                  rdmab_length(rep->rr_rdmabuf),
-                                  DMA_BIDIRECTIONAL);
-       rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+       if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
+               goto out_map;
+       rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
        if (rc)
                goto out_postrecv;
        return 0;
  
+ out_map:
+       pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
+       return -EIO;
  out_postrecv:
        pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
        return -ENOTCONN;
@@@ -1333,7 -1363,6 +1363,6 @@@ rpcrdma_ep_post_extra_recv(struct rpcrd
  {
        struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
        struct rpcrdma_rep *rep;
        int rc;
  
                rep = rpcrdma_buffer_get_rep_locked(buffers);
                spin_unlock(&buffers->rb_lock);
  
-               rc = rpcrdma_ep_post_recv(ia, ep, rep);
+               rc = rpcrdma_ep_post_recv(ia, rep);
                if (rc)
                        goto out_rc;
        }