Merge branch 'next'
authorTrond Myklebust <Trond.Myklebust@netapp.com>
Wed, 15 Oct 2008 19:54:56 +0000 (15:54 -0400)
committerTrond Myklebust <Trond.Myklebust@netapp.com>
Wed, 15 Oct 2008 19:54:56 +0000 (15:54 -0400)
26 files changed:
fs/nfs/client.c
fs/nfs/dir.c
fs/nfs/file.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/mount_clnt.c
fs/nfs/namespace.c
fs/nfs/nfs3acl.c
fs/nfs/nfs3proc.c
fs/nfs/nfs4namespace.c
fs/nfs/proc.c
fs/nfs/super.c
fs/nfs/unlink.c
fs/nfs/write.c
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_mount.h
include/linux/nfs_xdr.h
include/linux/sunrpc/xprtrdma.h
net/sunrpc/clnt.c
net/sunrpc/rpcb_clnt.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h

index 5ee23e7..7547600 100644 (file)
@@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server,
        server->nfs_client = clp;
 
        /* Initialise the client representation from the mount data */
-       server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+       server->flags = data->flags;
 
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
@@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void)
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
 
-       init_waitqueue_head(&server->active_wq);
        atomic_set(&server->active, 0);
 
        server->io_stats = nfs_alloc_iostats();
@@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server,
                goto error;
 
        /* Initialise the client representation from the mount data */
-       server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+       server->flags = data->flags;
        server->caps |= NFS_CAP_ATOMIC_OPEN;
 
        if (data->rsize)
index 74f92b7..2ab70d4 100644 (file)
@@ -156,6 +156,7 @@ typedef struct {
        decode_dirent_t decode;
        int             plus;
        unsigned long   timestamp;
+       unsigned long   gencount;
        int             timestamp_valid;
 } nfs_readdir_descriptor_t;
 
@@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
        struct file     *file = desc->file;
        struct inode    *inode = file->f_path.dentry->d_inode;
        struct rpc_cred *cred = nfs_file_cred(file);
-       unsigned long   timestamp;
+       unsigned long   timestamp, gencount;
        int             error;
 
        dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
@@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 
  again:
        timestamp = jiffies;
+       gencount = nfs_inc_attr_generation_counter();
        error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
                                          NFS_SERVER(inode)->dtsize, desc->plus);
        if (error < 0) {
@@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
                goto error;
        }
        desc->timestamp = timestamp;
+       desc->gencount = gencount;
        desc->timestamp_valid = 1;
        SetPageUptodate(page);
        /* Ensure consistent page alignment of the data.
@@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
        if (IS_ERR(p))
                return PTR_ERR(p);
        desc->ptr = p;
-       if (desc->timestamp_valid)
+       if (desc->timestamp_valid) {
                desc->entry->fattr->time_start = desc->timestamp;
-       else
+               desc->entry->fattr->gencount = desc->gencount;
+       } else
                desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
        return 0;
 }
@@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct rpc_cred *cred = nfs_file_cred(file);
        struct page     *page = NULL;
        int             status;
-       unsigned long   timestamp;
+       unsigned long   timestamp, gencount;
 
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                goto out;
        }
        timestamp = jiffies;
+       gencount = nfs_inc_attr_generation_counter();
        status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
                                                *desc->dir_cookie, page,
                                                NFS_SERVER(inode)->dtsize,
@@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
        if (status >= 0) {
                desc->timestamp = timestamp;
+               desc->gencount = gencount;
                desc->timestamp_valid = 1;
                if ((status = dir_decode(desc)) == 0)
                        desc->entry->prev_cookie = *desc->dir_cookie;
@@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
  */
 void nfs_force_lookup_revalidate(struct inode *dir)
 {
-       NFS_I(dir)->cache_change_attribute = jiffies;
+       NFS_I(dir)->cache_change_attribute++;
 }
 
 /*
@@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
        if (IS_ROOT(dentry))
                return 1;
+       if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+               return 0;
        if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
        /* Revalidate nfsi->cache_change_attribute before we declare a match */
@@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
        /* Don't revalidate a negative dentry if we're creating a new file */
        if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
                return 0;
+       if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+               return 1;
        return !nfs_check_verifier(dir, dentry);
 }
 
index 7846065..d319b49 100644 (file)
@@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
        /* origin == SEEK_END => we must revalidate the cached file length */
        if (origin == SEEK_END) {
                struct inode *inode = filp->f_mapping->host;
+
                int retval = nfs_revalidate_file_size(inode, filp);
                if (retval < 0)
                        return (loff_t)retval;
-       }
-       lock_kernel();  /* BKL needed? */
-       loff = generic_file_llseek_unlocked(filp, offset, origin);
-       unlock_kernel();
+
+               spin_lock(&inode->i_lock);
+               loff = generic_file_llseek_unlocked(filp, offset, origin);
+               spin_unlock(&inode->i_lock);
+       } else
+               loff = generic_file_llseek_unlocked(filp, offset, origin);
        return loff;
 }
 
@@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
                        filp->f_path.dentry->d_name.name,
                        fl->fl_type, fl->fl_flags);
 
-       /*
-        * No BSD flocks over NFS allowed.
-        * Note: we could try to fake a POSIX lock request here by
-        * using ((u32) filp | 0x80000000) or some such as the pid.
-        * Not sure whether that would be unique, though, or whether
-        * that would break in other places.
-        */
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
 
index 52daefa..b9195c0 100644 (file)
@@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                        init_special_inode(inode, inode->i_mode, fattr->rdev);
 
                nfsi->read_cache_jiffies = fattr->time_start;
-               nfsi->last_updated = now;
-               nfsi->cache_change_attribute = now;
+               nfsi->attr_gencount = fattr->gencount;
                inode->i_atime = fattr->atime;
                inode->i_mtime = fattr->mtime;
                inode->i_ctime = fattr->ctime;
@@ -453,6 +452,7 @@ out_big:
 void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 {
        if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+               spin_lock(&inode->i_lock);
                if ((attr->ia_valid & ATTR_MODE) != 0) {
                        int mode = attr->ia_mode & S_IALLUGO;
                        mode |= inode->i_mode & ~S_IALLUGO;
@@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
                        inode->i_uid = attr->ia_uid;
                if ((attr->ia_valid & ATTR_GID) != 0)
                        inode->i_gid = attr->ia_gid;
-               spin_lock(&inode->i_lock);
                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                spin_unlock(&inode->i_lock);
        }
@@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
        }
 }
 
-static int nfs_wait_schedule(void *word)
-{
-       if (signal_pending(current))
-               return -ERESTARTSYS;
-       schedule();
-       return 0;
-}
-
-/*
- * Wait for the inode to get unlocked.
- */
-static int nfs_wait_on_inode(struct inode *inode)
-{
-       struct nfs_inode *nfsi = NFS_I(inode);
-       int error;
-
-       error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
-                                       nfs_wait_schedule, TASK_KILLABLE);
-
-       return error;
-}
-
-static void nfs_wake_up_inode(struct inode *inode)
-{
-       struct nfs_inode *nfsi = NFS_I(inode);
-
-       clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
-       smp_mb__after_clear_bit();
-       wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
-}
-
 int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
@@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
                inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 
-       nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
        if (is_bad_inode(inode))
-               goto out_nowait;
+               goto out;
        if (NFS_STALE(inode))
-               goto out_nowait;
-
-       status = nfs_wait_on_inode(inode);
-       if (status < 0)
                goto out;
 
-       status = -ESTALE;
        if (NFS_STALE(inode))
                goto out;
 
+       nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
        if (status != 0) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
@@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                goto out;
        }
 
-       spin_lock(&inode->i_lock);
-       status = nfs_update_inode(inode, &fattr);
+       status = nfs_refresh_inode(inode, &fattr);
        if (status) {
-               spin_unlock(&inode->i_lock);
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
                         inode->i_sb->s_id,
                         (long long)NFS_FILEID(inode), status);
                goto out;
        }
-       spin_unlock(&inode->i_lock);
 
        if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
                nfs_zap_acl_cache(inode);
@@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                (long long)NFS_FILEID(inode));
 
  out:
-       nfs_wake_up_inode(inode);
-
- out_nowait:
        return status;
 }
 
@@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
                return -EIO;
        }
 
-       /* Do atomic weak cache consistency updates */
-       nfs_wcc_update_inode(inode, fattr);
-
        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
                        nfsi->change_attr != fattr->change_attr)
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
@@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
        if (invalid != 0)
                nfsi->cache_validity |= invalid;
-       else
-               nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
-                               | NFS_INO_INVALID_ATIME
-                               | NFS_INO_REVAL_PAGECACHE);
 
        nfsi->read_cache_jiffies = fattr->time_start;
        return 0;
 }
 
+static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+       return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
+}
+
+static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+       return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
+}
+
+static unsigned long nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+       smp_rmb();
+       return nfs_attr_generation_counter;
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+       unsigned long ret;
+       smp_rmb();
+       ret = ++nfs_attr_generation_counter;
+       smp_wmb();
+       return ret;
+}
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+       fattr->valid = 0;
+       fattr->time_start = jiffies;
+       fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode - pointer to inode
+ * @fattr - attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+       const struct nfs_inode *nfsi = NFS_I(inode);
+
+       return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
+               nfs_ctime_need_update(inode, fattr) ||
+               nfs_size_need_update(inode, fattr) ||
+               ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+       if (nfs_inode_attrs_need_update(inode, fattr))
+               return nfs_update_inode(inode, fattr);
+       return nfs_check_inode_attributes(inode, fattr);
+}
+
 /**
  * nfs_refresh_inode - try to update the inode attribute cache
  * @inode - pointer to inode
@@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
  */
 int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-       struct nfs_inode *nfsi = NFS_I(inode);
        int status;
 
        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
                return 0;
        spin_lock(&inode->i_lock);
-       if (time_after(fattr->time_start, nfsi->last_updated))
-               status = nfs_update_inode(inode, fattr);
-       else
-               status = nfs_check_inode_attributes(inode, fattr);
-
+       status = nfs_refresh_inode_locked(inode, fattr);
        spin_unlock(&inode->i_lock);
        return status;
 }
 
+static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+
+       nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+       if (S_ISDIR(inode->i_mode))
+               nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+       if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+               return 0;
+       return nfs_refresh_inode_locked(inode, fattr);
+}
+
 /**
  * nfs_post_op_update_inode - try to update the inode attribute cache
  * @inode - pointer to inode
@@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
  */
 int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-       struct nfs_inode *nfsi = NFS_I(inode);
+       int status;
 
        spin_lock(&inode->i_lock);
-       nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-       if (S_ISDIR(inode->i_mode))
-               nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+       status = nfs_post_op_update_inode_locked(inode, fattr);
        spin_unlock(&inode->i_lock);
-       return nfs_refresh_inode(inode, fattr);
+       return status;
 }
 
 /**
@@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  */
 int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
 {
+       int status;
+
+       spin_lock(&inode->i_lock);
+       /* Don't do a WCC update if these attributes are already stale */
+       if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+                       !nfs_inode_attrs_need_update(inode, fattr)) {
+               fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+               goto out_noforce;
+       }
        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
                        (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
                fattr->pre_change_attr = NFS_I(inode)->change_attr;
@@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
                fattr->pre_size = i_size_read(inode);
                fattr->valid |= NFS_ATTR_WCC;
        }
-       return nfs_post_op_update_inode(inode, fattr);
+out_noforce:
+       status = nfs_post_op_update_inode_locked(inode, fattr);
+       spin_unlock(&inode->i_lock);
+       return status;
 }
 
 /*
@@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                }
                /* If ctime has changed we should definitely clear access+acl caches */
                if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
-                       invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                       invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
        } else if (nfsi->change_attr != fattr->change_attr) {
                dprintk("NFS: change_attr change on server for file %s/%ld\n",
                                inode->i_sb->s_id, inode->i_ino);
@@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
            inode->i_gid != fattr->gid)
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 
+       if (inode->i_nlink != fattr->nlink)
+               invalid |= NFS_INO_INVALID_ATTR;
+
        inode->i_mode = fattr->mode;
        inode->i_nlink = fattr->nlink;
        inode->i_uid = fattr->uid;
@@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
-               nfsi->last_updated = now;
+               nfsi->attr_gencount = nfs_inc_attr_generation_counter();
        } else {
                if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
                        if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
                                nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
                        nfsi->attrtimeo_timestamp = now;
                }
-               /*
-                * Avoid jiffy wraparound issues with nfsi->last_updated
-                */
-               if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
-                       nfsi->last_updated = nfsi->read_cache_jiffies;
        }
        invalid &= ~NFS_INO_INVALID_ATTR;
        /* Don't invalidate the data if we were to blame */
index 24241fc..d212ee4 100644 (file)
@@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 
 /* super.c */
+void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
 extern struct file_system_type nfs_xdev_fs_type;
 #ifdef CONFIG_NFS_V4
 extern struct file_system_type nfs4_xdev_fs_type;
@@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat;
 
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct nfs_server *server);
-extern void nfs_sb_deactive(struct nfs_server *server);
+extern void nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
 
 /* namespace.c */
 extern char *nfs_path(const char *base,
@@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
                PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
+#define IPV6_SCOPE_DELIMITER   '%'
+
+/*
+ * Set the port number in an address.  Be agnostic about the address
+ * family.
+ */
+static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
+{
+       struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+       struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
+
+       switch (sap->sa_family) {
+       case AF_INET:
+               ap->sin_port = htons(port);
+               break;
+       case AF_INET6:
+               ap6->sin6_port = htons(port);
+               break;
+       }
+}
index 779d2eb..086a683 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/nfs_fs.h>
+#include "internal.h"
 
 #ifdef RPC_DEBUG
 # define NFSDBG_FACILITY       NFSDBG_MOUNT
@@ -98,7 +99,7 @@ out_call_err:
 
 out_mnt_err:
        dprintk("NFS: MNT server returned result %d\n", result.status);
-       status = -EACCES;
+       status = nfs_stat_to_errno(result.status);
        goto out;
 }
 
index 66df08d..64a288e 100644 (file)
@@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 
        dprintk("--> nfs_follow_mountpoint()\n");
 
-       BUG_ON(IS_ROOT(dentry));
+       err = -ESTALE;
+       if (IS_ROOT(dentry))
+               goto out_err;
+
        dprintk("%s: enter\n", __func__);
        dput(nd->path.dentry);
        nd->path.dentry = dget(dentry);
@@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
                                           struct nfs_clone_mount *mountdata)
 {
 #ifdef CONFIG_NFS_V4
-       struct vfsmount *mnt = NULL;
+       struct vfsmount *mnt = ERR_PTR(-EINVAL);
        switch (server->nfs_client->rpc_ops->version) {
                case 2:
                case 3:
index 423842f..cef6255 100644 (file)
@@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 
        dprintk("NFS call getacl\n");
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+       nfs_fattr_init(&fattr);
        status = rpc_call_sync(server->client_acl, &msg, 0);
        dprintk("NFS reply getacl: %d\n", status);
 
@@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 
        dprintk("NFS call setacl\n");
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+       nfs_fattr_init(&fattr);
        status = rpc_call_sync(server->client_acl, &msg, 0);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
index 1e750e4..c55be7a 100644 (file)
@@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 }
 
 static int
-nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
                 struct nfs_fsinfo *info)
 {
        struct rpc_message msg = {
@@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 
        dprintk("NFS call  fsinfo\n");
        nfs_fattr_init(info->fattr);
-       status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+       status = rpc_call_sync(client, &msg, 0);
        dprintk("NFS reply fsinfo: %d\n", status);
        return status;
 }
 
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+                  struct nfs_fsinfo *info)
+{
+       int     status;
+
+       status = do_proc_fsinfo(server->client, fhandle, info);
+       if (status && server->nfs_client->cl_rpcclient != server->client)
+               status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+       return status;
+}
+
 static int
 nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
                   struct nfs_pathconf *info)
index b112857..30befc3 100644 (file)
@@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
        return 0;
 }
 
-/*
- * Check if the string represents a "valid" IPv4 address
- */
-static inline int valid_ipaddr4(const char *buf)
+static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
+                                    char *page, char *page2,
+                                    const struct nfs4_fs_location *location)
 {
-       int rc, count, in[4];
-
-       rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
-       if (rc != 4)
-               return -EINVAL;
-       for (count = 0; count < 4; count++) {
-               if (in[count] > 255)
-                       return -EINVAL;
+       struct vfsmount *mnt = ERR_PTR(-ENOENT);
+       char *mnt_path;
+       int page2len;
+       unsigned int s;
+
+       mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+       if (IS_ERR(mnt_path))
+               return mnt;
+       mountdata->mnt_path = mnt_path;
+       page2 += strlen(mnt_path) + 1;
+       page2len = PAGE_SIZE - strlen(mnt_path) - 1;
+
+       for (s = 0; s < location->nservers; s++) {
+               const struct nfs4_string *buf = &location->servers[s];
+               struct sockaddr_storage addr;
+
+               if (buf->len <= 0 || buf->len >= PAGE_SIZE)
+                       continue;
+
+               mountdata->addr = (struct sockaddr *)&addr;
+
+               if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+                       continue;
+               nfs_parse_ip_address(buf->data, buf->len,
+                               mountdata->addr, &mountdata->addrlen);
+               if (mountdata->addr->sa_family == AF_UNSPEC)
+                       continue;
+               nfs_set_port(mountdata->addr, NFS_PORT);
+
+               strncpy(page2, buf->data, page2len);
+               page2[page2len] = '\0';
+               mountdata->hostname = page2;
+
+               snprintf(page, PAGE_SIZE, "%s:%s",
+                               mountdata->hostname,
+                               mountdata->mnt_path);
+
+               mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+               if (!IS_ERR(mnt))
+                       break;
        }
-       return 0;
+       return mnt;
 }
 
 /**
@@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
        };
        char *page = NULL, *page2 = NULL;
-       unsigned int s;
        int loc, error;
 
        if (locations == NULL || locations->nlocations <= 0)
@@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                goto out;
        }
 
-       loc = 0;
-       while (loc < locations->nlocations && IS_ERR(mnt)) {
+       for (loc = 0; loc < locations->nlocations; loc++) {
                const struct nfs4_fs_location *location = &locations->locations[loc];
-               char *mnt_path;
 
                if (location == NULL || location->nservers <= 0 ||
-                   location->rootpath.ncomponents == 0) {
-                       loc++;
+                   location->rootpath.ncomponents == 0)
                        continue;
-               }
 
-               mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
-               if (IS_ERR(mnt_path)) {
-                       loc++;
-                       continue;
-               }
-               mountdata.mnt_path = mnt_path;
-
-               s = 0;
-               while (s < location->nservers) {
-                       struct sockaddr_in addr = {
-                               .sin_family     = AF_INET,
-                               .sin_port       = htons(NFS_PORT),
-                       };
-
-                       if (location->servers[s].len <= 0 ||
-                           valid_ipaddr4(location->servers[s].data) < 0) {
-                               s++;
-                               continue;
-                       }
-
-                       mountdata.hostname = location->servers[s].data;
-                       addr.sin_addr.s_addr = in_aton(mountdata.hostname),
-                       mountdata.addr = (struct sockaddr *)&addr;
-                       mountdata.addrlen = sizeof(addr);
-
-                       snprintf(page, PAGE_SIZE, "%s:%s",
-                                       mountdata.hostname,
-                                       mountdata.mnt_path);
-
-                       mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata);
-                       if (!IS_ERR(mnt)) {
-                               break;
-                       }
-                       s++;
-               }
-               loc++;
+               mnt = try_location(&mountdata, page, page2, location);
+               if (!IS_ERR(mnt))
+                       break;
        }
 
 out:
index 4dbb84d..1934652 100644 (file)
@@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 
        dprintk("%s: call getattr\n", __func__);
        nfs_fattr_init(fattr);
-       status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+       status = rpc_call_sync(server->client, &msg, 0);
+       /* Retry with default authentication if different */
+       if (status && server->nfs_client->cl_rpcclient != server->client)
+               status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
        dprintk("%s: reply getattr: %d\n", __func__, status);
        if (status)
                return status;
        dprintk("%s: call statfs\n", __func__);
        msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
        msg.rpc_resp = &fsinfo;
-       status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+       status = rpc_call_sync(server->client, &msg, 0);
+       /* Retry with default authentication if different */
+       if (status && server->nfs_client->cl_rpcclient != server->client)
+               status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
        dprintk("%s: reply statfs: %d\n", __func__, status);
        if (status)
                return status;
index ffb6974..8b28b95 100644 (file)
@@ -91,6 +91,7 @@ enum {
        /* Mount options that take string arguments */
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
+       Opt_lookupcache,
 
        /* Special mount options */
        Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -154,6 +155,8 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_mounthost, "mounthost=%s" },
        { Opt_mountaddr, "mountaddr=%s" },
 
+       { Opt_lookupcache, "lookupcache=%s" },
+
        { Opt_err, NULL }
 };
 
@@ -200,6 +203,22 @@ static const match_table_t nfs_secflavor_tokens = {
        { Opt_sec_err, NULL }
 };
 
+enum {
+       Opt_lookupcache_all, Opt_lookupcache_positive,
+       Opt_lookupcache_none,
+
+       Opt_lookupcache_err
+};
+
+static match_table_t nfs_lookupcache_tokens = {
+       { Opt_lookupcache_all, "all" },
+       { Opt_lookupcache_positive, "pos" },
+       { Opt_lookupcache_positive, "positive" },
+       { Opt_lookupcache_none, "none" },
+
+       { Opt_lookupcache_err, NULL }
+};
+
 
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
@@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
 static int nfs_xdev_get_sb(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
-static void nfs_put_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 
 static struct file_system_type nfs_fs_type = {
@@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = {
        .alloc_inode    = nfs_alloc_inode,
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
-       .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .clear_inode    = nfs_clear_inode,
        .umount_begin   = nfs_umount_begin,
@@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void)
        unregister_filesystem(&nfs_fs_type);
 }
 
-void nfs_sb_active(struct nfs_server *server)
+void nfs_sb_active(struct super_block *sb)
 {
-       atomic_inc(&server->active);
-}
+       struct nfs_server *server = NFS_SB(sb);
 
-void nfs_sb_deactive(struct nfs_server *server)
-{
-       if (atomic_dec_and_test(&server->active))
-               wake_up(&server->active_wq);
+       if (atomic_inc_return(&server->active) == 1)
+               atomic_inc(&sb->s_active);
 }
 
-static void nfs_put_super(struct super_block *sb)
+void nfs_sb_deactive(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
-       /*
-        * Make sure there are no outstanding ops to this server.
-        * If so, wait for them to finish before allowing the
-        * unmount to continue.
-        */
-       wait_event(server->active_wq, atomic_read(&server->active) == 0);
+
+       if (atomic_dec_and_test(&server->active))
+               deactivate_super(sb);
 }
 
 /*
@@ -663,25 +674,6 @@ static void nfs_umount_begin(struct super_block *sb)
                rpc_killall_tasks(rpc);
 }
 
-/*
- * Set the port number in an address.  Be agnostic about the address family.
- */
-static void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
-       switch (sap->sa_family) {
-       case AF_INET: {
-               struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-               ap->sin_port = htons(port);
-               break;
-       }
-       case AF_INET6: {
-               struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
-               ap->sin6_port = htons(port);
-               break;
-       }
-       }
-}
-
 /*
  * Sanity-check a server address provided by the mount command.
  *
@@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len,
        *addr_len = 0;
 }
 
-#define IPV6_SCOPE_DELIMITER   '%'
-
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
-                                   const char *delim,
-                                   struct sockaddr_in6 *sin6)
+static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+                                  const char *delim,
+                                  struct sockaddr_in6 *sin6)
 {
        char *p;
        size_t len;
 
-       if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
-               return ;
+       if ((string + str_len) == delim)
+               return 1;
+
        if (*delim != IPV6_SCOPE_DELIMITER)
-               return;
+               return 0;
+
+       if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+               return 0;
 
        len = (string + str_len) - delim - 1;
        p = kstrndup(delim + 1, len, GFP_KERNEL);
@@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
                        scope_id = dev->ifindex;
                        dev_put(dev);
                } else {
-                       /* scope_id is set to zero on error */
-                       strict_strtoul(p, 10, &scope_id);
+                       if (strict_strtoul(p, 10, &scope_id) == 0) {
+                               kfree(p);
+                               return 0;
+                       }
                }
 
                kfree(p);
+
                sin6->sin6_scope_id = scope_id;
                dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+               return 1;
        }
+
+       return 0;
 }
 
 static void nfs_parse_ipv6_address(char *string, size_t str_len,
@@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
 
                sin6->sin6_family = AF_INET6;
                *addr_len = sizeof(*sin6);
-               if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
-                       nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
-                       return;
+               if (in6_pton(string, str_len, addr,
+                                       IPV6_SCOPE_DELIMITER, &delim) != 0) {
+                       if (nfs_parse_ipv6_scope_id(string, str_len,
+                                                       delim, sin6) != 0)
+                               return;
                }
        }
 
@@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
  * If there is a problem constructing the new sockaddr, set the address
  * family to AF_UNSPEC.
  */
-static void nfs_parse_ip_address(char *string, size_t str_len,
+void nfs_parse_ip_address(char *string, size_t str_len,
                                 struct sockaddr *sap, size_t *addr_len)
 {
        unsigned int i, colons;
@@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw,
                                             &mnt->mount_server.addrlen);
                        kfree(string);
                        break;
+               case Opt_lookupcache:
+                       string = match_strdup(args);
+                       if (string == NULL)
+                               goto out_nomem;
+                       token = match_token(string,
+                                       nfs_lookupcache_tokens, args);
+                       kfree(string);
+                       switch (token) {
+                               case Opt_lookupcache_all:
+                                       mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+                                       break;
+                               case Opt_lookupcache_positive:
+                                       mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+                                       mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+                                       break;
+                               case Opt_lookupcache_none:
+                                       mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+                                       break;
+                               default:
+                                       errors++;
+                                       dfprintk(MOUNT, "NFS:   invalid "
+                                                       "lookupcache argument\n");
+                       };
+                       break;
 
                /*
                 * Special options
@@ -1558,7 +1584,7 @@ static int nfs_validate_mount_data(void *options,
                 * Translate to nfs_parsed_mount_data, which nfs_fill_super
                 * can deal with.
                 */
-               args->flags             = data->flags;
+               args->flags             = data->flags & NFS_MOUNT_FLAGMASK;
                args->rsize             = data->rsize;
                args->wsize             = data->wsize;
                args->timeo             = data->timeo;
index f089e58..ecc2953 100644 (file)
@@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata)
 
        nfs_dec_sillycount(data->dir);
        nfs_free_unlinkdata(data);
-       nfs_sb_deactive(NFS_SB(sb));
+       nfs_sb_deactive(sb);
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
@@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                .rpc_message = &msg,
                .callback_ops = &nfs_unlink_ops,
                .callback_data = data,
+               .workqueue = nfsiod_workqueue,
                .flags = RPC_TASK_ASYNC,
        };
        struct rpc_task *task;
@@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                nfs_dec_sillycount(dir);
                return 0;
        }
-       nfs_sb_active(NFS_SERVER(dir));
+       nfs_sb_active(dir->i_sb);
        data->args.fh = NFS_FH(dir);
        nfs_fattr_init(&data->res.dir_attr);
 
index 3229e21..9f98458 100644 (file)
@@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
                .bdi = mapping->backing_dev_info,
                .sync_mode = WB_SYNC_NONE,
                .nr_to_write = LONG_MAX,
+               .range_start = 0,
+               .range_end = LLONG_MAX,
                .for_writepages = 1,
-               .range_cyclic = 1,
        };
        int ret;
 
index 78a5922..ac8d023 100644 (file)
@@ -137,7 +137,7 @@ struct nfs_inode {
        unsigned long           attrtimeo_timestamp;
        __u64                   change_attr;            /* v4 only */
 
-       unsigned long           last_updated;
+       unsigned long           attr_gencount;
        /* "Generation counter" for the attribute cache. This is
         * bumped whenever we update the metadata on the
         * server.
@@ -200,11 +200,10 @@ struct nfs_inode {
 /*
  * Bit offsets in flags field
  */
-#define NFS_INO_REVALIDATING   (0)             /* revalidating attrs */
-#define NFS_INO_ADVISE_RDPLUS  (1)             /* advise readdirplus */
-#define NFS_INO_STALE          (2)             /* possible stale inode */
-#define NFS_INO_ACL_LRU_SET    (3)             /* Inode is on the LRU list */
-#define NFS_INO_MOUNTPOINT     (4)             /* inode is remote mountpoint */
+#define NFS_INO_ADVISE_RDPLUS  (0)             /* advise readdirplus */
+#define NFS_INO_STALE          (1)             /* possible stale inode */
+#define NFS_INO_ACL_LRU_SET    (2)             /* Inode is on the LRU list */
+#define NFS_INO_MOUNTPOINT     (3)             /* inode is remote mountpoint */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
@@ -345,15 +344,11 @@ extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ct
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
 extern u64 nfs_compat_user_ino64(u64 fileid);
+extern void nfs_fattr_init(struct nfs_fattr *fattr);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern __be32 root_nfs_parse_addr(char *name); /*__init*/
-
-static inline void nfs_fattr_init(struct nfs_fattr *fattr)
-{
-       fattr->valid = 0;
-       fattr->time_start = jiffies;
-}
+extern unsigned long nfs_inc_attr_generation_counter(void);
 
 /*
  * linux/fs/nfs/file.c
index c9beacd..4e477ae 100644 (file)
@@ -119,7 +119,6 @@ struct nfs_server {
        void (*destroy)(struct nfs_server *);
 
        atomic_t active; /* Keep trace of any activity to this server */
-       wait_queue_head_t active_wq;  /* Wait for any activity to stop  */
 
        /* mountd-related mount options */
        struct sockaddr_storage mountd_address;
index df7c6b7..6549a06 100644 (file)
@@ -65,4 +65,8 @@ struct nfs_mount_data {
 #define NFS_MOUNT_UNSHARED     0x8000  /* 5 */
 #define NFS_MOUNT_FLAGMASK     0xFFFF
 
+/* The following are for internal use only */
+#define NFS_MOUNT_LOOKUP_CACHE_NONEG   0x10000
+#define NFS_MOUNT_LOOKUP_CACHE_NONE    0x20000
+
 #endif
index 8c77c11..c1c31ac 100644 (file)
@@ -36,6 +36,7 @@ struct nfs_fattr {
        __u32                   nlink;
        __u32                   uid;
        __u32                   gid;
+       dev_t                   rdev;
        __u64                   size;
        union {
                struct {
@@ -46,7 +47,6 @@ struct nfs_fattr {
                        __u64   used;
                } nfs3;
        } du;
-       dev_t                   rdev;
        struct nfs_fsid         fsid;
        __u64                   fileid;
        struct timespec         atime;
@@ -56,6 +56,7 @@ struct nfs_fattr {
        __u64                   change_attr;    /* NFSv4 change attribute */
        __u64                   pre_change_attr;/* pre-op NFSv4 change attribute */
        unsigned long           time_start;
+       unsigned long           gencount;
 };
 
 #define NFS_ATTR_WCC           0x0001          /* pre-op WCC data    */
@@ -672,16 +673,16 @@ struct nfs4_rename_res {
        struct nfs_fattr *              new_fattr;
 };
 
-#define NFS4_SETCLIENTID_NAMELEN       (56)
+#define NFS4_SETCLIENTID_NAMELEN       (127)
 struct nfs4_setclientid {
        const nfs4_verifier *           sc_verifier;
        unsigned int                    sc_name_len;
-       char                            sc_name[NFS4_SETCLIENTID_NAMELEN];
+       char                            sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
        u32                             sc_prog;
        unsigned int                    sc_netid_len;
-       char                            sc_netid[RPCBIND_MAXNETIDLEN];
+       char                            sc_netid[RPCBIND_MAXNETIDLEN + 1];
        unsigned int                    sc_uaddr_len;
-       char                            sc_uaddr[RPCBIND_MAXUADDRLEN];
+       char                            sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
        u32                             sc_cb_ident;
 };
 
index 4de56b1..54a379c 100644 (file)
@@ -66,9 +66,6 @@
 
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
-#define RDMA_RESOLVE_TIMEOUT   (5*HZ)  /* TBD 5 seconds */
-#define RDMA_CONNECT_RETRY_MAX (2)     /* retries if no listener backlog */
-
 /* memory registration strategies */
 #define RPCRDMA_PERSISTENT_REGISTRATION (1)
 
@@ -78,6 +75,7 @@ enum rpcrdma_memreg {
        RPCRDMA_MEMWINDOWS,
        RPCRDMA_MEMWINDOWS_ASYNC,
        RPCRDMA_MTHCAFMR,
+       RPCRDMA_FRMR,
        RPCRDMA_ALLPHYSICAL,
        RPCRDMA_LAST
 };
index da0789f..4895c34 100644 (file)
@@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
        }
 
        /* save the nodename */
-       clnt->cl_nodelen = strlen(utsname()->nodename);
+       clnt->cl_nodelen = strlen(init_utsname()->nodename);
        if (clnt->cl_nodelen > UNX_MAXNODENAME)
                clnt->cl_nodelen = UNX_MAXNODENAME;
-       memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen);
+       memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen);
        rpc_register_client(clnt);
        return clnt;
 
index 34abc91..41013dd 100644 (file)
@@ -460,6 +460,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
        return rpc_run_task(&task_setup_data);
 }
 
+/*
+ * In the case where rpc clients have been cloned, we want to make
+ * sure that we use the program number/version etc of the actual
+ * owner of the xprt. To do so, we walk back up the tree of parents
+ * to find whoever created the transport and/or whoever has the
+ * autobind flag set.
+ */
+static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt)
+{
+       struct rpc_clnt *parent = clnt->cl_parent;
+
+       while (parent != clnt) {
+               if (parent->cl_xprt != clnt->cl_xprt)
+                       break;
+               if (clnt->cl_autobind)
+                       break;
+               clnt = parent;
+               parent = parent->cl_parent;
+       }
+       return clnt;
+}
+
 /**
  * rpcb_getport_async - obtain the port for a given RPC service on a given host
  * @task: task that is waiting for portmapper request
@@ -469,10 +491,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
  */
 void rpcb_getport_async(struct rpc_task *task)
 {
-       struct rpc_clnt *clnt = task->tk_client;
+       struct rpc_clnt *clnt;
        struct rpc_procinfo *proc;
        u32 bind_version;
-       struct rpc_xprt *xprt = task->tk_xprt;
+       struct rpc_xprt *xprt;
        struct rpc_clnt *rpcb_clnt;
        static struct rpcbind_args *map;
        struct rpc_task *child;
@@ -481,13 +503,13 @@ void rpcb_getport_async(struct rpc_task *task)
        size_t salen;
        int status;
 
+       clnt = rpcb_find_transport_owner(task->tk_client);
+       xprt = clnt->cl_xprt;
+
        dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
                task->tk_pid, __func__,
                clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot);
 
-       /* Autobind on cloned rpc clients is discouraged */
-       BUG_ON(clnt->cl_parent != clnt);
-
        /* Put self on the wait queue to ensure we get notified if
         * some other task is already attempting to bind the port */
        rpc_sleep_on(&xprt->binding, task, NULL);
@@ -549,7 +571,7 @@ void rpcb_getport_async(struct rpc_task *task)
                status = -ENOMEM;
                dprintk("RPC: %5u %s: no memory available\n",
                        task->tk_pid, __func__);
-               goto bailout_nofree;
+               goto bailout_release_client;
        }
        map->r_prog = clnt->cl_prog;
        map->r_vers = clnt->cl_vers;
@@ -569,11 +591,13 @@ void rpcb_getport_async(struct rpc_task *task)
                        task->tk_pid, __func__);
                return;
        }
-       rpc_put_task(child);
 
-       task->tk_xprt->stat.bind_count++;
+       xprt->stat.bind_count++;
+       rpc_put_task(child);
        return;
 
+bailout_release_client:
+       rpc_release_client(rpcb_clnt);
 bailout_nofree:
        rpcb_wake_rpcbind_waiters(xprt, status);
        task->tk_status = status;
index 99a52aa..29e401b 100644 (file)
@@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport)
                        goto out;
        }
 
-       result = -EINVAL;
-       if (try_module_get(THIS_MODULE)) {
-               list_add_tail(&transport->list, &xprt_list);
-               printk(KERN_INFO "RPC: Registered %s transport module.\n",
-                       transport->name);
-               result = 0;
-       }
+       list_add_tail(&transport->list, &xprt_list);
+       printk(KERN_INFO "RPC: Registered %s transport module.\n",
+              transport->name);
+       result = 0;
 
 out:
        spin_unlock(&xprt_list_lock);
@@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport)
                                "RPC: Unregistered %s transport module.\n",
                                transport->name);
                        list_del_init(&transport->list);
-                       module_put(THIS_MODULE);
                        goto out;
                }
        }
index 5c1954d..14106d2 100644 (file)
@@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
        }
 
        if (xdrbuf->tail[0].iov_len) {
+               /* the rpcrdma protocol allows us to omit any trailing
+                * xdr pad bytes, saving the server an RDMA operation. */
+               if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+                       return n;
                if (n == nsegs)
                        return 0;
                seg[n].mr_page = NULL;
@@ -508,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        if (hdrlen == 0)
                return -1;
 
-       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd\n"
-               "                   headerp 0x%p base 0x%p lkey 0x%x\n",
+       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+               " headerp 0x%p base 0x%p lkey 0x%x\n",
                __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
                headerp, base, req->rl_iov.lkey);
 
@@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
  * Scatter inline received data back into provided iov's.
  */
 static void
-rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 {
        int i, npages, curlen, olen;
        char *destp;
@@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
        } else
                rqst->rq_rcv_buf.tail[0].iov_len = 0;
 
+       if (pad) {
+               /* implicit padding on terminal chunk */
+               unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+               while (pad--)
+                       p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+       }
+
        if (copy_len)
                dprintk("RPC:       %s: %d bytes in"
                        " %d extra segments (%d lost)\n",
@@ -681,12 +692,14 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
        struct rpc_xprt *xprt = ep->rep_xprt;
 
        spin_lock_bh(&xprt->transport_lock);
+       if (++xprt->connect_cookie == 0)        /* maintain a reserved value */
+               ++xprt->connect_cookie;
        if (ep->rep_connected > 0) {
                if (!xprt_test_and_set_connected(xprt))
                        xprt_wake_pending_tasks(xprt, 0);
        } else {
                if (xprt_test_and_clear_connected(xprt))
-                       xprt_wake_pending_tasks(xprt, ep->rep_connected);
+                       xprt_wake_pending_tasks(xprt, -ENOTCONN);
        }
        spin_unlock_bh(&xprt->transport_lock);
 }
@@ -792,14 +805,20 @@ repost:
                            ((unsigned char *)iptr - (unsigned char *)headerp);
                        status = rep->rr_len + rdmalen;
                        r_xprt->rx_stats.total_rdma_reply += rdmalen;
+                       /* special case - last chunk may omit padding */
+                       if (rdmalen &= 3) {
+                               rdmalen = 4 - rdmalen;
+                               status += rdmalen;
+                       }
                } else {
                        /* else ordinary inline */
+                       rdmalen = 0;
                        iptr = (__be32 *)((unsigned char *)headerp + 28);
                        rep->rr_len -= 28; /*sizeof *headerp;*/
                        status = rep->rr_len;
                }
                /* Fix up the rpc results for upper layer */
-               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
                break;
 
        case htonl(RDMA_NOMSG):
index a564c1a..9839c3d 100644 (file)
@@ -70,11 +70,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
 static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
-#if !RPCRDMA_PERSISTENT_REGISTRATION
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
-#else
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
-#endif
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+                int xprt_rdma_pad_optimize = 0;
 
 #ifdef RPC_DEBUG
 
@@ -139,6 +136,14 @@ static ctl_table xr_tunables_table[] = {
                .extra1         = &min_memreg,
                .extra2         = &max_memreg,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "rdma_pad_optimize",
+               .data           = &xprt_rdma_pad_optimize,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
        {
                .ctl_name = 0,
        },
@@ -458,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
        dprintk("RPC:       %s: closing\n", __func__);
+       if (r_xprt->rx_ep.rep_connected > 0)
+               xprt->reestablish_timeout = 0;
        xprt_disconnect_done(xprt);
        (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
 }
@@ -485,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task)
                        /* Reconnect */
                        schedule_delayed_work(&r_xprt->rdma_connect,
                                xprt->reestablish_timeout);
+                       xprt->reestablish_timeout <<= 1;
+                       if (xprt->reestablish_timeout > (30 * HZ))
+                               xprt->reestablish_timeout = (30 * HZ);
+                       else if (xprt->reestablish_timeout < (5 * HZ))
+                               xprt->reestablish_timeout = (5 * HZ);
                } else {
                        schedule_delayed_work(&r_xprt->rdma_connect, 0);
                        if (!RPC_IS_ASYNC(task))
@@ -591,6 +603,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
        }
        dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
 out:
+       req->rl_connect_cookie = 0;     /* our reserved value */
        return req->rl_xdr_buf;
 
 outfail:
@@ -694,13 +707,21 @@ xprt_rdma_send_request(struct rpc_task *task)
                req->rl_reply->rr_xprt = xprt;
        }
 
-       if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
-               xprt_disconnect_done(xprt);
-               return -ENOTCONN;       /* implies disconnect */
-       }
+       /* Must suppress retransmit to maintain credits */
+       if (req->rl_connect_cookie == xprt->connect_cookie)
+               goto drop_connection;
+       req->rl_connect_cookie = xprt->connect_cookie;
+
+       if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+               goto drop_connection;
 
+       task->tk_bytes_sent += rqst->rq_snd_buf.len;
        rqst->rq_bytes_sent = 0;
        return 0;
+
+drop_connection:
+       xprt_disconnect_done(xprt);
+       return -ENOTCONN;       /* implies disconnect */
 }
 
 static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
@@ -770,7 +791,7 @@ static void __exit xprt_rdma_cleanup(void)
 {
        int rc;
 
-       dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+       dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
 #ifdef RPC_DEBUG
        if (sunrpc_table_header) {
                unregister_sysctl_table(sunrpc_table_header);
index 8ea283e..a5fef5e 100644 (file)
@@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
        switch (event->event) {
        case RDMA_CM_EVENT_ADDR_RESOLVED:
        case RDMA_CM_EVENT_ROUTE_RESOLVED:
+               ia->ri_async_rc = 0;
                complete(&ia->ri_done);
                break;
        case RDMA_CM_EVENT_ADDR_ERROR:
@@ -338,13 +339,32 @@ connected:
                wake_up_all(&ep->rep_connect_wait);
                break;
        default:
-               ia->ri_async_rc = -EINVAL;
-               dprintk("RPC:       %s: unexpected CM event %X\n",
+               dprintk("RPC:       %s: unexpected CM event %d\n",
                        __func__, event->event);
-               complete(&ia->ri_done);
                break;
        }
 
+#ifdef RPC_DEBUG
+       if (connstate == 1) {
+               int ird = attr.max_dest_rd_atomic;
+               int tird = ep->rep_remote_cma.responder_resources;
+               printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
+                       "on %s, memreg %d slots %d ird %d%s\n",
+                       NIPQUAD(addr->sin_addr.s_addr),
+                       ntohs(addr->sin_port),
+                       ia->ri_id->device->name,
+                       ia->ri_memreg_strategy,
+                       xprt->rx_buf.rb_max_requests,
+                       ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
+       } else if (connstate < 0) {
+               printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u "
+                       "closed (%d)\n",
+                       NIPQUAD(addr->sin_addr.s_addr),
+                       ntohs(addr->sin_port),
+                       connstate);
+       }
+#endif
+
        return 0;
 }
 
@@ -355,6 +375,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
        struct rdma_cm_id *id;
        int rc;
 
+       init_completion(&ia->ri_done);
+
        id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
        if (IS_ERR(id)) {
                rc = PTR_ERR(id);
@@ -363,26 +385,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
                return id;
        }
 
-       ia->ri_async_rc = 0;
+       ia->ri_async_rc = -ETIMEDOUT;
        rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
        if (rc) {
                dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
                        __func__, rc);
                goto out;
        }
-       wait_for_completion(&ia->ri_done);
+       wait_for_completion_interruptible_timeout(&ia->ri_done,
+                               msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
        rc = ia->ri_async_rc;
        if (rc)
                goto out;
 
-       ia->ri_async_rc = 0;
+       ia->ri_async_rc = -ETIMEDOUT;
        rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
        if (rc) {
                dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
                        __func__, rc);
                goto out;
        }
-       wait_for_completion(&ia->ri_done);
+       wait_for_completion_interruptible_timeout(&ia->ri_done,
+                               msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
        rc = ia->ri_async_rc;
        if (rc)
                goto out;
@@ -423,11 +447,10 @@ rpcrdma_clean_cq(struct ib_cq *cq)
 int
 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 {
-       int rc;
+       int rc, mem_priv;
+       struct ib_device_attr devattr;
        struct rpcrdma_ia *ia = &xprt->rx_ia;
 
-       init_completion(&ia->ri_done);
-
        ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
        if (IS_ERR(ia->ri_id)) {
                rc = PTR_ERR(ia->ri_id);
@@ -442,6 +465,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                goto out2;
        }
 
+       /*
+        * Query the device to determine if the requested memory
+        * registration strategy is supported. If it isn't, set the
+        * strategy to a globally supported model.
+        */
+       rc = ib_query_device(ia->ri_id->device, &devattr);
+       if (rc) {
+               dprintk("RPC:       %s: ib_query_device failed %d\n",
+                       __func__, rc);
+               goto out2;
+       }
+
+       if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+               ia->ri_have_dma_lkey = 1;
+               ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+       }
+
+       switch (memreg) {
+       case RPCRDMA_MEMWINDOWS:
+       case RPCRDMA_MEMWINDOWS_ASYNC:
+               if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+                       dprintk("RPC:       %s: MEMWINDOWS registration "
+                               "specified but not supported by adapter, "
+                               "using slower RPCRDMA_REGISTER\n",
+                               __func__);
+                       memreg = RPCRDMA_REGISTER;
+               }
+               break;
+       case RPCRDMA_MTHCAFMR:
+               if (!ia->ri_id->device->alloc_fmr) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+                       dprintk("RPC:       %s: MTHCAFMR registration "
+                               "specified but not supported by adapter, "
+                               "using riskier RPCRDMA_ALLPHYSICAL\n",
+                               __func__);
+                       memreg = RPCRDMA_ALLPHYSICAL;
+#else
+                       dprintk("RPC:       %s: MTHCAFMR registration "
+                               "specified but not supported by adapter, "
+                               "using slower RPCRDMA_REGISTER\n",
+                               __func__);
+                       memreg = RPCRDMA_REGISTER;
+#endif
+               }
+               break;
+       case RPCRDMA_FRMR:
+               /* Requires both frmr reg and local dma lkey */
+               if ((devattr.device_cap_flags &
+                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+                   (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+                       dprintk("RPC:       %s: FRMR registration "
+                               "specified but not supported by adapter, "
+                               "using riskier RPCRDMA_ALLPHYSICAL\n",
+                               __func__);
+                       memreg = RPCRDMA_ALLPHYSICAL;
+#else
+                       dprintk("RPC:       %s: FRMR registration "
+                               "specified but not supported by adapter, "
+                               "using slower RPCRDMA_REGISTER\n",
+                               __func__);
+                       memreg = RPCRDMA_REGISTER;
+#endif
+               }
+               break;
+       }
+
        /*
         * Optionally obtain an underlying physical identity mapping in
         * order to do a memory window-based bind. This base registration
@@ -450,22 +540,28 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
         * revoked after the corresponding completion similar to a storage
         * adapter.
         */
-       if (memreg > RPCRDMA_REGISTER) {
-               int mem_priv = IB_ACCESS_LOCAL_WRITE;
-               switch (memreg) {
+       switch (memreg) {
+       case RPCRDMA_BOUNCEBUFFERS:
+       case RPCRDMA_REGISTER:
+       case RPCRDMA_FRMR:
+               break;
 #if RPCRDMA_PERSISTENT_REGISTRATION
-               case RPCRDMA_ALLPHYSICAL:
-                       mem_priv |= IB_ACCESS_REMOTE_WRITE;
-                       mem_priv |= IB_ACCESS_REMOTE_READ;
-                       break;
+       case RPCRDMA_ALLPHYSICAL:
+               mem_priv = IB_ACCESS_LOCAL_WRITE |
+                               IB_ACCESS_REMOTE_WRITE |
+                               IB_ACCESS_REMOTE_READ;
+               goto register_setup;
 #endif
-               case RPCRDMA_MEMWINDOWS_ASYNC:
-               case RPCRDMA_MEMWINDOWS:
-                       mem_priv |= IB_ACCESS_MW_BIND;
-                       break;
-               default:
+       case RPCRDMA_MEMWINDOWS_ASYNC:
+       case RPCRDMA_MEMWINDOWS:
+               mem_priv = IB_ACCESS_LOCAL_WRITE |
+                               IB_ACCESS_MW_BIND;
+               goto register_setup;
+       case RPCRDMA_MTHCAFMR:
+               if (ia->ri_have_dma_lkey)
                        break;
-               }
+               mem_priv = IB_ACCESS_LOCAL_WRITE;
+       register_setup:
                ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
                if (IS_ERR(ia->ri_bind_mem)) {
                        printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -475,7 +571,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                        memreg = RPCRDMA_REGISTER;
                        ia->ri_bind_mem = NULL;
                }
+               break;
+       default:
+               printk(KERN_ERR "%s: invalid memory registration mode %d\n",
+                               __func__, memreg);
+               rc = -EINVAL;
+               goto out2;
        }
+       dprintk("RPC:       %s: memory registration strategy is %d\n",
+               __func__, memreg);
 
        /* Else will do memory reg/dereg for each chunk */
        ia->ri_memreg_strategy = memreg;
@@ -483,6 +587,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        return 0;
 out2:
        rdma_destroy_id(ia->ri_id);
+       ia->ri_id = NULL;
 out1:
        return rc;
 }
@@ -503,15 +608,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
                dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
                        __func__, rc);
        }
-       if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
-               rdma_destroy_qp(ia->ri_id);
+       if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+               if (ia->ri_id->qp)
+                       rdma_destroy_qp(ia->ri_id);
+               rdma_destroy_id(ia->ri_id);
+               ia->ri_id = NULL;
+       }
        if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
                rc = ib_dealloc_pd(ia->ri_pd);
                dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
                        __func__, rc);
        }
-       if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
-               rdma_destroy_id(ia->ri_id);
 }
 
 /*
@@ -541,6 +648,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_attr.srq = NULL;
        ep->rep_attr.cap.max_send_wr = cdata->max_requests;
        switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+               /* Add room for frmr register and invalidate WRs */
+               ep->rep_attr.cap.max_send_wr *= 3;
+               if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+                       return -EINVAL;
+               break;
        case RPCRDMA_MEMWINDOWS_ASYNC:
        case RPCRDMA_MEMWINDOWS:
                /* Add room for mw_binds+unbinds - overkill! */
@@ -617,29 +730,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_remote_cma.private_data_len = 0;
 
        /* Client offers RDMA Read but does not initiate */
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_BOUNCEBUFFERS:
+       ep->rep_remote_cma.initiator_depth = 0;
+       if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
                ep->rep_remote_cma.responder_resources = 0;
-               break;
-       case RPCRDMA_MTHCAFMR:
-       case RPCRDMA_REGISTER:
-               ep->rep_remote_cma.responder_resources = cdata->max_requests *
-                               (RPCRDMA_MAX_DATA_SEGS / 8);
-               break;
-       case RPCRDMA_MEMWINDOWS:
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-#if RPCRDMA_PERSISTENT_REGISTRATION
-       case RPCRDMA_ALLPHYSICAL:
-#endif
-               ep->rep_remote_cma.responder_resources = cdata->max_requests *
-                               (RPCRDMA_MAX_DATA_SEGS / 2);
-               break;
-       default:
-               break;
-       }
-       if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
+       else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
+               ep->rep_remote_cma.responder_resources = 32;
+       else
                ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
-       ep->rep_remote_cma.initiator_depth = 0;
 
        ep->rep_remote_cma.retry_count = 7;
        ep->rep_remote_cma.flow_control = 0;
@@ -679,21 +776,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                if (rc)
                        dprintk("RPC:       %s: rpcrdma_ep_disconnect"
                                " returned %i\n", __func__, rc);
+               rdma_destroy_qp(ia->ri_id);
+               ia->ri_id->qp = NULL;
        }
 
-       ep->rep_func = NULL;
-
        /* padding - could be done in rpcrdma_buffer_destroy... */
        if (ep->rep_pad_mr) {
                rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
                ep->rep_pad_mr = NULL;
        }
 
-       if (ia->ri_id->qp) {
-               rdma_destroy_qp(ia->ri_id);
-               ia->ri_id->qp = NULL;
-       }
-
        rpcrdma_clean_cq(ep->rep_cq);
        rc = ib_destroy_cq(ep->rep_cq);
        if (rc)
@@ -712,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
        struct rdma_cm_id *id;
        int rc = 0;
        int retry_count = 0;
-       int reconnect = (ep->rep_connected != 0);
 
-       if (reconnect) {
+       if (ep->rep_connected != 0) {
                struct rpcrdma_xprt *xprt;
 retry:
                rc = rpcrdma_ep_disconnect(ep, ia);
@@ -745,6 +836,7 @@ retry:
                        goto out;
                }
                /* END TEMP */
+               rdma_destroy_qp(ia->ri_id);
                rdma_destroy_id(ia->ri_id);
                ia->ri_id = id;
        }
@@ -769,14 +861,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
        }
 }
 
-       /* Theoretically a client initiator_depth > 0 is not needed,
-        * but many peers fail to complete the connection unless they
-        * == responder_resources! */
-       if (ep->rep_remote_cma.initiator_depth !=
-                               ep->rep_remote_cma.responder_resources)
-               ep->rep_remote_cma.initiator_depth =
-                       ep->rep_remote_cma.responder_resources;
-
        ep->rep_connected = 0;
 
        rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -786,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
                goto out;
        }
 
-       if (reconnect)
-               return 0;
-
        wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
 
        /*
@@ -805,14 +886,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
        if (ep->rep_connected <= 0) {
                /* Sometimes, the only way to reliably connect to remote
                 * CMs is to use same nonzero values for ORD and IRD. */
-               ep->rep_remote_cma.initiator_depth =
-                                       ep->rep_remote_cma.responder_resources;
-               if (ep->rep_remote_cma.initiator_depth == 0)
-                       ++ep->rep_remote_cma.initiator_depth;
-               if (ep->rep_remote_cma.responder_resources == 0)
-                       ++ep->rep_remote_cma.responder_resources;
-               if (retry_count++ == 0)
+               if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
+                   (ep->rep_remote_cma.responder_resources == 0 ||
+                    ep->rep_remote_cma.initiator_depth !=
+                               ep->rep_remote_cma.responder_resources)) {
+                       if (ep->rep_remote_cma.responder_resources == 0)
+                               ep->rep_remote_cma.responder_resources = 1;
+                       ep->rep_remote_cma.initiator_depth =
+                               ep->rep_remote_cma.responder_resources;
                        goto retry;
+               }
                rc = ep->rep_connected;
        } else {
                dprintk("RPC:       %s: connected\n", __func__);
@@ -863,6 +946,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        char *p;
        size_t len;
        int i, rc;
+       struct rpcrdma_mw *r;
 
        buf->rb_max_requests = cdata->max_requests;
        spin_lock_init(&buf->rb_lock);
@@ -873,7 +957,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
         *   2.  arrays of struct rpcrdma_req to fill in pointers
         *   3.  array of struct rpcrdma_rep for replies
         *   4.  padding, if any
-        *   5.  mw's, if any
+        *   5.  mw's, fmr's or frmr's, if any
         * Send/recv buffers in req/rep need to be registered
         */
 
@@ -881,6 +965,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
        len += cdata->padding;
        switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+               len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
+                               sizeof(struct rpcrdma_mw);
+               break;
        case RPCRDMA_MTHCAFMR:
                /* TBD we are perhaps overallocating here */
                len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
@@ -927,15 +1015,37 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
         * and also reduce unbind-to-bind collision.
         */
        INIT_LIST_HEAD(&buf->rb_mws);
+       r = (struct rpcrdma_mw *)p;
        switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+               for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+                       r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+                                                        RPCRDMA_MAX_SEGS);
+                       if (IS_ERR(r->r.frmr.fr_mr)) {
+                               rc = PTR_ERR(r->r.frmr.fr_mr);
+                               dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+                                       " failed %i\n", __func__, rc);
+                               goto out;
+                       }
+                       r->r.frmr.fr_pgl =
+                               ib_alloc_fast_reg_page_list(ia->ri_id->device,
+                                                           RPCRDMA_MAX_SEGS);
+                       if (IS_ERR(r->r.frmr.fr_pgl)) {
+                               rc = PTR_ERR(r->r.frmr.fr_pgl);
+                               dprintk("RPC:       %s: "
+                                       "ib_alloc_fast_reg_page_list "
+                                       "failed %i\n", __func__, rc);
+                               goto out;
+                       }
+                       list_add(&r->mw_list, &buf->rb_mws);
+                       ++r;
+               }
+               break;
        case RPCRDMA_MTHCAFMR:
-               {
-               struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
-               struct ib_fmr_attr fa = {
-                       RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
-               };
                /* TBD we are perhaps overallocating here */
                for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+                       static struct ib_fmr_attr fa =
+                               { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
                        r->r.fmr = ib_alloc_fmr(ia->ri_pd,
                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
                                &fa);
@@ -948,12 +1058,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                        list_add(&r->mw_list, &buf->rb_mws);
                        ++r;
                }
-               }
                break;
        case RPCRDMA_MEMWINDOWS_ASYNC:
        case RPCRDMA_MEMWINDOWS:
-               {
-               struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
                /* Allocate one extra request's worth, for full cycling */
                for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
                        r->r.mw = ib_alloc_mw(ia->ri_pd);
@@ -966,7 +1073,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                        list_add(&r->mw_list, &buf->rb_mws);
                        ++r;
                }
-               }
                break;
        default:
                break;
@@ -1046,6 +1152,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
        int rc, i;
        struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+       struct rpcrdma_mw *r;
 
        /* clean up in reverse order from create
         *   1.  recv mr memory (mr free, then kfree)
@@ -1065,11 +1172,19 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
                }
                if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
                        while (!list_empty(&buf->rb_mws)) {
-                               struct rpcrdma_mw *r;
                                r = list_entry(buf->rb_mws.next,
                                        struct rpcrdma_mw, mw_list);
                                list_del(&r->mw_list);
                                switch (ia->ri_memreg_strategy) {
+                               case RPCRDMA_FRMR:
+                                       rc = ib_dereg_mr(r->r.frmr.fr_mr);
+                                       if (rc)
+                                               dprintk("RPC:       %s:"
+                                                       " ib_dereg_mr"
+                                                       " failed %i\n",
+                                                       __func__, rc);
+                                       ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+                                       break;
                                case RPCRDMA_MTHCAFMR:
                                        rc = ib_dealloc_fmr(r->r.fmr);
                                        if (rc)
@@ -1115,6 +1230,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
        struct rpcrdma_req *req;
        unsigned long flags;
+       int i;
+       struct rpcrdma_mw *r;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
        if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1135,9 +1252,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
        }
        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
        if (!list_empty(&buffers->rb_mws)) {
-               int i = RPCRDMA_MAX_SEGS - 1;
+               i = RPCRDMA_MAX_SEGS - 1;
                do {
-                       struct rpcrdma_mw *r;
                        r = list_entry(buffers->rb_mws.next,
                                        struct rpcrdma_mw, mw_list);
                        list_del(&r->mw_list);
@@ -1171,6 +1287,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
                req->rl_reply = NULL;
        }
        switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
        case RPCRDMA_MTHCAFMR:
        case RPCRDMA_MEMWINDOWS_ASYNC:
        case RPCRDMA_MEMWINDOWS:
@@ -1252,7 +1369,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
                        va, len, DMA_BIDIRECTIONAL);
        iov->length = len;
 
-       if (ia->ri_bind_mem != NULL) {
+       if (ia->ri_have_dma_lkey) {
+               *mrp = NULL;
+               iov->lkey = ia->ri_dma_lkey;
+               return 0;
+       } else if (ia->ri_bind_mem != NULL) {
                *mrp = NULL;
                iov->lkey = ia->ri_bind_mem->lkey;
                return 0;
@@ -1329,15 +1450,292 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
                                seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
 }
 
+static int
+rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+                       int *nsegs, int writing, struct rpcrdma_ia *ia,
+                       struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_mr_seg *seg1 = seg;
+       struct ib_send_wr frmr_wr, *bad_wr;
+       u8 key;
+       int len, pageoff;
+       int i, rc;
+
+       pageoff = offset_in_page(seg1->mr_offset);
+       seg1->mr_offset -= pageoff;     /* start of page */
+       seg1->mr_len += pageoff;
+       len = -pageoff;
+       if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+               *nsegs = RPCRDMA_MAX_DATA_SEGS;
+       for (i = 0; i < *nsegs;) {
+               rpcrdma_map_one(ia, seg, writing);
+               seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
+               len += seg->mr_len;
+               ++seg;
+               ++i;
+               /* Check for holes */
+               if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+                   offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+                       break;
+       }
+       dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
+               __func__, seg1->mr_chunk.rl_mw, i);
+
+       /* Bump the key */
+       key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+       ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+
+       /* Prepare FRMR WR */
+       memset(&frmr_wr, 0, sizeof frmr_wr);
+       frmr_wr.opcode = IB_WR_FAST_REG_MR;
+       frmr_wr.send_flags = 0;                 /* unsignaled */
+       frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
+       frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
+       frmr_wr.wr.fast_reg.page_list_len = i;
+       frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
+       frmr_wr.wr.fast_reg.access_flags = (writing ?
+                               IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
+       frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+       DECR_CQCOUNT(&r_xprt->rx_ep);
+
+       rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
+
+       if (rc) {
+               dprintk("RPC:       %s: failed ib_post_send for register,"
+                       " status %i\n", __func__, rc);
+               while (i--)
+                       rpcrdma_unmap_one(ia, --seg);
+       } else {
+               seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+               seg1->mr_base = seg1->mr_dma + pageoff;
+               seg1->mr_nsegs = i;
+               seg1->mr_len = len;
+       }
+       *nsegs = i;
+       return rc;
+}
+
+static int
+rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
+                       struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_mr_seg *seg1 = seg;
+       struct ib_send_wr invalidate_wr, *bad_wr;
+       int rc;
+
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(ia, seg++);
+
+       memset(&invalidate_wr, 0, sizeof invalidate_wr);
+       invalidate_wr.opcode = IB_WR_LOCAL_INV;
+       invalidate_wr.send_flags = 0;                   /* unsignaled */
+       invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+       DECR_CQCOUNT(&r_xprt->rx_ep);
+
+       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+       if (rc)
+               dprintk("RPC:       %s: failed ib_post_send for invalidate,"
+                       " status %i\n", __func__, rc);
+       return rc;
+}
+
+static int
+rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
+                       int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+       struct rpcrdma_mr_seg *seg1 = seg;
+       u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+       int len, pageoff, i, rc;
+
+       pageoff = offset_in_page(seg1->mr_offset);
+       seg1->mr_offset -= pageoff;     /* start of page */
+       seg1->mr_len += pageoff;
+       len = -pageoff;
+       if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+               *nsegs = RPCRDMA_MAX_DATA_SEGS;
+       for (i = 0; i < *nsegs;) {
+               rpcrdma_map_one(ia, seg, writing);
+               physaddrs[i] = seg->mr_dma;
+               len += seg->mr_len;
+               ++seg;
+               ++i;
+               /* Check for holes */
+               if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+                   offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+                       break;
+       }
+       rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+                               physaddrs, i, seg1->mr_dma);
+       if (rc) {
+               dprintk("RPC:       %s: failed ib_map_phys_fmr "
+                       "%u@0x%llx+%i (%d)... status %i\n", __func__,
+                       len, (unsigned long long)seg1->mr_dma,
+                       pageoff, i, rc);
+               while (i--)
+                       rpcrdma_unmap_one(ia, --seg);
+       } else {
+               seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+               seg1->mr_base = seg1->mr_dma + pageoff;
+               seg1->mr_nsegs = i;
+               seg1->mr_len = len;
+       }
+       *nsegs = i;
+       return rc;
+}
+
+static int
+rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+                       struct rpcrdma_ia *ia)
+{
+       struct rpcrdma_mr_seg *seg1 = seg;
+       LIST_HEAD(l);
+       int rc;
+
+       list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
+       rc = ib_unmap_fmr(&l);
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(ia, seg++);
+       if (rc)
+               dprintk("RPC:       %s: failed ib_unmap_fmr,"
+                       " status %i\n", __func__, rc);
+       return rc;
+}
+
+static int
+rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
+                       int *nsegs, int writing, struct rpcrdma_ia *ia,
+                       struct rpcrdma_xprt *r_xprt)
+{
+       int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+                                 IB_ACCESS_REMOTE_READ);
+       struct ib_mw_bind param;
+       int rc;
+
+       *nsegs = 1;
+       rpcrdma_map_one(ia, seg, writing);
+       param.mr = ia->ri_bind_mem;
+       param.wr_id = 0ULL;     /* no send cookie */
+       param.addr = seg->mr_dma;
+       param.length = seg->mr_len;
+       param.send_flags = 0;
+       param.mw_access_flags = mem_priv;
+
+       DECR_CQCOUNT(&r_xprt->rx_ep);
+       rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+       if (rc) {
+               dprintk("RPC:       %s: failed ib_bind_mw "
+                       "%u@0x%llx status %i\n",
+                       __func__, seg->mr_len,
+                       (unsigned long long)seg->mr_dma, rc);
+               rpcrdma_unmap_one(ia, seg);
+       } else {
+               seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+               seg->mr_base = param.addr;
+               seg->mr_nsegs = 1;
+       }
+       return rc;
+}
+
+static int
+rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
+                       struct rpcrdma_ia *ia,
+                       struct rpcrdma_xprt *r_xprt, void **r)
+{
+       struct ib_mw_bind param;
+       LIST_HEAD(l);
+       int rc;
+
+       BUG_ON(seg->mr_nsegs != 1);
+       param.mr = ia->ri_bind_mem;
+       param.addr = 0ULL;      /* unbind */
+       param.length = 0;
+       param.mw_access_flags = 0;
+       if (*r) {
+               param.wr_id = (u64) (unsigned long) *r;
+               param.send_flags = IB_SEND_SIGNALED;
+               INIT_CQCOUNT(&r_xprt->rx_ep);
+       } else {
+               param.wr_id = 0ULL;
+               param.send_flags = 0;
+               DECR_CQCOUNT(&r_xprt->rx_ep);
+       }
+       rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+       rpcrdma_unmap_one(ia, seg);
+       if (rc)
+               dprintk("RPC:       %s: failed ib_(un)bind_mw,"
+                       " status %i\n", __func__, rc);
+       else
+               *r = NULL;      /* will upcall on completion */
+       return rc;
+}
+
+static int
+rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+                       int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+       int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+                                 IB_ACCESS_REMOTE_READ);
+       struct rpcrdma_mr_seg *seg1 = seg;
+       struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+       int len, i, rc = 0;
+
+       if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+               *nsegs = RPCRDMA_MAX_DATA_SEGS;
+       for (len = 0, i = 0; i < *nsegs;) {
+               rpcrdma_map_one(ia, seg, writing);
+               ipb[i].addr = seg->mr_dma;
+               ipb[i].size = seg->mr_len;
+               len += seg->mr_len;
+               ++seg;
+               ++i;
+               /* Check for holes */
+               if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+                   offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+                       break;
+       }
+       seg1->mr_base = seg1->mr_dma;
+       seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+                               ipb, i, mem_priv, &seg1->mr_base);
+       if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+               rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+               dprintk("RPC:       %s: failed ib_reg_phys_mr "
+                       "%u@0x%llx (%d)... status %i\n",
+                       __func__, len,
+                       (unsigned long long)seg1->mr_dma, i, rc);
+               while (i--)
+                       rpcrdma_unmap_one(ia, --seg);
+       } else {
+               seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+               seg1->mr_nsegs = i;
+               seg1->mr_len = len;
+       }
+       *nsegs = i;
+       return rc;
+}
+
+static int
+rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
+                       struct rpcrdma_ia *ia)
+{
+       struct rpcrdma_mr_seg *seg1 = seg;
+       int rc;
+
+       rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+       seg1->mr_chunk.rl_mr = NULL;
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(ia, seg++);
+       if (rc)
+               dprintk("RPC:       %s: failed ib_dereg_mr,"
+                       " status %i\n", __func__, rc);
+       return rc;
+}
+
 int
 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
                        int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
-                                 IB_ACCESS_REMOTE_READ);
-       struct rpcrdma_mr_seg *seg1 = seg;
-       int i;
        int rc = 0;
 
        switch (ia->ri_memreg_strategy) {
@@ -1352,114 +1750,25 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
                break;
 #endif
 
-       /* Registration using fast memory registration */
+       /* Registration using frmr registration */
+       case RPCRDMA_FRMR:
+               rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
+               break;
+
+       /* Registration using fmr memory registration */
        case RPCRDMA_MTHCAFMR:
-               {
-               u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
-               int len, pageoff = offset_in_page(seg->mr_offset);
-               seg1->mr_offset -= pageoff;     /* start of page */
-               seg1->mr_len += pageoff;
-               len = -pageoff;
-               if (nsegs > RPCRDMA_MAX_DATA_SEGS)
-                       nsegs = RPCRDMA_MAX_DATA_SEGS;
-               for (i = 0; i < nsegs;) {
-                       rpcrdma_map_one(ia, seg, writing);
-                       physaddrs[i] = seg->mr_dma;
-                       len += seg->mr_len;
-                       ++seg;
-                       ++i;
-                       /* Check for holes */
-                       if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
-                           offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
-                               break;
-               }
-               nsegs = i;
-               rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
-                                       physaddrs, nsegs, seg1->mr_dma);
-               if (rc) {
-                       dprintk("RPC:       %s: failed ib_map_phys_fmr "
-                               "%u@0x%llx+%i (%d)... status %i\n", __func__,
-                               len, (unsigned long long)seg1->mr_dma,
-                               pageoff, nsegs, rc);
-                       while (nsegs--)
-                               rpcrdma_unmap_one(ia, --seg);
-               } else {
-                       seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
-                       seg1->mr_base = seg1->mr_dma + pageoff;
-                       seg1->mr_nsegs = nsegs;
-                       seg1->mr_len = len;
-               }
-               }
+               rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
                break;
 
        /* Registration using memory windows */
        case RPCRDMA_MEMWINDOWS_ASYNC:
        case RPCRDMA_MEMWINDOWS:
-               {
-               struct ib_mw_bind param;
-               rpcrdma_map_one(ia, seg, writing);
-               param.mr = ia->ri_bind_mem;
-               param.wr_id = 0ULL;     /* no send cookie */
-               param.addr = seg->mr_dma;
-               param.length = seg->mr_len;
-               param.send_flags = 0;
-               param.mw_access_flags = mem_priv;
-
-               DECR_CQCOUNT(&r_xprt->rx_ep);
-               rc = ib_bind_mw(ia->ri_id->qp,
-                                       seg->mr_chunk.rl_mw->r.mw, &param);
-               if (rc) {
-                       dprintk("RPC:       %s: failed ib_bind_mw "
-                               "%u@0x%llx status %i\n",
-                               __func__, seg->mr_len,
-                               (unsigned long long)seg->mr_dma, rc);
-                       rpcrdma_unmap_one(ia, seg);
-               } else {
-                       seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
-                       seg->mr_base = param.addr;
-                       seg->mr_nsegs = 1;
-                       nsegs = 1;
-               }
-               }
+               rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
                break;
 
        /* Default registration each time */
        default:
-               {
-               struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
-               int len = 0;
-               if (nsegs > RPCRDMA_MAX_DATA_SEGS)
-                       nsegs = RPCRDMA_MAX_DATA_SEGS;
-               for (i = 0; i < nsegs;) {
-                       rpcrdma_map_one(ia, seg, writing);
-                       ipb[i].addr = seg->mr_dma;
-                       ipb[i].size = seg->mr_len;
-                       len += seg->mr_len;
-                       ++seg;
-                       ++i;
-                       /* Check for holes */
-                       if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
-                           offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
-                               break;
-               }
-               nsegs = i;
-               seg1->mr_base = seg1->mr_dma;
-               seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
-                                       ipb, nsegs, mem_priv, &seg1->mr_base);
-               if (IS_ERR(seg1->mr_chunk.rl_mr)) {
-                       rc = PTR_ERR(seg1->mr_chunk.rl_mr);
-                       dprintk("RPC:       %s: failed ib_reg_phys_mr "
-                               "%u@0x%llx (%d)... status %i\n",
-                               __func__, len,
-                               (unsigned long long)seg1->mr_dma, nsegs, rc);
-                       while (nsegs--)
-                               rpcrdma_unmap_one(ia, --seg);
-               } else {
-                       seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
-                       seg1->mr_nsegs = nsegs;
-                       seg1->mr_len = len;
-               }
-               }
+               rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
                break;
        }
        if (rc)
@@ -1473,7 +1782,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
                struct rpcrdma_xprt *r_xprt, void *r)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_mr_seg *seg1 = seg;
        int nsegs = seg->mr_nsegs, rc;
 
        switch (ia->ri_memreg_strategy) {
@@ -1486,56 +1794,21 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
                break;
 #endif
 
+       case RPCRDMA_FRMR:
+               rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
+               break;
+
        case RPCRDMA_MTHCAFMR:
-               {
-               LIST_HEAD(l);
-               list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
-               rc = ib_unmap_fmr(&l);
-               while (seg1->mr_nsegs--)
-                       rpcrdma_unmap_one(ia, seg++);
-               }
-               if (rc)
-                       dprintk("RPC:       %s: failed ib_unmap_fmr,"
-                               " status %i\n", __func__, rc);
+               rc = rpcrdma_deregister_fmr_external(seg, ia);
                break;
 
        case RPCRDMA_MEMWINDOWS_ASYNC:
        case RPCRDMA_MEMWINDOWS:
-               {
-               struct ib_mw_bind param;
-               BUG_ON(nsegs != 1);
-               param.mr = ia->ri_bind_mem;
-               param.addr = 0ULL;      /* unbind */
-               param.length = 0;
-               param.mw_access_flags = 0;
-               if (r) {
-                       param.wr_id = (u64) (unsigned long) r;
-                       param.send_flags = IB_SEND_SIGNALED;
-                       INIT_CQCOUNT(&r_xprt->rx_ep);
-               } else {
-                       param.wr_id = 0ULL;
-                       param.send_flags = 0;
-                       DECR_CQCOUNT(&r_xprt->rx_ep);
-               }
-               rc = ib_bind_mw(ia->ri_id->qp,
-                               seg->mr_chunk.rl_mw->r.mw, &param);
-               rpcrdma_unmap_one(ia, seg);
-               }
-               if (rc)
-                       dprintk("RPC:       %s: failed ib_(un)bind_mw,"
-                               " status %i\n", __func__, rc);
-               else
-                       r = NULL;       /* will upcall on completion */
+               rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
                break;
 
        default:
-               rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
-               seg1->mr_chunk.rl_mr = NULL;
-               while (seg1->mr_nsegs--)
-                       rpcrdma_unmap_one(ia, seg++);
-               if (rc)
-                       dprintk("RPC:       %s: failed ib_dereg_mr,"
-                               " status %i\n", __func__, rc);
+               rc = rpcrdma_deregister_default_external(seg, ia);
                break;
        }
        if (r) {
index 2427822..c7a7eba 100644 (file)
@@ -51,6 +51,9 @@
 #include <linux/sunrpc/rpc_rdma.h>     /* RPC/RDMA protocol */
 #include <linux/sunrpc/xprtrdma.h>     /* xprt parameters */
 
+#define RDMA_RESOLVE_TIMEOUT   (5000)  /* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX (2)     /* retries if no listener backlog */
+
 /*
  * Interface Adapter -- one per transport instance
  */
@@ -58,6 +61,8 @@ struct rpcrdma_ia {
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
        struct ib_mr            *ri_bind_mem;
+       u32                     ri_dma_lkey;
+       int                     ri_have_dma_lkey;
        struct completion       ri_done;
        int                     ri_async_rc;
        enum rpcrdma_memreg     ri_memreg_strategy;
@@ -156,6 +161,10 @@ struct rpcrdma_mr_seg {            /* chunk descriptors */
                        union {
                                struct ib_mw    *mw;
                                struct ib_fmr   *fmr;
+                               struct {
+                                       struct ib_fast_reg_page_list *fr_pgl;
+                                       struct ib_mr *fr_mr;
+                               } frmr;
                        } r;
                        struct list_head mw_list;
                } *rl_mw;
@@ -175,6 +184,7 @@ struct rpcrdma_req {
        size_t          rl_size;        /* actual length of buffer */
        unsigned int    rl_niovs;       /* 0, 2 or 4 */
        unsigned int    rl_nchunks;     /* non-zero if chunks */
+       unsigned int    rl_connect_cookie;      /* retry detection */
        struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
        struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -198,7 +208,7 @@ struct rpcrdma_buffer {
        atomic_t        rb_credits;     /* most recent server credits */
        unsigned long   rb_cwndscale;   /* cached framework rpc_cwndscale */
        int             rb_max_requests;/* client max requests */
-       struct list_head rb_mws;        /* optional memory windows/fmrs */
+       struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
        int             rb_send_index;
        struct rpcrdma_req      **rb_send_bufs;
        int             rb_recv_index;
@@ -273,6 +283,11 @@ struct rpcrdma_xprt {
 #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
 #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
 
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
 /*
  * Interface Adapter calls - xprtrdma/verbs.c
  */