

---

 linux-2.6.14-pnfs-current-dhildebz/fs/nfs/Makefile            |    3 
 linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.c    |  469 ++++++++++
 linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.h    |   96 ++
 linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayoutdev.c |  406 ++++++++
 4 files changed, 974 insertions(+)

diff -puN /dev/null fs/nfs/nfs4filelayout.c
--- /dev/null	2006-01-09 05:56:56.224752500 -0500
+++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.c	2006-01-18 20:12:38.047788000 -0500
@@ -0,0 +1,469 @@
+/*
+ *  linux/fs/nfs/nfs4filelayout-mod.c
+ *
+ *  Module for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dean Hildebrand <dhildebz@eecs.umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs4_pnfs.h>
+
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@eecs.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+extern void nfs_execute_read(struct nfs_read_data *data);
+extern void nfs_readdata_release(struct rpc_task *task);
+extern int nfs_flush_task_priority(int how);
+extern void nfs_writedata_release(struct rpc_task *task);
+extern void nfs_execute_write(struct nfs_write_data *data);
+extern void nfs_commit_rpcsetup(struct nfs_write_data *data, int sync);
+
+/* Callback operations to the pNFS client */
+struct pnfs_client_operations * pnfs_callback_ops;
+
+/* Initialize a mountpoint by retrieving the list of
+ * available devices for it.
+ * Return the pnfs_mount_type structure so the
+ * pNFS_client can refer to the mount point later on.
+ */
+struct pnfs_mount_type*
+filelayout_initialize_mountpoint(struct super_block* sb)
+{
+    struct filelayout_mount_type* fl_mt;
+    struct pnfs_mount_type* mt;
+    struct pnfs_devicelist dlist;
+    int status;
+
+    fl_mt = kmalloc(sizeof(struct filelayout_mount_type), GFP_KERNEL);
+    if (!fl_mt)
+	return NULL;
+    mt = kmalloc(sizeof(struct pnfs_mount_type), GFP_KERNEL);
+    if (!mt)
+	return NULL;
+
+    fl_mt->fl_sb = sb;
+    mt->mountid = (void*)fl_mt;
+
+    /* Retrieve device list from server*/
+    status = pnfs_callback_ops->nfs_getdevicelist(sb, &dlist);
+
+    /* Initialize nfs4 file layout specific device list structure */
+    fl_mt->hlist = kmalloc(sizeof(struct nfs4_pnfs_dev_hlist), GFP_KERNEL);
+    if (!fl_mt->hlist)
+	return NULL;
+    status = nfs4_pnfs_devlist_init(fl_mt->hlist);
+    if (status)
+	return NULL;
+
+    /* Decode opaque devicelist and add to list of available
+     * devices (data servers.
+     */
+    status = decode_and_add_devicelist(fl_mt, &dlist);
+    if (status)
+	return NULL;
+
+    return mt;
+}
+
+/* Uninitialize a mountpoint by destroying its device list.
+ */
+int
+filelayout_uninitialize_mountpoint(struct pnfs_mount_type* mountid)
+{
+    struct filelayout_mount_type* fl_mt = NULL;
+
+    if (mountid)
+	fl_mt = (struct filelayout_mount_type*)mountid->mountid;
+
+    nfs4_pnfs_devlist_destroy(fl_mt->hlist);
+
+    if (fl_mt != NULL)
+	kfree(fl_mt);
+    kfree(mountid);
+    return 0;
+}
+
+/* Perform sync or async reads.
+ *
+ * An optimization for the NFS file layout driver
+ * allows the original read/write data structs to be passed in the
+ * last argument.
+ *
+ * This is called after the pNFS client has already created, so I pass it
+ * in via the last argument (void*).  I think this is the only way as there
+ * are just too many NFS specific arguments in the read/write data structs
+ * to pass to the layout drivers.
+ *
+ * TODO:
+ * 1. This is a lot of arguments, create special non-nfs-specific structure?
+ */
+ssize_t filelayout_read_pagelist(
+    struct pnfs_layout_type * layoutid,
+    struct inode * inode,
+    struct page **pages,
+    unsigned int pgbase,
+    unsigned nr_pages,
+    loff_t offset,
+    size_t count,
+    void* read_data)
+{
+    struct nfs_read_data *data = (struct nfs_read_data*)read_data;
+    struct nfs4_filelayout* nfslay = (struct nfs4_filelayout*)layoutid->layoutid;
+    struct nfs4_pnfs_dserver dserver;
+    int status;
+
+    /* Retrieve the correct rpc_client for the byte range */
+    status = nfs4_pnfs_dserver_get(inode,
+				   nfslay,
+				   offset,
+				   count,
+				   &dserver);
+    if(!status) {
+	data->pnfs_client = dserver.dev_item->rpc_clnt;
+	data->args.fh = dserver.fh;
+    }
+
+    /* Perform a syncronous or asyncronous read */
+    if (IS_SYNC(inode))
+    {
+	/* sync */
+	status = NFS_PROTO(inode)->read(data);
+    }
+    else
+    {
+	/* async */
+	NFS_PROTO(inode)->read_setup(data);
+	data->task.tk_cookie = (unsigned long)inode;
+	data->task.tk_calldata = data;
+	/* Release requests */
+	data->task.tk_release = nfs_readdata_release;
+
+	dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+		data->task.tk_pid,
+		inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode),
+		count,
+		(unsigned long long)data->args.offset);
+	nfs_execute_read(data);
+	status = 0;
+    }
+    return status;
+}
+
+/* Perform sync or async writes.
+ *
+ * TODO: See filelayout_read_pagelist.
+ */
+ssize_t filelayout_write_pagelist(
+    struct pnfs_layout_type * layoutid,
+    struct inode * inode,
+    struct page **pages,
+    unsigned int pgbase,
+    unsigned nr_pages,
+    loff_t offset,
+    size_t count,
+    int sync,
+    void* write_data)
+{
+    struct nfs_write_data *data = (struct nfs_write_data*)write_data;
+    struct nfs4_filelayout* nfslay = (struct nfs4_filelayout*)layoutid->layoutid;
+    struct nfs4_pnfs_dserver dserver;
+    int status;
+
+    /* Retrieve the correct rpc_client for the byte range */
+    status = nfs4_pnfs_dserver_get(inode,
+				   nfslay,
+				   offset,
+				   count,
+				   &dserver);
+    if(!status) {
+	data->pnfs_client = dserver.dev_item->rpc_clnt;
+	data->args.fh = dserver.fh;
+    }
+
+    /* Perform a syncronous or asyncronous read */
+    /* TODO: What is the different between file and data sync? */
+    if (sync == NFS_FILE_SYNC || sync == NFS_DATA_SYNC)
+    {
+	/* sync */
+	status = NFS_PROTO(inode)->write(data);
+    }
+    else
+    {
+	/* async */
+	NFS_PROTO(inode)->write_setup(data, sync);
+	data->task.tk_priority = nfs_flush_task_priority(sync);
+	data->task.tk_cookie = (unsigned long)inode;
+	data->task.tk_calldata = data;
+	/* Release requests */
+	data->task.tk_release = nfs_writedata_release;
+
+	dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+		data->task.tk_pid,
+		inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode),
+		count,
+		(unsigned long long)data->args.offset);
+	nfs_execute_write(data);
+	status = 0;
+    }
+    return status;
+}
+
+/* Create a filelayout layout structure and return it.  The pNFS client
+ * will use the pnfs_layout_type type to refer to the layout for this
+ * inode from now on.
+ */
+struct pnfs_layout_type*
+filelayout_alloc_layout(struct pnfs_mount_type * mountid, struct inode * inode)
+{
+    struct pnfs_layout_type* pnfslay = NULL;
+    struct nfs4_filelayout* nfslay = NULL;
+
+    pnfslay = kmalloc(sizeof(struct pnfs_layout_type), GFP_KERNEL);
+    if (!pnfslay)
+	return NULL;
+    nfslay = kmalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL);
+    if (!nfslay)
+	return NULL;
+
+    pnfslay->layoutid = (void*)nfslay;
+    pnfslay->mountid = mountid;
+    return pnfslay;
+}
+
+/* Free a filelayout layout structure
+ */
+void
+filelayout_free_layout(struct pnfs_layout_type * layoutid, struct inode * inode)
+{
+    struct nfs4_filelayout* nfslay = NULL;
+    if (layoutid)
+	nfslay = (struct nfs4_filelayout*)layoutid->layoutid;
+    if (nfslay != NULL)
+	kfree(nfslay);
+    kfree(layoutid);
+}
+
+/* Decode layout and store in layoutid.  Overwrite any existing layout
+ * information for this file.
+ */
+struct pnfs_layout_type*
+filelayout_set_layout(struct pnfs_layout_type* layoutid,
+		      struct inode* inode,
+		      void* layout)
+{
+    struct nfs4_filelayout* fl = NULL;
+    int i;
+    uint32_t *p = (uint32_t*)layout, class;
+
+    dprintk("%s set_layout_map Begin\n", __FUNCTION__);
+
+    if (!layoutid)
+	goto nfserr;
+    fl = (struct nfs4_filelayout*)layoutid->layoutid;
+    if (!fl)
+	goto nfserr;
+
+    /* Decode the layout here */
+    READ64(fl->offset);
+    READ64(fl->length);
+    READ32(fl->iomode);
+    READ32(class); /* class */
+
+    dprintk("DEBUG: %s: class %d\n", __func__, class);
+    if (class != LAYOUT_NFSV4_FILES) {
+	return NULL;
+    }
+
+    /* layout */
+    READ32(fl->stripe_type);
+    READ64(fl->stripe_unit);
+    READ64(fl->file_size);
+    READ32(fl->num_devs);
+
+    dprintk("DEBUG: %s: stripe_unit %lld file_size %lld devs %d\n",
+	   __func__, fl->stripe_unit, fl->file_size, fl->num_devs);
+    for (i = 0; i < fl->num_devs; i++) {
+	/* dev_id */
+	READ32(fl->devs[i].dev_id);
+
+	/* fh */
+	memset(&fl->devs[i].fh, 0, sizeof(struct nfs_fh));
+	READ32(fl->devs[i].fh.size);
+	COPYMEM(fl->devs[i].fh.data, fl->devs[i].fh.size);
+	dprintk("DEBUG: %s: dev %d len %d\n", __func__,
+		fl->devs[i].dev_id,fl->devs[i].fh.size);
+    }
+
+    return layoutid;
+nfserr:
+    return NULL;
+}
+
+/* Call nfs fsync function to flush buffers and eventually call
+ * the filelayout_write_pagelist and filelayout_commit functions.
+ */
+int
+filelayout_fsync(
+    struct pnfs_layout_type * layoutid,
+    struct file *file,
+    struct dentry *dentry,
+    int datasync)
+{
+    return pnfs_callback_ops->nfs_fsync(file, dentry, datasync);
+}
+
+/* Technically we would need to execute a COMMIT op to each
+ * data server on which a page in 'pages' exists.
+ * With a cluster file system, all we need is a single commit to the MDS.
+ */
+int
+filelayout_commit(struct pnfs_layout_type * layoutid, struct inode* ino, struct list_head *pages, int sync, void* private)
+{
+    struct nfs_write_data *data = (struct nfs_write_data*)private;
+
+    nfs_commit_rpcsetup(data, sync);
+    nfs_execute_write(data);
+    return 0;
+}
+
+/* Return the stripesize for the specified file.
+ */
+ssize_t
+filelayout_get_stripesize(struct pnfs_layout_type* layoutid, struct inode* inode)
+{
+    struct nfs4_filelayout* fl = (struct nfs4_filelayout*)layoutid->layoutid;
+    ssize_t stripesize = fl->stripe_unit;
+    return stripesize;
+}
+
+/* Split wsize/rsize chunks so they do not span multiple data servers
+ */
+int
+filelayout_gather_across_stripes(struct pnfs_mount_type* mountid)
+{
+    return 0;
+}
+
+/* Use the NFSv4 page cache
+*/
+int
+filelayout_use_pagecache(struct pnfs_layout_type* layoutid, struct inode* inode)
+{
+    return 1;
+}
+
+/* Issue a layoutget in the same compound as OPEN
+ */
+int
+filelayout_layoutget_on_open(struct pnfs_mount_type* mountid)
+{
+    return 1;
+}
+
+struct layoutdriver_io_operations filelayout_io_operations =
+{
+    .fsync                   = filelayout_fsync,
+    .commit                  = filelayout_commit,
+    .read_pagelist           = filelayout_read_pagelist,
+    .write_pagelist          = filelayout_write_pagelist,
+    .set_layout              = filelayout_set_layout,
+    .alloc_layout            = filelayout_alloc_layout,
+    .free_layout             = filelayout_free_layout,
+    .initialize_mountpoint   = filelayout_initialize_mountpoint,
+    .uninitialize_mountpoint = filelayout_uninitialize_mountpoint,
+};
+
+struct layoutdriver_policy_operations filelayout_policy_operations =
+{
+    .get_stripesize        = filelayout_get_stripesize,
+    .gather_across_stripes = filelayout_gather_across_stripes,
+    .use_pagecache         = filelayout_use_pagecache,
+    .layoutget_on_open     = filelayout_layoutget_on_open,
+};
+
+
+struct pnfs_layoutdriver_type filelayout_type =
+{
+    .id = LAYOUT_NFSV4_FILES,
+    .name = "LAYOUT_NFSV4_FILES",
+    .ld_io_ops = &filelayout_io_operations,
+    .ld_policy_ops = &filelayout_policy_operations,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+    printk("%s: NFSv4 File Layout Driver Registering...\n", __FUNCTION__);
+
+    /* Need to register file_operations struct with global list to indicate
+     * that NFS4 file layout is a possible pNFS I/O module
+     */
+    pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type);
+
+    return 0;
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+    printk("%s: NFSv4 File Layout Driver Unregistering...\n", __FUNCTION__);
+
+    /* Unregister NFS4 file layout driver with pNFS client*/
+    pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ * End:
+ *
+ * vim: ts=8 sts=4 sw=4 expandtab
+ */
diff -puN fs/nfs/Makefile~client-nfs4filelayoutdriver fs/nfs/Makefile
--- linux-2.6.14-pnfs-current/fs/nfs/Makefile~client-nfs4filelayoutdriver	2006-01-18 20:11:57.168279000 -0500
+++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/Makefile	2006-01-18 20:11:57.190280000 -0500
@@ -15,3 +15,6 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4x
 			   pnfs.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-objs		:= $(nfs-y)
+
+obj-m += nfslayoutdriver.o
+nfslayoutdriver-objs := nfs4filelayout.o nfs4filelayoutdev.o
diff -puN /dev/null fs/nfs/nfs4filelayout.h
--- /dev/null	2006-01-09 05:56:56.224752500 -0500
+++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.h	2006-01-18 20:11:57.196280000 -0500
@@ -0,0 +1,96 @@
+/*
+ *  pnfs_nfs4filelayout.h
+ *
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dean Hildebrand   <dhildebz@eecs.umich.edu>
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include <linux/nfs4_pnfs.h>
+
+#define NFS4_PNFS_DEV_HASH_BITS 5
+#define NFS4_PNFS_DEV_HASH (1 << NFS4_PNFS_DEV_HASH_BITS)
+
+#define NFS4_PNFS_MAX_DEVS 16
+
+struct nfs4_pnfs_dev_item {
+	struct hlist_node hash_node;
+	u32 dev_id;
+	u32 ip_addr;
+	u32 port;
+	atomic_t count;
+	struct rpc_clnt *rpc_clnt;
+};
+
+struct nfs4_pnfs_dev_hlist {
+	rwlock_t          dev_lock;
+	struct hlist_head dev_list[NFS4_PNFS_DEV_HASH];
+};
+
+struct nfs4_pnfs_devaddr {
+	u32 dev_id;
+	u32 ip;
+	u16 port;
+};
+
+struct nfs4_pnfs_devlist {
+	struct list_head         devlist;
+	struct nfs4_pnfs_devaddr devaddr;
+};
+
+struct nfs4_pnfs_dserver {
+	struct nfs_fh        *fh;
+	struct nfs4_pnfs_dev_item *dev_item;
+};
+
+struct nfs4_filelayout_devs {
+	u32 dev_id;
+	struct nfs_fh fh;
+};
+
+struct nfs4_filelayout {
+	int uncommitted_write;
+	loff_t last_commit_size;
+	u64 layout_id;
+	u64 offset;
+	u64 length;
+	u32 iomode;
+	u64 file_size;
+	u32 stripe_type;
+	u64 stripe_unit;
+	unsigned int num_devs;
+	struct nfs4_filelayout_devs devs[NFS4_PNFS_MAX_DEVS];
+};
+
+struct filelayout_mount_type {
+	struct super_block* fl_sb;
+	struct nfs4_pnfs_dev_hlist *hlist;
+};
+
+int  nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist);
+void nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist);
+
+int nfs4_pnfs_dserver_get(struct inode *inode,
+			  struct nfs4_filelayout *layout,
+			  u64 offset,
+			  u32 count,
+			  struct nfs4_pnfs_dserver *dserver);
+int decode_and_add_devicelist(struct filelayout_mount_type *mt, struct pnfs_devicelist* devlist);
+
+#define READ32(x)         (x) = ntohl(*p++)
+#define READ64(x)         do {			\
+	(x) = (u64)ntohl(*p++) << 32;		\
+	(x) |= ntohl(*p++);			\
+} while (0)
+#define COPYMEM(x,nbytes) do {			\
+	memcpy((x), p, nbytes);			\
+	p += XDR_QUADLEN(nbytes);		\
+} while (0)
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff -puN /dev/null fs/nfs/nfs4filelayoutdev.c
--- /dev/null	2006-01-09 05:56:56.224752500 -0500
+++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayoutdev.c	2006-01-18 20:11:57.203279000 -0500
@@ -0,0 +1,406 @@
+/*
+ *  linux/fs/nfs/nfs4filelayout-mod.c
+ *
+ *  Module for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dean Hildebrand <dhildebz@eecs.umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/config.h>
+#include <linux/completion.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include <asm/div64.h>
+
+#include "nfs4filelayout.h"
+
+extern struct pnfs_client_operations * pnfs_callback_ops;
+
+struct rpc_clnt* create_nfs_rpcclient(struct rpc_xprt *xprt,
+				      char* server_name,
+				      u32 version,
+				      rpc_authflavor_t authflavor,
+				      int *err);
+
+/* Assumes lock is held */
+static inline struct nfs4_pnfs_dev_item *
+_device_lookup(struct nfs4_pnfs_dev_hlist *hlist, u32 dev_id)
+{
+	unsigned long      hash;
+	struct hlist_node *np;
+
+	printk("_device_lookup: dev_id=%u\n", dev_id);
+
+	hash = hash_long(dev_id, NFS4_PNFS_DEV_HASH_BITS);
+
+	hlist_for_each(np, &hlist->dev_list[hash]) {
+		struct nfs4_pnfs_dev_item *dev;
+		dev = hlist_entry(np, struct nfs4_pnfs_dev_item, hash_node);
+		if (dev->dev_id == dev_id) {
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+/* Assumes lock is held */
+static inline void
+_device_add(struct nfs4_pnfs_dev_hlist *hlist, struct nfs4_pnfs_dev_item *dev)
+{
+	unsigned long      hash;
+
+	printk("_device_add: dev_id=%u, ip=%x, port=%hu\n", dev->dev_id,
+		   dev->ip_addr, dev->port);
+
+	hash = hash_long(dev->dev_id, NFS4_PNFS_DEV_HASH_BITS);
+	hlist_add_head(&dev->hash_node, &hlist->dev_list[hash]);
+}
+
+/* Create an rpc to the data server defined in 'dev' */
+static int
+device_create(struct rpc_clnt *mds_rpc, struct nfs4_pnfs_dev_item *dev)
+{
+	struct rpc_clnt      *clnt;
+	struct rpc_xprt      *xprt;
+	struct sockaddr_in    sin;
+	int err = 0;
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = dev->ip_addr;
+	sin.sin_port = dev->port;
+
+	printk("device_create: dev_id=%u, ip=%x, port=%hu\n", dev->dev_id, dev->ip_addr, dev->port);
+
+	xprt = xprt_create_proto(IPPROTO_TCP, &sin,
+				 &mds_rpc->cl_xprt->timeout);
+	if (IS_ERR(xprt)) {
+		err = PTR_ERR(xprt);
+		goto out;
+	}
+
+	clnt = create_nfs_rpcclient(xprt, "nfs4_pnfs_dserver", mds_rpc->cl_vers, mds_rpc->cl_auth->au_flavor, &err);
+	if (clnt == NULL) {
+		printk("%s: Can't create nfs rpc client!\n", __FUNCTION__);
+		goto out;
+	}
+
+	dev->rpc_clnt = clnt;
+
+ out:
+	return err;
+}
+
+static void
+device_destroy(struct nfs4_pnfs_dev_item *dev)
+{
+	if (!dev)
+		return;
+/*	BUG_ON(!atomic_sub_and_test(0, &dev->count)); */
+	rpc_shutdown_client(dev->rpc_clnt);
+	kfree(dev);
+}
+
+int
+nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist)
+{
+	int i;
+
+	hlist->dev_lock = RW_LOCK_UNLOCKED;
+
+	for (i = 0; i < NFS4_PNFS_DEV_HASH; i++) {
+		INIT_HLIST_HEAD(&hlist->dev_list[i]);
+	}
+
+	return 0;
+}
+
+/* De-alloc all devices for a mount point.  This is called in
+ * nfs4_kill_super.
+ */
+void
+nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist)
+{
+	int i;
+
+	if(hlist == NULL)
+		return;
+
+	/* No lock held, as synchronization should occur at upper levels */
+	for (i = 0; i < NFS4_PNFS_DEV_HASH; i++) {
+		struct hlist_node *np, *next;
+
+		hlist_for_each_safe(np, next, &hlist->dev_list[i]) {
+			struct nfs4_pnfs_dev_item *dev;
+			dev = hlist_entry(np, struct nfs4_pnfs_dev_item, hash_node);
+			hlist_del_rcu(&dev->hash_node);
+			device_destroy(dev);
+		}
+	}
+}
+
+/* Create the rpc client to the data server specific in
+ * 'dev', and add it to the list of available devices
+ * for this mount point.
+ */
+static int
+nfs4_pnfs_device_add(struct filelayout_mount_type *mt,
+		     struct nfs4_pnfs_dev_item *dev)
+{
+	struct nfs4_pnfs_dev_item *tmp_dev;
+	int err;
+	struct nfs4_pnfs_dev_hlist *hlist = mt->hlist;
+	struct nfs_server *server = NFS_SB(mt->fl_sb);
+
+	printk("nfs4_pnfs_device_add\n");
+
+	/* Create device */
+	err = device_create(server->client, dev);
+	if (err)
+		return err;
+
+	/* Write lock, do lookup again, and then add device */
+	write_lock(&hlist->dev_lock);
+	tmp_dev = _device_lookup(hlist, dev->dev_id);
+	if (tmp_dev == NULL) {
+		_device_add(hlist, dev);
+	}
+	write_unlock(&hlist->dev_lock);
+
+	/* Cleanup, if device was recently added */
+	if (tmp_dev != NULL) {
+		printk(" device found, not adding (after creation)\n");
+		device_destroy(dev);
+	}
+
+	return 0;
+}
+
+/* Decode opaque device data and return the result
+ */
+static struct nfs4_pnfs_dev_item*
+decode_device(struct pnfs_device* dev)
+{
+	int len;
+	int tmp[6];
+	uint32_t *p = (uint32_t*)dev->dev_addr_buf;
+	struct nfs4_pnfs_dev_item* file_dev;
+	char *r_addr;
+
+	if ((file_dev = kmalloc(sizeof(struct nfs4_pnfs_dev_item), GFP_KERNEL)) == NULL)
+	{
+		return NULL;
+	}
+
+	/* Initialize dev */
+	INIT_HLIST_NODE(&file_dev->hash_node);
+	atomic_set(&file_dev->count, 0);
+
+	/* Device id */
+	file_dev->dev_id = dev->dev_id;
+
+	/* Decode contents of device*/
+
+        /* device addr --  r_netid, r_addr */
+
+	/* check and skip r_netid */
+	READ32(len);
+	if (len != 3) /* "tcp" */
+		return NULL;
+	/* TODO: Don't we read the tcp bytes? */
+
+	READ32(len);
+	r_addr = (char*)p;
+	sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1],
+		       &tmp[2], &tmp[3], &tmp[4], &tmp[5]);
+	file_dev->ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) |
+			    (tmp[2]<<8) | (tmp[3]));
+	file_dev->port = htons((tmp[4] << 8) | (tmp[5]));
+
+	return file_dev;
+}
+
+/* Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices for this
+ * mount point.
+ * Must at some point be followed up with device_destroy
+ */
+static struct nfs4_pnfs_dev_item*
+decode_and_add_device(struct filelayout_mount_type *mt, struct pnfs_device* dev)
+{
+	struct nfs4_pnfs_dev_item* file_dev;
+
+	file_dev = decode_device(dev);
+
+	if (!file_dev)
+	{
+		printk("%s Could not decode device\n", __FUNCTION__);
+		return NULL;
+	}
+
+	if (nfs4_pnfs_device_add(mt, file_dev))
+		return NULL;
+	return file_dev;
+}
+
+/* Decode the opaque device list in 'devlist'
+ * and add it to the list of available devices for this
+ * mount point.
+ * Must at some point be followed up with device_destroy.
+ */
+int
+decode_and_add_devicelist(struct filelayout_mount_type *mt, struct pnfs_devicelist* devlist)
+{
+	int i, cnt;
+
+	for (i = 0,cnt=0; i < devlist->num_devs && cnt < NFS4_PNFS_DEV_MAXCOUNT; i++) {
+		if (!decode_and_add_device(mt, &devlist->devs[cnt]))
+			return 1;
+		cnt++;
+	}
+	return 0;
+}
+
+/* Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+static struct nfs4_pnfs_dev_item *
+get_device_info(struct filelayout_mount_type *mt, u32 dev_id)
+{
+	int rc;
+	struct pnfs_device *pdev = NULL;
+
+	if ((pdev = kmalloc(sizeof(struct pnfs_device), GFP_KERNEL)) == NULL)
+	{
+		return NULL;
+	}
+	rc = pnfs_callback_ops->nfs_getdeviceinfo(mt->fl_sb, dev_id, pdev);
+	if (rc) {
+		return NULL;
+        }
+
+	/* Found new device, need to decode it and then add it to the
+	 * list of known devices for this mountpoint.
+	 */
+	return decode_and_add_device(mt, pdev);
+}
+
+/* Lookup and return the device dev_id
+ */
+static struct nfs4_pnfs_dev_item *
+nfs4_pnfs_device_get(struct inode *inode, u32 dev_id)
+{
+	struct nfs4_pnfs_dev_item *dev;
+	struct nfs_server* server = NFS_SERVER(inode);
+	struct filelayout_mount_type *mt = (struct filelayout_mount_type*)server->pnfs_mountid->mountid;
+	struct nfs4_pnfs_dev_hlist *hlist = mt->hlist;
+
+	read_lock(&hlist->dev_lock);
+	dev = _device_lookup(hlist, dev_id);
+/*
+	if (dev) {
+		atomic_inc(&dev->count);
+	}
+*/
+	read_unlock(&hlist->dev_lock);
+	if (dev == NULL)
+		dev = get_device_info(mt, dev_id);
+
+	return dev;
+}
+
+/* Retrieve the rpc client for a specified byte range
+ * in 'inode' by filling in the contents of 'dserver'.
+ */
+int
+nfs4_pnfs_dserver_get(struct inode *inode,
+		      struct nfs4_filelayout *layout,
+		      u64 offset,
+		      u32 count,
+		      struct nfs4_pnfs_dserver *dserver)
+{
+	u32 dev_id;
+	u64 tmp;
+	u32 stripe_idx, dbg_stripe_idx;
+
+	if(!layout)
+		return 1;
+
+	tmp = offset;
+	/* Want ((offset / layout->stripe_unit) % layout->num_devs) */
+	do_div(tmp, layout->stripe_unit);
+	stripe_idx = do_div(tmp, layout->num_devs);
+
+	/* For debugging */
+	tmp = offset + count - 1;
+	do_div(tmp, layout->stripe_unit);
+	dbg_stripe_idx = do_div(tmp, layout->num_devs);
+
+	printk("nfsv4_pnfs_dserver_get: offset=%Lu, count=%u, si=%u, dsi=%u, "
+		   "num_devs=%u, stripe_unit=%Lu\n",
+		   offset, count, stripe_idx, dbg_stripe_idx, layout->num_devs,
+		   layout->stripe_unit);
+
+	BUG_ON(dbg_stripe_idx != stripe_idx);
+
+	dev_id = layout->devs[stripe_idx].dev_id;
+
+	dserver->dev_item = nfs4_pnfs_device_get(inode, dev_id);
+	if (dserver->dev_item == NULL)
+		return 1;
+	dserver->fh = &layout->devs[stripe_idx].fh;
+
+	printk("nfs4_pnfs_dserver_get: dev_id=%u, idx=%u, offset=%Lu, count=%u\n",
+		   dev_id, stripe_idx, offset, count);
+
+	return 0;
+}
+
+/* Currently not used.
+ * I have disabled checking the device count until we can think of a good way
+ * to call nfs4_pnfs_device_put in a generic way from the pNFS client.
+ * The only way I think think of is to put the nfs4_pnfs_dev_item directly
+ * in the nfs4_write/read_data structure, which breaks the clear line between
+ * the pNFS client and layout drivers.  If I did do this, then I could call
+ * an ioctl on the NFSv4 file layout driver to decrement the device count.
+ */
+static void
+nfs4_pnfs_device_put(struct nfs4_pnfs_dev_hlist *hlist, struct nfs4_pnfs_dev_item *dev)
+{
+	printk("nfs4_pnfs_dserver_put: dev_id=%u\n", dev->dev_id);
+	atomic_dec(&dev->count);
+}
_
