diff -druN Linux-2.4.16/Documentation/Configure.help linux-2.4.16-odirect/Documentation/Configure.help
--- Linux-2.4.16/Documentation/Configure.help	Tue Dec  4 13:29:35 2001
+++ linux-2.4.16-odirect/Documentation/Configure.help	Fri Dec 14 11:43:25 2001
@@ -14173,6 +14173,30 @@
 
   If unsure, say N.
 
+Allow direct I/O on files in NFS
+CONFIG_NFS_DIRECTIO
+  There are important applications whose performance or correctness
+  depends on uncached access to file data.  Database clusters (multiple
+  copies of the same instance running on separate hosts) implement their
+  own cache coherency protocol that subsumes the NFS cache protocols.
+  Applications that process datasets considerably larger than the client's
+  memory do not always benefit from a local cache.  A streaming video
+  server, for instance, has no need to cache the contents of a file.
+
+  This option enables applications to perform direct I/O on files in NFS
+  file systems using the O_DIRECT open() flag.  When O_DIRECT is set for
+  files, their data is not cached in the system's page cache.  Direct
+  read and write operations are aligned to block boundaries.  Data is
+  moved to and from user-level application buffers directly.
+
+  Unless your program is designed to use O_DIRECT properly, you are much
+  better off allowing the NFS client to manage caching for you.  Misusing
+  O_DIRECT can cause poor server performance or network storms.  This
+  kernel build option defaults OFF to avoid exposing system administrators
+  unwittingly to a potentially hazardous feature.
+
+  If unsure, say N.
+
 Root file system on NFS
 CONFIG_ROOT_NFS
   If you want your Linux box to mount its whole root file system (the
diff -druN Linux-2.4.16/fs/Config.in linux-2.4.16-odirect/fs/Config.in
--- Linux-2.4.16/fs/Config.in	Tue Dec  4 13:29:37 2001
+++ linux-2.4.16-odirect/fs/Config.in	Fri Dec 14 11:43:25 2001
@@ -95,6 +95,7 @@
    dep_tristate 'InterMezzo file system support (experimental, replicating fs)' CONFIG_INTERMEZZO_FS $CONFIG_INET $CONFIG_EXPERIMENTAL
    dep_tristate 'NFS file system support' CONFIG_NFS_FS $CONFIG_INET
    dep_mbool '  Provide NFSv3 client support' CONFIG_NFS_V3 $CONFIG_NFS_FS
+   dep_bool '  Allow direct I/O on NFS files' CONFIG_NFS_DIRECTIO $CONFIG_NFS_FS
    dep_bool '  Root file system on NFS' CONFIG_ROOT_NFS $CONFIG_NFS_FS $CONFIG_IP_PNP
 
    dep_tristate 'NFS server support' CONFIG_NFSD $CONFIG_INET
diff -druN Linux-2.4.16/fs/nfs/Makefile linux-2.4.16-odirect/fs/nfs/Makefile
--- Linux-2.4.16/fs/nfs/Makefile	Tue Dec  4 13:29:37 2001
+++ linux-2.4.16-odirect/fs/nfs/Makefile	Fri Dec 14 11:43:25 2001
@@ -14,6 +14,7 @@
 
 obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o      
 obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
+obj-$(CONFIG_NFS_DIRECTIO) += direct.o
 
 obj-m   := $(O_TARGET)
 
diff -druN Linux-2.4.16/fs/nfs/direct.c linux-2.4.16-odirect/fs/nfs/direct.c
--- Linux-2.4.16/fs/nfs/direct.c	Wed Dec 31 19:00:00 1969
+++ linux-2.4.16-odirect/fs/nfs/direct.c	Fri Dec 14 11:50:22 2001
@@ -0,0 +1,378 @@
+/*
+ * linux/fs/nfs/direct.c
+ *
+ * High-performance direct I/O for the NFS client
+ *
+ * When an application requests uncached I/O, all read and write requests
+ * are made directly to the server; data stored or fetched via these
+ * requests is not cached in the Linux page cache.  The client does not
+ * correct unaligned requests from applications.  All requested bytes are
+ * held on permanent storage before a direct write system call returns to
+ * an application.  Applications that manage their own data caching, such
+ * as databases, make very good use of direct I/O on local file systems.
+ *
+ * Solaris implements an uncached I/O facility called directio() that
+ * is used for backups and sequential I/O to very large files.  Solaris
+ * also supports uncaching whole NFS partitions with "-o forcedirectio,"
+ * an undocumented mount option.
+ *
+ * Note that I/O to read in executables (e.g. kernel_read) cannot use
+ * direct (kiobuf) reads because there is no vma backing the passed-in
+ * data buffer.
+ *
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust.
+ *
+ * Initial implementation:	12/2001 by Chuck Lever <cel@netapp.com>
+ *
+ * TODO:
+ *
+ * 1.  Use concurrent asynchronous network requests rather than
+ *     serialized synchronous network requests for normal (non-sync)
+ *     direct I/O.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+#include <linux/nfs_fs.h>
+#include <linux/smp_lock.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/iobuf.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define NFSDBG_FACILITY		(NFSDBG_PAGECACHE | NFSDBG_VFS)
+#define VERF_SIZE		(2 * sizeof(__u32))
+
+static /* inline */ int
+nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg)
+{
+	int result;
+	struct inode * inode = file->f_dentry->d_inode;
+	struct nfs_fattr fattr;
+        struct rpc_message msg;
+        struct nfs_readres res = { &fattr, arg->count, 0 };
+
+#ifdef CONFIG_NFS_V3
+	msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ?
+						NFS3PROC_READ : NFSPROC_READ;
+#else
+	msg.rpc_proc = NFSPROC_READ;
+#endif
+	msg.rpc_argp = arg;
+        msg.rpc_resp = &res;
+
+	lock_kernel();
+        msg.rpc_cred = nfs_file_cred(file);
+        fattr.valid = 0;
+        result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_refresh_inode(inode, &fattr);
+	unlock_kernel();
+
+	return result;
+}
+
+static /* inline */ int
+nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg,
+	struct nfs_writeverf *verf)
+{
+	int result;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct nfs_fattr fattr;
+        struct rpc_message msg;
+        struct nfs_writeres res = { &fattr, verf, 0 };
+
+#ifdef CONFIG_NFS_V3
+	msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ?
+						NFS3PROC_WRITE : NFSPROC_WRITE;
+#else
+	msg.rpc_proc = NFSPROC_WRITE;
+#endif
+	msg.rpc_argp = arg;
+        msg.rpc_resp = &res;
+
+	lock_kernel();
+	msg.rpc_cred = get_rpccred(nfs_file_cred(file));
+	fattr.valid = 0;
+        result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_write_attributes(inode, &fattr);
+	put_rpccred(msg.rpc_cred);
+	unlock_kernel();
+
+	if (result > 0) {
+		if ((arg->stable == NFS_FILE_SYNC) &&
+		    (verf->committed != NFS_FILE_SYNC)) {
+			printk(KERN_ERR __FUNCTION__
+				": server didn't sync stable write request\n");
+			return -EIO;
+		}
+
+		if (result != arg->count)
+			printk(KERN_INFO __FUNCTION__
+				": short write, count=%u, result=%d\n",
+							arg->count, result);
+	}
+
+	return result;
+}
+
+#ifdef CONFIG_NFS_V3
+static /* inline */ int
+nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count,
+	struct nfs_writeverf *verf)
+{
+	int result;
+	struct nfs_fattr fattr;
+	struct nfs_writeargs	arg = { NFS_FH(inode), offset, count, 0, 0,
+					{{0, 0}, {0,0}, {0,0}, {0,0},
+					 {0,0}, {0,0}, {0,0}, {0,0}} };
+	struct nfs_writeres	res = { &fattr, verf, 0 };
+	struct rpc_message	msg = { NFS3PROC_COMMIT, &arg, &res, NULL };
+
+	fattr.valid = 0;
+
+	lock_kernel();
+	result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+	nfs_write_attributes(inode, &fattr);
+	unlock_kernel();
+
+	return result;
+}
+#else
+static inline int
+nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count,
+	struct nfs_writeverf *verf)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Walk through the iobuf and create an iovec for each "rsize" bytes.
+ */
+static int
+nfs_direct_read(struct file *file, struct kiobuf *iobuf, loff_t offset,
+	size_t count)
+{
+	int curpage, total;
+	struct inode *inode = file->f_dentry->d_inode;
+	int rsize = NFS_SERVER(inode)->rsize;
+       	struct nfs_readargs args = { NFS_FH(inode), 0, 0, 0 };
+
+	total = 0;
+	curpage = 0;
+        while (count) {
+                int starting_offset, request, result, first, last, i;
+		struct iovec *iovec = args.iov;
+
+                request = count;
+                if (count > rsize)
+                        request = rsize;
+		args.count = request;
+		args.offset = offset;
+
+		starting_offset = iobuf->offset;
+		first = last = curpage;
+		while (curpage < iobuf->nr_pages) {
+			struct page *page = iobuf->maplist[curpage];
+
+			if (!page)
+				return -EFAULT;
+
+			iovec->iov_base = kmap(page) + starting_offset;
+			iovec->iov_len = (PAGE_SIZE - starting_offset);
+			if ((starting_offset + request) < PAGE_SIZE)
+				iovec->iov_len = request;
+
+			request -= iovec->iov_len;
+			starting_offset = 0;	/* zero after the first page */
+			last = curpage;
+			curpage++;
+			iovec++;
+			args.nriov++;
+		}
+
+                result = nfs_direct_read_rpc(file, &args);
+
+		for (i = first; i < last; i++) {
+			flush_dcache_page(iobuf->maplist[i]);
+			kunmap(iobuf->maplist[i]);
+		}
+
+                if (result < 0) {
+			if (result == -EISDIR)
+				total = -EINVAL;
+			else
+                        	total = result;
+                        break;
+                }
+
+                total += result;
+                count -= result;
+                offset += result;
+
+                if (result < args.count)   /* NFSv2ism */
+                        break;
+        };
+	return total;
+}
+
+/*
+ * Walk through the iobuf and create an iovec for each "wsize" bytes.
+ * If only one network write is necessary, or if the O_SYNC flag or
+ * 'sync' mount option are present, or if this is a V2 inode, use
+ * FILE_SYNC.  Otherwise, use UNSTABLE and finish with a COMMIT.
+ *
+ * The mechanics of this function are much the same as nfs_direct_read,
+ * with the added complexity of committing unstable writes.
+ */
+static int
+nfs_direct_write(struct file *file, struct kiobuf *iobuf,
+	loff_t offset, size_t count)
+{
+	int curpage, total;
+	int need_commit = 0;
+	loff_t save_offset = offset;
+	struct inode *inode = file->f_dentry->d_inode;
+	int wsize = NFS_SERVER(inode)->wsize;
+	struct nfs_writeverf first_verf, ret_verf;
+        struct nfs_writeargs args = { NFS_FH(inode), 0, 0, NFS_FILE_SYNC, 0 };
+
+#ifdef CONFIG_NFS_V3
+	if ((NFS_PROTO(inode)->version == 3) && (count > wsize) &&
+							(!IS_SYNC(inode)))
+		args.stable = NFS_UNSTABLE;
+#endif
+
+retry:
+	total = 0;
+	curpage = 0;
+        while (count) {
+                int starting_offset, request, result, first, last, i;
+		struct iovec *iovec = args.iov;
+
+                request = count;
+                if (count > wsize)
+                        request = wsize;
+		args.count = request;
+		args.offset = offset;
+
+		starting_offset = iobuf->offset;
+		first = last = curpage;
+		while (curpage < iobuf->nr_pages) {
+			struct page *page = iobuf->maplist[curpage];
+
+			if (!page)
+				return -EFAULT;
+
+			iovec->iov_base = kmap(page) + starting_offset;
+			iovec->iov_len = (PAGE_SIZE - starting_offset);
+			if ((starting_offset + request) < PAGE_SIZE)
+				iovec->iov_len = request;
+
+			request -= iovec->iov_len;
+			starting_offset = 0;	/* zero after the first page */
+			last = curpage;
+			curpage++;
+			iovec++;
+			args.nriov++;
+		}
+
+                result = nfs_direct_write_rpc(file, &args, &ret_verf);
+
+		for (i = first; i < last; i++)
+			kunmap(iobuf->maplist[i]);
+
+                if (result < 0) {
+                        total = result;
+                        break;
+                }
+
+		if (!total)
+			memcpy(&first_verf.verifier, &ret_verf.verifier,
+								VERF_SIZE);
+		if (ret_verf.committed != NFS_FILE_SYNC) {
+			need_commit = 1;
+			if (memcmp(&first_verf.verifier, &ret_verf.verifier,
+								VERF_SIZE))
+				goto print_retry;
+		}
+
+                total += result;
+                count -= result;
+                offset += result;
+        };
+
+	/*
+	 * Commit data written so far, even in the event of an error
+	 */
+	if (need_commit) {
+		if (nfs_direct_commit_rpc(inode, save_offset,
+					iobuf->length - count, &ret_verf))
+			goto print_retry;
+		if (memcmp(&first_verf.verifier, &ret_verf.verifier,
+								VERF_SIZE))
+			goto print_retry;
+	}
+
+	return total;
+
+print_retry:
+	printk(KERN_INFO __FUNCTION__
+		": detected server restart; retrying with FILE_SYNC\n");
+	args.stable = NFS_FILE_SYNC;
+	offset = save_offset;
+	count = iobuf->length;
+	goto retry;
+}
+
+/*
+ * Read or write data, moving the data directly to/from the
+ * application's buffer without caching in the page cache.
+ *
+ * Rules for direct I/O
+ *
+ * 1.  block size = 512 bytes or more
+ * 2.  file byte offset is block aligned
+ * 3.  byte count is a multiple of block size
+ * 4.  user buffer is not aligned
+ * 5.  user buffer is faulted in and pinned
+ *
+ * These are verified before we get here.
+ */
+int
+nfs_direct_IO(int rw, struct file *file, struct kiobuf *iobuf,
+	unsigned long blocknr, int blocksize)
+{
+	int result = -EINVAL;
+	size_t count = iobuf->length;
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	loff_t offset = blocknr << inode->i_blkbits;
+
+	switch (rw) {
+	case READ:
+		dfprintk(VFS,
+			"NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n",
+				dentry->d_parent->d_name.name,
+					dentry->d_name.name, offset, count);
+
+		result = nfs_direct_read(file, iobuf, offset, count);
+		break;
+	case WRITE:
+		dfprintk(VFS,
+			"NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n",
+				dentry->d_parent->d_name.name,
+					dentry->d_name.name, offset, count);
+
+		result = nfs_direct_write(file, iobuf, offset, count);
+		break;
+	default:
+		break;
+	}
+
+	dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
+	return result;
+}
diff -druN Linux-2.4.16/fs/nfs/file.c linux-2.4.16-odirect/fs/nfs/file.c
--- Linux-2.4.16/fs/nfs/file.c	Tue Dec  4 13:29:24 2001
+++ linux-2.4.16-odirect/fs/nfs/file.c	Fri Dec 14 11:43:25 2001
@@ -204,7 +204,10 @@
 	sync_page: nfs_sync_page,
 	writepage: nfs_writepage,
 	prepare_write: nfs_prepare_write,
-	commit_write: nfs_commit_write
+	commit_write: nfs_commit_write,
+#ifdef CONFIG_NFS_DIRECTIO
+	direct_IO: nfs_direct_IO,
+#endif
 };
 
 /* 
diff -druN Linux-2.4.16/fs/nfs/write.c linux-2.4.16-odirect/fs/nfs/write.c
--- Linux-2.4.16/fs/nfs/write.c	Tue Dec  4 13:29:37 2001
+++ linux-2.4.16-odirect/fs/nfs/write.c	Fri Dec 14 11:43:25 2001
@@ -121,23 +121,6 @@
 }
 
 /*
- * This function will be used to simulate weak cache consistency
- * under NFSv2 when the NFSv3 attribute patch is included.
- * For the moment, we just call nfs_refresh_inode().
- */
-static __inline__ int
-nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr)
-{
-	if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) {
-		fattr->pre_size  = NFS_CACHE_ISIZE(inode);
-		fattr->pre_mtime = NFS_CACHE_MTIME(inode);
-		fattr->pre_ctime = NFS_CACHE_CTIME(inode);
-		fattr->valid |= NFS_ATTR_WCC;
-	}
-	return nfs_refresh_inode(inode, fattr);
-}
-
-/*
  * Write a page synchronously.
  * Offset is the data offset within the page.
  */
diff -druN Linux-2.4.16/include/linux/nfs_fs.h linux-2.4.16-odirect/include/linux/nfs_fs.h
--- Linux-2.4.16/include/linux/nfs_fs.h	Tue Dec  4 13:29:37 2001
+++ linux-2.4.16-odirect/include/linux/nfs_fs.h	Fri Dec 14 11:47:15 2001
@@ -266,6 +266,11 @@
 extern int  nfs_scan_lru_read_timeout(struct nfs_server *, struct list_head *);
 
 /*
+ * linux/fs/nfs/direct.c
+ */
+extern int  nfs_direct_IO(int, struct file *, struct kiobuf *, unsigned long, int);
+
+/*
  * linux/fs/mount_clnt.c
  * (Used only by nfsroot module)
  */
@@ -289,6 +294,23 @@
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
 		return 0;
 	return __nfs_refresh_inode(inode,fattr);
+}
+
+/*
+ * This function will be used to simulate weak cache consistency
+ * under NFSv2 when the NFSv3 attribute patch is included.
+ * For the moment, we just call nfs_refresh_inode().
+ */
+static __inline__ int
+nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr)
+{
+	if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) {
+		fattr->pre_size  = NFS_CACHE_ISIZE(inode);
+		fattr->pre_mtime = NFS_CACHE_MTIME(inode);
+		fattr->pre_ctime = NFS_CACHE_CTIME(inode);
+		fattr->valid |= NFS_ATTR_WCC;
+	}
+	return nfs_refresh_inode(inode, fattr);
 }
 
 static inline loff_t
