From: Suparna Bhattacharya <suparna@in.ibm.com>

OK, have been playing with a patch to retry enable the osync speedup 
code.  This should get us O_SYNC support for aio (for ext2 at least).



 25-akpm/fs/aio.c                  |   44 +++++++++++++++++++++++++++++++-------
 25-akpm/include/linux/pagemap.h   |    8 ++++++
 25-akpm/include/linux/writeback.h |    2 -
 25-akpm/mm/filemap.c              |    4 +--
 25-akpm/mm/page-writeback.c       |   23 ++++++++++++-------
 5 files changed, 62 insertions(+), 19 deletions(-)

diff -puN fs/aio.c~aio-09-o_sync fs/aio.c
--- 25/fs/aio.c~aio-09-o_sync	Tue Jul  8 16:07:18 2003
+++ 25-akpm/fs/aio.c	Tue Jul  8 16:07:26 2003
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
+#include <linux/writeback.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -1245,16 +1246,21 @@ static ssize_t aio_pread(struct kiocb *i
 static ssize_t aio_pwrite(struct kiocb *iocb)
 {
 	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
 	ssize_t ret = 0;
 
+	if (!iocb->ki_buf) {
+		ret = iocb->ki_left;
+		goto retry_osync;
+	}
+
 	ret = file->f_op->aio_write(iocb, iocb->ki_buf,
-		iocb->ki_left, iocb->ki_pos);
+				iocb->ki_left, iocb->ki_pos);
 
 	/*
-	 * TBD: Even if iocb->ki_left = 0, could we need to
-	 * wait for data to be sync'd ? Or can we assume
-	 * that aio_fdsync/aio_fsync would be called explicitly
-	 * as required.
+	 * Even if iocb->ki_left = 0, we may need to wait
+	 * for a balance_dirty_pages to complete
 	 */
 	if (ret > 0) {
 		iocb->ki_buf += ret;
@@ -1264,10 +1270,34 @@ static ssize_t aio_pwrite(struct kiocb *
 	}
 
 	/* This means we must have transferred all that we could */
-	/* No need to retry anymore */
-	if (ret == 0)
+	/* No need to retry anymore unless we need to osync data */
+	if (ret == 0) {
 		ret = iocb->ki_nbytes - iocb->ki_left;
+		/* Set things up for potential O_SYNC */
+		iocb->ki_buf = NULL;
+		iocb->ki_pos -= ret; /* back up fpos */
+		iocb->ki_left = ret; /* sync only what we have written out */
+		iocb->ki_nbytes = ret;
+	}
+
 
+retry_osync:
+	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		ssize_t err;
+
+		err = sync_page_range(inode, mapping, iocb->ki_pos, ret);
+		if (err < 0)
+			ret = err;
+		else {
+			printk("synced %d bytes\n", err);
+			iocb->ki_pos += err;
+			iocb->ki_left -= err;
+			if ((iocb->ki_left) && (err != 0))
+				ret = -EIOCBRETRY;
+			else
+				ret = iocb->ki_nbytes;
+		}
+	}
 	return ret;
 }
 
diff -puN include/linux/pagemap.h~aio-09-o_sync include/linux/pagemap.h
--- 25/include/linux/pagemap.h~aio-09-o_sync	Tue Jul  8 16:07:18 2003
+++ 25-akpm/include/linux/pagemap.h	Tue Jul  8 16:07:18 2003
@@ -183,6 +183,14 @@ static inline void wait_on_page_writebac
 		wait_on_page_bit(page, PG_writeback);
 }
 
+static inline int wait_on_page_writeback_wq(struct page *page,
+						wait_queue_t *wait)
+{
+	if (PageWriteback(page))
+		return wait_on_page_bit_wq(page, PG_writeback, wait);
+	return 0;
+}
+
 extern void end_page_writeback(struct page *page);
 
 /*
diff -puN include/linux/writeback.h~aio-09-o_sync include/linux/writeback.h
--- 25/include/linux/writeback.h~aio-09-o_sync	Tue Jul  8 16:07:18 2003
+++ 25-akpm/include/linux/writeback.h	Tue Jul  8 16:07:18 2003
@@ -88,7 +88,7 @@ int balance_dirty_pages(struct address_s
 int balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
-int sync_page_range(struct inode *inode, struct address_space *mapping,
+ssize_t sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, size_t count);
 
 /* pdflush.c */
diff -puN mm/filemap.c~aio-09-o_sync mm/filemap.c
--- 25/mm/filemap.c~aio-09-o_sync	Tue Jul  8 16:07:18 2003
+++ 25-akpm/mm/filemap.c	Tue Jul  8 16:07:24 2003
@@ -1876,7 +1876,7 @@ generic_file_aio_write_nolock(struct kio
 	 */
 	if (likely(status >= 0)) {
 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			if (!a_ops->writepage || !is_sync_kiocb(iocb))
+			if (!a_ops->writepage)
 				status = generic_osync_inode(inode,
 						OSYNC_METADATA|OSYNC_DATA);
 		}
@@ -1980,7 +1980,7 @@ ssize_t generic_file_writev(struct file 
 	up(&inode->i_sem);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err;
+		ssize_t err;
 
 		err = sync_page_range(inode, inode->i_mapping,
 					*ppos - ret, ret);
diff -puN mm/page-writeback.c~aio-09-o_sync mm/page-writeback.c
--- 25/mm/page-writeback.c~aio-09-o_sync	Tue Jul  8 16:07:18 2003
+++ 25-akpm/mm/page-writeback.c	Tue Jul  8 16:07:26 2003
@@ -570,7 +570,7 @@ int test_clear_page_dirty(struct page *p
 EXPORT_SYMBOL(test_clear_page_dirty);
 
 
-static int operate_on_page_range(struct address_space *mapping,
+static ssize_t operate_on_page_range(struct address_space *mapping,
 		loff_t pos, size_t count, int (*operator)(struct page *))
 {
 	pgoff_t first = pos >> PAGE_CACHE_SHIFT;
@@ -594,6 +594,10 @@ static int operate_on_page_range(struct 
 			}
 			next = page->index + 1;
 			ret = (*operator)(page);
+			if (ret == -EIOCBRETRY) {
+				next--;
+				break;
+			}
 			if (PageError(page)) {
 				if (!ret)
 					ret = -EIO;
@@ -602,20 +606,21 @@ static int operate_on_page_range(struct 
 				break;
 		}
 		pagevec_release(&pvec);
-		if (next > last)
+		if ((next > last) || (ret == -EIOCBRETRY))
 			break;
 	}
+	if (!ret || (ret == -EIOCBRETRY))
+		ret = (next << PAGE_CACHE_SHIFT) - pos;
 	return ret;
 }
 
 static int page_waiter(struct page *page)
 {
 	unlock_page(page);
-	wait_on_page_writeback(page);
-	return 0;
+	return wait_on_page_writeback_wq(page, current->io_wait);
 }
 
-static int
+static size_t
 wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count)
 {
 	return operate_on_page_range(mapping, pos, count, page_waiter);
@@ -632,7 +637,7 @@ static int page_writer(struct page *page
 	return page->mapping->a_ops->writepage(page, &wbc);
 }
 
-static int
+static ssize_t
 write_out_page_range(struct address_space *mapping, loff_t pos, size_t count)
 {
 	return operate_on_page_range(mapping, pos, count, page_writer);
@@ -646,7 +651,7 @@ write_out_page_range(struct address_spac
  * We need to re-take i_sem during the generic_osync_inode list walk because
  * it is otherwise livelockable.
  */
-int sync_page_range(struct inode *inode, struct address_space *mapping,
+ssize_t sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, size_t count)
 {
 	int ret;
@@ -656,12 +661,12 @@ int sync_page_range(struct inode *inode,
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 	ret = write_out_page_range(mapping, pos, count);
-	if (ret == 0) {
+	if (ret >= 0) {
 		down(&inode->i_sem);
 		ret = generic_osync_inode(inode, OSYNC_METADATA);
 		up(&inode->i_sem);
 	}
-	if (ret == 0)
+	if (ret >= 0)
 		ret = wait_on_page_range(mapping, pos, count);
 	return ret;
 }

_