From: Suparna Bhattacharya <suparna@in.ibm.com>

The current implementation of O_SYNC support for AIO calls sync_page_range
explicitly in the high level aio write handlers after invoking the fops
aio_write method.  This a duplication of logic across sync i/o and aio (not
good from a maintenance perspective, and more error prone, e.g.  forgets to
check for O_DIRECT).

More importantly, it doesn't take into account the possibility that different
filesystems may have their own O_SYNC implementations in their corresponding
aio methods.  This is a problem for example in the case of ext3 which may
commit the journal after data and meta-data are written out.

This patch fixes the above problems by moving the call to sync_page_range out
of the high level aio_pwrite handler, and back into generic_file_aio_write
for both aio and sync i/o.  It takes care to make sure that sync_page_range
is called only after all the data is copied in the AIO case, rather than on
every retry.



 25-akpm/fs/aio.c     |   36 ++++++++++--------------------------
 25-akpm/mm/filemap.c |   16 ++++++++++++++++
 2 files changed, 26 insertions(+), 26 deletions(-)

diff -puN fs/aio.c~aio-O_SYNC-fix fs/aio.c
--- 25/fs/aio.c~aio-O_SYNC-fix	Tue Jul  8 16:07:39 2003
+++ 25-akpm/fs/aio.c	Tue Jul  8 16:07:41 2003
@@ -1289,11 +1289,6 @@ static ssize_t aio_pwrite(struct kiocb *
 	struct inode *inode = mapping->host;
 	ssize_t ret = 0;
 
-	if (!iocb->ki_buf) {
-		ret = iocb->ki_left;
-		goto retry_osync;
-	}
-
 	ret = file->f_op->aio_write(iocb, iocb->ki_buf,
 				iocb->ki_left, iocb->ki_pos);
 
@@ -1302,7 +1297,7 @@ static ssize_t aio_pwrite(struct kiocb *
 	 * for a balance_dirty_pages to complete
 	 */
 	if (ret > 0) {
-		iocb->ki_buf += ret;
+		iocb->ki_buf += iocb->ki_buf ? ret : 0;
 		iocb->ki_left -= ret;
 
 		ret = -EIOCBRETRY;
@@ -1312,30 +1307,19 @@ static ssize_t aio_pwrite(struct kiocb *
 	/* No need to retry anymore unless we need to osync data */
 	if (ret == 0) {
 		ret = iocb->ki_nbytes - iocb->ki_left;
-		/* Set things up for potential O_SYNC */
-		iocb->ki_buf = NULL;
-		iocb->ki_pos -= ret; /* back up fpos */
-		iocb->ki_left = ret; /* sync only what we have written out */
-		iocb->ki_nbytes = ret;
-	}
+		if (!iocb->ki_buf)
+			return ret;
 
-
-retry_osync:
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range(inode, mapping, iocb->ki_pos, ret);
-		if (err <= 0) {
-			ret = err;
-		} else {
-			BUG_ON(err > iocb->ki_left);
-			iocb->ki_pos += err;
-			iocb->ki_left -= err;
+		/* Set things up for potential O_SYNC */
+		if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			iocb->ki_buf = NULL;
+			iocb->ki_pos -= ret; /* back up fpos */
+			iocb->ki_left = ret; /* sync what we have written out */
+			iocb->ki_nbytes = ret;
 			ret = -EIOCBRETRY;
 		}
 	}
-	if (ret == 0)
-		ret = iocb->ki_nbytes - iocb->ki_left;
+
 	return ret;
 }
 
diff -puN mm/filemap.c~aio-O_SYNC-fix mm/filemap.c
--- 25/mm/filemap.c~aio-O_SYNC-fix	Tue Jul  8 16:07:39 2003
+++ 25-akpm/mm/filemap.c	Tue Jul  8 16:07:41 2003
@@ -1911,17 +1911,33 @@ ssize_t generic_file_aio_write(struct ki
 
 	BUG_ON(iocb->ki_pos != pos);
 
+	if (!buf && !is_sync_kiocb(iocb)) {
+		/* nothing to transfer, may just need to sync data */
+		ret = count;
+		goto osync;
+	}
+
 	down(&inode->i_sem);
 	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
 						&iocb->ki_pos);
 	up(&inode->i_sem);
 
+	/*
+	 * Avoid doing a sync in parts for aio - its more efficient to
+	 * call in again after all the data has been copied
+	 */
+	if (!is_sync_kiocb(iocb))
+		return ret;
+
+osync:
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
 
 		err = sync_page_range(inode, mapping, pos, ret);
 		if (err < 0)
 			ret = err;
+		else
+			iocb->ki_pos = pos + err;
 	}
 	return ret;
 }

_