From: Nick Piggin <piggin@cyberone.com.au>

Performance on my tests with mm7 is good.  tiobench sequential reads still
isn't really good (single processor).  I think this is because of the
seek_mean logic.  I think tiobench files aren't layed out in a very real
world manner, are they?

Anyway, the following patch changes the whole notion of READ/WRITE to
SYNC/ASYNC.  Now this is the "simple" way because it does not also keep a
list in READ/WRITE order for merging.  I think this is actually alright
because there probably isn't a lot of wasted merging opportunity.

The generic block layer shouldn't mind that we might offer a READ request
as a candidate to merge with a WRITE: it checks this and will just disallow
the merge.

Anyway, OraSim does not change much, pgbench gains about 15%.  The biggest
difference is just the artificial:

Bench 7 - 2 threads, 1 reading, 1 doing write+fsync
2.5.69-mm7: IO Rate: 51.21 MB/s, Sync writes per read: 0.0005
2.5.69-mm7+this: IO Rate: 36.19 MB/s Sync writes per read: 0.8674

Because we now anticipate on reads.  Probably help more with ext3 +
kjournald stuff.  WimMark would be interesting.



 drivers/block/as-iosched.c |  144 ++++++++++++++++++++++++---------------------
 fs/buffer.c                |    4 +
 fs/fs-writeback.c          |    2 
 include/linux/sched.h      |    1 
 4 files changed, 85 insertions(+), 66 deletions(-)

diff -puN drivers/block/as-iosched.c~as-sync-async drivers/block/as-iosched.c
--- 25/drivers/block/as-iosched.c~as-sync-async	2003-06-04 20:30:30.000000000 -0700
+++ 25-akpm/drivers/block/as-iosched.c	2003-06-04 20:30:30.000000000 -0700
@@ -22,6 +22,9 @@
 #include <linux/rbtree.h>
 #include <linux/interrupt.h>
 
+#define REQ_SYNC	1
+#define REQ_ASYNC	0
+
 /*
  * See Documentation/as-iosched.txt
  */
@@ -115,13 +118,13 @@ struct as_data {
 	struct list_head fifo_list[2];
 	
 	struct as_rq *next_arq[2];	/* next in sort order */
-	sector_t last_sector[2];	/* last READ and WRITE sectors */
+	sector_t last_sector[2];	/* last REQ_SYNC and REQ_ASYNC sectors */
 	struct list_head *dispatch;	/* driver dispatch queue */
 	struct list_head *hash;		/* request hash */
 	unsigned long hash_valid_count;	/* barrier hash count */
 	unsigned long current_batch_expires;
 	unsigned long last_check_fifo[2];
-	int batch_data_dir;		/* current/last batch READ or WRITE */
+	int batch_data_dir;		/* current/last batch REQ_SYNC or REQ_ASYNC */
 	mempool_t *arq_pool;
 
 	int antic_status;
@@ -184,6 +187,7 @@ struct as_rq {
 	struct list_head fifo;
 	unsigned long expires;
 
+	int is_sync;
 	enum arq_state state; /* debug only */
 };
 
@@ -256,9 +260,9 @@ static struct as_io_context *get_as_io_c
 			ret->ttime_total = 0;
 			ret->ttime_samples = 0;
 			ret->ttime_mean = 0;
-			ret->dir_after_read[READ] = 0;
-			ret->dir_after_read[WRITE] = 0;
-			ret->mean_dir_after_read = READ;
+			ret->dir_after_read[REQ_SYNC] = 0;
+			ret->dir_after_read[REQ_ASYNC] = 0;
+			ret->mean_dir_after_read = REQ_SYNC;
 			ret->seek_total = 0;
 			ret->seek_samples = 0;
 			ret->seek_mean = 0;
@@ -379,7 +383,7 @@ static struct request *as_find_arq_hash(
 #define ON_RB(node)	((node)->rb_color != RB_NONE)
 #define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
 #define rb_entry_arq(node)	rb_entry((node), struct as_rq, rb_node)
-#define ARQ_RB_ROOT(ad, arq)	(&(ad)->sort_list[rq_data_dir((arq)->request)])
+#define ARQ_RB_ROOT(ad, arq)	(&(ad)->sort_list[(arq)->is_sync])
 #define rq_rb_key(rq)		(rq)->sector
 
 /*
@@ -499,13 +503,13 @@ as_choose_req(struct as_data *ad, struct
 	if (arq2 == NULL)
 		return arq1;
 
-	data_dir = rq_data_dir(arq1->request);
+	data_dir = arq1->is_sync;
 
 	last = ad->last_sector[data_dir];
 	s1 = arq1->request->sector;
 	s2 = arq2->request->sector;
 
-	BUG_ON(data_dir != rq_data_dir(arq2->request));
+	BUG_ON(data_dir != arq2->is_sync);
 
 	/*
 	 * Strict one way elevator _except_ in the case where we allow
@@ -563,7 +567,7 @@ as_choose_req(struct as_data *ad, struct
  */
 static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
 {
-	const int data_dir = rq_data_dir(last->request);
+	const int data_dir = last->is_sync;
 	struct as_rq *ret;
 	struct rb_node *rbnext = rb_next(&last->rb_node);
 	struct rb_node *rbprev = rb_prev(&last->rb_node);
@@ -724,7 +728,7 @@ static int as_can_break_anticipation(str
 {
 	struct as_io_context *aic;
 	
-	if (arq && rq_data_dir(arq->request) == READ && as_close_req(ad, arq)) {
+	if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) {
 		/* close request */
 		return 1;
 	}
@@ -766,17 +770,17 @@ static int as_can_break_anticipation(str
 		return 1;
 	}
 
-	if (aic->mean_dir_after_read != READ) {
+	if (aic->mean_dir_after_read != REQ_SYNC) {
 		/* next request from this process will probably be a write */
 		return 1;
 	}
 
 	if (arq && aic->seek_samples) {
 		sector_t s;
-		if (ad->last_sector[READ] < arq->request->sector)
-			s = arq->request->sector - ad->last_sector[READ];
+		if (ad->last_sector[REQ_SYNC] < arq->request->sector)
+			s = arq->request->sector - ad->last_sector[REQ_SYNC];
 		else
-			s = ad->last_sector[READ] - arq->request->sector;
+			s = ad->last_sector[REQ_SYNC] - arq->request->sector;
 		if (aic->seek_mean > s)
 			/* this request is better than what we're expecting */
 			return 1;
@@ -828,14 +832,15 @@ static int as_can_anticipate(struct as_d
  */
 static void as_update_iohist(struct as_io_context *aic, struct request *rq)
 {
-	int data_dir = rq_data_dir(rq);
+	struct as_rq *arq = RQ_DATA(rq);
+	int data_dir = arq->is_sync;
 	unsigned long thinktime;
 	sector_t seek_dist;
 
 	if (aic == NULL)
 		return;
 
-	if (data_dir == READ) {
+	if (data_dir == REQ_SYNC) {
 		if (test_bit(AS_TASK_IORUNNING, &aic->state)) {
 			/* Calculate read -> read thinktime */
 			thinktime = jiffies - aic->last_end_request;
@@ -876,19 +881,19 @@ static void as_update_iohist(struct as_i
 	}
 
 	/* Calculate read/write pattern */
-	if (aic->last_data_dir == READ) {
+	if (aic->last_data_dir == REQ_SYNC) {
 		unsigned long rprob, wprob;
 		aic->dir_after_read[data_dir] += 256;
-		rprob = aic->dir_after_read[READ];
-		wprob = aic->dir_after_read[WRITE];
+		rprob = aic->dir_after_read[REQ_SYNC];
+		wprob = aic->dir_after_read[REQ_ASYNC];
 
 		if (rprob*4 >= wprob*5)
-			aic->mean_dir_after_read = READ;
+			aic->mean_dir_after_read = REQ_SYNC;
 		else
-			aic->mean_dir_after_read = WRITE;
+			aic->mean_dir_after_read = REQ_ASYNC;
 
-		aic->dir_after_read[READ] = (rprob>>1) + (rprob>>2);
-		aic->dir_after_read[WRITE] = (wprob>>1) + (wprob>>2);
+		aic->dir_after_read[REQ_SYNC] = (rprob>>1) + (rprob>>2);
+		aic->dir_after_read[REQ_ASYNC] = (wprob>>1) + (wprob>>2);
 	}
 	aic->last_data_dir = data_dir;
 }
@@ -900,7 +905,7 @@ static void as_update_iohist(struct as_i
  */
 static void as_update_arq(struct as_data *ad, struct as_rq *arq)
 {
-	const int data_dir = rq_data_dir(arq->request);
+	const int data_dir = arq->is_sync;
 
 	/* keep the next_arq cache up to date */
 	ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
@@ -937,7 +942,7 @@ static void as_completed_request(request
 	if (!aic)
 		return;
 
-	if (rq_data_dir(arq->request) == READ) {
+	if (arq->is_sync == REQ_SYNC) {
 		set_bit(AS_TASK_IORUNNING, &aic->state);
 		aic->last_end_request = jiffies;
 	}
@@ -970,7 +975,7 @@ static void as_remove_queued_request(req
 	if (!arq)
 		BUG();
 	else {
-		const int data_dir = rq_data_dir(arq->request);
+		const int data_dir = arq->is_sync;
 		struct as_data *ad = q->elevator.elevator_data;
 
 		WARN_ON(arq->state != AS_RQ_QUEUED);
@@ -1070,8 +1075,8 @@ static int as_fifo_expired(struct as_dat
 static inline int as_batch_expired(struct as_data *ad)
 {
 	return time_after(jiffies, ad->current_batch_expires) &&
-		(ad->batch_data_dir == WRITE ||
-		 time_after(jiffies, ad->fifo_expire[READ]));
+		(ad->batch_data_dir == REQ_ASYNC ||
+		 time_after(jiffies, ad->fifo_expire[REQ_SYNC]));
 }
 
 /*
@@ -1079,7 +1084,7 @@ static inline int as_batch_expired(struc
  */
 static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 {
-	const int data_dir = rq_data_dir(arq->request);
+	const int data_dir = arq->is_sync;
 	
 	BUG_ON(!ON_RB(&arq->rb_node));
 
@@ -1093,7 +1098,7 @@ static void as_move_to_dispatch(struct a
 	ad->last_sector[data_dir] = arq->request->sector
 					+ arq->request->nr_sectors;
 
-	if (data_dir == READ) {
+	if (data_dir == REQ_SYNC) {
 		/* In case we have to anticipate after this */
 		copy_as_io_context(&ad->as_io_context, &arq->as_io_context);
 	} else
@@ -1122,8 +1127,8 @@ static void as_move_to_dispatch(struct a
 static int as_dispatch_request(struct as_data *ad)
 {
 	struct as_rq *arq;
-	const int reads = !list_empty(&ad->fifo_list[READ]);
-	const int writes = !list_empty(&ad->fifo_list[WRITE]);
+	const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
+	const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
 
 	if (!(reads || writes))
 		return 0;
@@ -1134,8 +1139,8 @@ static int as_dispatch_request(struct as
 		 */
 		arq = ad->next_arq[ad->batch_data_dir];
 
-		if (ad->batch_data_dir == READ && ad->antic_expire) {
-			if (as_fifo_expired(ad, READ))
+		if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
+			if (as_fifo_expired(ad, REQ_SYNC))
 				goto fifo_expired;
 
 			if (as_can_anticipate(ad, arq)) {
@@ -1148,7 +1153,7 @@ static int as_dispatch_request(struct as
 			/* we have a "next request" */
 			if (reads && !writes)
 				ad->current_batch_expires =
-					jiffies + ad->batch_expire[READ];
+					jiffies + ad->batch_expire[REQ_SYNC];
 			goto dispatch_request;
 		}
 	}
@@ -1159,15 +1164,15 @@ static int as_dispatch_request(struct as
 	 */
 
 	if (reads) {
-		BUG_ON(RB_EMPTY(&ad->sort_list[READ]));
+		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC]));
 
-		if (writes && ad->batch_data_dir == READ)
+		if (writes && ad->batch_data_dir == REQ_SYNC)
 			/*
 			 * Last batch was a read, switch to writes
 			 */
 			goto dispatch_writes;
 
-		ad->batch_data_dir = READ;
+		ad->batch_data_dir = REQ_SYNC;
 		arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
 		ad->current_batch_expires = jiffies +
 			ad->batch_expire[ad->batch_data_dir];
@@ -1181,9 +1186,9 @@ static int as_dispatch_request(struct as
 
 	if (writes) {
 dispatch_writes:
-		BUG_ON(RB_EMPTY(&ad->sort_list[WRITE]));
+		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
 
-		ad->batch_data_dir = WRITE;
+		ad->batch_data_dir = REQ_ASYNC;
 		arq = ad->next_arq[ad->batch_data_dir];
 		ad->current_batch_expires = jiffies +
 			ad->batch_expire[ad->batch_data_dir];
@@ -1231,7 +1236,14 @@ static struct request *as_next_request(r
  */
 static void as_add_request(struct as_data *ad, struct as_rq *arq)
 {
-	const int data_dir = rq_data_dir(arq->request);
+	int data_dir;
+
+	if (rq_data_dir(arq->request) == READ
+			|| current->flags&PF_SYNCWRITE)
+		arq->is_sync = 1;
+	else
+		arq->is_sync = 0;
+	data_dir = arq->is_sync;
 
 	arq->as_io_context = get_as_io_context();
 
@@ -1262,11 +1274,11 @@ as_insert_request(request_queue_t *q, st
 		AS_INVALIDATE_HASH(ad);
 		q->last_merge = NULL;
 
-		while (ad->next_arq[READ])
-			as_move_to_dispatch(ad, ad->next_arq[READ]);
+		while (ad->next_arq[REQ_SYNC])
+			as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
 
-		while (ad->next_arq[WRITE])
-			as_move_to_dispatch(ad, ad->next_arq[WRITE]);
+		while (ad->next_arq[REQ_ASYNC])
+			as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
 
 		list_add_tail(&rq->queuelist, ad->dispatch);
 
@@ -1314,8 +1326,8 @@ static int as_queue_empty(request_queue_
 {
 	struct as_data *ad = q->elevator.elevator_data;
 
-	if (!list_empty(&ad->fifo_list[WRITE])
-		|| !list_empty(&ad->fifo_list[READ])
+	if (!list_empty(&ad->fifo_list[REQ_ASYNC])
+		|| !list_empty(&ad->fifo_list[REQ_SYNC])
 		|| !list_empty(ad->dispatch))
 			return 0;
 
@@ -1535,8 +1547,8 @@ static void as_exit(request_queue_t *q, 
 	del_timer_sync(&ad->antic_timer);
 	kblockd_flush();
 
-	BUG_ON(!list_empty(&ad->fifo_list[READ]));
-	BUG_ON(!list_empty(&ad->fifo_list[WRITE]));
+	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
+	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
 
 	mempool_destroy(ad->arq_pool);
 	put_as_io_context(&ad->as_io_context);
@@ -1585,20 +1597,20 @@ static int as_init(request_queue_t *q, e
 	for (i = 0; i < AS_HASH_ENTRIES; i++)
 		INIT_LIST_HEAD(&ad->hash[i]);
 
-	INIT_LIST_HEAD(&ad->fifo_list[READ]);
-	INIT_LIST_HEAD(&ad->fifo_list[WRITE]);
-	ad->sort_list[READ] = RB_ROOT;
-	ad->sort_list[WRITE] = RB_ROOT;
+	INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
+	INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
+	ad->sort_list[REQ_SYNC] = RB_ROOT;
+	ad->sort_list[REQ_ASYNC] = RB_ROOT;
 	ad->dispatch = &q->queue_head;
-	ad->fifo_expire[READ] = read_expire;
-	ad->fifo_expire[WRITE] = write_expire;
+	ad->fifo_expire[REQ_SYNC] = read_expire;
+	ad->fifo_expire[REQ_ASYNC] = write_expire;
 	ad->hash_valid_count = 1;
 	ad->antic_expire = antic_expire;
-	ad->batch_expire[READ] = read_batch_expire;
-	ad->batch_expire[WRITE] = write_batch_expire;
+	ad->batch_expire[REQ_SYNC] = read_batch_expire;
+	ad->batch_expire[REQ_ASYNC] = write_batch_expire;
 	e->elevator_data = ad;
 
-	ad->current_batch_expires = jiffies + ad->batch_expire[READ];
+	ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
 	return 0;
 }
 
@@ -1631,11 +1643,11 @@ static ssize_t __FUNC(struct as_data *ad
 {									\
 	return as_var_show(__VAR, (page));			\
 }
-SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[READ]);
-SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[WRITE]);
+SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
 SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[READ]);
-SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[WRITE]);
+SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
@@ -1648,13 +1660,13 @@ static ssize_t __FUNC(struct as_data *ad
 		*(__PTR) = (MAX);					\
 	return ret;							\
 }
-STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[READ], 0, INT_MAX);
-STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[WRITE], 0, INT_MAX);
+STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
 STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
 STORE_FUNCTION(as_read_batchexpire_store,
-			&ad->batch_expire[READ], 0, INT_MAX);
+			&ad->batch_expire[REQ_SYNC], 0, INT_MAX);
 STORE_FUNCTION(as_write_batchexpire_store,
-			&ad->batch_expire[WRITE], 0, INT_MAX);
+			&ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
 #undef STORE_FUNCTION
 
 static struct as_fs_entry as_readexpire_entry = {
diff -puN fs/buffer.c~as-sync-async fs/buffer.c
--- 25/fs/buffer.c~as-sync-async	2003-06-04 20:30:30.000000000 -0700
+++ 25-akpm/fs/buffer.c	2003-06-04 20:30:30.000000000 -0700
@@ -318,6 +318,7 @@ asmlinkage long sys_fsync(unsigned int f
 
 	/* We need to protect against concurrent writers.. */
 	down(&inode->i_sem);
+	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(inode->i_mapping);
 	err = file->f_op->fsync(file, dentry, 0);
 	if (!ret)
@@ -325,6 +326,7 @@ asmlinkage long sys_fsync(unsigned int f
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!ret)
 		ret = err;
+	current->flags &= ~PF_SYNCWRITE;
 	up(&inode->i_sem);
 
 out_putf:
@@ -353,6 +355,7 @@ asmlinkage long sys_fdatasync(unsigned i
 		goto out_putf;
 
 	down(&inode->i_sem);
+	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(inode->i_mapping);
 	err = file->f_op->fsync(file, dentry, 1);
 	if (!ret)
@@ -360,6 +363,7 @@ asmlinkage long sys_fdatasync(unsigned i
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!ret)
 		ret = err;
+	current->flags &= ~PF_SYNCWRITE;
 	up(&inode->i_sem);
 
 out_putf:
diff -puN fs/fs-writeback.c~as-sync-async fs/fs-writeback.c
--- 25/fs/fs-writeback.c~as-sync-async	2003-06-04 20:30:30.000000000 -0700
+++ 25-akpm/fs/fs-writeback.c	2003-06-04 20:30:30.000000000 -0700
@@ -498,6 +498,7 @@ int generic_osync_inode(struct inode *in
 	int need_write_inode_now = 0;
 	int err2;
 
+	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(inode->i_mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -510,6 +511,7 @@ int generic_osync_inode(struct inode *in
 		if (!err)
 			err = err2;
 	}
+	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
diff -puN include/linux/sched.h~as-sync-async include/linux/sched.h
--- 25/include/linux/sched.h~as-sync-async	2003-06-04 20:30:30.000000000 -0700
+++ 25-akpm/include/linux/sched.h	2003-06-04 20:30:30.000000000 -0700
@@ -479,6 +479,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
+#define PF_SYNCWRITE	0x00100000	/* I am doing a sync write */
 
 #ifdef CONFIG_SMP
 extern void set_cpus_allowed(task_t *p, unsigned long new_mask);

_