From: Nick Piggin <piggin@cyberone.com.au>

It seems to go alright the basic tests.  Really helps "1 reader and 1
writer" on my SCSI drive, but I'd like you to do your run of TCQ tests on
it if you could.

Quite a few big changes in order to get SCSI TCQ working properly. I'll
give you a short:

* Measure write batches by # of requests.  Time has little meaning as we
  can fill a lot of disk cache in a small amount of time.  (This option
  should be moved back to a time based tunable, which adjusts itself based
  on how long the last run took.  At the moment it probably won't writeout
  enough on small request size devices)

* When changing batch direction, don't submit any requests in the new
  direction until all outstanding requests have been completed.

* When starting a new batch direction, make the first request a hard
  barrier (don't know if this helps).

* Don't start timing a read (actually "SYNC") batch until the first
  request has been completed.  This should help with writeback caches.

It also includes some fixes / misc stuff done while the above was being 
done:

* cleanup / move enums around

* Now that we have sync vs async, we shouldn't need to keep track of
  "data dir".  Get rid of it.

* Thinktime is only accounted when the process has no outstanding
  requests.  This seems pretty sane when you think about it.

* Limit the rate at which seek_distance can grow.  A generally nicely
  seeking process shouldn't suffer _too_ much if they have to seek to a
  fragment, or have to page in some code/library/swap, etc etc.

* Properly set arq->state = AS_RQ_NEW.  Now that arqs are dynamically
  initialised, the previous place this was done was more useless than ever.

* Allow REQ_HARDBARRIER requests to be dispatched normally.  No reason
  why they shouldn't.



 drivers/block/as-iosched.c |  175 ++++++++++++++++++++++++---------------------
 1 files changed, 96 insertions(+), 79 deletions(-)

diff -puN drivers/block/as-iosched.c~as-jumbo-patch-for-scsi drivers/block/as-iosched.c
--- 25/drivers/block/as-iosched.c~as-jumbo-patch-for-scsi	2003-05-28 03:10:51.000000000 -0700
+++ 25-akpm/drivers/block/as-iosched.c	2003-05-28 03:10:51.000000000 -0700
@@ -47,10 +47,10 @@ static unsigned long write_expire = HZ /
 static unsigned long read_batch_expire = HZ / 5;
 
 /*
- * write_batch_expire describes how long we will allow a stream of writes to
- * persist before looking to see whether it is time to switch over to reads.
+ * write_batch_expire describes how many write request we allow before looking
+ * to see whether it is time to switch over to reads.
  */
-static unsigned long write_batch_expire = HZ / 20;
+static unsigned long write_batch_expire = 5;
 
 /*
  * max time we may wait to anticipate a read (default around 6ms)
@@ -74,6 +74,12 @@ static unsigned long antic_expire = ((HZ
  */
 #define MAX_THINKTIME (HZ/50UL)
 
+/* Bits in as_io_context.state */
+enum as_io_states {
+	AS_TASK_RUNNING=0,	/* Process has not exitted */
+	AS_TASK_IORUNNING,	/* Process has completed some IO */
+};
+
 struct as_io_context {
 	atomic_t refcount;
 	pid_t pid;
@@ -87,10 +93,6 @@ struct as_io_context {
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
-	/* Read / write pattern */
-	int last_data_dir;
-	unsigned long dir_after_read[2];
-	int mean_dir_after_read;
 	/* Layout pattern */
 	long seek_samples;
 	sector_t last_request_pos;
@@ -98,10 +100,13 @@ struct as_io_context {
 	sector_t seek_mean;
 };
 
-/* Bits in as_io_context.state */
-enum as_io_states {
-	AS_TASK_RUNNING=0,	/* Process has not exitted */
-	AS_TASK_IORUNNING,	/* Process has completed some IO */
+enum anticipation_status {
+	ANTIC_OFF=0,		/* Not anticipating (normal operation)	*/
+	ANTIC_WAIT_REQ,		/* The last read has not yet completed  */
+	ANTIC_WAIT_NEXT,	/* Currently anticipating a request vs
+				   last read (which has completed) */
+	ANTIC_FINISHED,		/* Anticipating but have found a candidate
+				 * or timed out */
 };
 
 struct as_data {
@@ -118,21 +123,23 @@ struct as_data {
 	struct list_head fifo_list[2];
 	
 	struct as_rq *next_arq[2];	/* next in sort order */
-	sector_t last_sector[2];	/* last REQ_SYNC and REQ_ASYNC sectors */
+	sector_t last_sector[2];	/* last REQ_SYNC & REQ_ASYNC sectors */
 	struct list_head *dispatch;	/* driver dispatch queue */
 	struct list_head *hash;		/* request hash */
 	unsigned long hash_valid_count;	/* barrier hash count */
 	unsigned long current_batch_expires;
 	unsigned long last_check_fifo[2];
-	int batch_data_dir;		/* current/last batch REQ_SYNC or REQ_ASYNC */
+	int changed_batch;
+	int batch_data_dir;		/* current batch REQ_SYNC / REQ_ASYNC */
 	mempool_t *arq_pool;
 
-	int antic_status;
+	enum anticipation_status antic_status;
 	unsigned long antic_start;	/* jiffies: when it started */
 	struct timer_list antic_timer;	/* anticipatory scheduling timer */
 	struct work_struct antic_work;	/* Deferred unplugging */
 	struct as_io_context *as_io_context;/* Identify the expected process */
 	int aic_finished; /* IO associated with as_io_context finished */
+	int nr_dispatched;
 
 	/*
 	 * settings that change how the i/o scheduler behaves
@@ -144,15 +151,6 @@ struct as_data {
 
 #define list_entry_fifo(ptr)	list_entry((ptr), struct as_rq, fifo)
 
-enum anticipation_states {
-	ANTIC_OFF=0,		/* Not anticipating (normal operation)	*/
-	ANTIC_WAIT_REQ,		/* The last read has not yet completed  */
-	ANTIC_WAIT_NEXT,	/* Currently anticipating a request vs
-				   last read (which has completed) */
-	ANTIC_FINISHED,		/* Anticipating but have found a candidate
-				 * or timed out */
-};
-
 /*
  * per-request data.
  */
@@ -260,9 +258,6 @@ static struct as_io_context *get_as_io_c
 			ret->ttime_total = 0;
 			ret->ttime_samples = 0;
 			ret->ttime_mean = 0;
-			ret->dir_after_read[REQ_SYNC] = 0;
-			ret->dir_after_read[REQ_ASYNC] = 0;
-			ret->mean_dir_after_read = REQ_SYNC;
 			ret->seek_total = 0;
 			ret->seek_samples = 0;
 			ret->seek_mean = 0;
@@ -750,13 +745,12 @@ static int as_can_break_anticipation(str
 	}
 
 	aic = ad->as_io_context;
+	BUG_ON(!aic);
+
 	if (arq && aic == arq->as_io_context) {
 		/* request from same process */
 		return 1;
 	}
-
-	if (!aic)
-		return 0;
 	
 	if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
 		/* process anticipated on has exitted */
@@ -778,20 +772,16 @@ static int as_can_break_anticipation(str
 		return 1;
 	}
 
-	if (aic->mean_dir_after_read != REQ_SYNC) {
-		/* next request from this process will probably be a write */
-		return 1;
-	}
-
 	if (arq && aic->seek_samples) {
 		sector_t s;
 		if (ad->last_sector[REQ_SYNC] < arq->request->sector)
 			s = arq->request->sector - ad->last_sector[REQ_SYNC];
 		else
 			s = ad->last_sector[REQ_SYNC] - arq->request->sector;
-		if (aic->seek_mean > s)
+		if (aic->seek_mean > s) {
 			/* this request is better than what we're expecting */
 			return 1;
+		}
 	}
 	
 	return 0;
@@ -849,7 +839,9 @@ static void as_update_iohist(struct as_i
 		return;
 
 	if (data_dir == REQ_SYNC) {
-		if (test_bit(AS_TASK_IORUNNING, &aic->state)) {
+		if (test_bit(AS_TASK_IORUNNING, &aic->state)
+				&& !atomic_read(&aic->nr_queued)
+				&& !atomic_read(&aic->nr_dispatched)) {
 			/* Calculate read -> read thinktime */
 			thinktime = jiffies - aic->last_end_request;
 			thinktime = min(thinktime, MAX_THINKTIME-1);
@@ -873,8 +865,16 @@ static void as_update_iohist(struct as_i
 			seek_dist = aic->last_request_pos - rq->sector;
 		aic->last_request_pos = rq->sector + rq->nr_sectors;
 
+		/*
+		 * Don't allow the seek distance to get too large from the
+		 * odd fragment, pagein, etc
+		 */
 		if (!aic->seek_samples)
 			seek_dist = 0;
+		else if (aic->seek_samples < 400) /* second&third seek */
+			seek_dist = min(seek_dist, (aic->seek_mean * 4) + 2*1024*1024);
+		else
+			seek_dist = min(seek_dist, (aic->seek_mean * 4) + 2*1024*64);
 
 		aic->seek_samples += 256;
 		aic->seek_total += 256*seek_dist;
@@ -887,23 +887,6 @@ static void as_update_iohist(struct as_i
 		aic->seek_total = (aic->seek_total>>1)
 					+ (aic->seek_total>>2);
 	}
-
-	/* Calculate read/write pattern */
-	if (aic->last_data_dir == REQ_SYNC) {
-		unsigned long rprob, wprob;
-		aic->dir_after_read[data_dir] += 256;
-		rprob = aic->dir_after_read[REQ_SYNC];
-		wprob = aic->dir_after_read[REQ_ASYNC];
-
-		if (rprob*4 >= wprob*5)
-			aic->mean_dir_after_read = REQ_SYNC;
-		else
-			aic->mean_dir_after_read = REQ_ASYNC;
-
-		aic->dir_after_read[REQ_SYNC] = (rprob>>1) + (rprob>>2);
-		aic->dir_after_read[REQ_ASYNC] = (wprob>>1) + (wprob>>2);
-	}
-	aic->last_data_dir = data_dir;
 }
 
 /*
@@ -940,13 +923,35 @@ static void as_completed_request(request
 	struct as_rq *arq = RQ_DATA(rq);
 	struct as_io_context *aic = arq->as_io_context;
 
-	arq->state = AS_RQ_NEW;
-
-	if (unlikely(!blk_fs_request(rq) || rq->flags & REQ_HARDBARRIER)) {
+	if (unlikely(!blk_fs_request(rq))) {
 		WARN_ON(aic);
 		return;
 	}
 
+	if (blk_fs_request(rq) && arq->state == AS_RQ_NEW)
+		printk(KERN_INFO "warning: as_completed_request got bad request\n");
+				
+	if (arq->state != AS_RQ_DISPATCHED)
+		return;
+	
+	if (ad->changed_batch && ad->nr_dispatched == 1) {
+		kblockd_schedule_work(&ad->antic_work);
+		ad->changed_batch = 2;
+	}
+	ad->nr_dispatched--;
+
+	/*
+	 * Start counting the batch from when a request of that direction is
+	 * actually serviced. This should help devices with big TCQ windows
+	 * and writeback caches
+	 */
+	if (ad->batch_data_dir == REQ_SYNC && ad->changed_batch
+			&& ad->batch_data_dir == arq->is_sync) {
+		ad->current_batch_expires = jiffies +
+				ad->batch_expire[REQ_SYNC];
+		ad->changed_batch = 0;
+	}
+	
 	if (!aic)
 		return;
 
@@ -1036,7 +1041,7 @@ static void as_remove_request(request_qu
 {
 	struct as_rq *arq = RQ_DATA(rq);
 
-	if (unlikely(!blk_fs_request(rq) || rq->flags & REQ_HARDBARRIER))
+	if (unlikely(!blk_fs_request(rq)))
 		return;
 
 	if (arq) {
@@ -1082,9 +1087,14 @@ static int as_fifo_expired(struct as_dat
  */
 static inline int as_batch_expired(struct as_data *ad)
 {
-	return time_after(jiffies, ad->current_batch_expires) &&
-		(ad->batch_data_dir == REQ_ASYNC ||
-		 time_after(jiffies, ad->fifo_expire[REQ_SYNC]));
+	if (ad->changed_batch)
+		return 0;
+
+	if (ad->batch_data_dir == REQ_SYNC)
+		return time_after(jiffies, ad->current_batch_expires) &&
+		 	time_after(jiffies, ad->fifo_expire[REQ_SYNC]);
+
+	return !ad->current_batch_expires;
 }
 
 /*
@@ -1106,11 +1116,16 @@ static void as_move_to_dispatch(struct a
 	ad->last_sector[data_dir] = arq->request->sector
 					+ arq->request->nr_sectors;
 
+	ad->nr_dispatched++;
+
 	if (data_dir == REQ_SYNC) {
 		/* In case we have to anticipate after this */
 		copy_as_io_context(&ad->as_io_context, &arq->as_io_context);
-	} else
+	} else {
 		put_as_io_context(&ad->as_io_context);
+		if (ad->current_batch_expires)
+			ad->current_batch_expires--;
+	}
 	ad->aic_finished = 0;
 
 	ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
@@ -1138,7 +1153,10 @@ static int as_dispatch_request(struct as
 	const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
 	const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
 
-	if (!(reads || writes))
+	if (!(reads || writes)
+		|| ad->antic_status == ANTIC_WAIT_REQ
+		|| ad->antic_status == ANTIC_WAIT_NEXT
+		|| ad->changed_batch == 1)
 		return 0;
 
 	if (!(reads && writes && as_batch_expired(ad)) ) {
@@ -1180,10 +1198,10 @@ static int as_dispatch_request(struct as
 			 */
 			goto dispatch_writes;
 
+ 		if (ad->batch_data_dir == REQ_ASYNC)
+ 			ad->changed_batch = 1;
 		ad->batch_data_dir = REQ_SYNC;
 		arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
-		ad->current_batch_expires = jiffies +
-			ad->batch_expire[ad->batch_data_dir];
 		ad->last_check_fifo[ad->batch_data_dir] = jiffies;
 		goto dispatch_request;
 	}
@@ -1197,9 +1215,10 @@ dispatch_writes:
 		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
 
 		ad->batch_data_dir = REQ_ASYNC;
+ 		if (ad->batch_data_dir == REQ_SYNC)
+ 			ad->changed_batch = 1;
+		ad->current_batch_expires = ad->batch_expire[REQ_ASYNC];
 		arq = ad->next_arq[ad->batch_data_dir];
-		ad->current_batch_expires = jiffies +
-			ad->batch_expire[ad->batch_data_dir];
 		goto dispatch_request;
 	}
 
@@ -1217,6 +1236,16 @@ fifo_expired:
 		BUG_ON(arq == NULL);
 	}
 
+	if (ad->changed_batch) {
+		if (ad->changed_batch == 1 && ad->nr_dispatched)
+			return 0;
+		if (ad->changed_batch == 2 && ad->batch_data_dir == REQ_ASYNC)
+			ad->changed_batch = 0;
+		else
+			ad->changed_batch = 2;
+		arq->request->flags |= REQ_HARDBARRIER;
+	}
+	
 	/*
 	 * arq is the selected appropriate request.
 	 */
@@ -1287,16 +1316,6 @@ as_insert_request(request_queue_t *q, st
 
 		while (ad->next_arq[REQ_ASYNC])
 			as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
-
-		list_add_tail(&rq->queuelist, ad->dispatch);
-
-		/* Stop anticipating - let this request get through */
-		if (!list_empty(ad->dispatch)
-				&& (ad->antic_status == ANTIC_WAIT_REQ
-				|| ad->antic_status == ANTIC_WAIT_NEXT))
-			as_antic_stop(ad);
-
-		return;
 	}
 
 	if (unlikely(!blk_fs_request(rq))) {
@@ -1534,13 +1553,11 @@ static int as_set_request(request_queue_
 	if (arq) {
 		RB_CLEAR(&arq->rb_node);
 		arq->request = rq;
-
+		arq->state = AS_RQ_NEW;
 		arq->as_io_context = NULL;
 		INIT_LIST_HEAD(&arq->hash);
 		arq->hash_valid_count = 0;
-
 		INIT_LIST_HEAD(&arq->fifo);
-
 		rq->elevator_private = arq;
 		return 0;
 	}

_