Patch from Nick Piggin <piggin@cyberone.com.au>

Now 1 AFAIKS forward decl which really can't go.  Better arrangement.  More
comments.  The last BUG_ON fix is included in this, but other than that there
should be no functional changes.



 drivers/block/as-iosched.c | 1083 ++++++++++++++++++++++-----------------------
 1 files changed, 546 insertions(+), 537 deletions(-)

diff -puN drivers/block/as-iosched.c~as-cleanup-2 drivers/block/as-iosched.c
--- 25/drivers/block/as-iosched.c~as-cleanup-2	2003-03-16 19:49:24.000000000 -0800
+++ 25-akpm/drivers/block/as-iosched.c	2003-03-16 19:49:24.000000000 -0800
@@ -81,15 +81,6 @@ static unsigned long write_batch_expire 
  */
 static unsigned long antic_expire = HZ / 100;
 
-enum anticipation_states {
-	ANTIC_OFF=0,		/* Not anticipating (normal operation)	*/
-	ANTIC_WAIT_REQ,		/* The last read has not yet completed  */
-	ANTIC_WAIT_NEXT,	/* Currently anticipating a request vs
-				   last read (which has completed) */
-	ANTIC_FINISHED,		/* Anticipating but have found a candidate
-				   or timed out	*/
-};
-
 /*
  * This is the per-process anticipatory I/O scheduler state.  It is refcounted
  * and kmalloc'ed.
@@ -158,12 +149,15 @@ struct as_data {
 	unsigned long antic_expire;
 };
 
-enum arq_states {
-	AS_RQ_NEW=0,		/* New - not referenced and not on any lists */
-	AS_RQ_QUEUED,		/* In the request queue. It belongs to the
-				   scheduler */
-	AS_RQ_DISPATCHED,	/* On the dispatch list. It belongs to the
-				   driver now */
+#define list_entry_fifo(ptr)	list_entry((ptr), struct as_rq, fifo)
+
+enum anticipation_states {
+	ANTIC_OFF=0,		/* Not anticipating (normal operation)	*/
+	ANTIC_WAIT_REQ,		/* The last read has not yet completed  */
+	ANTIC_WAIT_NEXT,	/* Currently anticipating a request vs
+				   last read (which has completed) */
+	ANTIC_FINISHED,		/* Anticipating but have found a candidate
+				   or timed out	*/
 };
 
 /*
@@ -192,7 +186,15 @@ struct as_rq {
 	struct list_head fifo;
 	unsigned long expires;
 
-	unsigned long state;
+	unsigned long state; /* debug only */
+};
+
+enum arq_states {
+	AS_RQ_NEW=0,		/* New - not referenced and not on any lists */
+	AS_RQ_QUEUED,		/* In the request queue. It belongs to the
+				   scheduler */
+	AS_RQ_DISPATCHED,	/* On the dispatch list. It belongs to the
+				   driver now */
 };
 
 #define RQ_DATA(rq)	((struct as_rq *) (rq)->elevator_private)
@@ -389,40 +391,6 @@ static struct as_rq *as_find_first_arq(s
 	}
 }
 
-static struct as_rq *
-as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2);
-
-/*
- * as_find_next_arq finds the next request after @prev in elevator order.
- */
-static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
-{
-	const int data_dir = rq_data_dir(last->request);
-	struct as_rq *ret;
-	struct rb_node *rbnext = rb_next(&last->rb_node);
-	struct rb_node *rbprev = rb_prev(&last->rb_node);
-	struct as_rq *arq_next, *arq_prev;
-
-	BUG_ON(!ON_RB(&last->rb_node));
-
-	if (rbprev)
-		arq_prev = rb_entry_arq(rbprev);
-	else
-		arq_prev = NULL;
-
-	if (rbnext)
-		arq_next = rb_entry_arq(rbnext);
-	else {
-		arq_next = as_find_first_arq(ad, data_dir);
-		if (arq_next == last)
-			arq_next = NULL;
-	}
-
-	ret = as_choose_req(ad,	arq_next, arq_prev);
-	
-	return ret;
-}
-
 static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
 	struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
@@ -446,7 +414,6 @@ static struct as_rq *__as_add_arq_rb(str
 }
 
 static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
-
 /*
  * Aad the request to the rb tree if it is unique.  If there is an alias (an
  * existing request against the same sector), which can happen when using
@@ -460,6 +427,7 @@ static void as_add_arq_rb(struct as_data
 	 
 	arq->rb_key = rq_rb_key(rq);
 
+	/* This can be caused by direct IO */
 	while ((alias = __as_add_arq_rb(ad, arq)))
 		as_move_to_dispatch(ad, alias);
 	
@@ -494,395 +462,173 @@ as_find_arq_rb(struct as_data *ad, secto
 	return NULL;
 }
 
-static void as_antic_waitnext(struct as_data *ad);
+/*
+ * IO Scheduler proper
+ */
+
+#define MAXBACK (512 * 1024)	/* Maximum distance a process can seek backward
+				   from a previous request it has made. No
+				   seeking backward between processes allowed */
 
 /*
- * as_update_iohist keeps a decaying histogram of IO thinktimes, and
- * updates @aic->mean_thinktime based on that. It is called when a new
- * request is queued.
+ * as_choose_req selects the preferred one of two requests of the same data_dir
+ * ignoring time - eg. timeouts, which is the job of as_dispatch_request
  */
-static void as_update_iohist(struct as_io_context *aic)
+static struct as_rq *
+as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
 {
-	unsigned i;
-	unsigned long thinktime;
-	unsigned long total = 0;
-	unsigned long num = 0;
+	int data_dir;
+	sector_t last, s1, s2, d1, d2;
+	int r1_wrap=0, r2_wrap=0;	/* requests are behind the disk head */
+	const sector_t maxback = MAXBACK;
 
-	if (aic == NULL)
-		return;
+	if (arq1 == NULL || arq1 == arq2)
+		return arq2;
+	if (arq2 == NULL)
+		return arq1;
 
-	if (test_bit(AS_TASK_IORUNNING, &aic->state)) {
-		thinktime = jiffies - aic->last_end_request;
-		thinktime = min(thinktime, MAX_THINKTIME-1);
-		aic->thinktime[thinktime] += 256; /* fixed point: 1.0 == 1<<8 */
+	data_dir = rq_data_dir(arq1->request);
 
-		for (i = 0; i < MAX_THINKTIME; i++) {
-			unsigned long tt = aic->thinktime[i];
-			total += i*tt;
-			num += tt;
+	last = ad->last_sector[data_dir];
+	s1 = arq1->request->sector;
+	s2 = arq2->request->sector;
 
-			aic->thinktime[i] = (tt>>1) + (tt>>2); /* 75% decay */
-		}
-		/* fixed point factor is cancelled here */
-		if (num)
-			aic->mean_thinktime = total / num;
+	BUG_ON(data_dir != rq_data_dir(arq2->request));
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek. Only for reads and only between reads
+	 * from the same process!
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (data_dir == READ
+			&& ad->as_io_context == arq1->as_io_context
+			&& s1+maxback >= last)
+				d1 = (last - s1)*2;
+	else {
+		r1_wrap = 1;
+		d1 = 0; /* shut up, gcc */
+	}
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (data_dir == READ
+			&& ad->as_io_context == arq2->as_io_context
+			&& s2+maxback >= last)
+				d2 = (last - s2)*2;
+	else {
+		r2_wrap = 1;
+		d2 = 0;
+	}
+
+	/* Found required data */
+	if (!r1_wrap && r2_wrap)
+		return arq1;
+	else if (!r2_wrap && r1_wrap)
+		return arq2;
+	else if (r1_wrap && r2_wrap) {
+		/* both behind the head */
+		if (s1 <= s2)
+			return arq1;
+		else
+			return arq2;
+	}
+	
+	/* Both requests in front of the head */
+	if (d1 < d2) 
+		return arq1;
+	else if (d2 < d1)
+		return arq2;
+	else {
+		if (s1 >= s2)
+			return arq1;
+		else
+			return arq2;
 	}
 }
 
 /*
- * Look Ma, no comment!
+ * as_find_next_arq finds the next request after @prev in elevator order.
+ * this with as_choose_arq form the basis for how the scheduler chooses
+ * what request to process next. Anticipation works on top of this.
  */
-
-static void as_complete_arq(struct as_data *ad, struct as_rq *arq)
+static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
 {
-	if (!arq->as_io_context)
-		return;
+	const int data_dir = rq_data_dir(last->request);
+	struct as_rq *ret;
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct as_rq *arq_next, *arq_prev;
 
-	if (rq_data_dir(arq->request) == READ) {
-		set_bit(AS_TASK_IORUNNING, &arq->as_io_context->state);
-		arq->as_io_context->last_end_request = jiffies;
-	}
+	BUG_ON(!ON_RB(&last->rb_node));
 
-	if (ad->as_io_context == arq->as_io_context) {
-		ad->antic_start = jiffies;
-		ad->aic_finished = 1;
-		if (ad->antic_status == ANTIC_WAIT_REQ) {
-			/*
-			 * We were waiting on this request, now anticipate
-			 * the next one
-			 */
-			as_antic_waitnext(ad);
-		}
+	if (rbprev)
+		arq_prev = rb_entry_arq(rbprev);
+	else
+		arq_prev = NULL;
+
+	if (rbnext)
+		arq_next = rb_entry_arq(rbnext);
+	else {
+		arq_next = as_find_first_arq(ad, data_dir);
+		if (arq_next == last)
+			arq_next = NULL;
 	}
-	put_as_io_context(&arq->as_io_context);
-}
 
-static void as_update_arq(struct as_data *ad, struct as_rq *arq);
+	ret = as_choose_req(ad,	arq_next, arq_prev);
+	
+	return ret;
+}
 
 /*
- * add arq to rbtree and fifo
+ * anticipatory scheduling functions follow
  */
-static void as_add_request(struct as_data *ad, struct as_rq *arq)
-{
-	const int data_dir = rq_data_dir(arq->request);
-
-	arq->as_io_context = get_as_io_context();
-	if (arq->as_io_context) {
-		atomic_inc(&arq->as_io_context->nr_queued);
 
-		if (data_dir == READ)
-			as_update_iohist(arq->as_io_context);
-	}
+/* 
+ * as_antic_expired tells us when we have anticipated too long.
+ * The funny "absolute difference" math on the elapsed time is to handle
+ * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
+ */
+static int as_antic_expired(struct as_data *ad)
+{
+	long delta_jif;
 
-	as_add_arq_rb(ad, arq);
+	delta_jif = jiffies - ad->antic_start;
+	if (unlikely(delta_jif < 0))
+		delta_jif = -delta_jif;
+	if (delta_jif < ad->antic_expire)
+		return 0;
 
-	/*
-	 * set expire time (only used for reads) and add to fifo list
-	 */
-	arq->expires = jiffies + ad->fifo_expire[data_dir];
-	list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
-	arq->state = AS_RQ_QUEUED;
-	as_update_arq(ad, arq); /* keep state machine up to date */
+	return 1;
 }
 
 /*
- * as_remove_queued_request removes a request from the pre dispatch queue
- * without updating refcounts. It is expected the caller will drop the
- * reference unless it replaces the request at somepart of the elevator
- * (ie. the dispatch queue)
+ * as_antic_waitnext starts anticipating that a nice request will soon be
+ * submitted. See also as_antic_waitreq
  */
-static void as_remove_queued_request(request_queue_t *q, struct request *rq)
+static void as_antic_waitnext(struct as_data *ad)
 {
-	struct as_rq *arq = RQ_DATA(rq);
-
-	if (!arq)
-		BUG();
-	else {
-		const int data_dir = rq_data_dir(arq->request);
-		struct as_data *ad = q->elevator.elevator_data;
+	unsigned long timeout;
 
-		BUG_ON(arq->state != AS_RQ_QUEUED);
+	BUG_ON(ad->antic_status != ANTIC_OFF
+			&& ad->antic_status != ANTIC_WAIT_REQ);
 
-		if (arq->as_io_context) {
-			BUG_ON(!atomic_read(&arq->as_io_context->nr_queued));
-			atomic_dec(&arq->as_io_context->nr_queued);
-		}
-
-		/*
-		 * Update the "next_arq" cache if we are about to remove its
-		 * entry
-		 */
-		if (ad->next_arq[data_dir] == arq)
-			ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
-		
-		list_del_init(&arq->fifo);
-		as_del_arq_hash(arq);
-		as_del_arq_rb(ad, arq);
-
-		if (q->last_merge == &rq->queuelist)
-			q->last_merge = NULL;
-
-		list_del_init(&rq->queuelist);
-	}
-
-}
-
-/*
- * as_remove_dispatched_request is called when a driver has completed the
- * request (or it has caused an error), and is finished with it. It assumes
- * the request is on the dispatch queue.
- */
-static void as_remove_request(request_queue_t *q, struct request *rq)
-{
-	struct as_rq *arq = RQ_DATA(rq);
-	struct as_data *ad = q->elevator.elevator_data;
-
-	if (q->last_merge == &rq->queuelist)
-		q->last_merge = NULL;
-
-	list_del_init(&rq->queuelist);
-
-	if (arq) {
-		list_del_init(&arq->fifo);
-		as_del_arq_hash(arq);
-		as_del_arq_rb(ad, arq);
-		if (arq->as_io_context) {
-			WARN_ON(!atomic_read(&arq->as_io_context->nr_dispatched));
-			atomic_dec(&arq->as_io_context->nr_dispatched);
-		}
-		as_complete_arq(ad, arq);
-	}
-}
-
-static int
-as_merge(request_queue_t *q, struct list_head **insert, struct bio *bio)
-{
-	struct as_data *ad = q->elevator.elevator_data;
-	struct request *__rq;
-	int ret;
-
-	/*
-	 * try last_merge to avoid going to hash
-	 */
-	ret = elv_try_last_merge(q, bio);
-	if (ret != ELEVATOR_NO_MERGE) {
-		__rq = list_entry_rq(q->last_merge);
-		goto out_insert;
-	}
-
-	/*
-	 * see if the merge hash can satisfy a back merge
-	 */
-	__rq = as_find_arq_hash(ad, bio->bi_sector);
-	if (__rq) {
-		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_BACK_MERGE;
-			goto out;
-		}
-	}
-
-	/*
-	 * check for front merge
-	 */
-	if (ad->front_merges) {
-		sector_t rb_key = bio->bi_sector + bio_sectors(bio);
-
-		__rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
-		if (__rq) {
-			BUG_ON(rb_key != rq_rb_key(__rq));
-
-			if (elv_rq_merge_ok(__rq, bio)) {
-				ret = ELEVATOR_FRONT_MERGE;
-				goto out;
-			}
-		}
-	}
-
-	return ELEVATOR_NO_MERGE;
-out:
-	q->last_merge = &__rq->queuelist;
-out_insert:
-	*insert = &__rq->queuelist;
-	return ret;
-}
-
-static void as_merged_request(request_queue_t *q, struct request *req)
-{
-	struct as_data *ad = q->elevator.elevator_data;
-	struct as_rq *arq = RQ_DATA(req);
-
-	/*
-	 * hash always needs to be repositioned, key is end sector
-	 */
-	as_del_arq_hash(arq);
-	as_add_arq_hash(ad, arq);
-
-	/*
-	 * if the merge was a front merge, we need to reposition request
-	 */
-	if (rq_rb_key(req) != arq->rb_key) {
-		as_del_arq_rb(ad, arq);
-		as_add_arq_rb(ad, arq);
-		/*
-		 * Note! At this stage of this and the next function, our next
-		 * request may not be optimal - eg the request may have "grown"
-		 * behind the disk head. We currently don't bother adjusting.
-		 */
-	}
-
-	q->last_merge = &req->queuelist;
-}
-
-static void
-as_merged_requests(request_queue_t *q, struct request *req,
-			 struct request *next)
-{
-	struct as_data *ad = q->elevator.elevator_data;
-	struct as_rq *arq = RQ_DATA(req);
-	struct as_rq *anext = RQ_DATA(next);
-
-	BUG_ON(!arq);
-	BUG_ON(!anext);
-
-	/*
-	 * reposition arq (this is the merged request) in hash, and in rbtree
-	 * in case of a front merge
-	 */
-	as_del_arq_hash(arq);
-	as_add_arq_hash(ad, arq);
-
-	if (rq_rb_key(req) != arq->rb_key) {
-		as_del_arq_rb(ad, arq);
-		as_add_arq_rb(ad, arq);
-	}
-
-	/*
-	 * if anext expires before arq, assign its expire time to arq
-	 * and move into anext position (anext will be deleted) in fifo
-	 */
-	if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
-		if (time_before(anext->expires, arq->expires)) {
-			list_move(&arq->fifo, &anext->fifo);
-			arq->expires = anext->expires;
-			/*
-			 * Don't copy here but swap, because when anext is
-			 * removed below, it must contain the unused context
-			 */
-			swap_as_io_context(&arq->as_io_context,
-					&anext->as_io_context);
-		}
-	}
-
-	/*
-	 * kill knowledge of next, this one is a goner
-	 */
-	as_remove_queued_request(q, next);
-	put_as_io_context(&anext->as_io_context);
-}
-
-static void as_antic_stop(struct as_data *ad);
-
-/*
- * move an entry to dispatch queue
- */
-static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
-{
-	const int data_dir = rq_data_dir(arq->request);
-	
-	BUG_ON(!ON_RB(&arq->rb_node));
-
-	as_antic_stop(ad);
-	ad->antic_status = ANTIC_OFF;
-
-	/*
-	 * This has to be set in order to be correctly updated by
-	 * as_find_next_arq
-	 */
-	ad->last_sector[data_dir] = arq->request->sector
-					+ arq->request->nr_sectors;
-
-	if (data_dir == READ) {
-		/* In case we have to anticipate after this */
-		copy_as_io_context(&ad->as_io_context, &arq->as_io_context);
-		ad->aic_finished = 0;
-	} else
-		put_as_io_context(&ad->as_io_context);
-
-	ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
-
-	/*
-	 * take it off the sort and fifo list, add to dispatch queue
-	 */
-	as_remove_queued_request(ad->q, arq->request);
-	list_add_tail(&arq->request->queuelist, ad->dispatch);
-	if (arq->as_io_context) 
-		atomic_inc(&arq->as_io_context->nr_dispatched);
-
-	BUG_ON(arq->state != AS_RQ_QUEUED);
-	arq->state = AS_RQ_DISPATCHED;
-}
-
-#define list_entry_fifo(ptr)	list_entry((ptr), struct as_rq, fifo)
-
-/*
- * as_fifo_expired returns 0 if there are no expired reads on the fifo,
- * 1 otherwise.  It is ratelimited so that we only perform the check once per
- * `fifo_expire' interval.  Otherwise a large number of expired requests
- * would create a hopeless seekstorm.
- *
- * The funny "absolute difference" math on the elapsed time is to handle
- * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
- */ 
-static int as_fifo_expired(struct as_data *ad, int adir)
-{
-	struct as_rq *arq;
-	long delta_jif;
-
-	delta_jif = jiffies - ad->last_check_fifo[adir];
-	if (unlikely(delta_jif < 0))
-		delta_jif = -delta_jif;
-	if (delta_jif < ad->fifo_expire[adir])
-		return 0;
-
-	ad->last_check_fifo[adir] = jiffies;
-
-	if (list_empty(&ad->fifo_list[adir]))
-		return 0;
-
-	arq = list_entry_fifo(ad->fifo_list[adir].next);
-	
-	return time_after(jiffies, arq->expires);
-}
-
-static int as_antic_expired(struct as_data *ad)
-{
-	long delta_jif;
-
-	delta_jif = jiffies - ad->antic_start;
-	if (unlikely(delta_jif < 0))
-		delta_jif = -delta_jif;
-	if (delta_jif < ad->antic_expire)
-		return 0;
-
-	return 1;
-}
-
-/*
- * as_batch_expired returns true if the current batch has expired.
- */
-static inline int as_batch_expired(struct as_data *ad)
-{
-	return time_after(jiffies, ad->current_batch_expires);
+	timeout = ad->antic_start + ad->antic_expire;
+#if 0 /* TODO unif me. This should be fixed. */
+	timeout = min(timeout,	ad->current_batch_expires);
+#endif
+	mod_timer(&ad->antic_timer, timeout);
+				
+	ad->antic_status = ANTIC_WAIT_NEXT;
 }
 
 /*
- * anticipatory scheduling functions follow
+ * as_antic_waitreq starts anticipating. We don't start timing the anticipation
+ * until the request that we're anticipating on has finished. This means we
+ * are timing from when the candidate process wakes up hopefully.
  */
-
-static int as_queue_notready(request_queue_t *q);
-
 static void as_antic_waitreq(struct as_data *ad)
 {
 	BUG_ON(ad->antic_status == ANTIC_FINISHED);
@@ -896,46 +642,6 @@ static void as_antic_waitreq(struct as_d
 	}
 }
 
-static void as_antic_waitnext(struct as_data *ad)
-{
-	unsigned long timeout;
-
-	BUG_ON(ad->antic_status != ANTIC_OFF
-			&& ad->antic_status != ANTIC_WAIT_REQ);
-
-	timeout = ad->antic_start + ad->antic_expire;
-#if 0
-	/* FIX THIS!!! */
-	timeout = min(timeout,	ad->current_batch_expires);
-#endif
-	mod_timer(&ad->antic_timer, timeout);
-				
-	ad->antic_status = ANTIC_WAIT_NEXT;
-}
-
-/*
- * This is executed in a "deferred" process context, by kblockd. It calls the
- * driver's request_fn so the driver can submit that request.
- *
- * IMPORTANT! Thisguy will reenter the elevator, so set up all queue global
- * state before calling, and don't rely on any state over calls.
- *
- * FIXME! dispatch queue is not a queue at all!
- * Andrew! as_queue_notready does not _try_ to move a request to dispatch
- * list, in fact it tries not to! Unfortunately it sometimes must in order
- * to guarantee elv_next_request will return !NULL after a ready indication.
- */
-static void as_work_handler(void *data)
-{
-	struct request_queue *q = data;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (!as_queue_notready(q))
-		q->request_fn(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
 /*
  * This is called directly by the functions in this file to stop anticipation.
  * We kill the timer and schedule a call to the request_fn asap.
@@ -948,6 +654,7 @@ static void as_antic_stop(struct as_data
 		if (status == ANTIC_WAIT_NEXT)
 			del_timer(&ad->antic_timer);
 		ad->antic_status = ANTIC_FINISHED;
+		/* see as_work_handler */
 		kblockd_schedule_work(&ad->antic_work);
 	}
 }
@@ -1005,7 +712,7 @@ static int as_close_req(struct as_data *
  *
  * If the task which has submitted the request has exitted, break anticipation.
  *
- * If this task has queued some other reads, do not enter enticipation.
+ * If this task has queued some other IO, do not enter enticipation.
  */
 static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
 {
@@ -1051,6 +758,75 @@ static int as_can_break_anticipation(str
 }
 
 /*
+ * as_can_anticipate indicates weather we should either run arq
+ * or keep anticipating a better request.
+ */
+static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
+{
+	if (!ad->as_io_context)
+		/*
+		 * Last request submitted was a write
+		 */
+		return 0;
+
+	if (ad->antic_status == ANTIC_FINISHED)
+		/*
+		 * Don't restart if we have just finished. Run the next request
+		 */
+		return 0;
+
+	if (arq && as_can_break_anticipation(ad, arq))
+		/*
+		 * This request is a good candidate. Don't keep anticipating,
+		 * run it.
+		 */
+		return 0;
+
+	/*
+	 * OK from here, we haven't finished, and don't have a decent request!
+	 * Status is either ANTIC_OFF so start waiting,
+	 * ANTIC_WAIT_REQ so continue waiting for request to finish
+	 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
+	 * 
+	 */
+
+	return 1;
+}
+
+/*
+ * as_update_iohist keeps a decaying histogram of IO thinktimes, and
+ * updates @aic->mean_thinktime based on that. It is called when a new
+ * request is queued.
+ */
+static void as_update_iohist(struct as_io_context *aic)
+{
+	unsigned i;
+	unsigned long thinktime;
+	unsigned long total = 0;
+	unsigned long num = 0;
+
+	if (aic == NULL)
+		return;
+
+	if (test_bit(AS_TASK_IORUNNING, &aic->state)) {
+		thinktime = jiffies - aic->last_end_request;
+		thinktime = min(thinktime, MAX_THINKTIME-1);
+		aic->thinktime[thinktime] += 256; /* fixed point: 1.0 == 1<<8 */
+
+		for (i = 0; i < MAX_THINKTIME; i++) {
+			unsigned long tt = aic->thinktime[i];
+			total += i*tt;
+			num += tt;
+
+			aic->thinktime[i] = (tt>>1) + (tt>>2); /* 75% decay */
+		}
+		/* fixed point factor is cancelled here */
+		if (num)
+			aic->mean_thinktime = total / num;
+	}
+}
+
+/*
  * as_update_arq must be called whenever a request (arq) is added to
  * the sort_list. This function keeps caches up to date, and checks if the
  * request might be one we are "anticipating"
@@ -1110,120 +886,178 @@ static void as_update_arq(struct as_data
 }
 
 /*
- * as_can_anticipate indicates weather we should either run arq
- * or keep anticipating a better request.
+ * as_complete_arq is to be called when a request has completed and returned
+ * something to the requesting process, be it an error or data.
  */
-static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
+static void as_complete_arq(struct as_data *ad, struct as_rq *arq)
 {
-	BUG_ON(ad->antic_status == ANTIC_WAIT_REQ ||
-		ad->antic_status == ANTIC_WAIT_NEXT);
+	if (!arq->as_io_context)
+		return;
 
-	if (!ad->as_io_context)
-		/*
-		 * Last request submitted was a write
-		 */
-		return 0;
+	if (rq_data_dir(arq->request) == READ) {
+		set_bit(AS_TASK_IORUNNING, &arq->as_io_context->state);
+		arq->as_io_context->last_end_request = jiffies;
+	}
 
-	if (ad->antic_status == ANTIC_FINISHED)
-		/*
-		 * Don't restart if we have just finished. Run the next request
-		 */
-		return 0;
+	if (ad->as_io_context == arq->as_io_context) {
+		ad->antic_start = jiffies;
+		ad->aic_finished = 1;
+		if (ad->antic_status == ANTIC_WAIT_REQ) {
+			/*
+			 * We were waiting on this request, now anticipate
+			 * the next one
+			 */
+			as_antic_waitnext(ad);
+		}
+	}
+	put_as_io_context(&arq->as_io_context);
+}
+
+/*
+ * as_remove_queued_request removes a request from the pre dispatch queue
+ * without updating refcounts. It is expected the caller will drop the
+ * reference unless it replaces the request at somepart of the elevator
+ * (ie. the dispatch queue)
+ */
+static void as_remove_queued_request(request_queue_t *q, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+
+	if (!arq)
+		BUG();
+	else {
+		const int data_dir = rq_data_dir(arq->request);
+		struct as_data *ad = q->elevator.elevator_data;
+
+		BUG_ON(arq->state != AS_RQ_QUEUED);
+
+		if (arq->as_io_context) {
+			BUG_ON(!atomic_read(&arq->as_io_context->nr_queued));
+			atomic_dec(&arq->as_io_context->nr_queued);
+		}
 
-	if (arq && as_can_break_anticipation(ad, arq))
 		/*
-		 * This request is a good candidate. Don't keep anticipating,
-		 * run it.
+		 * Update the "next_arq" cache if we are about to remove its
+		 * entry
 		 */
+		if (ad->next_arq[data_dir] == arq)
+			ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+		
+		list_del_init(&arq->fifo);
+		as_del_arq_hash(arq);
+		as_del_arq_rb(ad, arq);
+
+		if (q->last_merge == &rq->queuelist)
+			q->last_merge = NULL;
+
+		list_del_init(&rq->queuelist);
+	}
+
+}
+
+/*
+ * as_remove_request is called when a driver has completed the request
+ * (or it has caused an error), and is finished with it. It assumes
+ * the request is on the dispatch queue.
+ */
+static void as_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+	struct as_data *ad = q->elevator.elevator_data;
+
+	if (q->last_merge == &rq->queuelist)
+		q->last_merge = NULL;
+
+	list_del_init(&rq->queuelist);
+
+	if (arq) {
+		list_del_init(&arq->fifo);
+		as_del_arq_hash(arq);
+		as_del_arq_rb(ad, arq);
+		if (arq->as_io_context) {
+			WARN_ON(!atomic_read(&arq->as_io_context->nr_dispatched));
+			atomic_dec(&arq->as_io_context->nr_dispatched);
+		}
+		as_complete_arq(ad, arq);
+	}
+}
+
+/*
+ * as_fifo_expired returns 0 if there are no expired reads on the fifo,
+ * 1 otherwise.  It is ratelimited so that we only perform the check once per
+ * `fifo_expire' interval.  Otherwise a large number of expired requests
+ * would create a hopeless seekstorm.
+ *
+ * See as_antic_expired comment.
+ */ 
+static int as_fifo_expired(struct as_data *ad, int adir)
+{
+	struct as_rq *arq;
+	long delta_jif;
+
+	delta_jif = jiffies - ad->last_check_fifo[adir];
+	if (unlikely(delta_jif < 0))
+		delta_jif = -delta_jif;
+	if (delta_jif < ad->fifo_expire[adir])
 		return 0;
 
-	/*
-	 * OK from here, we haven't finished, and don't have a decent request!
-	 * Status is ANTIC_OFF so start waiting.
-	 */
+	ad->last_check_fifo[adir] = jiffies;
 
-	return 1;
+	if (list_empty(&ad->fifo_list[adir]))
+		return 0;
+
+	arq = list_entry_fifo(ad->fifo_list[adir].next);
+	
+	return time_after(jiffies, arq->expires);
 }
 
-#define MAXBACK (512 * 1024)
+/*
+ * as_batch_expired returns true if the current batch has expired. A batch
+ * is a set of reads or a set of writes.
+ */
+static inline int as_batch_expired(struct as_data *ad)
+{
+	return time_after(jiffies, ad->current_batch_expires);
+}
 
 /*
- * as_choose_req selects the preferred one of two requests of the same data_dir
- * ignoring time - eg. timeouts, which is the job of as_dispatch_request
+ * move an entry to dispatch queue
  */
-static struct as_rq *
-as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
+static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 {
-	int data_dir;
-	sector_t last, s1, s2, d1, d2;
-	int r1_wrap=0, r2_wrap=0;	/* requests are behind the disk head */
-	const sector_t maxback = MAXBACK;
+	const int data_dir = rq_data_dir(arq->request);
+	
+	BUG_ON(!ON_RB(&arq->rb_node));
 
-	if (arq1 == NULL || arq1 == arq2)
-		return arq2;
-	if (arq2 == NULL)
-		return arq1;
+	as_antic_stop(ad);
+	ad->antic_status = ANTIC_OFF;
 
-	data_dir = rq_data_dir(arq1->request);
+	/*
+	 * This has to be set in order to be correctly updated by
+	 * as_find_next_arq
+	 */
+	ad->last_sector[data_dir] = arq->request->sector
+					+ arq->request->nr_sectors;
 
-	last = ad->last_sector[data_dir];
-	s1 = arq1->request->sector;
-	s2 = arq2->request->sector;
+	if (data_dir == READ) {
+		/* In case we have to anticipate after this */
+		copy_as_io_context(&ad->as_io_context, &arq->as_io_context);
+		ad->aic_finished = 0;
+	} else
+		put_as_io_context(&ad->as_io_context);
 
-	BUG_ON(data_dir != rq_data_dir(arq2->request));
+	ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
 
 	/*
-	 * Strict one way elevator _except_ in the case where we allow
-	 * short backward seeks which are biased as twice the cost of a
-	 * similar forward seek. Only for reads and only between reads
-	 * from the same process!
+	 * take it off the sort and fifo list, add to dispatch queue
 	 */
-	if (s1 >= last)
-		d1 = s1 - last;
-	else if (data_dir == READ
-			&& ad->as_io_context == arq1->as_io_context
-			&& s1+maxback >= last)
-				d1 = (last - s1)*2;
-	else {
-		r1_wrap = 1;
-		d1 = 0; /* shut up, gcc */
-	}
-
-	if (s2 >= last)
-		d2 = s2 - last;
-	else if (data_dir == READ
-			&& ad->as_io_context == arq2->as_io_context
-			&& s2+maxback >= last)
-				d2 = (last - s2)*2;
-	else {
-		r2_wrap = 1;
-		d2 = 0;
-	}
+	as_remove_queued_request(ad->q, arq->request);
+	list_add_tail(&arq->request->queuelist, ad->dispatch);
+	if (arq->as_io_context) 
+		atomic_inc(&arq->as_io_context->nr_dispatched);
 
-	/* Found required data */
-	if (!r1_wrap && r2_wrap)
-		return arq1;
-	else if (!r2_wrap && r1_wrap)
-		return arq2;
-	else if (r1_wrap && r2_wrap) {
-		/* both behind the head */
-		if (s1 <= s2)
-			return arq1;
-		else
-			return arq2;
-	}
-	
-	/* Both requests in front of the head */
-	if (d1 < d2) 
-		return arq1;
-	else if (d2 < d1)
-		return arq2;
-	else {
-		if (s1 >= s2)
-			return arq1;
-		else
-			return arq2;
-	}
+	BUG_ON(arq->state != AS_RQ_QUEUED);
+	arq->state = AS_RQ_DISPATCHED;
 }
 
 /*
@@ -1312,7 +1146,6 @@ dispatch_writes:
 	return 0;
 
 dispatch_request:
-
 	/*
 	 * If a request has expired, service it.
 	 */
@@ -1349,6 +1182,32 @@ static struct request *as_next_request(r
 	return rq;
 }
 
+/*
+ * add arq to rbtree and fifo
+ */
+static void as_add_request(struct as_data *ad, struct as_rq *arq)
+{
+	const int data_dir = rq_data_dir(arq->request);
+
+	arq->as_io_context = get_as_io_context();
+	if (arq->as_io_context) {
+		atomic_inc(&arq->as_io_context->nr_queued);
+
+		if (data_dir == READ)
+			as_update_iohist(arq->as_io_context);
+	}
+
+	as_add_arq_rb(ad, arq);
+
+	/*
+	 * set expire time (only used for reads) and add to fifo list
+	 */
+	arq->expires = jiffies + ad->fifo_expire[data_dir];
+	list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
+	arq->state = AS_RQ_QUEUED;
+	as_update_arq(ad, arq); /* keep state machine up to date */
+}
+
 static void
 as_insert_request(request_queue_t *q, struct request *rq,
 			struct list_head *insert_here)
@@ -1444,6 +1303,154 @@ as_latter_request(request_queue_t *q, st
 	return ret;
 }
 
+static int
+as_merge(request_queue_t *q, struct list_head **insert, struct bio *bio)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct request *__rq;
+	int ret;
+
+	/*
+	 * try last_merge to avoid going to hash
+	 */
+	ret = elv_try_last_merge(q, bio);
+	if (ret != ELEVATOR_NO_MERGE) {
+		__rq = list_entry_rq(q->last_merge);
+		goto out_insert;
+	}
+
+	/*
+	 * see if the merge hash can satisfy a back merge
+	 */
+	__rq = as_find_arq_hash(ad, bio->bi_sector);
+	if (__rq) {
+		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
+
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_BACK_MERGE;
+			goto out;
+		}
+	}
+
+	/*
+	 * check for front merge
+	 */
+	if (ad->front_merges) {
+		sector_t rb_key = bio->bi_sector + bio_sectors(bio);
+
+		__rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
+		if (__rq) {
+			BUG_ON(rb_key != rq_rb_key(__rq));
+
+			if (elv_rq_merge_ok(__rq, bio)) {
+				ret = ELEVATOR_FRONT_MERGE;
+				goto out;
+			}
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+out:
+	q->last_merge = &__rq->queuelist;
+out_insert:
+	*insert = &__rq->queuelist;
+	return ret;
+}
+
+static void as_merged_request(request_queue_t *q, struct request *req)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = RQ_DATA(req);
+
+	/*
+	 * hash always needs to be repositioned, key is end sector
+	 */
+	as_del_arq_hash(arq);
+	as_add_arq_hash(ad, arq);
+
+	/*
+	 * if the merge was a front merge, we need to reposition request
+	 */
+	if (rq_rb_key(req) != arq->rb_key) {
+		as_del_arq_rb(ad, arq);
+		as_add_arq_rb(ad, arq);
+		/*
+		 * Note! At this stage of this and the next function, our next
+		 * request may not be optimal - eg the request may have "grown"
+		 * behind the disk head. We currently don't bother adjusting.
+		 */
+	}
+
+	q->last_merge = &req->queuelist;
+}
+
+static void
+as_merged_requests(request_queue_t *q, struct request *req,
+			 struct request *next)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = RQ_DATA(req);
+	struct as_rq *anext = RQ_DATA(next);
+
+	BUG_ON(!arq);
+	BUG_ON(!anext);
+
+	/*
+	 * reposition arq (this is the merged request) in hash, and in rbtree
+	 * in case of a front merge
+	 */
+	as_del_arq_hash(arq);
+	as_add_arq_hash(ad, arq);
+
+	if (rq_rb_key(req) != arq->rb_key) {
+		as_del_arq_rb(ad, arq);
+		as_add_arq_rb(ad, arq);
+	}
+
+	/*
+	 * if anext expires before arq, assign its expire time to arq
+	 * and move into anext position (anext will be deleted) in fifo
+	 */
+	if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
+		if (time_before(anext->expires, arq->expires)) {
+			list_move(&arq->fifo, &anext->fifo);
+			arq->expires = anext->expires;
+			/*
+			 * Don't copy here but swap, because when anext is
+			 * removed below, it must contain the unused context
+			 */
+			swap_as_io_context(&arq->as_io_context,
+					&anext->as_io_context);
+		}
+	}
+
+	/*
+	 * kill knowledge of next, this one is a goner
+	 */
+	as_remove_queued_request(q, next);
+	put_as_io_context(&anext->as_io_context);
+}
+
+/*
+ * This is executed in a "deferred" process context, by kblockd. It calls the
+ * driver's request_fn so the driver can submit that request.
+ *
+ * IMPORTANT! This guy will reenter the elevator, so set up all queue global
+ * state before calling, and don't rely on any state over calls.
+ *
+ * FIXME! dispatch queue is not a queue at all!
+ */
+static void as_work_handler(void *data)
+{
+	struct request_queue *q = data;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!as_queue_notready(q))
+		q->request_fn(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
 static void as_exit(request_queue_t *q, elevator_t *e)
 {
 	struct as_data *ad = e->elevator_data;
@@ -1527,6 +1534,8 @@ static int as_init(request_queue_t *q, e
 	ad->batch_expire[WRITE] = write_batch_expire;
 	e->elevator_data = ad;
 
+	ad->current_batch_expires = jiffies + ad->batch_expire[READ];
+
 	for (i = READ; i <= WRITE; i++) {
 		struct request_list *rl = &q->rq[i];
 		struct list_head *entry;

_