From: Alex Tomas <bzzz@tmi.comex.ru>

We have a race wherein the block allocator can decide that
journal_head.b_committed_data is present and then will use it.  But kjournald
can concurrently free it and set the pointer to NULL.  It goes oops.

We introduce per-buffer_head "spinlocking" based on a bit in b_state.  To do
this we abstract out pte_chain_lock() and reuse the implementation.

The bit-based spinlocking is pretty inefficient CPU-wise (hence the warning
in there) and we may move this to a hashed spinlock later.



 fs/ext3/balloc.c             |   54 +++++++++++++++++++++------------
 fs/jbd/commit.c              |    2 +
 include/linux/jbd.h          |   16 +++++++++
 include/linux/rmap-locking.h |   28 +----------------
 include/linux/spinlock.h     |   70 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 125 insertions(+), 45 deletions(-)

diff -puN fs/ext3/balloc.c~jbd-010-b_committed_data-race-fix fs/ext3/balloc.c
--- 25/fs/ext3/balloc.c~jbd-010-b_committed_data-race-fix	2003-06-07 14:29:32.000000000 -0700
+++ 25-akpm/fs/ext3/balloc.c	2003-06-07 14:29:32.000000000 -0700
@@ -185,11 +185,14 @@ do_more:
 	if (err)
 		goto error_return;
 
+	jbd_lock_bh_state(bitmap_bh);
+	
 	for (i = 0; i < count; i++) {
 		/*
 		 * An HJ special.  This is expensive...
 		 */
 #ifdef CONFIG_JBD_DEBUG
+		jbd_unlock_bh_state(bitmap_bh);
 		{
 			struct buffer_head *debug_bh;
 			debug_bh = sb_find_get_block(sb, block + i);
@@ -202,6 +205,7 @@ do_more:
 				__brelse(debug_bh);
 			}
 		}
+		jbd_lock_bh_state(bitmap_bh);
 #endif
 		/* @@@ This prevents newly-allocated data from being
 		 * freed and then reallocated within the same
@@ -243,6 +247,7 @@ do_more:
 			dquot_freed_blocks++;
 		}
 	}
+	jbd_unlock_bh_state(bitmap_bh);
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
 	gdp->bg_free_blocks_count =
@@ -289,11 +294,12 @@ error_return:
  * data-writes at some point, and disable it for metadata allocations or
  * sync-data inodes.
  */
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static inline int ext3_test_allocatable(int nr, struct buffer_head *bh,
+					int have_access)
 {
 	if (ext3_test_bit(nr, bh->b_data))
 		return 0;
-	if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
+	if (!have_access || !buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
 		return 1;
 	return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
 }
@@ -305,8 +311,8 @@ static int ext3_test_allocatable(int nr,
  * the initial goal; then for a free byte somewhere in the bitmap; then
  * for any free bit in the bitmap.
  */
-static int find_next_usable_block(int start,
-			struct buffer_head *bh, int maxblocks)
+static int find_next_usable_block(int start, struct buffer_head *bh,
+				int maxblocks, int have_access)
 {
 	int here, next;
 	char *p, *r;
@@ -322,7 +328,8 @@ static int find_next_usable_block(int st
 		 */
 		int end_goal = (start + 63) & ~63;
 		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
-		if (here < end_goal && ext3_test_allocatable(here, bh))
+		if (here < end_goal &&
+			ext3_test_allocatable(here, bh, have_access))
 			return here;
 		
 		ext3_debug ("Bit not found near goal\n");
@@ -345,7 +352,7 @@ static int find_next_usable_block(int st
 	r = memscan(p, 0, (maxblocks - here + 7) >> 3);
 	next = (r - ((char *) bh->b_data)) << 3;
 	
-	if (next < maxblocks && ext3_test_allocatable(next, bh))
+	if (next < maxblocks && ext3_test_allocatable(next, bh, have_access))
 		return next;
 	
 	/* The bitmap search --- search forward alternately
@@ -357,13 +364,13 @@ static int find_next_usable_block(int st
 						 maxblocks, here);
 		if (next >= maxblocks)
 			return -1;
-		if (ext3_test_allocatable(next, bh))
+		if (ext3_test_allocatable(next, bh, have_access))
 			return next;
 
-		J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
-		here = ext3_find_next_zero_bit
-			((unsigned long *) bh2jh(bh)->b_committed_data, 
-			 maxblocks, next);
+		if (have_access)
+			here = ext3_find_next_zero_bit
+				((unsigned long *) bh2jh(bh)->b_committed_data, 
+			 	maxblocks, next);
 	}
 	return -1;
 }
@@ -402,17 +409,18 @@ ext3_try_to_allocate(struct super_block 
 
 	*errp = 0;
 
-	if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh))
+	if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh, 0))
 		goto got;
 
 repeat:
 	goal = find_next_usable_block(goal, bitmap_bh,
-				EXT3_BLOCKS_PER_GROUP(sb));
+				EXT3_BLOCKS_PER_GROUP(sb), have_access);
 	if (goal < 0)
 		goto fail;
 
 	for (i = 0;
-		i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh);
+		i < 7 && goal > 0 && 
+			ext3_test_allocatable(goal - 1, bitmap_bh, have_access);
 		i++, goal--);
 
 got:
@@ -429,6 +437,7 @@ got:
 			*errp = fatal;
 			goto fail;
 		}
+		jbd_lock_bh_state(bitmap_bh);
 		have_access = 1;
 	}
 
@@ -444,6 +453,7 @@ got:
 	}
 
 	BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
+	jbd_unlock_bh_state(bitmap_bh);
 	fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
 	if (fatal) {
 		*errp = fatal;
@@ -454,6 +464,7 @@ got:
 fail:
 	if (have_access) {
 		BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+		jbd_unlock_bh_state(bitmap_bh);
 		ext3_journal_release_buffer(handle, bitmap_bh);
 	}
 	return -1;
@@ -611,14 +622,19 @@ allocated:
 			brelse(debug_bh);
 		}
 	}
-#endif
+	jbd_lock_bh_state(bitmap_bh);
 	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data)
-		J_ASSERT_BH(bitmap_bh,
-			!ext3_test_bit(ret_block,
-					bh2jh(bitmap_bh)->b_committed_data));
+	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
+		if (ext3_test_bit(ret_block,
+				bh2jh(bitmap_bh)->b_committed_data)) {
+			printk("%s: block was unexpectedly set in "
+				"b_committed_data\n", __FUNCTION__);
+		}
+	}
 	ext3_debug("found bit %d\n", ret_block);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
+	jbd_unlock_bh_state(bitmap_bh);
+#endif
 
 	/* ret_block was blockgroup-relative.  Now it becomes fs-relative */
 	ret_block = target_block;
diff -puN fs/jbd/commit.c~jbd-010-b_committed_data-race-fix fs/jbd/commit.c
--- 25/fs/jbd/commit.c~jbd-010-b_committed_data-race-fix	2003-06-07 14:29:32.000000000 -0700
+++ 25-akpm/fs/jbd/commit.c	2003-06-07 14:29:32.000000000 -0700
@@ -640,6 +640,7 @@ skip_commit: /* The journal should be un
 		 *
 		 * Otherwise, we can just throw away the frozen data now.
 		 */
+		jbd_lock_bh_state(jh2bh(jh));
 		if (jh->b_committed_data) {
 			kfree(jh->b_committed_data);
 			jh->b_committed_data = NULL;
@@ -651,6 +652,7 @@ skip_commit: /* The journal should be un
 			kfree(jh->b_frozen_data);
 			jh->b_frozen_data = NULL;
 		}
+		jbd_unlock_bh_state(jh2bh(jh));
 
 		spin_lock(&journal_datalist_lock);
 		cp_transaction = jh->b_cp_transaction;
diff -puN include/linux/jbd.h~jbd-010-b_committed_data-race-fix include/linux/jbd.h
--- 25/include/linux/jbd.h~jbd-010-b_committed_data-race-fix	2003-06-07 14:29:32.000000000 -0700
+++ 25-akpm/include/linux/jbd.h	2003-06-07 14:29:34.000000000 -0700
@@ -292,6 +292,7 @@ enum jbd_state_bits {
 	BH_Revoked,		/* Has been revoked from the log */
 	BH_RevokeValid,		/* Revoked flag is valid */
 	BH_JBDDirty,		/* Is dirty but journaled */
+	BH_State,		/* Pins most journal_head state */
 };
 
 BUFFER_FNS(JBD, jbd)
@@ -309,6 +310,21 @@ static inline struct journal_head *bh2jh
 	return bh->b_private;
 }
 
+static inline void jbd_lock_bh_state(struct buffer_head *bh)
+{
+	bit_spin_lock(BH_State, &bh->b_state);
+}
+
+static inline int jbd_trylock_bh_state(struct buffer_head *bh)
+{
+	return bit_spin_trylock(BH_State, &bh->b_state);
+}
+
+static inline void jbd_unlock_bh_state(struct buffer_head *bh)
+{
+	bit_spin_unlock(BH_State, &bh->b_state);
+}
+
 #define HAVE_JOURNAL_CALLBACK_STATUS
 /**
  *   struct journal_callback - Base structure for callback information.
diff -puN include/linux/rmap-locking.h~jbd-010-b_committed_data-race-fix include/linux/rmap-locking.h
--- 25/include/linux/rmap-locking.h~jbd-010-b_committed_data-race-fix	2003-06-07 14:29:32.000000000 -0700
+++ 25-akpm/include/linux/rmap-locking.h	2003-06-07 14:29:32.000000000 -0700
@@ -10,32 +10,8 @@
 struct pte_chain;
 extern kmem_cache_t *pte_chain_cache;
 
-static inline void pte_chain_lock(struct page *page)
-{
-	/*
-	 * Assuming the lock is uncontended, this never enters
-	 * the body of the outer loop. If it is contended, then
-	 * within the inner loop a non-atomic test is used to
-	 * busywait with less bus contention for a good time to
-	 * attempt to acquire the lock bit.
-	 */
-	preempt_disable();
-#ifdef CONFIG_SMP
-	while (test_and_set_bit(PG_chainlock, &page->flags)) {
-		while (test_bit(PG_chainlock, &page->flags))
-			cpu_relax();
-	}
-#endif
-}
-
-static inline void pte_chain_unlock(struct page *page)
-{
-#ifdef CONFIG_SMP
-	smp_mb__before_clear_bit();
-	clear_bit(PG_chainlock, &page->flags);
-#endif
-	preempt_enable();
-}
+#define pte_chain_lock(page)	bit_spin_lock(PG_chainlock, &page->flags)
+#define pte_chain_unlock(page)	bit_spin_unlock(PG_chainlock, &page->flags)
 
 struct pte_chain *pte_chain_alloc(int gfp_flags);
 void __pte_chain_free(struct pte_chain *pte_chain);
diff -puN include/linux/spinlock.h~jbd-010-b_committed_data-race-fix include/linux/spinlock.h
--- 25/include/linux/spinlock.h~jbd-010-b_committed_data-race-fix	2003-06-07 14:29:32.000000000 -0700
+++ 25-akpm/include/linux/spinlock.h	2003-06-07 14:29:32.000000000 -0700
@@ -541,4 +541,74 @@ do { \
 extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
 #endif
 
+/*
+ *  bit-based spin_lock()
+ *
+ * Don't use this unless you really need to: spin_lock() and spin_unlock()
+ * are significantly faster.
+ */
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+	preempt_disable();
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	while (test_and_set_bit(bitnum, addr)) {
+		while (test_bit(bitnum, addr))
+			cpu_relax();
+	}
+#endif
+}
+
+/*
+ * Return true if it was acquired
+ */
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	int ret;
+
+	preempt_disable();
+	ret = !test_and_set_bit(bitnum, addr);
+	if (!ret)
+		preempt_enable();
+	return ret;
+#else
+	preempt_disable();
+	return 1;
+#endif
+}
+
+/*
+ *  bit-based spin_unlock()
+ */
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	BUG_ON(!test_bit(bitnum, addr));
+	smp_mb__before_clear_bit();
+	clear_bit(bitnum, addr);
+#endif
+	preempt_enable();
+}
+
+/*
+ * Return true if the lock is held.
+ */
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	return test_bit(bitnum, addr);
+#elif defined CONFIG_PREEMPT
+	return preempt_count();
+#else
+	return 1;
+#endif
+}
+
 #endif /* __LINUX_SPINLOCK_H */

_