From: Alex Tomas <bzzz@tmi.comex.ru>

here is port of your percpu_counters + group locks onto ext3.
plus, I think percpu_counter_read_positive() should not return 0.
because ext2/ext3 have no counters that may be 0. moreover, if
percpu_counter_read_positive() return 0 for dirs, then we'll get
'divide by zero' oops. this patch fix it up.




 fs/ext3/balloc.c           |   53 ++++++++++++++++++++++-----------------------
 fs/ext3/ialloc.c           |   43 ++++++++++++++++++++++++------------
 fs/ext3/super.c            |   47 +++++++++++++++------------------------
 include/linux/ext3_fs_sb.h |   15 +++++-------
 4 files changed, 81 insertions(+), 77 deletions(-)

diff -puN fs/ext3/balloc.c~ext3-concurrent-block-allocation-hashed fs/ext3/balloc.c
--- 25/fs/ext3/balloc.c~ext3-concurrent-block-allocation-hashed	2003-04-17 19:36:42.000000000 -0700
+++ 25-akpm/fs/ext3/balloc.c	2003-04-17 19:36:42.000000000 -0700
@@ -110,6 +110,7 @@ void ext3_free_blocks (handle_t *handle,
 	struct super_block * sb;
 	struct ext3_group_desc * gdp;
 	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi;
 	int err = 0, ret;
 	int dquot_freed_blocks = 0;
 
@@ -118,6 +119,7 @@ void ext3_free_blocks (handle_t *handle,
 		printk ("ext3_free_blocks: nonexistent device");
 		return;
 	}
+	sbi = EXT3_SB(sb);
 	es = EXT3_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
@@ -242,11 +244,12 @@ do_more:
 		}
 	}
 
-	spin_lock(bg_lock(sb, block_group));
+	spin_lock(sb_bgl_lock(sbi, block_group));
 	gdp->bg_free_blocks_count =
 		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) +
 			dquot_freed_blocks);
-	spin_unlock(bg_lock(sb, block_group));
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -429,7 +432,7 @@ got:
 		have_access = 1;
 	}
 
-	if (!claim_block(bg_lock(sb, group), goal, bitmap_bh)) {
+	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
 		/*
 		 * The block was allocated by another thread, or it was
 		 * allocated and then freed by another thread
@@ -477,11 +480,11 @@ ext3_new_block(handle_t *handle, struct 
 	int target_block;			/* tmp */
 	int fatal = 0, err;
 	int performed_allocation = 0;
-	int free;
-	int use_reserve = 0;
+	int free_blocks, root_blocks;
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
 	struct ext3_super_block *es;
+	struct ext3_sb_info *sbi;
 #ifdef EXT3FS_DEBUG
 	static int goal_hits = 0, goal_attempts = 0;
 #endif
@@ -500,9 +503,19 @@ ext3_new_block(handle_t *handle, struct 
 		return 0;
 	}
 
+	sbi = EXT3_SB(sb);
 	es = EXT3_SB(sb)->s_es;
 	ext3_debug("goal=%lu.\n", goal);
 
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	root_blocks = le32_to_cpu(es->s_r_blocks_count);
+	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current->fsuid &&
+		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+		*errp = -ENOSPC;
+		return 0;
+	}
+
 	/*
 	 * First, test whether the goal block is free.
 	 */
@@ -515,9 +528,8 @@ ext3_new_block(handle_t *handle, struct 
 	if (!gdp)
 		goto io_error;
 
-	free = le16_to_cpu(gdp->bg_free_blocks_count);
-	free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
-	if (free > 0) {
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	if (free_blocks > 0) {
 		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
 				EXT3_BLOCKS_PER_GROUP(sb));
 		bitmap_bh = read_block_bitmap(sb, group_no);
@@ -535,7 +547,6 @@ ext3_new_block(handle_t *handle, struct 
 	 * Now search the rest of the groups.  We assume that 
 	 * i and gdp correctly point to the last group visited.
 	 */
-repeat:
 	for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) {
 		group_no++;
 		if (group_no >= EXT3_SB(sb)->s_groups_count)
@@ -545,10 +556,8 @@ repeat:
 			*errp = -EIO;
 			goto out;
 		}
-		free = le16_to_cpu(gdp->bg_free_blocks_count);
-		if (!use_reserve) 
-			free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
-		if (free <= 0)
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		if (free_blocks <= 0)
 			continue;
 
 		brelse(bitmap_bh);
@@ -563,15 +572,6 @@ repeat:
 			goto allocated;
 	}
 
-	if (!use_reserve &&
-		(EXT3_SB(sb)->s_resuid == current->fsuid ||
-		  (EXT3_SB(sb)->s_resgid != 0 && in_group_p(EXT3_SB(sb)->s_resgid)) ||
-		  capable(CAP_SYS_RESOURCE))) {
-		use_reserve = 1;
-		group_no = 0;
-		goto repeat;
-	}
-
 	/* No space left on the device */
 	*errp = -ENOSPC;
 	goto out;
@@ -612,13 +612,13 @@ allocated:
 		}
 	}
 #endif
-	spin_lock(bg_lock(sb, group_no));
+	spin_lock(sb_bgl_lock(sbi, group_no));
 	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data)
 		J_ASSERT_BH(bitmap_bh,
 			!ext3_test_bit(ret_block,
 					bh2jh(bitmap_bh)->b_committed_data));
 	ext3_debug("found bit %d\n", ret_block);
-	spin_unlock(bg_lock(sb, group_no));
+	spin_unlock(sb_bgl_lock(sbi, group_no));
 
 	/* ret_block was blockgroup-relative.  Now it becomes fs-relative */
 	ret_block = target_block;
@@ -639,10 +639,11 @@ allocated:
 	ext3_debug("allocating block %d. Goal hits %d of %d.\n",
 			ret_block, goal_hits, goal_attempts);
 
-	spin_lock(bg_lock(sb, group_no));
+	spin_lock(sb_bgl_lock(sbi, group_no));
 	gdp->bg_free_blocks_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
-	spin_unlock(bg_lock(sb, group_no));
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext3_journal_dirty_metadata(handle, gdp_bh);
diff -puN fs/ext3/ialloc.c~ext3-concurrent-block-allocation-hashed fs/ext3/ialloc.c
--- 25/fs/ext3/ialloc.c~ext3-concurrent-block-allocation-hashed	2003-04-17 19:36:42.000000000 -0700
+++ 25-akpm/fs/ext3/ialloc.c	2003-04-17 19:36:42.000000000 -0700
@@ -97,6 +97,7 @@ void ext3_free_inode (handle_t *handle, 
 	unsigned long bit;
 	struct ext3_group_desc * gdp;
 	struct ext3_super_block * es;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	int fatal = 0, err;
 
 	if (atomic_read(&inode->i_count) > 1) {
@@ -161,13 +162,17 @@ void ext3_free_inode (handle_t *handle, 
 		if (fatal) goto error_return;
 
 		if (gdp) {
-			spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
+			spin_lock(sb_bgl_lock(sbi, block_group));
 			gdp->bg_free_inodes_count = cpu_to_le16(
 				le16_to_cpu(gdp->bg_free_inodes_count) + 1);
 			if (is_directory)
 				gdp->bg_used_dirs_count = cpu_to_le16(
 				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
-			spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
+			spin_unlock(sb_bgl_lock(sbi, block_group));
+			percpu_counter_inc(&sbi->s_freeinodes_counter);
+			if (is_directory)
+				percpu_counter_dec(&sbi->s_dirs_counter);
+
 		}
 		BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
 		err = ext3_journal_dirty_metadata(handle, bh2);
@@ -196,11 +201,14 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent)
 {
 	int ngroups = EXT3_SB(sb)->s_groups_count;
-	int avefreei = ext3_count_free_inodes(sb) / ngroups;
+	int freei, avefreei;
 	struct ext3_group_desc *desc, *best_desc = NULL;
 	struct buffer_head *bh;
 	int group, best_group = -1;
 
+	freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+
 	for (group = 0; group < ngroups; group++) {
 		desc = ext3_get_group_desc (sb, group, &bh);
 		if (!desc || !desc->bg_free_inodes_count)
@@ -252,17 +260,20 @@ static int find_group_orlov(struct super
 	struct ext3_super_block *es = sbi->s_es;
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
-	int freei = ext3_count_free_inodes(sb);
-	int avefreei = freei / ngroups;
-	int freeb = ext3_count_free_blocks(sb);
-	int avefreeb = freeb / ngroups;
-	int blocks_per_dir;
-	int ndirs = ext3_count_dirs(sb);
+	int freei, avefreei;
+	int freeb, avefreeb;
+	int blocks_per_dir, ndirs;
 	int max_debt, max_dirs, min_blocks, min_inodes;
 	int group = -1, i;
 	struct ext3_group_desc *desc;
 	struct buffer_head *bh;
 
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	avefreeb = freeb / ngroups;
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
 	if ((parent == sb->s_root->d_inode) ||
 	    (parent->i_flags & EXT3_TOPDIR_FL)) {
 		int best_ndir = inodes_per_group;
@@ -289,8 +300,7 @@ static int find_group_orlov(struct super
 		goto fallback;
 	}
 
-	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
-			  le32_to_cpu(es->s_free_blocks_count)) / ndirs;
+	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
 
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
 	min_inodes = avefreei - inodes_per_group / 4;
@@ -309,7 +319,7 @@ static int find_group_orlov(struct super
 		desc = ext3_get_group_desc (sb, group, &bh);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
-		if (sbi->s_bgi[group].bg_debts >= max_debt)
+		if (sbi->s_debts[group] >= max_debt)
 			continue;
 		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
 			continue;
@@ -421,6 +431,7 @@ struct inode *ext3_new_inode(handle_t *h
 	struct ext3_group_desc * gdp;
 	struct ext3_super_block * es;
 	struct ext3_inode_info *ei;
+	struct ext3_sb_info *sbi;
 	int err = 0;
 	struct inode *ret;
 
@@ -435,6 +446,7 @@ struct inode *ext3_new_inode(handle_t *h
 	ei = EXT3_I(inode);
 
 	es = EXT3_SB(sb)->s_es;
+	sbi = EXT3_SB(sb);
 repeat:
 	if (S_ISDIR(mode)) {
 		if (test_opt (sb, OLDALLOC))
@@ -500,18 +512,21 @@ repeat:
 	BUFFER_TRACE(bh2, "get_write_access");
 	err = ext3_journal_get_write_access(handle, bh2);
 	if (err) goto fail;
-	spin_lock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
+	spin_lock(sb_bgl_lock(sbi, group));
 	gdp->bg_free_inodes_count =
 		cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
 	if (S_ISDIR(mode)) {
 		gdp->bg_used_dirs_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
 	}
-	spin_unlock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
+	spin_unlock(sb_bgl_lock(sbi, group));
 	BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
 	err = ext3_journal_dirty_metadata(handle, bh2);
 	if (err) goto fail;
 	
+	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
 	sb->s_dirt = 1;
 
 	inode->i_uid = current->fsuid;
diff -puN fs/ext3/super.c~ext3-concurrent-block-allocation-hashed fs/ext3/super.c
--- 25/fs/ext3/super.c~ext3-concurrent-block-allocation-hashed	2003-04-17 19:36:42.000000000 -0700
+++ 25-akpm/fs/ext3/super.c	2003-04-17 19:36:42.000000000 -0700
@@ -464,7 +464,7 @@ void ext3_put_super (struct super_block 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	kfree(sbi->s_bgi);
+	kfree(sbi->s_debts);
 	brelse(sbi->s_sbh);
 
 	/* Debugging code just in case the in-memory inode orphan list
@@ -906,7 +906,6 @@ static int ext3_check_descriptors (struc
 	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	struct ext3_group_desc * gdp = NULL;
 	unsigned long total_free;
-	unsigned int reserved = le32_to_cpu(sbi->s_es->s_r_blocks_count);
 	int desc_block = 0;
 	int i;
 
@@ -962,25 +961,6 @@ static int ext3_check_descriptors (struc
 		EXT3_SB(sb)->s_es->s_free_blocks_count = cpu_to_le32(total_free);
 	}
 
-	/* distribute reserved blocks over groups -bzzz */
-	for(i = sbi->s_groups_count - 1; reserved && total_free && i >= 0; i--) {
-		int free;
-
-		gdp = ext3_get_group_desc (sb, i, NULL);
-		if (!gdp) {
-			ext3_error (sb, "ext3_check_descriptors",
-					"cant get descriptor for group %d", i);
-			return 0;
-		}
-
-		free = le16_to_cpu(gdp->bg_free_blocks_count);
-		if (free > reserved)
-			free = reserved;
-		sbi->s_bgi[i].bg_reserved = free;
-		reserved -= free;
-		total_free -= free;
-	}
-
 	total_free = ext3_count_free_inodes(sb);
 	if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count)) {
 		printk("EXT3-fs: invalid s_free_inodes_count %u (real %lu)\n",
@@ -1350,17 +1330,19 @@ static int ext3_fill_super (struct super
 		printk (KERN_ERR "EXT3-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	sbi->s_bgi = kmalloc(sbi->s_groups_count * sizeof(struct ext3_bg_info),
+	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(u8),
 			GFP_KERNEL);
-	if (!sbi->s_bgi) {
+	if (!sbi->s_debts) {
 		printk("EXT3-fs: not enough memory to allocate s_bgi\n");
 		goto failed_mount2;
 	}
-	memset(sbi->s_bgi, 0,  sbi->s_groups_count * sizeof(struct ext3_bg_info));
-	for (i = 0; i < sbi->s_groups_count; i++) {
-		spin_lock_init(&sbi->s_bgi[i].bg_balloc_lock);
-		spin_lock_init(&sbi->s_bgi[i].bg_ialloc_lock);
-	}
+	memset(sbi->s_debts, 0,  sbi->s_groups_count * sizeof(u8));
+
+	percpu_counter_init(&sbi->s_freeblocks_counter);
+	percpu_counter_init(&sbi->s_freeinodes_counter);
+	percpu_counter_init(&sbi->s_dirs_counter);
+	bgl_lock_init(&sbi->s_blockgroup_lock);
+
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -1473,12 +1455,19 @@ static int ext3_fill_super (struct super
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
+	percpu_counter_mod(&sbi->s_freeblocks_counter,
+		ext3_count_free_blocks(sb));
+	percpu_counter_mod(&sbi->s_freeinodes_counter,
+		ext3_count_free_inodes(sb));
+	percpu_counter_mod(&sbi->s_dirs_counter,
+		ext3_count_dirs(sb));
+
 	return 0;
 
 failed_mount3:
 	journal_destroy(sbi->s_journal);
 failed_mount2:
-	kfree(sbi->s_bgi);
+	kfree(sbi->s_debts);
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
diff -puN include/linux/ext3_fs_sb.h~ext3-concurrent-block-allocation-hashed include/linux/ext3_fs_sb.h
--- 25/include/linux/ext3_fs_sb.h~ext3-concurrent-block-allocation-hashed	2003-04-17 19:36:42.000000000 -0700
+++ 25-akpm/include/linux/ext3_fs_sb.h	2003-04-17 19:36:42.000000000 -0700
@@ -19,15 +19,10 @@
 #ifdef __KERNEL__
 #include <linux/timer.h>
 #include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 #endif
 
-struct ext3_bg_info {
-	u8 bg_debts;
-	spinlock_t bg_balloc_lock;
-	spinlock_t bg_ialloc_lock;
-	unsigned long bg_reserved;
-} ____cacheline_aligned_in_smp;
-
 /*
  * third extended-fs super-block data in memory
  */
@@ -57,7 +52,11 @@ struct ext3_sb_info {
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
-	struct ext3_bg_info *s_bgi;
+        u8 *s_debts;
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct blockgroup_lock s_blockgroup_lock;
 
 	/* Journaling */
 	struct inode * s_journal_inode;

_