Patch from Stephen Hemminger <shemminger@osdl.org>

Fixed (at least for me), the problem with netfilter/iptables.

Here is a replacement for the brlock-removal-1 patch.  The changes to
fix the bug are all in the netfilter.c part.



 25-akpm/include/linux/netdevice.h |    2 
 25-akpm/net/core/dev.c            |  118 +++++++++++++---------
 25-akpm/net/core/netfilter.c      |  197 ++++++++++++++++++--------------------
 3 files changed, 167 insertions(+), 150 deletions(-)

diff -puN include/linux/netdevice.h~brlock-1b include/linux/netdevice.h
--- 25/include/linux/netdevice.h~brlock-1b	Tue Mar 18 14:55:36 2003
+++ 25-akpm/include/linux/netdevice.h	Tue Mar 18 14:55:36 2003
@@ -452,7 +452,7 @@ struct packet_type 
 	int			(*func) (struct sk_buff *, struct net_device *,
 					 struct packet_type *);
 	void			*data;	/* Private to the packet type		*/
-	struct packet_type	*next;
+	struct list_head	packet_list;
 };
 
 
diff -puN net/core/dev.c~brlock-1b net/core/dev.c
--- 25/net/core/dev.c~brlock-1b	Tue Mar 18 14:55:36 2003
+++ 25-akpm/net/core/dev.c	Tue Mar 18 14:55:36 2003
@@ -90,7 +90,6 @@
 #include <linux/etherdevice.h>
 #include <linux/notifier.h>
 #include <linux/skbuff.h>
-#include <linux/brlock.h>
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 #include <linux/proc_fs.h>
@@ -117,6 +116,7 @@ extern int plip_init(void);
 
 #include <asm/current.h>
 
+
 /* This define, if set, will randomly drop a packet when congestion
  * is more than moderate.  It helps fairness in the multi-interface
  * case when one of them is a hog, but it kills performance for the
@@ -170,8 +170,11 @@ const char *if_port_text[] = {
  *		86DD	IPv6
  */
 
-static struct packet_type *ptype_base[16];	/* 16 way hashed list */
-static struct packet_type *ptype_all;		/* Taps */
+static spinlock_t ptype_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head ptype_base[16];	/* 16 way hashed list */
+static struct list_head ptype_all;	/* Taps */
+
+static spinlock_t master_lock = SPIN_LOCK_UNLOCKED;
 
 #ifdef OFFLINE_SAMPLE
 static void sample_queue(unsigned long dummy);
@@ -203,7 +206,6 @@ int netdev_fastroute_obstacles;
 
 static struct subsystem net_subsys;
 
-
 /*******************************************************************************
 
 		Protocol management and registration routines
@@ -245,8 +247,7 @@ void dev_add_pack(struct packet_type *pt
 {
 	int hash;
 
-	br_write_lock_bh(BR_NETPROTO_LOCK);
-
+	spin_lock_bh(&ptype_lock);
 #ifdef CONFIG_NET_FASTROUTE
 	/* Hack to detect packet socket */
 	if (pt->data && (long)(pt->data) != 1) {
@@ -256,14 +257,12 @@ void dev_add_pack(struct packet_type *pt
 #endif
 	if (pt->type == htons(ETH_P_ALL)) {
 		netdev_nit++;
-		pt->next  = ptype_all;
-		ptype_all = pt;
+		list_add_rcu(&pt->packet_list, &ptype_all);
 	} else {
 		hash = ntohs(pt->type) & 15;
-		pt->next = ptype_base[hash];
-		ptype_base[hash] = pt;
+		list_add_rcu(&pt->packet_list, &ptype_base[hash]);
 	}
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
+	spin_unlock_bh(&ptype_lock);
 }
 
 extern void linkwatch_run_queue(void);
@@ -279,29 +278,30 @@ extern void linkwatch_run_queue(void);
  */
 void dev_remove_pack(struct packet_type *pt)
 {
-	struct packet_type **pt1;
-
-	br_write_lock_bh(BR_NETPROTO_LOCK);
+	struct list_head *head, *pelem;
 
-	if (pt->type == htons(ETH_P_ALL)) {
-		netdev_nit--;
-		pt1 = &ptype_all;
-	} else
-		pt1 = &ptype_base[ntohs(pt->type) & 15];
-
-	for (; *pt1; pt1 = &((*pt1)->next)) {
-		if (pt == *pt1) {
-			*pt1 = pt->next;
+	if (pt->type == htons(ETH_P_ALL))
+		head = &ptype_all;
+	else 
+		head =  &ptype_base[ntohs(pt->type) & 15];
+	
+	spin_lock_bh(&ptype_lock);
+	list_for_each(pelem, head) {
+		if (list_entry(pelem, struct packet_type, packet_list) == pt) {
+			list_del(pelem);
 #ifdef CONFIG_NET_FASTROUTE
 			if (pt->data)
 				netdev_fastroute_obstacles--;
 #endif
+			if (pt->type == htons(ETH_P_ALL)) 
+				netdev_nit--;
 			goto out;
 		}
 	}
 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
-out:
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
+
+ out:
+	spin_unlock_bh(&ptype_lock);
 }
 
 /******************************************************************************
@@ -896,11 +896,13 @@ int call_netdevice_notifiers(unsigned lo
 
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct packet_type *ptype;
+	struct list_head *plist;
 	do_gettimeofday(&skb->stamp);
 
-	br_read_lock(BR_NETPROTO_LOCK);
-	for (ptype = ptype_all; ptype; ptype = ptype->next) {
+	rcu_read_lock();
+	list_for_each_rcu(plist, &ptype_all) {
+		struct packet_type *ptype
+			= list_entry(plist, struct packet_type, packet_list);
 		/* Never send packets back to the socket
 		 * they originated from - MvS (miquels@drinkel.ow.org)
 		 */
@@ -930,7 +932,7 @@ void dev_queue_xmit_nit(struct sk_buff *
 			ptype->func(skb2, skb->dev, ptype);
 		}
 	}
-	br_read_unlock(BR_NETPROTO_LOCK);
+	rcu_read_unlock();
 }
 
 /* Calculate csum in the case, when packet is misrouted.
@@ -1423,6 +1425,7 @@ static inline int handle_diverter(struct
 
 int netif_receive_skb(struct sk_buff *skb)
 {
+	struct list_head *pcur;
 	struct packet_type *ptype, *pt_prev;
 	int ret = NET_RX_DROP;
 	unsigned short type = skb->protocol;
@@ -1443,8 +1446,10 @@ int netif_receive_skb(struct sk_buff *sk
 
 	skb->h.raw = skb->nh.raw = skb->data;
 
+	rcu_read_lock();
 	pt_prev = NULL;
-	for (ptype = ptype_all; ptype; ptype = ptype->next) {
+	list_for_each_rcu(pcur, &ptype_all) {
+		ptype = list_entry(pcur, struct packet_type, packet_list);
 		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev) {
 				if (!pt_prev->data) {
@@ -1466,17 +1471,23 @@ int netif_receive_skb(struct sk_buff *sk
 #endif /* CONFIG_NET_DIVERT */
 
 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
-	if (skb->dev->br_port && br_handle_frame_hook) {
-		int ret;
+	if (skb->dev->br_port) {
+		int (*curhook)(struct sk_buff *) = br_handle_frame_hook;
 
-		ret = handle_bridge(skb, pt_prev);
-		if (br_handle_frame_hook(skb) == 0)
-			return ret;
-		pt_prev = NULL;
+		barrier();	/* prevent compiler RCU problems */
+		if (curhook) {
+			int ret;
+			ret = handle_bridge(skb, pt_prev);
+			if (curhook(skb) == 0)
+				return ret;
+			pt_prev = NULL;
+		}
 	}
 #endif
 
-	for (ptype = ptype_base[ntohs(type) & 15]; ptype; ptype = ptype->next) {
+	list_for_each_rcu(pcur, &ptype_base[ntohs(type) & 15]) {
+		ptype = list_entry(pcur, struct packet_type, packet_list);
+
 		if (ptype->type == type &&
 		    (!ptype->dev || ptype->dev == skb->dev)) {
 			if (pt_prev) {
@@ -1506,6 +1517,7 @@ int netif_receive_skb(struct sk_buff *sk
 		 */
 		ret = NET_RX_DROP;
 	}
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1580,7 +1592,7 @@ static void net_rx_action(struct softirq
 	unsigned long start_time = jiffies;
 	int budget = netdev_max_backlog;
 
-	br_read_lock(BR_NETPROTO_LOCK);
+	preempt_disable();
 	local_irq_disable();
 
 	while (!list_empty(&queue->poll_list)) {
@@ -1609,7 +1621,7 @@ static void net_rx_action(struct softirq
 	}
 out:
 	local_irq_enable();
-	br_read_unlock(BR_NETPROTO_LOCK);
+	preempt_enable();
 	return;
 
 softnet_break:
@@ -1925,7 +1937,6 @@ out_dev:
 #define dev_proc_init() 0
 #endif	/* CONFIG_PROC_FS */
 
-
 /**
  *	netdev_set_master	-	set up master/slave pair
  *	@slave: slave device
@@ -1943,18 +1954,23 @@ int netdev_set_master(struct net_device 
 
 	ASSERT_RTNL();
 
+	spin_lock_bh(&master_lock);
 	if (master) {
-		if (old)
+		if (old) {
+			spin_unlock_bh(&master_lock);
 			return -EBUSY;
+		}
 		dev_hold(master);
 	}
 
-	br_write_lock_bh(BR_NETPROTO_LOCK);
 	slave->master = master;
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
+	spin_unlock_bh(&master_lock);
 
-	if (old)
+	if (old) {
+		/* wait for all BH */
+		synchronize_kernel();
 		dev_put(old);
+	}
 
 	if (master)
 		slave->flags |= IFF_SLAVE;
@@ -1962,6 +1978,7 @@ int netdev_set_master(struct net_device 
 		slave->flags &= ~IFF_SLAVE;
 
 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
+
 	return 0;
 }
 
@@ -2623,6 +2640,7 @@ int unregister_netdevice(struct net_devi
 	struct net_device *d, **dp;
 
 	BUG_ON(dev_boot_phase);
+	might_sleep();
 
 	/* If device is running, close it first. */
 	if (dev->flags & IFF_UP)
@@ -2646,10 +2664,8 @@ int unregister_netdevice(struct net_devi
 		return -ENODEV;
 	}
 
-	/* Synchronize to net_rx_action. */
-	br_write_lock_bh(BR_NETPROTO_LOCK);
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
-
+	/* wait for RCU */
+	synchronize_kernel();
 
 #ifdef CONFIG_NET_FASTROUTE
 	dev_clear_fastroute(dev);
@@ -2755,7 +2771,6 @@ out:
 	return 0;
 }
 
-
 /*
  *	Initialize the DEV module. At boot time this walks the device list and
  *	unhooks any devices that fail to initialise (normally hardware not
@@ -2786,6 +2801,11 @@ static int __init net_dev_init(void)
 	if (dev_proc_init())
 		goto out;
 
+	/* Initialize packet type chains */
+	INIT_LIST_HEAD(&ptype_all);
+	for (i = 0; i < ARRAY_SIZE(ptype_base); i++) 
+	        INIT_LIST_HEAD(&ptype_base[i]);
+
 	subsystem_register(&net_subsys);
 
 #ifdef CONFIG_NET_DIVERT
diff -puN net/core/netfilter.c~brlock-1b net/core/netfilter.c
--- 25/net/core/netfilter.c~brlock-1b	Tue Mar 18 14:55:36 2003
+++ 25-akpm/net/core/netfilter.c	Tue Mar 18 14:55:36 2003
@@ -19,7 +19,6 @@
 #include <linux/interrupt.h>
 #include <linux/if.h>
 #include <linux/netdevice.h>
-#include <linux/brlock.h>
 #include <linux/inetdevice.h>
 #include <net/sock.h>
 #include <net/route.h>
@@ -44,8 +43,12 @@
    sleep. */
 static DECLARE_MUTEX(nf_sockopt_mutex);
 
+/*
+ * nf_hooks are protected by using read-copy-update
+ */
 struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
 static LIST_HEAD(nf_sockopts);
+static spinlock_t nf_hook_lock = SPIN_LOCK_UNLOCKED;
 
 /* 
  * A queue handler may be registered for each protocol.  Each is protected by
@@ -56,28 +59,34 @@ static struct nf_queue_handler_t {
 	nf_queue_outfn_t outfn;
 	void *data;
 } queue_handler[NPROTO];
+static rwlock_t queue_handler_lock = RW_LOCK_UNLOCKED;
 
 int nf_register_hook(struct nf_hook_ops *reg)
 {
 	struct list_head *i;
 
-	br_write_lock_bh(BR_NETPROTO_LOCK);
+	spin_lock_bh(&nf_hook_lock);
 	for (i = nf_hooks[reg->pf][reg->hooknum].next; 
 	     i != &nf_hooks[reg->pf][reg->hooknum]; 
 	     i = i->next) {
 		if (reg->priority < ((struct nf_hook_ops *)i)->priority)
 			break;
 	}
-	list_add(&reg->list, i->prev);
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
+	list_add_rcu(&reg->list, i->prev);
+	spin_unlock_bh(&nf_hook_lock);
+
+	synchronize_kernel();
 	return 0;
 }
 
 void nf_unregister_hook(struct nf_hook_ops *reg)
 {
-	br_write_lock_bh(BR_NETPROTO_LOCK);
-	list_del(&reg->list);
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
+	spin_lock_bh(&nf_hook_lock);
+	list_del_rcu(&reg->list);
+	spin_unlock_bh(&nf_hook_lock);
+
+	/* wait for all receivers that may still be using hook */
+	synchronize_kernel();
 }
 
 /* Do exclusive ranges overlap? */
@@ -335,72 +344,30 @@ int nf_getsockopt(struct sock *sk, int p
 	return nf_sockopt(sk, pf, val, opt, len, 1);
 }
 
-static unsigned int nf_iterate(struct list_head *head,
-			       struct sk_buff **skb,
-			       int hook,
-			       const struct net_device *indev,
-			       const struct net_device *outdev,
-			       struct list_head **i,
-			       int (*okfn)(struct sk_buff *),
-			       int hook_thresh)
-{
-	for (*i = (*i)->next; *i != head; *i = (*i)->next) {
-		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
-
-		if (hook_thresh > elem->priority)
-			continue;
-
-		switch (elem->hook(hook, skb, indev, outdev, okfn)) {
-		case NF_QUEUE:
-			return NF_QUEUE;
-
-		case NF_STOLEN:
-			return NF_STOLEN;
-
-		case NF_DROP:
-			return NF_DROP;
-
-		case NF_REPEAT:
-			*i = (*i)->prev;
-			break;
-
-#ifdef CONFIG_NETFILTER_DEBUG
-		case NF_ACCEPT:
-			break;
-
-		default:
-			NFDEBUG("Evil return from %p(%u).\n", 
-				elem->hook, hook);
-#endif
-		}
-	}
-	return NF_ACCEPT;
-}
-
 int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
 {      
 	int ret;
 
-	br_write_lock_bh(BR_NETPROTO_LOCK);
+	write_lock_bh(&queue_handler_lock);
 	if (queue_handler[pf].outfn)
 		ret = -EBUSY;
 	else {
-		queue_handler[pf].outfn = outfn;
 		queue_handler[pf].data = data;
+		queue_handler[pf].outfn = outfn;
 		ret = 0;
 	}
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
-
+	write_unlock_bh(&queue_handler_lock);
 	return ret;
 }
 
 /* The caller must flush their queue before this */
 int nf_unregister_queue_handler(int pf)
 {
-	br_write_lock_bh(BR_NETPROTO_LOCK);
+	write_lock_bh(&queue_handler_lock);
 	queue_handler[pf].outfn = NULL;
 	queue_handler[pf].data = NULL;
-	br_write_unlock_bh(BR_NETPROTO_LOCK);
+	write_unlock_bh(&queue_handler_lock);
+
 	return 0;
 }
 
@@ -422,9 +389,10 @@ static void nf_queue(struct sk_buff *skb
 	struct net_device *physoutdev = NULL;
 #endif
 
+	read_lock(&queue_handler_lock);
 	if (!queue_handler[pf].outfn) {
 		kfree_skb(skb);
-		return;
+		goto out;
 	}
 
 	info = kmalloc(sizeof(*info), GFP_ATOMIC);
@@ -433,7 +401,7 @@ static void nf_queue(struct sk_buff *skb
 			printk(KERN_ERR "OOM queueing packet %p\n",
 			       skb);
 		kfree_skb(skb);
-		return;
+		goto out;
 	}
 
 	*info = (struct nf_info) { 
@@ -463,8 +431,9 @@ static void nf_queue(struct sk_buff *skb
 #endif
 		kfree(info);
 		kfree_skb(skb);
-		return;
 	}
+ out:
+	read_unlock(&queue_handler_lock);
 }
 
 int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb,
@@ -474,6 +443,7 @@ int nf_hook_slow(int pf, unsigned int ho
 		 int hook_thresh)
 {
 	struct list_head *elem;
+	const struct nf_hook_ops *ops = NULL;
 	unsigned int verdict;
 	int ret = 0;
 
@@ -490,9 +460,7 @@ int nf_hook_slow(int pf, unsigned int ho
 		}
 	}
 
-	/* We may already have this, but read-locks nest anyway */
-	br_read_lock_bh(BR_NETPROTO_LOCK);
-
+	rcu_read_lock();
 #ifdef CONFIG_NETFILTER_DEBUG
 	if (skb->nf_debug & (1 << hook)) {
 		printk("nf_hook: hook %i already set.\n", hook);
@@ -501,15 +469,26 @@ int nf_hook_slow(int pf, unsigned int ho
 	skb->nf_debug |= (1 << hook);
 #endif
 
-	elem = &nf_hooks[pf][hook];
-	verdict = nf_iterate(&nf_hooks[pf][hook], &skb, hook, indev,
-			     outdev, &elem, okfn, hook_thresh);
-	if (verdict == NF_QUEUE) {
-		NFDEBUG("nf_hook: Verdict = QUEUE.\n");
-		nf_queue(skb, elem, pf, hook, indev, outdev, okfn);
+	verdict = NF_ACCEPT;
+	list_for_each_rcu(elem, &nf_hooks[pf][hook]) {
+		ops = list_entry(elem, const struct nf_hook_ops, list);
+
+		if (hook_thresh > ops->priority)
+			continue;
+
+		verdict = ops->hook(hook, &skb, indev, outdev, okfn);
+		if (verdict != NF_ACCEPT)
+			goto out;
+
 	}
 
+ out:;
 	switch (verdict) {
+	case NF_QUEUE:
+		NFDEBUG("nf_hook: Verdict = QUEUE.\n");
+		nf_queue(skb, elem, pf, hook, indev, outdev, okfn);
+		break;
+
 	case NF_ACCEPT:
 		ret = okfn(skb);
 		break;
@@ -518,9 +497,14 @@ int nf_hook_slow(int pf, unsigned int ho
 		kfree_skb(skb);
 		ret = -EPERM;
 		break;
+#ifdef CONFIG_NETFILTER_DEBUG
+	default:
+		NFDEBUG("Evil return %d from %p(%u).\n", verdict,
+			ops->hook, hook);
+#endif
 	}
 
-	br_read_unlock_bh(BR_NETPROTO_LOCK);
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -528,48 +512,61 @@ void nf_reinject(struct sk_buff *skb, st
 		 unsigned int verdict)
 {
 	struct list_head *elem = &info->elem->list;
-	struct list_head *i;
+	const struct list_head *head = &nf_hooks[info->pf][info->hook];
+	const struct list_head *i;
 
-	/* We don't have BR_NETPROTO_LOCK here */
-	br_read_lock_bh(BR_NETPROTO_LOCK);
-	for (i = nf_hooks[info->pf][info->hook].next; i != elem; i = i->next) {
-		if (i == &nf_hooks[info->pf][info->hook]) {
-			/* The module which sent it to userspace is gone. */
-			NFDEBUG("%s: module disappeared, dropping packet.\n",
-			         __FUNCTION__);
-			verdict = NF_DROP;
-			break;
+	rcu_read_lock();
+	list_for_each_rcu(i, head) {
+		/* traverse loop until we find elem */
+		if (elem) {
+			if (i != elem) 
+				continue;
+
+			elem = NULL;
+			/* Continue and process next element */
+			if (verdict == NF_ACCEPT) 
+				continue;
+
+			/* Process this element again */
+			if (verdict == NF_REPEAT) 
+				verdict = NF_ACCEPT;
 		}
-	}
 
-	/* Continue traversal iff userspace said ok... */
-	if (verdict == NF_REPEAT) {
-		elem = elem->prev;
-		verdict = NF_ACCEPT;
-	}
+		if (verdict == NF_ACCEPT) {
+			const struct nf_hook_ops *ops
+				= list_entry(elem, typeof(*ops), list);
+
+			verdict = ops->hook(info->hook, &skb, 
+					    info->indev, info->outdev, 
+					    info->okfn);
+		}
 
-	if (verdict == NF_ACCEPT) {
-		verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
-				     &skb, info->hook, 
-				     info->indev, info->outdev, &elem,
-				     info->okfn, INT_MIN);
-	}
+		switch (verdict) {
+		case NF_ACCEPT:
+			info->okfn(skb);
+			break;
 
-	switch (verdict) {
-	case NF_ACCEPT:
-		info->okfn(skb);
-		break;
+		case NF_QUEUE:
+			nf_queue(skb, elem, info->pf, info->hook, 
+				 info->indev, info->outdev, info->okfn);
+			break;
 
-	case NF_QUEUE:
-		nf_queue(skb, elem, info->pf, info->hook, 
-			 info->indev, info->outdev, info->okfn);
+		case NF_DROP:
+			kfree_skb(skb);
+			break;
+		}
 		break;
+	}
 
-	case NF_DROP:
+	if (elem) {
+		/* The module which sent it to userspace is gone. */
+		NFDEBUG("%s: module disappeared, dropping packet.\n",
+			__FUNCTION__);
 		kfree_skb(skb);
-		break;
 	}
-	br_read_unlock_bh(BR_NETPROTO_LOCK);
+
+
+	rcu_read_unlock();
 
 	/* Release those devices we held, or Alexey will kill me. */
 	if (info->indev) dev_put(info->indev);

_