From: Manfred Spraul <manfred@colorfullife.com>

Manfred's latest page unmapping debug patch.


 arch/i386/Kconfig             |    8 ++
 arch/i386/kernel/cpu/common.c |    8 ++
 arch/i386/mm/fault.c          |    3 
 arch/i386/mm/pageattr.c       |   80 ++++++++++++-----------
 include/asm-i386/cacheflush.h |    7 ++
 include/linux/slab.h          |    2 
 mm/page_alloc.c               |   15 +++-
 mm/slab.c                     |  145 +++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 225 insertions(+), 43 deletions(-)

diff -puN arch/i386/Kconfig~unmap-page-debugging-2 arch/i386/Kconfig
--- 25/arch/i386/Kconfig~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/arch/i386/Kconfig	2003-06-09 18:16:08.000000000 -0700
@@ -1559,6 +1559,14 @@ config SPINLINE
 	  itself (as ".text.lock.filename"). This can be helpful for finding
 	  the callers of locks.
 
+config DEBUG_PAGEALLOC
+	bool "Page alloc debugging"
+	depends on DEBUG_SLAB
+	help
+	  Unmap pages from the kernel linear mapping after free_pages().
+	  This results in a large slowdown, but helps to find certain types
+	  of memory corruptions.
+
 config DEBUG_HIGHMEM
 	bool "Highmem debugging"
 	depends on DEBUG_KERNEL && HIGHMEM
diff -puN arch/i386/kernel/cpu/common.c~unmap-page-debugging-2 arch/i386/kernel/cpu/common.c
--- 25/arch/i386/kernel/cpu/common.c~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/arch/i386/kernel/cpu/common.c	2003-06-09 18:16:08.000000000 -0700
@@ -430,6 +430,14 @@ void __init early_cpu_init(void)
 	rise_init_cpu();
 	nexgen_init_cpu();
 	umc_init_cpu();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/* pse is not compatible with on-the-fly unmapping,
+	 * disable it even if the cpus claim to support it.
+	 */
+	clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+	disable_pse = 1;
+#endif
 }
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
diff -puN arch/i386/mm/fault.c~unmap-page-debugging-2 arch/i386/mm/fault.c
--- 25/arch/i386/mm/fault.c~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/arch/i386/mm/fault.c	2003-06-09 18:16:08.000000000 -0700
@@ -13,6 +13,7 @@
 #include <linux/ptrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
@@ -270,7 +271,9 @@ no_context:
 	}
 #endif
 	die("Oops", regs, error_code);
+	ptrinfo(address);
 	bust_spinlocks(0);
+for(;;);
 	do_exit(SIGKILL);
 
 /*
diff -puN arch/i386/mm/pageattr.c~unmap-page-debugging-2 arch/i386/mm/pageattr.c
--- 25/arch/i386/mm/pageattr.c~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/arch/i386/mm/pageattr.c	2003-06-09 18:16:08.000000000 -0700
@@ -13,6 +13,10 @@
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 
+static spinlock_t cpa_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head df_list = LIST_HEAD_INIT(df_list);
+
+
 static inline pte_t *lookup_address(unsigned long address) 
 { 
 	pgd_t *pgd = pgd_offset_k(address); 
@@ -31,10 +35,15 @@ static struct page *split_large_page(uns
 { 
 	int i; 
 	unsigned long addr;
-	struct page *base = alloc_pages(GFP_KERNEL, 0);
+	struct page *base;
 	pte_t *pbase;
+
+	spin_unlock_irq(&cpa_lock);
+	base = alloc_pages(GFP_KERNEL, 0);
+	spin_lock_irq(&cpa_lock);
 	if (!base) 
 		return NULL;
+
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
@@ -90,7 +99,7 @@ static inline void revert_page(struct pa
 }
 
 static int
-__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage) 
+__change_page_attr(struct page *page, pgprot_t prot) 
 { 
 	pte_t *kpte; 
 	unsigned long address;
@@ -126,7 +135,7 @@ __change_page_attr(struct page *page, pg
 	}
 
 	if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { 
-		*oldpage = kpte_page;
+		list_add(&kpte_page->list, &df_list);
 		revert_page(kpte_page, address);
 	} 
 	return 0;
@@ -137,12 +146,6 @@ static inline void flush_map(void)
 	on_each_cpu(flush_kernel_map, NULL, 1, 1);
 }
 
-struct deferred_page { 
-	struct deferred_page *next; 
-	struct page *fpage;
-}; 
-static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
-
 /*
  * Change the page attributes of an page in the linear mapping.
  *
@@ -159,47 +162,52 @@ static struct deferred_page *df_list; /*
 int change_page_attr(struct page *page, int numpages, pgprot_t prot)
 {
 	int err = 0; 
-	struct page *fpage; 
 	int i; 
+	unsigned long flags;
 
-	down_write(&init_mm.mmap_sem);
+	spin_lock_irqsave(&cpa_lock, flags);
 	for (i = 0; i < numpages; i++, page++) { 
-		fpage = NULL;
-		err = __change_page_attr(page, prot, &fpage); 
+		err = __change_page_attr(page, prot); 
 		if (err) 
 			break; 
-		if (fpage) { 
-			struct deferred_page *df;
-			df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); 
-			if (!df) {
-				flush_map();
-				__free_page(fpage);
-			} else { 
-				df->next = df_list;
-				df->fpage = fpage;				
-				df_list = df;
-			} 			
-		} 
 	} 	
-	up_write(&init_mm.mmap_sem); 
+	spin_unlock_irqrestore(&cpa_lock, flags);
 	return err;
 }
 
 void global_flush_tlb(void)
 { 
-	struct deferred_page *df, *next_df;
+	LIST_HEAD(l);
+	struct list_head* n;
 
-	down_read(&init_mm.mmap_sem);
-	df = xchg(&df_list, NULL);
-	up_read(&init_mm.mmap_sem);
+	BUG_ON(irqs_disabled());
+
+	spin_lock_irq(&cpa_lock);
+	list_splice_init(&df_list, &l);
+	spin_unlock_irq(&cpa_lock);
 	flush_map();
-	for (; df; df = next_df) { 
-		next_df = df->next;
-		if (df->fpage) 
-			__free_page(df->fpage);
-		kfree(df);
-	} 
+	n = l.next;
+	while (n != &l) {
+		struct page *pg = list_entry(n, struct page, list);
+		n = n->next;
+		__free_page(pg);
+	}
 } 
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	/* the return value is ignored - the calls cannot fail,
+	 * large pages are disabled at boot time.
+	 */
+	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+	/* we should perform an IPI and flush all tlbs,
+	 * but that can deadlock->flush only current cpu.
+	 */
+	__flush_tlb_all();
+}
+EXPORT_SYMBOL(kernel_map_pages);
+#endif
+
 EXPORT_SYMBOL(change_page_attr);
 EXPORT_SYMBOL(global_flush_tlb);
diff -puN include/asm-i386/cacheflush.h~unmap-page-debugging-2 include/asm-i386/cacheflush.h
--- 25/include/asm-i386/cacheflush.h~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/include/asm-i386/cacheflush.h	2003-06-09 18:16:08.000000000 -0700
@@ -17,4 +17,11 @@
 void global_flush_tlb(void); 
 int change_page_attr(struct page *page, int numpages, pgprot_t prot);
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+/* internal debugging function */
+void kernel_map_pages(struct page *page, int numpages, int enable);
+#else
+static inline void kernel_map_pages(struct page *page, int numpages, int enable) { }
+#endif
+
 #endif /* _I386_CACHEFLUSH_H */
diff -puN include/linux/slab.h~unmap-page-debugging-2 include/linux/slab.h
--- 25/include/linux/slab.h~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/include/linux/slab.h	2003-06-09 18:16:08.000000000 -0700
@@ -125,6 +125,8 @@ extern kmem_cache_t	*signal_cachep;
 extern kmem_cache_t	*sighand_cachep;
 extern kmem_cache_t	*bio_cachep;
 
+void ptrinfo(unsigned long addr);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_SLAB_H */
diff -puN mm/page_alloc.c~unmap-page-debugging-2 mm/page_alloc.c
--- 25/mm/page_alloc.c~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/mm/page_alloc.c	2003-06-09 18:16:08.000000000 -0700
@@ -32,6 +32,8 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 
+#include <asm/tlbflush.h>
+
 DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
 DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS);
 struct pglist_data *pgdat_list;
@@ -265,6 +267,7 @@ void __free_pages_ok(struct page *page, 
 	mod_page_state(pgfree, 1 << order);
 	free_pages_check(__FUNCTION__, page);
 	list_add(&page->list, &list);
+	kernel_map_pages(page, 1<<order, 0);
 	free_pages_bulk(page_zone(page), 1, &list, order);
 }
 
@@ -440,6 +443,7 @@ static void free_hot_cold_page(struct pa
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 
+	kernel_map_pages(page, 1, 0);
 	inc_page_state(pgfree);
 	free_pages_check(__FUNCTION__, page);
 	pcp = &zone->pageset[get_cpu()].pcp[cold];
@@ -556,7 +560,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-				return page;
+		       		goto got_pg;
 		}
 		min += z->pages_low * sysctl_lower_zone_protection;
 	}
@@ -579,7 +583,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-				return page;
+				goto got_pg;
 		}
 		min += local_min * sysctl_lower_zone_protection;
 	}
@@ -594,7 +598,7 @@ rebalance:
 
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-				return page;
+				goto got_pg;
 		}
 		goto nopage;
 	}
@@ -622,7 +626,7 @@ rebalance:
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-				return page;
+				goto got_pg;
 		}
 		min += z->pages_low * sysctl_lower_zone_protection;
 	}
@@ -653,6 +657,9 @@ nopage:
 			current->comm, order, gfp_mask);
 	}
 	return NULL;
+got_pg:
+	kernel_map_pages(page, 1 << order, 1);
+	return page;
 }
 
 /*
diff -puN mm/slab.c~unmap-page-debugging-2 mm/slab.c
--- 25/mm/slab.c~unmap-page-debugging-2	2003-06-09 18:16:08.000000000 -0700
+++ 25-akpm/mm/slab.c	2003-06-09 18:16:08.000000000 -0700
@@ -89,7 +89,11 @@
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
 #include	<linux/cpu.h>
+#include	<linux/sysctl.h>
+
 #include	<asm/uaccess.h>
+#include	<asm/cacheflush.h>
+#include	<asm/tlbflush.h>
 
 /*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
@@ -769,6 +773,44 @@ static inline void kmem_freepages (kmem_
 }
 
 #if DEBUG
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, unsigned long caller)
+{
+	int size = cachep->objsize;
+	if (cachep->flags & SLAB_RED_ZONE) {
+		addr++;
+		size -= 2*BYTES_PER_WORD;
+	}
+	if (cachep->flags & SLAB_STORE_USER) {
+		size -= BYTES_PER_WORD;
+	}
+	if (size < 5*sizeof(unsigned long))
+		return;
+
+	*addr++=0x12345678;
+	*addr++=caller;
+	*addr++=smp_processor_id();
+	size -= 3*sizeof(unsigned long);
+	{
+		unsigned long *sptr = &caller;
+		unsigned long svalue;
+
+		while (((long) sptr & (THREAD_SIZE-1)) != 0) {
+			svalue = *sptr++;
+			if (kernel_text_address(svalue)) {
+				*addr++=svalue;
+				size -= sizeof(unsigned long);
+				if (size <= sizeof(unsigned long))
+					break;
+			}
+		}
+			
+	}
+	*addr++=0x87654321;
+}
+#endif
+
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
 	int size = cachep->objsize;
@@ -783,6 +825,8 @@ static void poison_obj(kmem_cache_t *cac
 	*(unsigned char *)(addr+size-1) = POISON_END;
 }
 
+#ifndef CONFIG_DEBUG_PAGEALLOC
+
 static void *scan_poisoned_obj(unsigned char* addr, unsigned int size)
 {
 	unsigned char *end;
@@ -849,6 +893,7 @@ static void check_poison_obj(kmem_cache_
 	}
 }
 #endif
+#endif
 
 /* Destroy all the objs in a slab, and release the mem back to the system.
  * Before calling the slab must have been unlinked from the cache.
@@ -862,8 +907,14 @@ static void slab_destroy (kmem_cache_t *
 		void *objp = slabp->s_mem + cachep->objsize * i;
 		int objlen = cachep->objsize;
 
-		if (cachep->flags & SLAB_POISON)
+		if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+			if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
+				kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+#else
 			check_poison_obj(cachep, objp);
+#endif
+		}
 		if (cachep->flags & SLAB_STORE_USER)
 			objlen -= BYTES_PER_WORD;
 
@@ -956,6 +1007,10 @@ kmem_cache_create (const char *name, siz
 	}
 
 #if FORCED_DEBUG
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	if (size < PAGE_SIZE-3*BYTES_PER_WORD && size > 128)
+		size = PAGE_SIZE-3*BYTES_PER_WORD;
+#endif
 	/*
 	 * Enable redzoning and last user accounting, except
 	 * - for caches with forced alignment: redzoning would violate the
@@ -1407,6 +1462,8 @@ static void cache_init_objs (kmem_cache_
 				slab_error(cachep, "constructor overwrote the"
 							" start of an object");
 		}
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
+	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp, cachep, ctor_flags);
@@ -1603,9 +1660,16 @@ static inline void *cache_free_debugchec
 		else
 			cachep->dtor(objp, cachep, 0);
 	}
-	if (cachep->flags & SLAB_POISON)
+	if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		store_stackinfo(cachep, objp, POISON_AFTER);
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+#else
 		poison_obj(cachep, objp, POISON_AFTER);
 #endif
+	}
+#endif
 	return objp;
 }
 
@@ -1620,6 +1684,7 @@ static inline void check_slabp(kmem_cach
 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
 		entries++;
 		BUG_ON(entries > cachep->num);
+		BUG_ON(i < 0 || i >= cachep->num);
 	}
 	BUG_ON(entries != cachep->num - slabp->inuse);
 #endif
@@ -1749,8 +1814,15 @@ cache_alloc_debugcheck_after(kmem_cache_
 
 	if (!objp)	
 		return objp;
-	if (cachep->flags & SLAB_POISON)
+	if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+			kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+		poison_obj(cachep, objp, POISON_AFTER);
+#else
 		check_poison_obj(cachep, objp);
+#endif
+	}
 	if (cachep->flags & SLAB_STORE_USER) {
 		objlen -= BYTES_PER_WORD;
 		*((void **)(objp+objlen)) = caller;
@@ -2627,3 +2699,70 @@ unsigned int ksize(const void *objp)
 	return size;
 }
 
+void ptrinfo(unsigned long addr)
+{
+	struct page *page;
+
+	printk("Dumping data about address %p.\n", (void*)addr);
+	if (!virt_addr_valid((void*)addr)) {
+		printk("virt addr invalid.\n");
+		return;
+	}
+	do {
+		pgd_t *pgd = pgd_offset_k(addr); 
+		pmd_t *pmd;
+		if (pgd_none(*pgd)) {
+			printk("No pgd.\n");
+			break;
+		}
+		pmd = pmd_offset(pgd, addr); 	       
+		if (pmd_none(*pmd)) {
+			printk("No pmd.\n");
+			break;
+		}
+#ifdef CONFIG_X86
+		if (pmd_large(*pmd)) {
+			printk("Large page.\n");
+			break;
+		}
+#endif
+		printk("normal page, pte_val 0x%llx\n",
+		  (unsigned long long)pte_val(*pte_offset_kernel(pmd, addr)));
+	} while(0);
+
+	page = virt_to_page((void*)addr);
+	printk("struct page at %p, flags %lxh.\n", page, page->flags);
+	if (PageSlab(page)) {
+		kmem_cache_t *c;
+		struct slab *s;
+		unsigned long flags;
+		int objnr;
+		void *objp;
+
+		c = GET_PAGE_CACHE(page);
+		printk("belongs to cache %s.\n",c->name);
+
+		spin_lock_irqsave(&c->spinlock, flags);
+		s = GET_PAGE_SLAB(page);
+		printk("slabp %p with %d inuse objects (from %d).\n",
+			s, s->inuse, c->num);
+		check_slabp(c,s);
+
+		objnr = (addr-(unsigned long)s->s_mem)/c->objsize;
+		objp = s->s_mem+c->objsize*objnr;
+		printk("points into object no %d, starting at %p, len %d.\n",
+			objnr, objp, c->objsize);
+		if (objnr >= c->num) {
+			printk("Bad obj number.\n");
+		} else {
+			kernel_map_pages(virt_to_page(objp), c->objsize/PAGE_SIZE, 1);
+
+			printk("redzone: %lxh/%lxh/%lxh.\n",
+				((unsigned long*)objp)[0],
+				((unsigned long*)(objp+c->objsize))[-2],
+				((unsigned long*)(objp+c->objsize))[-1]);
+		}
+		spin_unlock_irqrestore(&c->spinlock, flags);
+		
+	}
+}

_