From: Manfred Spraul Manfred's latest page unmapping debug patch. The patch adds support for a special debug mode to both the page and the slab allocator: Unused pages are removed from the kernel linear mapping. This means that now any access to freed memory will cause an immediate exception. Right now, read accesses remain totally unnoticed and write accesses may be catched by the slab poisoning, but usually far too late for a meaningfull bug report. The implementation is based on a new arch dependant function, kernel_map_pages(), that removes the pages from the linear mapping. It's right now only implemented for i386. Changelog: - Add kernel_map_pages() for i386, based on change_page_attr. If DEBUG_PAGEALLOC is not set, then the function is an empty stub. The stub is in , i.e. it exists for all archs. - Make change_page_attr irq safe. Note that it's not fully irq safe due to the lack of the tlb flush ipi, but it's good enough for kernel_map_pages(). Another problem is that kernel_map_pages is not permitted to fail, thus PSE is disabled if DEBUG_PAGEALLOC is enabled - use kernel_map pages for the page allocator. - use kernel_map_pages for the slab allocator. I couldn't resist and added additional debugging support into mm/slab.c: * at kfree time, the complete backtrace of the kfree caller is stored in the freed object. * a ptrinfo() function that dumps all known data about a kernel virtual address: the pte value, if it belongs to a slab cache the cache name and additional info. * merging of common code: new helper function obj_dbglen and obj_dbghdr for the conversion between the user visible object pointers/len and the actual, internal addresses and len values. arch/i386/Kconfig | 8 + arch/i386/kernel/cpu/common.c | 8 + arch/i386/mm/pageattr.c | 82 ++++++++------- include/asm-i386/cacheflush.h | 5 include/linux/mm.h | 8 + include/linux/slab.h | 2 mm/page_alloc.c | 15 ++ mm/slab.c | 218 ++++++++++++++++++++++++++++++++++-------- 8 files changed, 267 insertions(+), 79 deletions(-) diff -puN arch/i386/Kconfig~3-unmap-page-debugging arch/i386/Kconfig --- 25/arch/i386/Kconfig~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/arch/i386/Kconfig 2003-06-29 11:33:06.000000000 -0700 @@ -1390,6 +1390,14 @@ config SPINLINE itself (as ".text.lock.filename"). This can be helpful for finding the callers of locks. +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM diff -puN arch/i386/kernel/cpu/common.c~3-unmap-page-debugging arch/i386/kernel/cpu/common.c --- 25/arch/i386/kernel/cpu/common.c~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/arch/i386/kernel/cpu/common.c 2003-06-29 00:05:22.000000000 -0700 @@ -430,6 +430,14 @@ void __init early_cpu_init(void) rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif } /* * cpu_init() initializes state that is per-CPU. Some data is already diff -puN arch/i386/mm/pageattr.c~3-unmap-page-debugging arch/i386/mm/pageattr.c --- 25/arch/i386/mm/pageattr.c~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/arch/i386/mm/pageattr.c 2003-06-29 11:35:48.000000000 -0700 @@ -13,6 +13,10 @@ #include #include +static spinlock_t cpa_lock = SPIN_LOCK_UNLOCKED; +static struct list_head df_list = LIST_HEAD_INIT(df_list); + + static inline pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); @@ -31,10 +35,15 @@ static struct page *split_large_page(uns { int i; unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); + struct page *base; pte_t *pbase; + + spin_unlock_irq(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + spin_lock_irq(&cpa_lock); if (!base) return NULL; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -87,7 +96,7 @@ static inline void revert_page(struct pa } static int -__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage) +__change_page_attr(struct page *page, pgprot_t prot) { pte_t *kpte; unsigned long address; @@ -123,7 +132,7 @@ __change_page_attr(struct page *page, pg } if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { - *oldpage = kpte_page; + list_add(&kpte_page->list, &df_list); revert_page(kpte_page, address); } return 0; @@ -134,12 +143,6 @@ static inline void flush_map(void) on_each_cpu(flush_kernel_map, NULL, 1, 1); } -struct deferred_page { - struct deferred_page *next; - struct page *fpage; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ - /* * Change the page attributes of an page in the linear mapping. * @@ -156,47 +159,54 @@ static struct deferred_page *df_list; /* int change_page_attr(struct page *page, int numpages, pgprot_t prot) { int err = 0; - struct page *fpage; int i; + unsigned long flags; - down_write(&init_mm.mmap_sem); + spin_lock_irqsave(&cpa_lock, flags); for (i = 0; i < numpages; i++, page++) { - fpage = NULL; - err = __change_page_attr(page, prot, &fpage); + err = __change_page_attr(page, prot); if (err) break; - if (fpage) { - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df_list = df; - } - } } - up_write(&init_mm.mmap_sem); + spin_unlock_irqrestore(&cpa_lock, flags); return err; } void global_flush_tlb(void) { - struct deferred_page *df, *next_df; + LIST_HEAD(l); + struct list_head* n; - down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); - up_read(&init_mm.mmap_sem); + BUG_ON(irqs_disabled()); + + spin_lock_irq(&cpa_lock); + list_splice_init(&df_list, &l); + spin_unlock_irq(&cpa_lock); flush_map(); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); - } + n = l.next; + while (n != &l) { + struct page *pg = list_entry(n, struct page, list); + n = n->next; + __free_page(pg); + } } +#ifdef CONFIG_DEBUG_PAGEALLOC +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (page >= highmem_start_page) + return; + /* the return value is ignored - the calls cannot fail, + * large pages are disabled at boot time. + */ + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); + /* we should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + */ + __flush_tlb_all(); +} +EXPORT_SYMBOL(kernel_map_pages); +#endif + EXPORT_SYMBOL(change_page_attr); EXPORT_SYMBOL(global_flush_tlb); diff -puN include/asm-i386/cacheflush.h~3-unmap-page-debugging include/asm-i386/cacheflush.h --- 25/include/asm-i386/cacheflush.h~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/include/asm-i386/cacheflush.h 2003-06-29 00:05:22.000000000 -0700 @@ -17,4 +17,9 @@ void global_flush_tlb(void); int change_page_attr(struct page *page, int numpages, pgprot_t prot); +#ifdef CONFIG_DEBUG_PAGEALLOC +/* internal debugging function */ +void kernel_map_pages(struct page *page, int numpages, int enable); +#endif + #endif /* _I386_CACHEFLUSH_H */ diff -puN include/linux/mm.h~3-unmap-page-debugging include/linux/mm.h --- 25/include/linux/mm.h~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/include/linux/mm.h 2003-06-29 11:33:05.000000000 -0700 @@ -609,5 +609,13 @@ extern struct page * follow_page(struct int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); + +#ifndef CONFIG_DEBUG_PAGEALLOC +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -puN include/linux/slab.h~3-unmap-page-debugging include/linux/slab.h --- 25/include/linux/slab.h~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/include/linux/slab.h 2003-06-29 11:32:59.000000000 -0700 @@ -114,6 +114,8 @@ extern kmem_cache_t *signal_cachep; extern kmem_cache_t *sighand_cachep; extern kmem_cache_t *bio_cachep; +void ptrinfo(unsigned long addr); + #endif /* __KERNEL__ */ #endif /* _LINUX_SLAB_H */ diff -puN mm/page_alloc.c~3-unmap-page-debugging mm/page_alloc.c --- 25/mm/page_alloc.c~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-06-29 11:33:15.000000000 -0700 @@ -32,6 +32,8 @@ #include #include +#include + DECLARE_BITMAP(node_online_map, MAX_NUMNODES); DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS); struct pglist_data *pgdat_list; @@ -265,6 +267,7 @@ void __free_pages_ok(struct page *page, mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); list_add(&page->list, &list); + kernel_map_pages(page, 1<pageset[get_cpu()].pcp[cold]; @@ -556,7 +560,7 @@ __alloc_pages(unsigned int gfp_mask, uns (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -579,7 +583,7 @@ __alloc_pages(unsigned int gfp_mask, uns (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += local_min * sysctl_lower_zone_protection; } @@ -594,7 +598,7 @@ rebalance: page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } goto nopage; } @@ -622,7 +626,7 @@ rebalance: (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -653,6 +657,9 @@ nopage: current->comm, order, gfp_mask); } return NULL; +got_pg: + kernel_map_pages(page, 1 << order, 1); + return page; } /* diff -puN mm/slab.c~3-unmap-page-debugging mm/slab.c --- 25/mm/slab.c~3-unmap-page-debugging 2003-06-29 00:05:22.000000000 -0700 +++ 25-akpm/mm/slab.c 2003-06-29 11:32:59.000000000 -0700 @@ -89,7 +89,11 @@ #include #include #include +#include + #include +#include +#include /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, @@ -351,6 +355,34 @@ struct kmem_cache_s { #define POISON_AFTER 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ +static inline int obj_dbghead(kmem_cache_t *cachep) +{ + if (cachep->flags & SLAB_RED_ZONE) + return BYTES_PER_WORD; + return 0; +} + +static inline int obj_dbglen(kmem_cache_t *cachep) +{ + int len = 0; + + if (cachep->flags & SLAB_RED_ZONE) { + len += 2*BYTES_PER_WORD; + } + if (cachep->flags & SLAB_STORE_USER) { + len += BYTES_PER_WORD; + } + return len; +} +#else +static inline int obj_dbghead(kmem_cache_t *cachep) +{ + return 0; +} +static inline int obj_dbglen(kmem_cache_t *cachep) +{ + return 0; +} #endif /* @@ -765,16 +797,45 @@ static inline void kmem_freepages (kmem_ } #if DEBUG -static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, unsigned long caller) { - int size = cachep->objsize; - if (cachep->flags & SLAB_RED_ZONE) { - addr += BYTES_PER_WORD; - size -= 2*BYTES_PER_WORD; - } - if (cachep->flags & SLAB_STORE_USER) { - size -= BYTES_PER_WORD; + int size = cachep->objsize-obj_dbglen(cachep); + + addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; + + if (size < 5*sizeof(unsigned long)) + return; + + *addr++=0x12345678; + *addr++=caller; + *addr++=smp_processor_id(); + size -= 3*sizeof(unsigned long); + { + unsigned long *sptr = &caller; + unsigned long svalue; + + while (((long) sptr & (THREAD_SIZE-1)) != 0) { + svalue = *sptr++; + if (kernel_text_address(svalue)) { + *addr++=svalue; + size -= sizeof(unsigned long); + if (size <= sizeof(unsigned long)) + break; + } + } + } + *addr++=0x87654321; +} +#endif + +static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) +{ + int size = cachep->objsize-obj_dbglen(cachep); + addr = &((char*)addr)[obj_dbghead(cachep)]; + memset(addr, val, size); *(unsigned char *)(addr+size-1) = POISON_END; } @@ -796,15 +857,11 @@ static void *scan_poisoned_obj(unsigned static void check_poison_obj(kmem_cache_t *cachep, void *addr) { - int size = cachep->objsize; void *end; - if (cachep->flags & SLAB_RED_ZONE) { - addr += BYTES_PER_WORD; - size -= 2*BYTES_PER_WORD; - } - if (cachep->flags & SLAB_STORE_USER) { - size -= BYTES_PER_WORD; - } + int size = cachep->objsize-obj_dbglen(cachep); + + addr = &((char*)addr)[obj_dbghead(cachep)]; + end = scan_poisoned_obj(addr, size); if (end) { int s; @@ -858,8 +915,16 @@ static void slab_destroy (kmem_cache_t * void *objp = slabp->s_mem + cachep->objsize * i; int objlen = cachep->objsize; - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); + else + check_poison_obj(cachep, objp); +#else check_poison_obj(cachep, objp); +#endif + } if (cachep->flags & SLAB_STORE_USER) objlen -= BYTES_PER_WORD; @@ -952,6 +1017,10 @@ kmem_cache_create (const char *name, siz } #if FORCED_DEBUG +#ifdef CONFIG_DEBUG_PAGEALLOC + if (size < PAGE_SIZE-3*BYTES_PER_WORD && size > 128) + size = PAGE_SIZE-3*BYTES_PER_WORD; +#endif /* * Enable redzoning and last user accounting, except * - for caches with forced alignment: redzoning would violate the @@ -1404,6 +1473,8 @@ static void cache_init_objs (kmem_cache_ slab_error(cachep, "constructor overwrote the" " start of an object"); } + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); #else if (cachep->ctor) cachep->ctor(objp, cachep, ctor_flags); @@ -1584,25 +1655,28 @@ static inline void *cache_free_debugchec * caller can perform a verify of its state (debugging). * Called without the cache-lock held. */ - if (cachep->flags & SLAB_RED_ZONE) { - cachep->ctor(objp+BYTES_PER_WORD, + cachep->ctor(objp+obj_dbghead(cachep), cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); - } else { - cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); - } } if (cachep->flags & SLAB_POISON && cachep->dtor) { /* we want to cache poison the object, * call the destruction callback */ - if (cachep->flags & SLAB_RED_ZONE) - cachep->dtor(objp+BYTES_PER_WORD, cachep, 0); - else - cachep->dtor(objp, cachep, 0); + cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); } - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + store_stackinfo(cachep, objp, POISON_AFTER); + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); + } else { + poison_obj(cachep, objp, POISON_AFTER); + } +#else poison_obj(cachep, objp, POISON_AFTER); #endif + } +#endif return objp; } @@ -1617,6 +1691,7 @@ static inline void check_slabp(kmem_cach for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { entries++; BUG_ON(entries > cachep->num); + BUG_ON(i < 0 || i >= cachep->num); } BUG_ON(entries != cachep->num - slabp->inuse); #endif @@ -1746,9 +1821,16 @@ cache_alloc_debugcheck_after(kmem_cache_ if (!objp) return objp; - if (cachep->flags & SLAB_POISON) { + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); + else + check_poison_obj(cachep, objp); +#else check_poison_obj(cachep, objp); - poison_obj(cachep, objp, POISON_BEFORE); +#endif + poison_obj(cachep, objp, POISON_BEFORE); } if (cachep->flags & SLAB_STORE_USER) { objlen -= BYTES_PER_WORD; @@ -2085,16 +2167,7 @@ free_percpu(const void *objp) unsigned int kmem_cache_size(kmem_cache_t *cachep) { - unsigned int objlen = cachep->objsize; - -#if DEBUG - if (cachep->flags & SLAB_RED_ZONE) - objlen -= 2*BYTES_PER_WORD; - if (cachep->flags & SLAB_STORE_USER) - objlen -= BYTES_PER_WORD; -#endif - - return objlen; + return cachep->objsize-obj_dbglen(cachep); } kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) @@ -2626,3 +2699,70 @@ unsigned int ksize(const void *objp) return size; } +void ptrinfo(unsigned long addr) +{ + struct page *page; + + printk("Dumping data about address %p.\n", (void*)addr); + if (!virt_addr_valid((void*)addr)) { + printk("virt addr invalid.\n"); + return; + } + do { + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd; + if (pgd_none(*pgd)) { + printk("No pgd.\n"); + break; + } + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) { + printk("No pmd.\n"); + break; + } +#ifdef CONFIG_X86 + if (pmd_large(*pmd)) { + printk("Large page.\n"); + break; + } +#endif + printk("normal page, pte_val 0x%llx\n", + (unsigned long long)pte_val(*pte_offset_kernel(pmd, addr))); + } while(0); + + page = virt_to_page((void*)addr); + printk("struct page at %p, flags %lxh.\n", page, page->flags); + if (PageSlab(page)) { + kmem_cache_t *c; + struct slab *s; + unsigned long flags; + int objnr; + void *objp; + + c = GET_PAGE_CACHE(page); + printk("belongs to cache %s.\n",c->name); + + spin_lock_irqsave(&c->spinlock, flags); + s = GET_PAGE_SLAB(page); + printk("slabp %p with %d inuse objects (from %d).\n", + s, s->inuse, c->num); + check_slabp(c,s); + + objnr = (addr-(unsigned long)s->s_mem)/c->objsize; + objp = s->s_mem+c->objsize*objnr; + printk("points into object no %d, starting at %p, len %d.\n", + objnr, objp, c->objsize); + if (objnr >= c->num) { + printk("Bad obj number.\n"); + } else { + kernel_map_pages(virt_to_page(objp), c->objsize/PAGE_SIZE, 1); + + printk("redzone: %lxh/%lxh/%lxh.\n", + ((unsigned long*)objp)[0], + ((unsigned long*)(objp+c->objsize))[-2], + ((unsigned long*)(objp+c->objsize))[-1]); + } + spin_unlock_irqrestore(&c->spinlock, flags); + + } +} _