From: Dave McCracken <dmccr@us.ibm.com>

This patch solves the race between truncate and page in which can cause stray
anon pages to appear in the truncated region.

The race occurs when a process is sleeping in pagein IO during the truncate:
there's a window after checking i_size in which the paging-in process decides
that the page was an OK one.

This leaves an anon page in the pagetables, and if the file is subsequently
extended we have an anon page floating about inside a file-backed mmap - user
modifications will not be written out.

Apparently this is also needed for the implementation of POSIX semantics for
distributed filesystems.

We use a generation counter in the address_space so the paging-in process can
determine whether there was a truncate which might have shot the new page
down.

It's a bit grubby to be playing with files and inodes in do_no_page(), but we
do need the page_table_lock coverage for this, and rearranging thngs to
provide that coverage to filemap_nopage wasn't very nice either.


 drivers/mtd/devices/blkmtd.c |    1 +
 fs/inode.c                   |    1 +
 include/linux/fs.h           |    1 +
 mm/memory.c                  |   17 +++++++++++++++++
 mm/swap_state.c              |    1 +
 5 files changed, 21 insertions(+)

diff -puN drivers/mtd/devices/blkmtd.c~truncate-pagefault-race-fix drivers/mtd/devices/blkmtd.c
--- 25/drivers/mtd/devices/blkmtd.c~truncate-pagefault-race-fix	2003-06-17 10:08:11.000000000 -0700
+++ 25-akpm/drivers/mtd/devices/blkmtd.c	2003-06-17 10:08:11.000000000 -0700
@@ -1189,6 +1189,7 @@ static int __init init_blkmtd(void)
   INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages);
   mtd_rawdevice->as.host = NULL;
   init_MUTEX(&(mtd_rawdevice->as.i_shared_sem));
+  atomic_set(&(mtd_rawdevice->as.truncate_count), 0);
 
   mtd_rawdevice->as.a_ops = &blkmtd_aops;
   INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap);
diff -puN fs/inode.c~truncate-pagefault-race-fix fs/inode.c
--- 25/fs/inode.c~truncate-pagefault-race-fix	2003-06-17 10:08:11.000000000 -0700
+++ 25-akpm/fs/inode.c	2003-06-17 10:08:11.000000000 -0700
@@ -184,6 +184,7 @@ void inode_init_once(struct inode *inode
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
+	atomic_set(&inode->i_data.truncate_count, 0);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap);
diff -puN include/linux/fs.h~truncate-pagefault-race-fix include/linux/fs.h
--- 25/include/linux/fs.h~truncate-pagefault-race-fix	2003-06-17 10:08:11.000000000 -0700
+++ 25-akpm/include/linux/fs.h	2003-06-17 10:08:11.000000000 -0700
@@ -323,6 +323,7 @@ struct address_space {
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of shared mappings */
 	struct semaphore	i_shared_sem;	/* protect both above lists */
+	atomic_t		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		dirtied_when;	/* jiffies of first page dirtying */
 	int			gfp_mask;	/* how to allocate the pages */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -puN mm/memory.c~truncate-pagefault-race-fix mm/memory.c
--- 25/mm/memory.c~truncate-pagefault-race-fix	2003-06-17 10:08:11.000000000 -0700
+++ 25-akpm/mm/memory.c	2003-06-17 10:08:11.000000000 -0700
@@ -1138,6 +1138,8 @@ void invalidate_mmap_range(struct addres
 			hlen = ULONG_MAX - hba + 1;
 	}
 	down(&mapping->i_shared_sem);
+	/* Protect against page fault */
+	atomic_inc(&mapping->truncate_count);
 	if (unlikely(!list_empty(&mapping->i_mmap)))
 		invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
 	if (unlikely(!list_empty(&mapping->i_mmap_shared)))
@@ -1390,8 +1392,10 @@ do_no_page(struct mm_struct *mm, struct 
 	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
 	struct page * new_page;
+	struct address_space *mapping;
 	pte_t entry;
 	struct pte_chain *pte_chain;
+	int sequence;
 	int ret;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
@@ -1400,6 +1404,9 @@ do_no_page(struct mm_struct *mm, struct 
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+retry:
+	sequence = atomic_read(&mapping->truncate_count);
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
 
 	/* no page was available -- either SIGBUS or OOM */
@@ -1428,6 +1435,16 @@ do_no_page(struct mm_struct *mm, struct 
 	}
 
 	spin_lock(&mm->page_table_lock);
+	/*
+	 * For a file-backed vma, someone could have truncated or otherwise
+	 * invalidated this page.  If invalidate_mmap_range got called,
+	 * retry getting the page.
+	 */
+	if (unlikely(sequence != atomic_read(&mapping->truncate_count))) {
+		spin_unlock(&mm->page_table_lock);
+		page_cache_release(new_page);
+		goto retry;
+	}
 	page_table = pte_offset_map(pmd, address);
 
 	/*
diff -puN mm/swap_state.c~truncate-pagefault-race-fix mm/swap_state.c
--- 25/mm/swap_state.c~truncate-pagefault-race-fix	2003-06-17 10:08:11.000000000 -0700
+++ 25-akpm/mm/swap_state.c	2003-06-17 10:08:11.000000000 -0700
@@ -44,6 +44,7 @@ struct address_space swapper_space = {
 	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
 	.i_mmap_shared	= LIST_HEAD_INIT(swapper_space.i_mmap_shared),
 	.i_shared_sem	= __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
+	.truncate_count  = ATOMIC_INIT(0),
 	.private_lock	= SPIN_LOCK_UNLOCKED,
 	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
 };

_