1 files changed, 570 insertions, 7 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index b5febc2e5..3fb7d011c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -25,6 +25,7 @@
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
+#include <asm/mman.h>
 
 #include <linux/highmem.h>
 
@@ -220,15 +221,18 @@ int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 	struct list_head * page_lru, * dispose;
 	struct page * page;
 
+	if (!zone)
+		BUG();
+
 	count = nr_lru_pages / (priority+1);
 
 	spin_lock(&pagemap_lru_lock);
 
-	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+	while (count > 0 && (page_lru = zone->lru_cache.prev) != &zone->lru_cache) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
 
-		dispose = &lru_cache;
+		dispose = &zone->lru_cache;
 		if (test_and_clear_bit(PG_referenced, &page->flags))
 			/* Roll the page at the top of the lru list,
 			 * we could also be more aggressive putting
@@ -355,8 +359,8 @@ made_buffer_progress:
 	nr_lru_pages--;
 
 out:
-	list_splice(&young, &lru_cache);
-	list_splice(&old, lru_cache.prev);
+	list_splice(&young, &zone->lru_cache);
+	list_splice(&old, zone->lru_cache.prev);
 
 	spin_unlock(&pagemap_lru_lock);
 
@@ -1294,6 +1298,61 @@ out:
 }
 
 /*
+ * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
+ * sure this is sequential access, we don't need a flexible read-ahead
+ * window size -- we can always use a large fixed size window.
+ */
+static void nopage_sequential_readahead(struct vm_area_struct * vma,
+	unsigned long pgoff, unsigned long filesize)
+{
+	unsigned long ra_window;
+
+	ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
+	ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
+
+	/* vm_raend is zero if we haven't read ahead in this area yet.  */
+	if (vma->vm_raend == 0)
+		vma->vm_raend = vma->vm_pgoff + ra_window;
+
+	/*
+	 * If we've just faulted the page half-way through our window,
+	 * then schedule reads for the next window, and release the
+	 * pages in the previous window.
+	 */
+	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
+		unsigned long start = vma->vm_pgoff + vma->vm_raend;
+		unsigned long end = start + ra_window;
+
+		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
+			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
+		if (start > end)
+			return;
+
+		while ((start < end) && (start < filesize)) {
+			if (read_cluster_nonblocking(vma->vm_file,
+							start, filesize) < 0)
+				break;
+			start += CLUSTER_PAGES;
+		}
+		run_task_queue(&tq_disk);
+
+		/* if we're far enough past the beginning of this area,
+		   recycle pages that are in the previous window. */
+		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
+			unsigned long window = ra_window << PAGE_SHIFT;
+
+			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
+			end -= window + window;
+			filemap_sync(vma, end - window, window, MS_INVALIDATE);
+		}
+
+		vma->vm_raend += ra_window;
+	}
+
+	return;
+}
+
+/*
  * filemap_nopage() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
  *
@@ -1339,6 +1398,12 @@ retry_find:
 		goto page_not_uptodate;
 
 success:
+ 	/*
+	 * Try read-ahead for sequential areas.
+	 */
+	if (VM_SequentialReadHint(area))
+		nopage_sequential_readahead(area, pgoff, size);
+
 	/*
 	 * Found the page and have a reference on it, need to check sharing
 	 * and possibly copy it over to another page..
@@ -1355,7 +1420,7 @@ success:
 		page_cache_release(page);
 		return new_page;
 	}
-		
+
 	flush_page_to_ram(old_page);
 	return old_page;
 
@@ -1367,7 +1432,7 @@ no_cached_page:
 	 * Otherwise, we're off the end of a privately mapped file,
 	 * so we need to map a zero page.
 	 */
-	if (pgoff < size)
+	if ((pgoff < size) && !VM_RandomReadHint(area))
 		error = read_cluster_nonblocking(file, pgoff, size);
 	else
 		error = page_cache_read(file, pgoff);
@@ -1646,7 +1711,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 	return 0;
 }
 
-
 /*
  * The msync() system call.
  */
@@ -1727,6 +1791,505 @@ out:
 	return error;
 }
 
+static inline void setup_read_behavior(struct vm_area_struct * vma,
+	int behavior)
+{
+	VM_ClearReadHint(vma);
+	switch(behavior) {
+		case MADV_SEQUENTIAL:
+			vma->vm_flags |= VM_SEQ_READ;
+			break;
+		case MADV_RANDOM:
+			vma->vm_flags |= VM_RAND_READ;
+			break;
+		default:
+			break;
+	}
+	return;
+}
+
+static long madvise_fixup_start(struct vm_area_struct * vma,
+	unsigned long end, int behavior)
+{
+	struct vm_area_struct * n;
+
+	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!n)
+		return -EAGAIN;
+	*n = *vma;
+	n->vm_end = end;
+	setup_read_behavior(n, behavior);
+	n->vm_raend = 0;
+	get_file(n->vm_file);
+	if (n->vm_ops && n->vm_ops->open)
+		n->vm_ops->open(n);
+	vmlist_modify_lock(vma->vm_mm);
+	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
+	vma->vm_start = end;
+	insert_vm_struct(current->mm, n);
+	vmlist_modify_unlock(vma->vm_mm);
+	return 0;
+}
+
+static long madvise_fixup_end(struct vm_area_struct * vma,
+	unsigned long start, int behavior)
+{
+	struct vm_area_struct * n;
+
+	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!n)
+		return -EAGAIN;
+	*n = *vma;
+	n->vm_start = start;
+	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
+	setup_read_behavior(n, behavior);
+	n->vm_raend = 0;
+	get_file(n->vm_file);
+	if (n->vm_ops && n->vm_ops->open)
+		n->vm_ops->open(n);
+	vmlist_modify_lock(vma->vm_mm);
+	vma->vm_end = start;
+	insert_vm_struct(current->mm, n);
+	vmlist_modify_unlock(vma->vm_mm);
+	return 0;
+}
+
+static long madvise_fixup_middle(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int behavior)
+{
+	struct vm_area_struct * left, * right;
+
+	left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!left)
+		return -EAGAIN;
+	right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!right) {
+		kmem_cache_free(vm_area_cachep, left);
+		return -EAGAIN;
+	}
+	*left = *vma;
+	*right = *vma;
+	left->vm_end = start;
+	right->vm_start = end;
+	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
+	left->vm_raend = 0;
+	right->vm_raend = 0;
+	atomic_add(2, &vma->vm_file->f_count);
+
+	if (vma->vm_ops && vma->vm_ops->open) {
+		vma->vm_ops->open(left);
+		vma->vm_ops->open(right);
+	}
+	vmlist_modify_lock(vma->vm_mm);
+	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+	vma->vm_start = start;
+	vma->vm_end = end;
+	setup_read_behavior(vma, behavior);
+	vma->vm_raend = 0;
+	insert_vm_struct(current->mm, left);
+	insert_vm_struct(current->mm, right);
+	vmlist_modify_unlock(vma->vm_mm);
+	return 0;
+}
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int behavior)
+{
+	int error = 0;
+
+	/* This caps the number of vma's this process can own */
+	if (vma->vm_mm->map_count > MAX_MAP_COUNT)
+		return -ENOMEM;
+
+	if (start == vma->vm_start) {
+		if (end == vma->vm_end) {
+			setup_read_behavior(vma, behavior);
+			vma->vm_raend = 0;
+		} else
+			error = madvise_fixup_start(vma, end, behavior);
+	} else {
+		if (end == vma->vm_end)
+			error = madvise_fixup_end(vma, start, behavior);
+		else
+			error = madvise_fixup_middle(vma, start, end, behavior);
+	}
+
+	return error;
+}
+
+/*
+ * Schedule all required I/O operations, then run the disk queue
+ * to make sure they are started.  Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end)
+{
+	long error = -EBADF;
+	struct file * file;
+	unsigned long size, rlim_rss;
+
+	/* Doesn't work if there's no mapped file. */
+	if (!vma->vm_file)
+		return error;
+	file = vma->vm_file;
+	size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
+							PAGE_CACHE_SHIFT;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	/* Make sure this doesn't exceed the process's max rss. */
+	error = -EIO;
+	rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
+				LONG_MAX; /* default: see resource.h */
+	if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
+		return error;
+
+	/* round to cluster boundaries if this isn't a "random" area. */
+	if (!VM_RandomReadHint(vma)) {
+		start = CLUSTER_OFFSET(start);
+		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
+
+		while ((start < end) && (start < size)) {
+			error = read_cluster_nonblocking(file, start, size);
+			start += CLUSTER_PAGES;
+			if (error < 0)
+				break;
+		}
+	} else {
+		while ((start < end) && (start < size)) {
+			error = page_cache_read(file, start);
+			start++;
+			if (error < 0)
+				break;
+		}
+	}
+
+	/* Don't wait for someone else to push these requests. */
+	run_task_queue(&tq_disk);
+
+	return error;
+}
+
+/*
+ * Application no longer needs these pages.  If the pages are dirty,
+ * it's OK to just throw them away.  The app will be more careful about
+ * data it wants to keep.  Be sure to free swap resources too.  The
+ * zap_page_range call sets things up for shrink_mmap to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * shrink_mmap to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do.  This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them.  There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end)
+{
+	if (vma->vm_flags & VM_LOCKED)
+		return -EINVAL;
+
+	lock_kernel();	/* is this really necessary? */
+
+	flush_cache_range(vma->vm_mm, start, end);
+	zap_page_range(vma->vm_mm, start, end - start);
+	flush_tlb_range(vma->vm_mm, start, end);
+
+	unlock_kernel();
+	return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+	unsigned long end, int behavior)
+{
+	long error = -EBADF;
+
+	switch (behavior) {
+	case MADV_NORMAL:
+	case MADV_SEQUENTIAL:
+	case MADV_RANDOM:
+		error = madvise_behavior(vma, start, end, behavior);
+		break;
+
+	case MADV_WILLNEED:
+		error = madvise_willneed(vma, start, end);
+		break;
+
+	case MADV_DONTNEED:
+		error = madvise_dontneed(vma, start, end);
+		break;
+
+	default:
+		error = -EINVAL;
+		break;
+	}
+		
+	return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area.  The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques.  The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ *  MADV_NORMAL - the default behavior is to read clusters.  This
+ *		results in some read-ahead and read-behind.
+ *  MADV_RANDOM - the system should read the minimum amount of data
+ *		on any access, since it is unlikely that the appli-
+ *		cation will need more than what it asks for.
+ *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ *		once, so they can be aggressively read ahead, and
+ *		can be freed soon after they are accessed.
+ *  MADV_WILLNEED - the application is notifying the system to read
+ *		some pages ahead.
+ *  MADV_DONTNEED - the application is finished with the given range,
+ *		so the kernel can free resources associated with it.
+ *
+ * return values:
+ *  zero    - success
+ *  -EINVAL - start + len < 0, start is not page-aligned,
+ *		"behavior" is not a valid value, or application
+ *		is attempting to release locked or shared pages.
+ *  -ENOMEM - addresses in the specified range are not currently
+ *		mapped, or are outside the AS of the process.
+ *  -EIO    - an I/O error occurred while paging in data.
+ *  -EBADF  - map exists, but area maps something that isn't a file.
+ *  -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
+{
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	int error = -EINVAL;
+
+	down(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = madvise_vma(vma, start, end,
+							behavior);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = madvise_vma(vma, start, vma->vm_end, behavior);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up(&current->mm->mmap_sem);
+	return error;
+}
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+	unsigned long pgoff)
+{
+	unsigned char present = 0;
+	struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
+	struct page * page, ** hash = page_hash(as, pgoff);
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(as, pgoff, *hash);
+	if ((page) && (Page_Uptodate(page)))
+		present = 1;
+	spin_unlock(&pagecache_lock);
+
+	return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, unsigned char * vec)
+{
+	long error, i, remaining;
+	unsigned char * tmp;
+
+	error = -ENOMEM;
+	if (!vma->vm_file)
+		return error;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	error = -EAGAIN;
+	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+	if (!tmp)
+		return error;
+
+	/* (end - start) is # of pages, and also # of bytes in "vec */
+	remaining = (end - start),
+
+	error = 0;
+	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+		int j = 0;
+		long thispiece = (remaining < PAGE_SIZE) ?
+						remaining : PAGE_SIZE;
+
+		while (j < thispiece)
+			tmp[j++] = mincore_page(vma, start++);
+
+		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+			error = -EFAULT;
+			break;
+		}
+	}
+
+	free_page((unsigned long) tmp);
+	return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.  Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
+ *		or len has a nonpositive value
+ *  -ENOMEM - Addresses in the range [addr, addr + len] are
+ *		invalid for the address space of this process, or
+ *		specify one or more pages which are not currently
+ *		mapped
+ *  -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+	unsigned char * vec)
+{
+	int index = 0;
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	long error = -EINVAL;
+
+	down(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = mincore_vma(vma, start, end,
+							&vec[index]);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+		if (error)
+			goto out;
+		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up(&current->mm->mmap_sem);
+	return error;
+}
+
 struct page *read_cache_page(struct address_space *mapping,
 				unsigned long index,
 				int (*filler)(void *,struct page*),