summaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c577
1 files changed, 570 insertions, 7 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index b5febc2e5..3fb7d011c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -25,6 +25,7 @@
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
+#include <asm/mman.h>
#include <linux/highmem.h>
@@ -220,15 +221,18 @@ int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
struct list_head * page_lru, * dispose;
struct page * page;
+ if (!zone)
+ BUG();
+
count = nr_lru_pages / (priority+1);
spin_lock(&pagemap_lru_lock);
- while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+ while (count > 0 && (page_lru = zone->lru_cache.prev) != &zone->lru_cache) {
page = list_entry(page_lru, struct page, lru);
list_del(page_lru);
- dispose = &lru_cache;
+ dispose = &zone->lru_cache;
if (test_and_clear_bit(PG_referenced, &page->flags))
/* Roll the page at the top of the lru list,
* we could also be more aggressive putting
@@ -355,8 +359,8 @@ made_buffer_progress:
nr_lru_pages--;
out:
- list_splice(&young, &lru_cache);
- list_splice(&old, lru_cache.prev);
+ list_splice(&young, &zone->lru_cache);
+ list_splice(&old, zone->lru_cache.prev);
spin_unlock(&pagemap_lru_lock);
@@ -1294,6 +1298,61 @@ out:
}
/*
+ * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
+ * sure this is sequential access, we don't need a flexible read-ahead
+ * window size -- we can always use a large fixed size window.
+ */
+static void nopage_sequential_readahead(struct vm_area_struct * vma,
+ unsigned long pgoff, unsigned long filesize)
+{
+ unsigned long ra_window;
+
+ ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
+ ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
+
+ /* vm_raend is zero if we haven't read ahead in this area yet. */
+ if (vma->vm_raend == 0)
+ vma->vm_raend = vma->vm_pgoff + ra_window;
+
+ /*
+ * If we've just faulted the page half-way through our window,
+ * then schedule reads for the next window, and release the
+ * pages in the previous window.
+ */
+ if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
+ unsigned long start = vma->vm_pgoff + vma->vm_raend;
+ unsigned long end = start + ra_window;
+
+ if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
+ end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (start > end)
+ return;
+
+ while ((start < end) && (start < filesize)) {
+ if (read_cluster_nonblocking(vma->vm_file,
+ start, filesize) < 0)
+ break;
+ start += CLUSTER_PAGES;
+ }
+ run_task_queue(&tq_disk);
+
+ /* if we're far enough past the beginning of this area,
+ recycle pages that are in the previous window. */
+ if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
+ unsigned long window = ra_window << PAGE_SHIFT;
+
+ end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
+ end -= window + window;
+ filemap_sync(vma, end - window, window, MS_INVALIDATE);
+ }
+
+ vma->vm_raend += ra_window;
+ }
+
+ return;
+}
+
+/*
* filemap_nopage() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
@@ -1339,6 +1398,12 @@ retry_find:
goto page_not_uptodate;
success:
+ /*
+ * Try read-ahead for sequential areas.
+ */
+ if (VM_SequentialReadHint(area))
+ nopage_sequential_readahead(area, pgoff, size);
+
/*
* Found the page and have a reference on it, need to check sharing
* and possibly copy it over to another page..
@@ -1355,7 +1420,7 @@ success:
page_cache_release(page);
return new_page;
}
-
+
flush_page_to_ram(old_page);
return old_page;
@@ -1367,7 +1432,7 @@ no_cached_page:
* Otherwise, we're off the end of a privately mapped file,
* so we need to map a zero page.
*/
- if (pgoff < size)
+ if ((pgoff < size) && !VM_RandomReadHint(area))
error = read_cluster_nonblocking(file, pgoff, size);
else
error = page_cache_read(file, pgoff);
@@ -1646,7 +1711,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
return 0;
}
-
/*
* The msync() system call.
*/
@@ -1727,6 +1791,505 @@ out:
return error;
}
+static inline void setup_read_behavior(struct vm_area_struct * vma,
+ int behavior)
+{
+ VM_ClearReadHint(vma);
+ switch(behavior) {
+ case MADV_SEQUENTIAL:
+ vma->vm_flags |= VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ vma->vm_flags |= VM_RAND_READ;
+ break;
+ default:
+ break;
+ }
+ return;
+}
+
+static long madvise_fixup_start(struct vm_area_struct * vma,
+ unsigned long end, int behavior)
+{
+ struct vm_area_struct * n;
+
+ n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!n)
+ return -EAGAIN;
+ *n = *vma;
+ n->vm_end = end;
+ setup_read_behavior(n, behavior);
+ n->vm_raend = 0;
+ get_file(n->vm_file);
+ if (n->vm_ops && n->vm_ops->open)
+ n->vm_ops->open(n);
+ vmlist_modify_lock(vma->vm_mm);
+ vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
+ vma->vm_start = end;
+ insert_vm_struct(current->mm, n);
+ vmlist_modify_unlock(vma->vm_mm);
+ return 0;
+}
+
+static long madvise_fixup_end(struct vm_area_struct * vma,
+ unsigned long start, int behavior)
+{
+ struct vm_area_struct * n;
+
+ n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!n)
+ return -EAGAIN;
+ *n = *vma;
+ n->vm_start = start;
+ n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
+ setup_read_behavior(n, behavior);
+ n->vm_raend = 0;
+ get_file(n->vm_file);
+ if (n->vm_ops && n->vm_ops->open)
+ n->vm_ops->open(n);
+ vmlist_modify_lock(vma->vm_mm);
+ vma->vm_end = start;
+ insert_vm_struct(current->mm, n);
+ vmlist_modify_unlock(vma->vm_mm);
+ return 0;
+}
+
+static long madvise_fixup_middle(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int behavior)
+{
+ struct vm_area_struct * left, * right;
+
+ left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!left)
+ return -EAGAIN;
+ right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!right) {
+ kmem_cache_free(vm_area_cachep, left);
+ return -EAGAIN;
+ }
+ *left = *vma;
+ *right = *vma;
+ left->vm_end = start;
+ right->vm_start = end;
+ right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
+ left->vm_raend = 0;
+ right->vm_raend = 0;
+ atomic_add(2, &vma->vm_file->f_count);
+
+ if (vma->vm_ops && vma->vm_ops->open) {
+ vma->vm_ops->open(left);
+ vma->vm_ops->open(right);
+ }
+ vmlist_modify_lock(vma->vm_mm);
+ vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+ vma->vm_start = start;
+ vma->vm_end = end;
+ setup_read_behavior(vma, behavior);
+ vma->vm_raend = 0;
+ insert_vm_struct(current->mm, left);
+ insert_vm_struct(current->mm, right);
+ vmlist_modify_unlock(vma->vm_mm);
+ return 0;
+}
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int behavior)
+{
+ int error = 0;
+
+ /* This caps the number of vma's this process can own */
+ if (vma->vm_mm->map_count > MAX_MAP_COUNT)
+ return -ENOMEM;
+
+ if (start == vma->vm_start) {
+ if (end == vma->vm_end) {
+ setup_read_behavior(vma, behavior);
+ vma->vm_raend = 0;
+ } else
+ error = madvise_fixup_start(vma, end, behavior);
+ } else {
+ if (end == vma->vm_end)
+ error = madvise_fixup_end(vma, start, behavior);
+ else
+ error = madvise_fixup_middle(vma, start, end, behavior);
+ }
+
+ return error;
+}
+
+/*
+ * Schedule all required I/O operations, then run the disk queue
+ * to make sure they are started. Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ long error = -EBADF;
+ struct file * file;
+ unsigned long size, rlim_rss;
+
+ /* Doesn't work if there's no mapped file. */
+ if (!vma->vm_file)
+ return error;
+ file = vma->vm_file;
+ size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ /* Make sure this doesn't exceed the process's max rss. */
+ error = -EIO;
+ rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
+ LONG_MAX; /* default: see resource.h */
+ if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
+ return error;
+
+ /* round to cluster boundaries if this isn't a "random" area. */
+ if (!VM_RandomReadHint(vma)) {
+ start = CLUSTER_OFFSET(start);
+ end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
+
+ while ((start < end) && (start < size)) {
+ error = read_cluster_nonblocking(file, start, size);
+ start += CLUSTER_PAGES;
+ if (error < 0)
+ break;
+ }
+ } else {
+ while ((start < end) && (start < size)) {
+ error = page_cache_read(file, start);
+ start++;
+ if (error < 0)
+ break;
+ }
+ }
+
+ /* Don't wait for someone else to push these requests. */
+ run_task_queue(&tq_disk);
+
+ return error;
+}
+
+/*
+ * Application no longer needs these pages. If the pages are dirty,
+ * it's OK to just throw them away. The app will be more careful about
+ * data it wants to keep. Be sure to free swap resources too. The
+ * zap_page_range call sets things up for shrink_mmap to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * shrink_mmap to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do. This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them. There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_flags & VM_LOCKED)
+ return -EINVAL;
+
+ lock_kernel(); /* is this really necessary? */
+
+ flush_cache_range(vma->vm_mm, start, end);
+ zap_page_range(vma->vm_mm, start, end - start);
+ flush_tlb_range(vma->vm_mm, start, end);
+
+ unlock_kernel();
+ return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+ unsigned long end, int behavior)
+{
+ long error = -EBADF;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ case MADV_SEQUENTIAL:
+ case MADV_RANDOM:
+ error = madvise_behavior(vma, start, end, behavior);
+ break;
+
+ case MADV_WILLNEED:
+ error = madvise_willneed(vma, start, end);
+ break;
+
+ case MADV_DONTNEED:
+ error = madvise_dontneed(vma, start, end);
+ break;
+
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area. The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques. The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ * MADV_NORMAL - the default behavior is to read clusters. This
+ * results in some read-ahead and read-behind.
+ * MADV_RANDOM - the system should read the minimum amount of data
+ * on any access, since it is unlikely that the appli-
+ * cation will need more than what it asks for.
+ * MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ * once, so they can be aggressively read ahead, and
+ * can be freed soon after they are accessed.
+ * MADV_WILLNEED - the application is notifying the system to read
+ * some pages ahead.
+ * MADV_DONTNEED - the application is finished with the given range,
+ * so the kernel can free resources associated with it.
+ *
+ * return values:
+ * zero - success
+ * -EINVAL - start + len < 0, start is not page-aligned,
+ * "behavior" is not a valid value, or application
+ * is attempting to release locked or shared pages.
+ * -ENOMEM - addresses in the specified range are not currently
+ * mapped, or are outside the AS of the process.
+ * -EIO - an I/O error occurred while paging in data.
+ * -EBADF - map exists, but area maps something that isn't a file.
+ * -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
+{
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ down(&current->mm->mmap_sem);
+
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ */
+ vma = find_vma(current->mm, start);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = madvise_vma(vma, start, end,
+ behavior);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = madvise_vma(vma, start, vma->vm_end, behavior);
+ if (error)
+ goto out;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+out:
+ up(&current->mm->mmap_sem);
+ return error;
+}
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+ unsigned long pgoff)
+{
+ unsigned char present = 0;
+ struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
+ struct page * page, ** hash = page_hash(as, pgoff);
+
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(as, pgoff, *hash);
+ if ((page) && (Page_Uptodate(page)))
+ present = 1;
+ spin_unlock(&pagecache_lock);
+
+ return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, unsigned char * vec)
+{
+ long error, i, remaining;
+ unsigned char * tmp;
+
+ error = -ENOMEM;
+ if (!vma->vm_file)
+ return error;
+
+ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ error = -EAGAIN;
+ tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+ if (!tmp)
+ return error;
+
+ /* (end - start) is # of pages, and also # of bytes in "vec */
+ remaining = (end - start),
+
+ error = 0;
+ for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+ int j = 0;
+ long thispiece = (remaining < PAGE_SIZE) ?
+ remaining : PAGE_SIZE;
+
+ while (j < thispiece)
+ tmp[j++] = mincore_page(vma, start++);
+
+ if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+ error = -EFAULT;
+ break;
+ }
+ }
+
+ free_page((unsigned long) tmp);
+ return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes. The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information. Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - vec points to an illegal address
+ * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
+ * or len has a nonpositive value
+ * -ENOMEM - Addresses in the range [addr, addr + len] are
+ * invalid for the address space of this process, or
+ * specify one or more pages which are not currently
+ * mapped
+ * -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+ unsigned char * vec)
+{
+ int index = 0;
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error = 0;
+ long error = -EINVAL;
+
+ down(&current->mm->mmap_sem);
+
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ */
+ vma = find_vma(current->mm, start);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = mincore_vma(vma, start, end,
+ &vec[index]);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+ if (error)
+ goto out;
+ index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+out:
+ up(&current->mm->mmap_sem);
+ return error;
+}
+
struct page *read_cache_page(struct address_space *mapping,
unsigned long index,
int (*filler)(void *,struct page*),