summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1997-01-07 02:33:00 +0000
committer <ralf@linux-mips.org>1997-01-07 02:33:00 +0000
commitbeb116954b9b7f3bb56412b2494b562f02b864b1 (patch)
tree120e997879884e1b9d93b265221b939d2ef1ade1 /mm
parent908d4681a1dc3792ecafbe64265783a86c4cccb6 (diff)
Import of Linux/MIPS 2.1.14
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile27
-rw-r--r--mm/filemap.c1218
-rw-r--r--mm/kmalloc.c663
-rw-r--r--mm/memory.c920
-rw-r--r--mm/mlock.c272
-rw-r--r--mm/mmap.c360
-rw-r--r--mm/mprotect.c22
-rw-r--r--mm/mremap.c224
-rw-r--r--mm/page_alloc.c339
-rw-r--r--mm/page_io.c193
-rw-r--r--mm/swap.c1263
-rw-r--r--mm/swap_state.c111
-rw-r--r--mm/swapfile.c577
-rw-r--r--mm/vmalloc.c91
-rw-r--r--mm/vmscan.c453
15 files changed, 4351 insertions, 2382 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 35f51d45f..19552c98f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -7,26 +7,9 @@
#
# Note 2! The CFLAGS definition is now in the main makefile...
-.c.o:
- $(CC) $(CFLAGS) -c $<
-.s.o:
- $(AS) -o $*.o $<
-.c.s:
- $(CC) $(CFLAGS) -S $<
+O_TARGET := mm.o
+O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
+ kmalloc.o vmalloc.o \
+ swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o
-OBJS = memory.o swap.o mmap.o filemap.o mprotect.o kmalloc.o vmalloc.o
-
-mm.o: $(OBJS)
- $(LD) -r -o mm.o $(OBJS)
-
-modules:
-
-dep:
- $(CPP) -M *.c > .depend
-
-#
-# include a dependency file if one exists
-#
-ifeq (.depend,$(wildcard .depend))
-include .depend
-endif
+include $(TOPDIR)/Rules.make
diff --git a/mm/filemap.c b/mm/filemap.c
index 5a1e99142..c0ce486df 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1,7 +1,7 @@
/*
- * linux/mm/filemmap.c
+ * linux/mm/filemap.c
*
- * Copyright (C) 1994 Linus Torvalds
+ * Copyright (C) 1994, 1995 Linus Torvalds
*/
/*
@@ -18,57 +18,921 @@
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/malloc.h>
+#include <linux/fs.h>
+#include <linux/locks.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
-#include <asm/segment.h>
#include <asm/system.h>
#include <asm/pgtable.h>
+#include <asm/uaccess.h>
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
+ *
+ * Shared mappings now work. 15.8.1995 Bruno.
+ */
+
+unsigned long page_cache_size = 0;
+struct page * page_hash_table[PAGE_HASH_SIZE];
+
+/*
+ * Simple routines for both non-shared and shared mappings.
*/
-static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address,
- unsigned long page, int no_share)
+/*
+ * This is a special fast page-free routine that _only_ works
+ * on page-cache pages that we are currently using. We can
+ * just decrement the page count, because we know that the page
+ * has a count > 1 (the page cache itself counts as one, and
+ * we're currently using it counts as one). So we don't need
+ * the full free_page() stuff..
+ */
+static inline void release_page(struct page * page)
{
- struct inode * inode = area->vm_inode;
- unsigned int block;
- int nr[8];
- int i, *p;
-
- address &= PAGE_MASK;
- block = address - area->vm_start + area->vm_offset;
- block >>= inode->i_sb->s_blocksize_bits;
- i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
- p = nr;
+ atomic_dec(&page->count);
+}
+
+/*
+ * Invalidate the pages of an inode, removing all pages that aren't
+ * locked down (those are sure to be up-to-date anyway, so we shouldn't
+ * invalidate them).
+ */
+void invalidate_inode_pages(struct inode * inode)
+{
+ struct page ** p;
+ struct page * page;
+
+ p = &inode->i_pages;
+ while ((page = *p) != NULL) {
+ if (PageLocked(page)) {
+ p = &page->next;
+ continue;
+ }
+ inode->i_nrpages--;
+ if ((*p = page->next) != NULL)
+ (*p)->prev = page->prev;
+ page->dirty = 0;
+ page->next = NULL;
+ page->prev = NULL;
+ remove_page_from_hash_queue(page);
+ page->inode = NULL;
+ __free_page(page);
+ continue;
+ }
+}
+
+/*
+ * Truncate the page cache at a set offset, removing the pages
+ * that are beyond that offset (and zeroing out partial pages).
+ */
+void truncate_inode_pages(struct inode * inode, unsigned long start)
+{
+ struct page ** p;
+ struct page * page;
+
+repeat:
+ p = &inode->i_pages;
+ while ((page = *p) != NULL) {
+ unsigned long offset = page->offset;
+
+ /* page wholly truncated - free it */
+ if (offset >= start) {
+ if (PageLocked(page)) {
+ wait_on_page(page);
+ goto repeat;
+ }
+ inode->i_nrpages--;
+ if ((*p = page->next) != NULL)
+ (*p)->prev = page->prev;
+ page->dirty = 0;
+ page->next = NULL;
+ page->prev = NULL;
+ remove_page_from_hash_queue(page);
+ page->inode = NULL;
+ __free_page(page);
+ continue;
+ }
+ p = &page->next;
+ offset = start - offset;
+ /* partial truncate, clear end of page */
+ if (offset < PAGE_SIZE) {
+ unsigned long address = page_address(page);
+ memset((void *) (offset + address), 0, PAGE_SIZE - offset);
+ flush_page_to_ram(address);
+ }
+ }
+}
+
+int shrink_mmap(int priority, int dma)
+{
+ static unsigned long clock = 0;
+ struct page * page;
+ unsigned long limit = max_mapnr;
+ struct buffer_head *tmp, *bh;
+ int count_max, count_min;
+
+ count_max = (limit<<1) >> (priority>>1);
+ count_min = (limit<<1) >> (priority);
+
+ page = mem_map + clock;
+ do {
+ count_max--;
+ if (page->inode || page->buffers)
+ count_min--;
+
+ if (PageLocked(page))
+ goto next;
+ if (dma && !PageDMA(page))
+ goto next;
+ /* First of all, regenerate the page's referenced bit
+ from any buffers in the page */
+ bh = page->buffers;
+ if (bh) {
+ tmp = bh;
+ do {
+ if (buffer_touched(tmp)) {
+ clear_bit(BH_Touched, &tmp->b_state);
+ set_bit(PG_referenced, &page->flags);
+ }
+ tmp = tmp->b_this_page;
+ } while (tmp != bh);
+ }
+
+ /* We can't throw away shared pages, but we do mark
+ them as referenced. This relies on the fact that
+ no page is currently in both the page cache and the
+ buffer cache; we'd have to modify the following
+ test to allow for that case. */
+
+ switch (page->count) {
+ case 1:
+ /* If it has been referenced recently, don't free it */
+ if (clear_bit(PG_referenced, &page->flags))
+ break;
+
+ /* is it a page cache page? */
+ if (page->inode) {
+ remove_page_from_hash_queue(page);
+ remove_page_from_inode_queue(page);
+ __free_page(page);
+ return 1;
+ }
+
+ /* is it a buffer cache page? */
+ if (bh && try_to_free_buffer(bh, &bh, 6))
+ return 1;
+ break;
+
+ default:
+ /* more than one users: we can't throw it away */
+ set_bit(PG_referenced, &page->flags);
+ /* fall through */
+ case 0:
+ /* nothing */
+ }
+next:
+ page++;
+ clock++;
+ if (clock >= limit) {
+ clock = 0;
+ page = mem_map;
+ }
+ } while (count_max > 0 && count_min > 0);
+ return 0;
+}
+
+/*
+ * This is called from try_to_swap_out() when we try to get rid of some
+ * pages.. If we're unmapping the last occurrence of this page, we also
+ * free it from the page hash-queues etc, as we don't want to keep it
+ * in-core unnecessarily.
+ */
+unsigned long page_unuse(unsigned long page)
+{
+ struct page * p = mem_map + MAP_NR(page);
+ int count = p->count;
+
+ if (count != 2)
+ return count;
+ if (!p->inode)
+ return count;
+ remove_page_from_hash_queue(p);
+ remove_page_from_inode_queue(p);
+ free_page(page);
+ return 1;
+}
+
+/*
+ * Update a page cache copy, when we're doing a "write()" system call
+ * See also "update_vm_cache()".
+ */
+void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
+{
+ unsigned long offset, len;
+
+ offset = (pos & ~PAGE_MASK);
+ pos = pos & PAGE_MASK;
+ len = PAGE_SIZE - offset;
do {
- *p = bmap(inode,block);
- i--;
- block++;
- p++;
- } while (i > 0);
- return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share);
+ struct page * page;
+
+ if (len > count)
+ len = count;
+ page = find_page(inode, pos);
+ if (page) {
+ wait_on_page(page);
+ memcpy((void *) (offset + page_address(page)), buf, len);
+ release_page(page);
+ }
+ count -= len;
+ buf += len;
+ len = PAGE_SIZE;
+ offset = 0;
+ pos += PAGE_SIZE;
+ } while (count);
+}
+
+static inline void add_to_page_cache(struct page * page,
+ struct inode * inode, unsigned long offset,
+ struct page **hash)
+{
+ page->count++;
+ page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
+ page->offset = offset;
+ add_page_to_inode_queue(inode, page);
+ __add_page_to_hash_queue(page, hash);
}
/*
- * NOTE! mmap sync doesn't really work yet. This is mainly a stub for it,
- * which only works if the buffers and the page were already sharing the
- * same physical page (that's actually pretty common, especially if the
- * file has been mmap'ed before being read the normal way).
+ * Try to read ahead in the file. "page_cache" is a potentially free page
+ * that we could use for the cache (if it is 0 we can try to create one,
+ * this is all overlapped with the IO on the previous page finishing anyway)
+ */
+static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
+{
+ struct page * page;
+ struct page ** hash;
+
+ offset &= PAGE_MASK;
+ switch (page_cache) {
+ case 0:
+ page_cache = __get_free_page(GFP_KERNEL);
+ if (!page_cache)
+ break;
+ default:
+ if (offset >= inode->i_size)
+ break;
+ hash = page_hash(inode, offset);
+ page = __find_page(inode, offset, *hash);
+ if (!page) {
+ /*
+ * Ok, add the new page to the hash-queues...
+ */
+ page = mem_map + MAP_NR(page_cache);
+ add_to_page_cache(page, inode, offset, hash);
+ inode->i_op->readpage(inode, page);
+ page_cache = 0;
+ }
+ release_page(page);
+ }
+ return page_cache;
+}
+
+/*
+ * Wait for IO to complete on a locked page.
*
- * Todo:
- * - non-shared pages also need to be synced with the buffers.
- * - the "swapout()" function needs to swap out the page to
- * the shared file instead of using the swap device.
+ * This must be called with the caller "holding" the page,
+ * ie with increased "page->count" so that the page won't
+ * go away during the wait..
*/
-static void filemap_sync_page(struct vm_area_struct * vma,
+void __wait_on_page(struct page *page)
+{
+ struct wait_queue wait = { current, NULL };
+
+ add_wait_queue(&page->wait, &wait);
+repeat:
+ run_task_queue(&tq_disk);
+ current->state = TASK_UNINTERRUPTIBLE;
+ if (PageLocked(page)) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(&page->wait, &wait);
+ current->state = TASK_RUNNING;
+}
+
+#if 0
+#define PROFILE_READAHEAD
+#define DEBUG_READAHEAD
+#endif
+
+/*
+ * Read-ahead profiling information
+ * --------------------------------
+ * Every PROFILE_MAXREADCOUNT, the following information is written
+ * to the syslog:
+ * Percentage of asynchronous read-ahead.
+ * Average of read-ahead fields context value.
+ * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
+ * to the syslog.
+ */
+
+#ifdef PROFILE_READAHEAD
+
+#define PROFILE_MAXREADCOUNT 1000
+
+static unsigned long total_reada;
+static unsigned long total_async;
+static unsigned long total_ramax;
+static unsigned long total_ralen;
+static unsigned long total_rawin;
+
+static void profile_readahead(int async, struct file *filp)
+{
+ unsigned long flags;
+
+ ++total_reada;
+ if (async)
+ ++total_async;
+
+ total_ramax += filp->f_ramax;
+ total_ralen += filp->f_ralen;
+ total_rawin += filp->f_rawin;
+
+ if (total_reada > PROFILE_MAXREADCOUNT) {
+ save_flags(flags);
+ cli();
+ if (!(total_reada > PROFILE_MAXREADCOUNT)) {
+ restore_flags(flags);
+ return;
+ }
+
+ printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
+ total_ramax/total_reada,
+ total_ralen/total_reada,
+ total_rawin/total_reada,
+ (total_async*100)/total_reada);
+#ifdef DEBUG_READAHEAD
+ printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
+ filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
+#endif
+
+ total_reada = 0;
+ total_async = 0;
+ total_ramax = 0;
+ total_ralen = 0;
+ total_rawin = 0;
+
+ restore_flags(flags);
+ }
+}
+#endif /* defined PROFILE_READAHEAD */
+
+/*
+ * Read-ahead context:
+ * -------------------
+ * The read ahead context fields of the "struct file" are the following:
+ * - f_raend : position of the first byte after the last page we tried to
+ * read ahead.
+ * - f_ramax : current read-ahead maximum size.
+ * - f_ralen : length of the current IO read block we tried to read-ahead.
+ * - f_rawin : length of the current read-ahead window.
+ * if last read-ahead was synchronous then
+ * f_rawin = f_ralen
+ * otherwise (was asynchronous)
+ * f_rawin = previous value of f_ralen + f_ralen
+ *
+ * Read-ahead limits:
+ * ------------------
+ * MIN_READAHEAD : minimum read-ahead size when read-ahead.
+ * MAX_READAHEAD : maximum read-ahead size when read-ahead.
+ *
+ * Synchronous read-ahead benefits:
+ * --------------------------------
+ * Using reasonable IO xfer length from peripheral devices increase system
+ * performances.
+ * Reasonable means, in this context, not too large but not too small.
+ * The actual maximum value is:
+ * MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
+ * and 32K if defined (4K page size assumed).
+ *
+ * Asynchronous read-ahead benefits:
+ * ---------------------------------
+ * Overlapping next read request and user process execution increase system
+ * performance.
+ *
+ * Read-ahead risks:
+ * -----------------
+ * We have to guess which further data are needed by the user process.
+ * If these data are often not really needed, it's bad for system
+ * performances.
+ * However, we know that files are often accessed sequentially by
+ * application programs and it seems that it is possible to have some good
+ * strategy in that guessing.
+ * We only try to read-ahead files that seems to be read sequentially.
+ *
+ * Asynchronous read-ahead risks:
+ * ------------------------------
+ * In order to maximize overlapping, we must start some asynchronous read
+ * request from the device, as soon as possible.
+ * We must be very careful about:
+ * - The number of effective pending IO read requests.
+ * ONE seems to be the only reasonable value.
+ * - The total memory pool usage for the file access stream.
+ * This maximum memory usage is implicitly 2 IO read chunks:
+ * 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
+ * 64k if defined (4K page size assumed).
+ */
+
+#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)
+
+#if 0 /* small readahead */
+#define MAX_READAHEAD PageAlignSize(4096*7)
+#define MIN_READAHEAD PageAlignSize(4096*2)
+#else /* large readahead */
+#define MAX_READAHEAD PageAlignSize(4096*18)
+#define MIN_READAHEAD PageAlignSize(4096*3)
+#endif
+
+static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
+ unsigned long ppos, struct page * page,
+ unsigned long page_cache)
+{
+ unsigned long max_ahead, ahead;
+ unsigned long raend;
+
+ raend = filp->f_raend & PAGE_MASK;
+ max_ahead = 0;
+
+/*
+ * The current page is locked.
+ * If the current position is inside the previous read IO request, do not
+ * try to reread previously read ahead pages.
+ * Otherwise decide or not to read ahead some pages synchronously.
+ * If we are not going to read ahead, set the read ahead context for this
+ * page only.
+ */
+ if (PageLocked(page)) {
+ if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
+ raend = ppos;
+ if (raend < inode->i_size)
+ max_ahead = filp->f_ramax;
+ filp->f_rawin = 0;
+ filp->f_ralen = PAGE_SIZE;
+ if (!max_ahead) {
+ filp->f_raend = ppos + filp->f_ralen;
+ filp->f_rawin += filp->f_ralen;
+ }
+ }
+ }
+/*
+ * The current page is not locked.
+ * If we were reading ahead and,
+ * if the current max read ahead size is not zero and,
+ * if the current position is inside the last read-ahead IO request,
+ * it is the moment to try to read ahead asynchronously.
+ * We will later force unplug device in order to force asynchronous read IO.
+ */
+ else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
+ ppos <= raend && ppos + filp->f_ralen >= raend) {
+/*
+ * Add ONE page to max_ahead in order to try to have about the same IO max size
+ * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
+ * Compute the position of the last page we have tried to read in order to
+ * begin to read ahead just at the next page.
+ */
+ raend -= PAGE_SIZE;
+ if (raend < inode->i_size)
+ max_ahead = filp->f_ramax + PAGE_SIZE;
+
+ if (max_ahead) {
+ filp->f_rawin = filp->f_ralen;
+ filp->f_ralen = 0;
+ reada_ok = 2;
+ }
+ }
+/*
+ * Try to read ahead pages.
+ * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
+ * scheduler, will work enough for us to avoid too bad actuals IO requests.
+ */
+ ahead = 0;
+ while (ahead < max_ahead) {
+ ahead += PAGE_SIZE;
+ page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
+ }
+/*
+ * If we tried to read ahead some pages,
+ * If we tried to read ahead asynchronously,
+ * Try to force unplug of the device in order to start an asynchronous
+ * read IO request.
+ * Update the read-ahead context.
+ * Store the length of the current read-ahead window.
+ * Double the current max read ahead size.
+ * That heuristic avoid to do some large IO for files that are not really
+ * accessed sequentially.
+ */
+ if (ahead) {
+ if (reada_ok == 2) {
+ run_task_queue(&tq_disk);
+ }
+
+ filp->f_ralen += ahead;
+ filp->f_rawin += filp->f_ralen;
+ filp->f_raend = raend + ahead + PAGE_SIZE;
+
+ filp->f_ramax += filp->f_ramax;
+
+ if (filp->f_ramax > MAX_READAHEAD)
+ filp->f_ramax = MAX_READAHEAD;
+
+#ifdef PROFILE_READAHEAD
+ profile_readahead((reada_ok == 2), filp);
+#endif
+ }
+
+ return page_cache;
+}
+
+
+/*
+ * This is a generic file read routine, and uses the
+ * inode->i_op->readpage() function for the actual low-level
+ * stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ */
+
+long generic_file_read(struct inode * inode, struct file * filp,
+ char * buf, unsigned long count)
+{
+ int error, read;
+ unsigned long pos, ppos, page_cache;
+ int reada_ok;
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+ if (!count)
+ return 0;
+ error = 0;
+ read = 0;
+ page_cache = 0;
+
+ pos = filp->f_pos;
+ ppos = pos & PAGE_MASK;
+/*
+ * If the current position is outside the previous read-ahead window,
+ * we reset the current read-ahead context and set read ahead max to zero
+ * (will be set to just needed value later),
+ * otherwise, we assume that the file accesses are sequential enough to
+ * continue read-ahead.
+ */
+ if (ppos > filp->f_raend || ppos + filp->f_rawin < filp->f_raend) {
+ reada_ok = 0;
+ filp->f_raend = 0;
+ filp->f_ralen = 0;
+ filp->f_ramax = 0;
+ filp->f_rawin = 0;
+ } else {
+ reada_ok = 1;
+ }
+/*
+ * Adjust the current value of read-ahead max.
+ * If the read operation stay in the first half page, force no readahead.
+ * Otherwise try to increase read ahead max just enough to do the read request.
+ * Then, at least MIN_READAHEAD if read ahead is ok,
+ * and at most MAX_READAHEAD in all cases.
+ */
+ if (pos + count <= (PAGE_SIZE >> 1)) {
+ filp->f_ramax = 0;
+ } else {
+ unsigned long needed;
+
+ needed = ((pos + count) & PAGE_MASK) - ppos;
+
+ if (filp->f_ramax < needed)
+ filp->f_ramax = needed;
+
+ if (reada_ok && filp->f_ramax < MIN_READAHEAD)
+ filp->f_ramax = MIN_READAHEAD;
+ if (filp->f_ramax > MAX_READAHEAD)
+ filp->f_ramax = MAX_READAHEAD;
+ }
+
+ for (;;) {
+ struct page *page, **hash;
+
+ if (pos >= inode->i_size)
+ break;
+
+ /*
+ * Try to find the data in the page cache..
+ */
+ hash = page_hash(inode, pos & PAGE_MASK);
+ page = __find_page(inode, pos & PAGE_MASK, *hash);
+ if (!page)
+ goto no_cached_page;
+
+found_page:
+/*
+ * Try to read ahead only if the current page is filled or being filled.
+ * Otherwise, if we were reading ahead, decrease max read ahead size to
+ * the minimum value.
+ * In this context, that seems to may happen only on some read error or if
+ * the page has been rewritten.
+ */
+ if (PageUptodate(page) || PageLocked(page))
+ page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
+ else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
+ filp->f_ramax = MIN_READAHEAD;
+
+ wait_on_page(page);
+
+ if (!PageUptodate(page))
+ goto page_read_error;
+
+success:
+ /*
+ * Ok, we have the page, it's up-to-date and ok,
+ * so now we can finally copy it to user space...
+ */
+ {
+ unsigned long offset, nr;
+
+ offset = pos & ~PAGE_MASK;
+ nr = PAGE_SIZE - offset;
+ if (nr > count)
+ nr = count;
+ if (nr > inode->i_size - pos)
+ nr = inode->i_size - pos;
+ nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr);
+ release_page(page);
+ error = -EFAULT;
+ if (!nr)
+ break;
+ buf += nr;
+ pos += nr;
+ read += nr;
+ count -= nr;
+ if (count)
+ continue;
+ break;
+ }
+
+no_cached_page:
+ /*
+ * Ok, it wasn't cached, so we need to create a new
+ * page..
+ */
+ if (!page_cache) {
+ page_cache = __get_free_page(GFP_KERNEL);
+ /*
+ * That could have slept, so go around to the
+ * very beginning..
+ */
+ if (page_cache)
+ continue;
+ error = -ENOMEM;
+ break;
+ }
+
+ /*
+ * Ok, add the new page to the hash-queues...
+ */
+ page = mem_map + MAP_NR(page_cache);
+ page_cache = 0;
+ add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
+
+ /*
+ * Error handling is tricky. If we get a read error,
+ * the cached page stays in the cache (but uptodate=0),
+ * and the next process that accesses it will try to
+ * re-read it. This is needed for NFS etc, where the
+ * identity of the reader can decide if we can read the
+ * page or not..
+ */
+/*
+ * We have to read the page.
+ * If we were reading ahead, we had previously tried to read this page,
+ * That means that the page has probably been removed from the cache before
+ * the application process needs it, or has been rewritten.
+ * Decrease max readahead size to the minimum value in that situation.
+ */
+ if (reada_ok && filp->f_ramax > MIN_READAHEAD)
+ filp->f_ramax = MIN_READAHEAD;
+
+ error = inode->i_op->readpage(inode, page);
+ if (!error)
+ goto found_page;
+ release_page(page);
+ break;
+
+page_read_error:
+ /*
+ * We found the page, but it wasn't up-to-date.
+ * Try to re-read it _once_. We do this synchronously,
+ * because this happens only if there were errors.
+ */
+ error = inode->i_op->readpage(inode, page);
+ if (!error) {
+ wait_on_page(page);
+ if (PageUptodate(page) && !PageError(page))
+ goto success;
+ error = -EIO; /* Some unspecified error occurred.. */
+ }
+ release_page(page);
+ break;
+ }
+
+ filp->f_pos = pos;
+ filp->f_reada = 1;
+ if (page_cache)
+ free_page(page_cache);
+ if (!IS_RDONLY(inode)) {
+ inode->i_atime = CURRENT_TIME;
+ inode->i_dirt = 1;
+ }
+ if (!read)
+ read = error;
+ return read;
+}
+
+/*
+ * Semantics for shared and private memory areas are different past the end
+ * of the file. A shared mapping past the last page of the file is an error
+ * and results in a SIGBUS, while a private mapping just maps in a zero page.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ */
+static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
+{
+/* XXX: Check the flushes in this code. At least sometimes we do
+ duplicate flushes. ... */
+ unsigned long offset;
+ struct page * page, **hash;
+ struct inode * inode = area->vm_inode;
+ unsigned long old_page, new_page;
+
+ new_page = 0;
+ offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
+ if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
+ goto no_page;
+
+ /*
+ * Do we have something in the page cache already?
+ */
+ hash = page_hash(inode, offset);
+ page = __find_page(inode, offset, *hash);
+ if (!page)
+ goto no_cached_page;
+
+found_page:
+ /*
+ * Ok, found a page in the page cache, now we need to check
+ * that it's up-to-date
+ */
+ if (PageLocked(page))
+ goto page_locked_wait;
+ if (!PageUptodate(page))
+ goto page_read_error;
+
+success:
+ /*
+ * Found the page, need to check sharing and possibly
+ * copy it over to another page..
+ */
+ old_page = page_address(page);
+ if (!no_share) {
+ /*
+ * Ok, we can share the cached page directly.. Get rid
+ * of any potential extra pages.
+ */
+ if (new_page)
+ free_page(new_page);
+
+ flush_page_to_ram(old_page);
+ return old_page;
+ }
+
+ /*
+ * Check that we have another page to copy it over to..
+ */
+ if (!new_page) {
+ new_page = __get_free_page(GFP_KERNEL);
+ if (!new_page)
+ goto failure;
+ }
+ copy_page(new_page, old_page);
+ flush_page_to_ram(new_page);
+ release_page(page);
+ return new_page;
+
+no_cached_page:
+ new_page = __get_free_page(GFP_KERNEL);
+ if (!new_page)
+ goto no_page;
+
+ /*
+ * During getting the above page we might have slept,
+ * so we need to re-check the situation with the page
+ * cache.. The page we just got may be useful if we
+ * can't share, so don't get rid of it here.
+ */
+ page = find_page(inode, offset);
+ if (page)
+ goto found_page;
+
+ /*
+ * Now, create a new page-cache page from the page we got
+ */
+ page = mem_map + MAP_NR(new_page);
+ new_page = 0;
+ add_to_page_cache(page, inode, offset, hash);
+
+ if (inode->i_op->readpage(inode, page) != 0)
+ goto failure;
+
+ /*
+ * Do a very limited read-ahead if appropriate
+ */
+ if (PageLocked(page))
+ new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
+ goto found_page;
+
+page_locked_wait:
+ __wait_on_page(page);
+ if (PageUptodate(page))
+ goto success;
+
+page_read_error:
+ /*
+ * Umm, take care of errors if the page isn't up-to-date.
+ * Try to re-read it _once_. We do this synchronously,
+ * because there really aren't any performance issues here
+ * and we need to check for errors.
+ */
+ if (inode->i_op->readpage(inode, page) != 0)
+ goto failure;
+ wait_on_page(page);
+ if (PageError(page))
+ goto failure;
+ if (PageUptodate(page))
+ goto success;
+
+ /*
+ * Uhhuh.. Things didn't work out. Return zero to tell the
+ * mm layer so, possibly freeing the page cache page first.
+ */
+failure:
+ release_page(page);
+no_page:
+ return 0;
+}
+
+/*
+ * Tries to write a shared mapped page to its backing store. May return -EIO
+ * if the disk is full.
+ */
+static inline int do_write_page(struct inode * inode, struct file * file,
+ const char * page, unsigned long offset)
+{
+ int old_fs, retval;
+ unsigned long size;
+
+ size = offset + PAGE_SIZE;
+ /* refuse to extend file size.. */
+ if (S_ISREG(inode->i_mode)) {
+ if (size > inode->i_size)
+ size = inode->i_size;
+ /* Ho humm.. We should have tested for this earlier */
+ if (size < offset)
+ return -EIO;
+ }
+ size -= offset;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ retval = -EIO;
+ if (size == file->f_op->write(inode, file, (const char *) page, size))
+ retval = 0;
+ set_fs(old_fs);
+ return retval;
+}
+
+static int filemap_write_page(struct vm_area_struct * vma,
unsigned long offset,
unsigned long page)
{
+ int result;
+ struct file file;
+ struct inode * inode;
struct buffer_head * bh;
- printk("msync: %ld: [%08lx]\n", offset, page);
- bh = buffer_pages[MAP_NR(page)];
+ bh = mem_map[MAP_NR(page)].buffers;
if (bh) {
/* whee.. just mark the buffer heads dirty */
struct buffer_head * tmp = bh;
@@ -76,45 +940,125 @@ static void filemap_sync_page(struct vm_area_struct * vma,
mark_buffer_dirty(tmp, 0);
tmp = tmp->b_this_page;
} while (tmp != bh);
- return;
+ return 0;
}
- /* we'll need to go fetch the buffer heads etc.. RSN */
- printk("Can't handle non-shared page yet\n");
- return;
+
+ inode = vma->vm_inode;
+ file.f_op = inode->i_op->default_file_ops;
+ if (!file.f_op->write)
+ return -EIO;
+ file.f_mode = 3;
+ file.f_flags = 0;
+ file.f_count = 1;
+ file.f_inode = inode;
+ file.f_pos = offset;
+ file.f_reada = 0;
+
+ down(&inode->i_sem);
+ result = do_write_page(inode, &file, (const char *) page, offset);
+ up(&inode->i_sem);
+ return result;
+}
+
+
+/*
+ * Swapping to a shared file: while we're busy writing out the page
+ * (and the page still exists in memory), we save the page information
+ * in the page table, so that "filemap_swapin()" can re-use the page
+ * immediately if it is called while we're busy swapping it out..
+ *
+ * Once we've written it all out, we mark the page entry "empty", which
+ * will result in a normal page-in (instead of a swap-in) from the now
+ * up-to-date disk file.
+ */
+int filemap_swapout(struct vm_area_struct * vma,
+ unsigned long offset,
+ pte_t *page_table)
+{
+ int error;
+ unsigned long page = pte_page(*page_table);
+ unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
+
+ flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
+ set_pte(page_table, __pte(entry));
+ flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
+ error = filemap_write_page(vma, offset, page);
+ if (pte_val(*page_table) == entry)
+ pte_clear(page_table);
+ return error;
}
-static inline void filemap_sync_pte(pte_t * pte, struct vm_area_struct *vma,
+/*
+ * filemap_swapin() is called only if we have something in the page
+ * tables that is non-zero (but not present), which we know to be the
+ * page index of a page that is busy being swapped out (see above).
+ * So we just use it directly..
+ */
+static pte_t filemap_swapin(struct vm_area_struct * vma,
+ unsigned long offset,
+ unsigned long entry)
+{
+ unsigned long page = SWP_OFFSET(entry);
+
+ mem_map[page].count++;
+ page = (page << PAGE_SHIFT) + PAGE_OFFSET;
+ return mk_pte(page,vma->vm_page_prot);
+}
+
+
+static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
- pte_t page = *pte;
-
- if (!pte_present(page))
- return;
- if (!pte_dirty(page))
- return;
- if (flags & MS_INVALIDATE) {
- pte_clear(pte);
+ pte_t pte = *ptep;
+ unsigned long page;
+ int error;
+
+ if (!(flags & MS_INVALIDATE)) {
+ if (!pte_present(pte))
+ return 0;
+ if (!pte_dirty(pte))
+ return 0;
+ flush_page_to_ram(pte_page(pte));
+ flush_cache_page(vma, address);
+ set_pte(ptep, pte_mkclean(pte));
+ flush_tlb_page(vma, address);
+ page = pte_page(pte);
+ mem_map[MAP_NR(page)].count++;
} else {
- mem_map[MAP_NR(pte_page(page))]++;
- *pte = pte_mkclean(page);
+ if (pte_none(pte))
+ return 0;
+ flush_cache_page(vma, address);
+ pte_clear(ptep);
+ flush_tlb_page(vma, address);
+ if (!pte_present(pte)) {
+ swap_free(pte_val(pte));
+ return 0;
+ }
+ page = pte_page(pte);
+ if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
+ free_page(page);
+ return 0;
+ }
}
- filemap_sync_page(vma, address - vma->vm_start, pte_page(page));
- free_page(pte_page(page));
+ error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
+ free_page(page);
+ return error;
}
-static inline void filemap_sync_pte_range(pmd_t * pmd,
+static inline int filemap_sync_pte_range(pmd_t * pmd,
unsigned long address, unsigned long size,
struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
{
pte_t * pte;
unsigned long end;
+ int error;
if (pmd_none(*pmd))
- return;
+ return 0;
if (pmd_bad(*pmd)) {
printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
pmd_clear(pmd);
- return;
+ return 0;
}
pte = pte_offset(pmd, address);
offset += address & PMD_MASK;
@@ -122,58 +1066,65 @@ static inline void filemap_sync_pte_range(pmd_t * pmd,
end = address + size;
if (end > PMD_SIZE)
end = PMD_SIZE;
+ error = 0;
do {
- filemap_sync_pte(pte, vma, address + offset, flags);
+ error |= filemap_sync_pte(pte, vma, address + offset, flags);
address += PAGE_SIZE;
pte++;
} while (address < end);
+ return error;
}
-static inline void filemap_sync_pmd_range(pgd_t * pgd,
+static inline int filemap_sync_pmd_range(pgd_t * pgd,
unsigned long address, unsigned long size,
struct vm_area_struct *vma, unsigned int flags)
{
pmd_t * pmd;
unsigned long offset, end;
+ int error;
if (pgd_none(*pgd))
- return;
+ return 0;
if (pgd_bad(*pgd)) {
printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
pgd_clear(pgd);
- return;
+ return 0;
}
pmd = pmd_offset(pgd, address);
- offset = address & PMD_MASK;
- address &= ~PMD_MASK;
+ offset = address & PGDIR_MASK;
+ address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
+ error = 0;
do {
- filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
+ error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
+ return error;
}
-static void filemap_sync(struct vm_area_struct * vma, unsigned long address,
+static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
size_t size, unsigned int flags)
{
pgd_t * dir;
unsigned long end = address + size;
+ int error = 0;
- dir = pgd_offset(current, address);
+ dir = pgd_offset(vma->vm_mm, address);
+ flush_cache_range(vma->vm_mm, end - size, end);
while (address < end) {
- filemap_sync_pmd_range(dir, address, end - address, vma, flags);
+ error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
- invalidate();
- return;
+ flush_tlb_range(vma->vm_mm, end - size, end);
+ return error;
}
/*
- * This handles area unmaps..
+ * This handles (potentially partial) area unmaps..
*/
static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
{
@@ -181,50 +1132,27 @@ static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_
}
/*
- * This handles complete area closes..
- */
-static void filemap_close(struct vm_area_struct * vma)
-{
- filemap_sync(vma, vma->vm_start, vma->vm_end - vma->vm_start, MS_ASYNC);
-}
-
-/*
- * This isn't implemented yet: you'll get a warning and incorrect behaviour.
- *
- * Note that the page is free'd by the higher-level after return,
- * so we have to either write it out or just forget it. We currently
- * forget it..
- */
-void filemap_swapout(struct vm_area_struct * vma,
- unsigned long offset,
- pte_t *page_table)
-{
- printk("swapout not implemented on shared files..\n");
- pte_clear(page_table);
-}
-
-/*
* Shared mappings need to be able to do the right thing at
* close/unmap/sync. They will also use the private file as
* backing-store for swapping..
*/
static struct vm_operations_struct file_shared_mmap = {
- NULL, /* open */
- filemap_close, /* close */
- filemap_unmap, /* unmap */
- NULL, /* protect */
+ NULL, /* no special open */
+ NULL, /* no special close */
+ filemap_unmap, /* unmap - we need to sync the pages */
+ NULL, /* no special protect */
filemap_sync, /* sync */
NULL, /* advise */
filemap_nopage, /* nopage */
NULL, /* wppage */
filemap_swapout, /* swapout */
- NULL, /* swapin */
+ filemap_swapin, /* swapin */
};
/*
- * Private mappings just need to be able to load in the map
+ * Private mappings just need to be able to load in the map.
*
- * (this is actually used for shared mappings as well, if we
+ * (This is actually used for shared mappings as well, if we
* know they can't ever get write permissions..)
*/
static struct vm_operations_struct file_private_mmap = {
@@ -241,28 +1169,25 @@ static struct vm_operations_struct file_private_mmap = {
};
/* This is used for a general mmap of a disk file */
-int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
{
struct vm_operations_struct * ops;
- if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
- return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
+ ops = &file_shared_mmap;
+ /* share_page() can only guarantee proper page sharing if
+ * the offsets are all page aligned. */
+ if (vma->vm_offset & (PAGE_SIZE - 1))
+ return -EINVAL;
+ } else {
+ ops = &file_private_mmap;
+ if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
+ return -EINVAL;
+ }
if (!inode->i_sb || !S_ISREG(inode->i_mode))
return -EACCES;
- if (!inode->i_op || !inode->i_op->bmap)
+ if (!inode->i_op || !inode->i_op->readpage)
return -ENOEXEC;
- ops = &file_private_mmap;
- if (vma->vm_flags & VM_SHARED) {
- if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) {
- static int nr = 0;
- ops = &file_shared_mmap;
-#ifndef SHARED_MMAP_REALLY_WORKS /* it doesn't, yet */
- if (nr++ < 5)
- printk("%s tried to do a shared writeable mapping\n", current->comm);
- return -EINVAL;
-#endif
- }
- }
if (!IS_RDONLY(inode)) {
inode->i_atime = CURRENT_TIME;
inode->i_dirt = 1;
@@ -272,3 +1197,74 @@ int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct
vma->vm_ops = ops;
return 0;
}
+
+
+/*
+ * The msync() system call.
+ */
+
+static int msync_interval(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int flags)
+{
+ if (!vma->vm_inode)
+ return 0;
+ if (vma->vm_ops->sync) {
+ int error;
+ error = vma->vm_ops->sync(vma, start, end-start, flags);
+ if (error)
+ return error;
+ if (flags & MS_SYNC)
+ return file_fsync(vma->vm_inode, NULL);
+ return 0;
+ }
+ return 0;
+}
+
+asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
+{
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error, error;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+ if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+ return -EINVAL;
+ if (end == start)
+ return 0;
+ /*
+ * If the interval [start,end) covers some unmapped address ranges,
+ * just ignore them, but return -EFAULT at the end.
+ */
+ vma = find_vma(current->mm, start);
+ unmapped_error = 0;
+ for (;;) {
+ /* Still start < end. */
+ if (!vma)
+ return -EFAULT;
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -EFAULT;
+ start = vma->vm_start;
+ }
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = msync_interval(vma, start, end, flags);
+ if (error)
+ return error;
+ }
+ return unmapped_error;
+ }
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = msync_interval(vma, start, vma->vm_end, flags);
+ if (error)
+ return error;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+}
diff --git a/mm/kmalloc.c b/mm/kmalloc.c
index e288ecf2f..d0193f02b 100644
--- a/mm/kmalloc.c
+++ b/mm/kmalloc.c
@@ -10,44 +10,42 @@
/*
* Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple
* pages. So for 'page' throughout, read 'area'.
+ *
+ * Largely rewritten.. Linus
*/
#include <linux/mm.h>
-#include <asm/system.h>
#include <linux/delay.h>
+#include <linux/interrupt.h>
-#define GFP_LEVEL_MASK 0xf
-
-/* I want this low enough for a while to catch errors.
- I want this number to be increased in the near future:
- loadable device drivers should use this function to get memory */
-
-#define MAX_KMALLOC_K ((PAGE_SIZE<<(NUM_AREA_ORDERS-1))>>10)
-
-
-/* This defines how many times we should try to allocate a free page before
- giving up. Normally this shouldn't happen at all. */
-#define MAX_GET_FREE_PAGE_TRIES 4
+#include <asm/system.h>
+#include <asm/dma.h>
+#ifdef __mips__
+#include <asm/sgidefs.h>
+#endif
+/* Define this if you want slow routines that try to trip errors */
+#undef SADISTIC_KMALLOC
/* Private flags. */
#define MF_USED 0xffaa0055
+#define MF_DMA 0xff00aa55
#define MF_FREE 0x0055ffaa
-/*
+/*
* Much care has gone into making these routines in this file reentrant.
*
* The fancy bookkeeping of nbytesmalloced and the like are only used to
- * report them to the user (oooohhhhh, aaaaahhhhh....) are not
+ * report them to the user (oooohhhhh, aaaaahhhhh....) are not
* protected by cli(). (If that goes wrong. So what?)
*
* These routines restore the interrupt status to allow calling with ints
- * off.
+ * off.
*/
-/*
+/*
* A block header. This is in front of every malloc-block, whether free or not.
*/
struct block_header {
@@ -64,8 +62,8 @@ struct block_header {
#define BH(p) ((struct block_header *)(p))
-/*
- * The page descriptor is at the front of every page that malloc has in use.
+/*
+ * The page descriptor is at the front of every page that malloc has in use.
*/
struct page_descriptor {
struct page_descriptor *next;
@@ -84,324 +82,415 @@ struct page_descriptor {
*/
struct size_descriptor {
struct page_descriptor *firstfree;
- struct page_descriptor *dmafree; /* DMA-able memory */
- int size;
+ struct page_descriptor *dmafree; /* DMA-able memory */
int nblocks;
int nmallocs;
int nfrees;
int nbytesmalloced;
int npages;
- unsigned long gfporder; /* number of pages in the area required */
+ unsigned long gfporder; /* number of pages in the area required */
};
/*
- * For now it is unsafe to allocate bucket sizes between n & n=16 where n is
- * 4096 * any power of two
+ * For now it is unsafe to allocate bucket sizes between n and
+ * n-sizeof(page_descriptor) where n is PAGE_SIZE * any power of two
*/
-#if PAGE_SIZE == 4096
-struct size_descriptor sizes[] = {
- { NULL, NULL, 32,127, 0,0,0,0, 0},
- { NULL, NULL, 64, 63, 0,0,0,0, 0 },
- { NULL, NULL, 128, 31, 0,0,0,0, 0 },
- { NULL, NULL, 252, 16, 0,0,0,0, 0 },
- { NULL, NULL, 508, 8, 0,0,0,0, 0 },
- { NULL, NULL,1020, 4, 0,0,0,0, 0 },
- { NULL, NULL,2040, 2, 0,0,0,0, 0 },
- { NULL, NULL,4096-16, 1, 0,0,0,0, 0 },
- { NULL, NULL,8192-16, 1, 0,0,0,0, 1 },
- { NULL, NULL,16384-16, 1, 0,0,0,0, 2 },
- { NULL, NULL,32768-16, 1, 0,0,0,0, 3 },
- { NULL, NULL,65536-16, 1, 0,0,0,0, 4 },
- { NULL, NULL,131072-16, 1, 0,0,0,0, 5 },
- { NULL, NULL, 0, 0, 0,0,0,0, 0 }
+#if PAGE_SIZE == 4096 && defined (__mips__) && \
+ ((_MIPS_ISA == _MIPS_ISA_MIPS2) || \
+ (_MIPS_ISA == _MIPS_ISA_MIPS3) || \
+ (_MIPS_ISA == _MIPS_ISA_MIPS4))
+static const unsigned int blocksize[] = {
+ /*
+ * For MIPS II we need this hacked descriptor table to get
+ * doubleword alignment. Otherwise the scheduler and other code
+ * that use doublewords will bomb.
+ */
+ 32,
+ 64,
+ 128,
+ 248,
+ 504,
+ 1016,
+ 2040,
+ 4096 - 16,
+ 8192 - 16,
+ 16384 - 16,
+ 32768 - 16,
+ 65536 - 16,
+ 131072 - 16,
+ 0
+};
+
+static struct size_descriptor sizes[] =
+{
+ {NULL, NULL, 127, 0, 0, 0, 0, 0},
+ {NULL, NULL, 63, 0, 0, 0, 0, 0},
+ {NULL, NULL, 31, 0, 0, 0, 0, 0},
+ {NULL, NULL, 16, 0, 0, 0, 0, 0},
+ {NULL, NULL, 8, 0, 0, 0, 0, 0},
+ {NULL, NULL, 4, 0, 0, 0, 0, 0},
+ {NULL, NULL, 2, 0, 0, 0, 0, 0},
+ {NULL, NULL, 1, 0, 0, 0, 0, 0},
+ {NULL, NULL, 1, 0, 0, 0, 0, 1},
+ {NULL, NULL, 1, 0, 0, 0, 0, 2},
+ {NULL, NULL, 1, 0, 0, 0, 0, 3},
+ {NULL, NULL, 1, 0, 0, 0, 0, 4},
+ {NULL, NULL, 1, 0, 0, 0, 0, 5},
+ {NULL, NULL, 0, 0, 0, 0, 0, 0}
+};
+#elif PAGE_SIZE == 4096
+static const unsigned int blocksize[] = {
+ 32,
+ 64,
+ 128,
+ 252,
+ 508,
+ 1020,
+ 2040,
+ 4096 - 16,
+ 8192 - 16,
+ 16384 - 16,
+ 32768 - 16,
+ 65536 - 16,
+ 131072 - 16,
+ 0
+};
+
+static struct size_descriptor sizes[] =
+{
+ {NULL, NULL, 127, 0, 0, 0, 0, 0},
+ {NULL, NULL, 63, 0, 0, 0, 0, 0},
+ {NULL, NULL, 31, 0, 0, 0, 0, 0},
+ {NULL, NULL, 16, 0, 0, 0, 0, 0},
+ {NULL, NULL, 8, 0, 0, 0, 0, 0},
+ {NULL, NULL, 4, 0, 0, 0, 0, 0},
+ {NULL, NULL, 2, 0, 0, 0, 0, 0},
+ {NULL, NULL, 1, 0, 0, 0, 0, 0},
+ {NULL, NULL, 1, 0, 0, 0, 0, 1},
+ {NULL, NULL, 1, 0, 0, 0, 0, 2},
+ {NULL, NULL, 1, 0, 0, 0, 0, 3},
+ {NULL, NULL, 1, 0, 0, 0, 0, 4},
+ {NULL, NULL, 1, 0, 0, 0, 0, 5},
+ {NULL, NULL, 0, 0, 0, 0, 0, 0}
};
#elif PAGE_SIZE == 8192
-struct size_descriptor sizes[] = {
- { NULL, NULL, 64,127, 0,0,0,0, 0},
- { NULL, NULL, 128, 63, 0,0,0,0, 0 },
- { NULL, NULL, 248, 31, 0,0,0,0, 0 },
- { NULL, NULL, 504, 16, 0,0,0,0, 0 },
- { NULL, NULL,1016, 8, 0,0,0,0, 0 },
- { NULL, NULL,2040, 4, 0,0,0,0, 0 },
- { NULL, NULL,4080, 2, 0,0,0,0, 0 },
- { NULL, NULL,8192-32, 1, 0,0,0,0, 0 },
- { NULL, NULL,16384-32, 1, 0,0,0,0, 1 },
- { NULL, NULL,32768-32, 1, 0,0,0,0, 2 },
- { NULL, NULL,65536-32, 1, 0,0,0,0, 3 },
- { NULL, NULL,131072-32, 1, 0,0,0,0, 4 },
- { NULL, NULL,262144-32, 1, 0,0,0,0, 5 },
- { NULL, NULL, 0, 0, 0,0,0,0, 0 }
+static const unsigned int blocksize[] = {
+ 64,
+ 128,
+ 248,
+ 504,
+ 1016,
+ 2040,
+ 4080,
+ 8192 - 32,
+ 16384 - 32,
+ 32768 - 32,
+ 65536 - 32,
+ 131072 - 32,
+ 262144 - 32,
+ 0
+};
+
+struct size_descriptor sizes[] =
+{
+ {NULL, NULL, 127, 0, 0, 0, 0, 0},
+ {NULL, NULL, 63, 0, 0, 0, 0, 0},
+ {NULL, NULL, 31, 0, 0, 0, 0, 0},
+ {NULL, NULL, 16, 0, 0, 0, 0, 0},
+ {NULL, NULL, 8, 0, 0, 0, 0, 0},
+ {NULL, NULL, 4, 0, 0, 0, 0, 0},
+ {NULL, NULL, 2, 0, 0, 0, 0, 0},
+ {NULL, NULL, 1, 0, 0, 0, 0, 0},
+ {NULL, NULL, 1, 0, 0, 0, 0, 1},
+ {NULL, NULL, 1, 0, 0, 0, 0, 2},
+ {NULL, NULL, 1, 0, 0, 0, 0, 3},
+ {NULL, NULL, 1, 0, 0, 0, 0, 4},
+ {NULL, NULL, 1, 0, 0, 0, 0, 5},
+ {NULL, NULL, 0, 0, 0, 0, 0, 0}
};
#else
#error you need to make a version for your pagesize
#endif
#define NBLOCKS(order) (sizes[order].nblocks)
-#define BLOCKSIZE(order) (sizes[order].size)
+#define BLOCKSIZE(order) (blocksize[order])
#define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder))
+
-
-long kmalloc_init (long start_mem,long end_mem)
+long kmalloc_init(long start_mem, long end_mem)
{
int order;
-/*
+/*
* Check the static info array. Things will blow up terribly if it's
* incorrect. This is a late "compile time" check.....
*/
-for (order = 0;BLOCKSIZE(order);order++)
- {
- if ((NBLOCKS (order)*BLOCKSIZE(order) + sizeof (struct page_descriptor)) >
- AREASIZE(order))
- {
- printk ("Cannot use %d bytes out of %d in order = %d block mallocs\n",
- (int) (NBLOCKS (order) * BLOCKSIZE(order) +
- sizeof (struct page_descriptor)),
- (int) AREASIZE(order),
- BLOCKSIZE (order));
- panic ("This only happens if someone messes with kmalloc");
- }
- }
-return start_mem;
+ for (order = 0; BLOCKSIZE(order); order++) {
+ if ((NBLOCKS(order) * BLOCKSIZE(order) + sizeof(struct page_descriptor)) >
+ AREASIZE(order)) {
+ printk("Cannot use %d bytes out of %d in order = %d block mallocs\n",
+ (int) (NBLOCKS(order) * BLOCKSIZE(order) +
+ sizeof(struct page_descriptor)),
+ (int) AREASIZE(order),
+ BLOCKSIZE(order));
+ panic("This only happens if someone messes with kmalloc");
+ }
+ }
+ return start_mem;
}
+/*
+ * Create a small cache of page allocations: this helps a bit with
+ * those pesky 8kB+ allocations for NFS when we're temporarily
+ * out of memory..
+ *
+ * This is a _truly_ small cache, we just cache one single page
+ * order (for orders 0, 1 and 2, that is 4, 8 and 16kB on x86).
+ */
+#define MAX_CACHE_ORDER 3
+struct page_descriptor * kmalloc_cache[MAX_CACHE_ORDER];
-
-int get_order (int size)
+static inline struct page_descriptor * get_kmalloc_pages(unsigned long priority,
+ unsigned long order, int dma)
{
- int order;
+ return (struct page_descriptor *) __get_free_pages(priority, order, dma);
+}
- /* Add the size of the header */
- size += sizeof (struct block_header);
- for (order = 0;BLOCKSIZE(order);order++)
- if (size <= BLOCKSIZE (order))
- return order;
- return -1;
+static inline void free_kmalloc_pages(struct page_descriptor * page,
+ unsigned long order, int dma)
+{
+ if (!dma && order < MAX_CACHE_ORDER) {
+ page = xchg(kmalloc_cache+order, page);
+ if (!page)
+ return;
+ }
+ free_pages((unsigned long) page, order);
}
-void * kmalloc (size_t size, int priority)
+/*
+ * Ugh, this is ugly, but we want the default case to run
+ * straight through, which is why we have the ugly goto's
+ */
+void *kmalloc(size_t size, int priority)
{
unsigned long flags;
- int order,tries,i,sz;
- int dma_flag;
+ unsigned long type;
+ int order, dma;
struct block_header *p;
- struct page_descriptor *page;
+ struct page_descriptor *page, **pg;
+ struct size_descriptor *bucket = sizes;
+
+ /* Get order */
+ order = 0;
+ {
+ unsigned int realsize = size + sizeof(struct block_header);
+ for (;;) {
+ int ordersize = BLOCKSIZE(order);
+ if (realsize <= ordersize)
+ break;
+ order++;
+ bucket++;
+ if (ordersize)
+ continue;
+ printk("kmalloc of too large a block (%d bytes).\n", (int) size);
+ return NULL;
+ }
+ }
+
+ dma = 0;
+ type = MF_USED;
+ pg = &bucket->firstfree;
+ if (priority & GFP_DMA) {
+ dma = 1;
+ type = MF_DMA;
+ pg = &bucket->dmafree;
+ }
- dma_flag = (priority & GFP_DMA);
priority &= GFP_LEVEL_MASK;
-
+
/* Sanity check... */
if (intr_count && priority != GFP_ATOMIC) {
static int count = 0;
if (++count < 5) {
printk("kmalloc called nonatomically from interrupt %p\n",
- __builtin_return_address(0));
+ return_address());
priority = GFP_ATOMIC;
}
}
-order = get_order (size);
-if (order < 0)
- {
- printk ("kmalloc of too large a block (%d bytes).\n",(int) size);
- return (NULL);
- }
-
-save_flags(flags);
-
-/* It seems VERY unlikely to me that it would be possible that this
- loop will get executed more than once. */
-tries = MAX_GET_FREE_PAGE_TRIES;
-while (tries --)
- {
- /* Try to allocate a "recently" freed memory block */
- cli ();
- if ((page = (dma_flag ? sizes[order].dmafree : sizes[order].firstfree)) &&
- (p = page->firstfree))
- {
- if (p->bh_flags == MF_FREE)
- {
- page->firstfree = p->bh_next;
- page->nfree--;
- if (!page->nfree)
- {
- if(dma_flag)
- sizes[order].dmafree = page->next;
- else
- sizes[order].firstfree = page->next;
- page->next = NULL;
- }
- restore_flags(flags);
-
- sizes [order].nmallocs++;
- sizes [order].nbytesmalloced += size;
- p->bh_flags = MF_USED; /* As of now this block is officially in use */
- p->bh_length = size;
- return p+1; /* Pointer arithmetic: increments past header */
- }
- printk ("Problem: block on freelist at %08lx isn't free.\n",(long)p);
- return (NULL);
- }
- restore_flags(flags);
-
-
- /* Now we're in trouble: We need to get a new free page..... */
-
- sz = BLOCKSIZE(order); /* sz is the size of the blocks we're dealing with */
-
- /* This can be done with ints on: This is private to this invocation */
- if (dma_flag)
- page = (struct page_descriptor *) __get_dma_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder);
- else
- page = (struct page_descriptor *) __get_free_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder);
-
- if (!page) {
- static unsigned long last = 0;
- if (last + 10*HZ < jiffies) {
- last = jiffies;
- printk ("Couldn't get a free page.....\n");
- }
- return NULL;
- }
-#if 0
- printk ("Got page %08x to use for %d byte mallocs....",(long)page,sz);
-#endif
- sizes[order].npages++;
-
- /* Loop for all but last block: */
- for (i=NBLOCKS(order),p=BH (page+1);i > 1;i--,p=p->bh_next)
- {
- p->bh_flags = MF_FREE;
- p->bh_next = BH ( ((long)p)+sz);
- }
- /* Last block: */
- p->bh_flags = MF_FREE;
- p->bh_next = NULL;
-
- page->order = order;
- page->nfree = NBLOCKS(order);
- page->firstfree = BH(page+1);
-#if 0
- printk ("%d blocks per page\n",page->nfree);
+ save_flags(flags);
+ cli();
+ page = *pg;
+ if (!page)
+ goto no_bucket_page;
+
+ p = page->firstfree;
+ if (p->bh_flags != MF_FREE)
+ goto not_free_on_freelist;
+
+found_it:
+ page->firstfree = p->bh_next;
+ page->nfree--;
+ if (!page->nfree)
+ *pg = page->next;
+ restore_flags(flags);
+ bucket->nmallocs++;
+ bucket->nbytesmalloced += size;
+ p->bh_flags = type; /* As of now this block is officially in use */
+ p->bh_length = size;
+#ifdef SADISTIC_KMALLOC
+ memset(p+1, 0xf0, size);
#endif
- /* Now we're going to muck with the "global" freelist for this size:
- this should be uninterruptible */
- cli ();
- /*
- * sizes[order].firstfree used to be NULL, otherwise we wouldn't be
- * here, but you never know....
- */
- if (dma_flag) {
- page->next = sizes[order].dmafree;
- sizes[order].dmafree = page;
- } else {
- page->next = sizes[order].firstfree;
- sizes[order].firstfree = page;
- }
- restore_flags(flags);
- }
-
-/* Pray that printk won't cause this to happen again :-) */
-
-printk ("Hey. This is very funny. I tried %d times to allocate a whole\n"
- "new page for an object only %d bytes long, but some other process\n"
- "beat me to actually allocating it. Also note that this 'error'\n"
- "message is soooo very long to catch your attention. I'd appreciate\n"
- "it if you'd be so kind as to report what conditions caused this to\n"
- "the author of this kmalloc: wolff@dutecai.et.tudelft.nl.\n"
- "(Executive summary: This can't happen)\n",
- MAX_GET_FREE_PAGE_TRIES,
- (int) size);
-return NULL;
+ return p + 1; /* Pointer arithmetic: increments past header */
+
+
+no_bucket_page:
+ /*
+ * If we didn't find a page already allocated for this
+ * bucket size, we need to get one..
+ *
+ * This can be done with ints on: it is private to this invocation
+ */
+ restore_flags(flags);
+
+ {
+ int i, sz;
+
+ /* sz is the size of the blocks we're dealing with */
+ sz = BLOCKSIZE(order);
+
+ page = get_kmalloc_pages(priority, bucket->gfporder, dma);
+ if (!page)
+ goto no_free_page;
+found_cached_page:
+
+ bucket->npages++;
+
+ page->order = order;
+ /* Loop for all but last block: */
+ i = (page->nfree = bucket->nblocks) - 1;
+ p = BH(page + 1);
+ while (i > 0) {
+ i--;
+ p->bh_flags = MF_FREE;
+ p->bh_next = BH(((long) p) + sz);
+ p = p->bh_next;
+ }
+ /* Last block: */
+ p->bh_flags = MF_FREE;
+ p->bh_next = NULL;
+
+ p = BH(page+1);
+ }
+
+ /*
+ * Now we're going to muck with the "global" freelist
+ * for this size: this should be uninterruptible
+ */
+ cli();
+ page->next = *pg;
+ *pg = page;
+ goto found_it;
+
+
+no_free_page:
+ /*
+ * No free pages, check the kmalloc cache of
+ * pages to see if maybe we have something available
+ */
+ if (!dma && order < MAX_CACHE_ORDER) {
+ page = xchg(kmalloc_cache+order, page);
+ if (page)
+ goto found_cached_page;
+ }
+ {
+ static unsigned long last = 0;
+ if (priority != GFP_BUFFER && (last + 10 * HZ < jiffies)) {
+ last = jiffies;
+ printk("Couldn't get a free page.....\n");
+ }
+ return NULL;
+ }
+
+not_free_on_freelist:
+ restore_flags(flags);
+ printk("Problem: block on freelist at %08lx isn't free.\n", (long) p);
+ return NULL;
}
-void kfree_s (void *ptr,int size)
+void kfree(void *__ptr)
{
-unsigned long flags;
-int order;
-register struct block_header *p=((struct block_header *)ptr) -1;
-struct page_descriptor *page,*pg2;
-
-page = PAGE_DESC (p);
-order = page->order;
-if ((order < 0) ||
- (order > sizeof (sizes)/sizeof (sizes[0])) ||
- (((long)(page->next)) & ~PAGE_MASK) ||
- (p->bh_flags != MF_USED))
- {
- printk ("kfree of non-kmalloced memory: %p, next= %p, order=%d\n",
- p, page->next, page->order);
- return;
- }
-if (size &&
- size != p->bh_length)
- {
- printk ("Trying to free pointer at %p with wrong size: %d instead of %lu.\n",
- p,size,p->bh_length);
- return;
- }
-size = p->bh_length;
-p->bh_flags = MF_FREE; /* As of now this block is officially free */
-save_flags(flags);
-cli ();
-p->bh_next = page->firstfree;
-page->firstfree = p;
-page->nfree ++;
-
-if (page->nfree == 1)
- { /* Page went from full to one free block: put it on the freelist. Do not bother
- trying to put it on the DMA list. */
- if (page->next)
- {
- printk ("Page %p already on freelist dazed and confused....\n", page);
- }
- else
- {
- page->next = sizes[order].firstfree;
- sizes[order].firstfree = page;
- }
- }
-
-/* If page is completely free, free it */
-if (page->nfree == NBLOCKS (page->order))
- {
-#if 0
- printk ("Freeing page %08x.\n", (long)page);
+ int dma;
+ unsigned long flags;
+ unsigned int order;
+ struct page_descriptor *page, **pg;
+ struct size_descriptor *bucket;
+
+ if (!__ptr)
+ goto null_kfree;
+#define ptr ((struct block_header *) __ptr)
+ page = PAGE_DESC(ptr);
+ __ptr = ptr - 1;
+ if (~PAGE_MASK & (unsigned long)page->next)
+ goto bad_order;
+ order = page->order;
+ if (order >= sizeof(sizes) / sizeof(sizes[0]))
+ goto bad_order;
+ bucket = sizes + order;
+ dma = 0;
+ pg = &bucket->firstfree;
+ if (ptr->bh_flags == MF_DMA) {
+ dma = 1;
+ ptr->bh_flags = MF_USED;
+ pg = &bucket->dmafree;
+ }
+ if (ptr->bh_flags != MF_USED)
+ goto bad_order;
+ ptr->bh_flags = MF_FREE; /* As of now this block is officially free */
+#ifdef SADISTIC_KMALLOC
+ memset(ptr+1, 0x0e, ptr->bh_length);
#endif
- if (sizes[order].firstfree == page)
- {
- sizes[order].firstfree = page->next;
- }
- else if (sizes[order].dmafree == page)
- {
- sizes[order].dmafree = page->next;
- }
- else
- {
- for (pg2=sizes[order].firstfree;
- (pg2 != NULL) && (pg2->next != page);
- pg2=pg2->next)
- /* Nothing */;
- if (!pg2)
- for (pg2=sizes[order].dmafree;
- (pg2 != NULL) && (pg2->next != page);
- pg2=pg2->next)
- /* Nothing */;
- if (pg2 != NULL)
- pg2->next = page->next;
- else
- printk ("Ooops. page %p doesn't show on freelist.\n", page);
- }
-/* FIXME: I'm sure we should do something with npages here (like npages--) */
- free_pages ((long)page, sizes[order].gfporder);
- }
-restore_flags(flags);
-
-/* FIXME: ?? Are these increment & decrement operations guaranteed to be
- * atomic? Could an IRQ not occur between the read & the write?
- * Maybe yes on a x86 with GCC...??
- */
-sizes[order].nfrees++; /* Noncritical (monitoring) admin stuff */
-sizes[order].nbytesmalloced -= size;
+ save_flags(flags);
+ cli();
+
+ bucket->nfrees++;
+ bucket->nbytesmalloced -= ptr->bh_length;
+
+ ptr->bh_next = page->firstfree;
+ page->firstfree = ptr;
+ if (!page->nfree++) {
+/* Page went from full to one free block: put it on the freelist. */
+ if (bucket->nblocks == 1)
+ goto free_page;
+ page->next = *pg;
+ *pg = page;
+ }
+/* If page is completely free, free it */
+ if (page->nfree == bucket->nblocks) {
+ for (;;) {
+ struct page_descriptor *tmp = *pg;
+ if (!tmp)
+ goto not_on_freelist;
+ if (tmp == page)
+ break;
+ pg = &tmp->next;
+ }
+ *pg = page->next;
+free_page:
+ bucket->npages--;
+ free_kmalloc_pages(page, bucket->gfporder, dma);
+ }
+ restore_flags(flags);
+null_kfree:
+ return;
+
+bad_order:
+ printk("kfree of non-kmalloced memory: %p, next= %p, order=%d\n",
+ ptr+1, page->next, page->order);
+ return;
+
+not_on_freelist:
+ printk("Ooops. page %p doesn't show on freelist.\n", page);
+ restore_flags(flags);
}
diff --git a/mm/memory.c b/mm/memory.c
index 4fba3a4c4..9fd243a67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -33,7 +33,6 @@
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*/
-#include <linux/config.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/head.h>
@@ -44,28 +43,29 @@
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <asm/system.h>
-#include <asm/segment.h>
+#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#include <asm/string.h>
-unsigned long high_memory = 0;
+unsigned long max_mapnr = 0;
+void * high_memory = NULL;
/*
- * The free_area_list arrays point to the queue heads of the free areas
- * of different sizes
+ * We special-case the C-O-W ZERO_PAGE, because it's such
+ * a common occurrence (no need to read the page to know
+ * that it's zero - better for the cache and memory subsystem).
*/
-int nr_swap_pages = 0;
-int nr_free_pages = 0;
-struct mem_list free_area_list[NR_MEM_LISTS];
-unsigned char * free_area_map[NR_MEM_LISTS];
-
-#if 0
-/*
- * This now resides in include/asm/page.h
- */
-#define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE)
-#endif
+static inline void copy_cow_page(unsigned long from, unsigned long to)
+{
+ if (from == ZERO_PAGE) {
+ clear_page(to);
+ return;
+ }
+ copy_page(to, from);
+}
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
@@ -77,30 +77,18 @@ mem_map_t * mem_map = NULL;
*/
void oom(struct task_struct * task)
{
- printk("\nOut of memory for %s.\n", current->comm);
- task->sigaction[SIGKILL-1].sa_handler = NULL;
+ printk("\nOut of memory for %s.\n", task->comm);
+ task->sig->action[SIGKILL-1].sa_handler = NULL;
task->blocked &= ~(1<<(SIGKILL-1));
send_sig(SIGKILL,task,1);
}
-static inline void free_one_pte(pte_t * page_table)
-{
- pte_t page = *page_table;
-
- if (pte_none(page))
- return;
- pte_clear(page_table);
- if (!pte_present(page)) {
- swap_free(pte_val(page));
- return;
- }
- free_page(pte_page(page));
- return;
-}
-
+/*
+ * Note: this doesn't free the actual pages themselves. That
+ * has been handled earlier when unmapping all the memory regions.
+ */
static inline void free_one_pmd(pmd_t * dir)
{
- int j;
pte_t * pte;
if (pmd_none(*dir))
@@ -112,12 +100,6 @@ static inline void free_one_pmd(pmd_t * dir)
}
pte = pte_offset(dir, 0);
pmd_clear(dir);
- if (pte_inuse(pte)) {
- pte_free(pte);
- return;
- }
- for (j = 0; j < PTRS_PER_PTE ; j++)
- free_one_pte(pte+j);
pte_free(pte);
}
@@ -135,224 +117,195 @@ static inline void free_one_pgd(pgd_t * dir)
}
pmd = pmd_offset(dir, 0);
pgd_clear(dir);
- if (pmd_inuse(pmd)) {
- pmd_free(pmd);
- return;
- }
for (j = 0; j < PTRS_PER_PMD ; j++)
free_one_pmd(pmd+j);
pmd_free(pmd);
}
-
/*
* This function clears all user-level page tables of a process - this
- * is needed by execve(), so that old pages aren't in the way. Note that
- * unlike 'free_page_tables()', this function still leaves a valid
- * page-table-tree in memory: it just removes the user pages. The two
- * functions are similar, but there is a fundamental difference.
+ * is needed by execve(), so that old pages aren't in the way.
*/
void clear_page_tables(struct task_struct * tsk)
{
int i;
pgd_t * page_dir;
- if (!tsk)
- return;
- if (tsk == task[0])
- panic("task[0] (swapper) doesn't support exec()\n");
- page_dir = pgd_offset(tsk, 0);
+ page_dir = tsk->mm->pgd;
if (!page_dir || page_dir == swapper_pg_dir) {
printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
return;
}
- if (pgd_inuse(page_dir)) {
- pgd_t * new_pg;
-
- if (!(new_pg = pgd_alloc())) {
- oom(tsk);
- return;
- }
- for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
- new_pg[i] = page_dir[i];
- SET_PAGE_DIR(tsk, new_pg);
- pgd_free(page_dir);
- return;
- }
+ flush_cache_mm(tsk->mm);
for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
free_one_pgd(page_dir + i);
- invalidate();
- return;
+ flush_tlb_mm(tsk->mm);
}
/*
- * This function frees up all page tables of a process when it exits.
+ * This function frees up all page tables of a process when it exits. It
+ * is the same as "clear_page_tables()", except it also changes the process'
+ * page table directory to the kernel page tables and then frees the old
+ * page table directory.
*/
-void free_page_tables(struct task_struct * tsk)
+void free_page_tables(struct mm_struct * mm)
{
int i;
pgd_t * page_dir;
- if (!tsk)
- return;
- if (tsk == task[0]) {
- printk("task[0] (swapper) killed: unable to recover\n");
- panic("Trying to free up swapper memory space");
- }
- page_dir = pgd_offset(tsk, 0);
+ page_dir = mm->pgd;
if (!page_dir || page_dir == swapper_pg_dir) {
- printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
- return;
- }
- SET_PAGE_DIR(tsk, swapper_pg_dir);
- if (pgd_inuse(page_dir)) {
- pgd_free(page_dir);
+ printk("Trying to free kernel page-directory: not good\n");
return;
}
- for (i = 0 ; i < PTRS_PER_PGD ; i++)
+ for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
free_one_pgd(page_dir + i);
pgd_free(page_dir);
- invalidate();
}
-/*
- * clone_page_tables() clones the page table for a process - both
- * processes will have the exact same pages in memory. There are
- * probably races in the memory management with cloning, but we'll
- * see..
- */
-int clone_page_tables(struct task_struct * tsk)
+int new_page_tables(struct task_struct * tsk)
{
- pgd_t * pg_dir;
+ pgd_t * page_dir, * new_pg;
- pg_dir = pgd_offset(current, 0);
- pgd_reuse(pg_dir);
- SET_PAGE_DIR(tsk, pg_dir);
+ if (!(new_pg = pgd_alloc()))
+ return -ENOMEM;
+ page_dir = pgd_offset(&init_mm, 0);
+ flush_cache_mm(tsk->mm);
+ memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
+ (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
+ flush_tlb_mm(tsk->mm);
+ SET_PAGE_DIR(tsk, new_pg);
+ tsk->mm->pgd = new_pg;
return 0;
}
-static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
+static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
{
pte_t pte = *old_pte;
+ unsigned long page_nr;
if (pte_none(pte))
return;
if (!pte_present(pte)) {
swap_duplicate(pte_val(pte));
- *new_pte = pte;
+ set_pte(new_pte, pte);
return;
}
- if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
- *new_pte = pte;
+ page_nr = MAP_NR(pte_page(pte));
+ if (page_nr >= max_mapnr || PageReserved(mem_map+page_nr)) {
+ set_pte(new_pte, pte);
return;
}
- if (pte_cow(pte))
+ if (cow)
pte = pte_wrprotect(pte);
- if (delete_from_swap_cache(pte_page(pte)))
+ if (delete_from_swap_cache(page_nr))
pte = pte_mkdirty(pte);
- *new_pte = pte_mkold(pte);
- *old_pte = pte;
- mem_map[MAP_NR(pte_page(pte))]++;
+ set_pte(new_pte, pte_mkold(pte));
+ set_pte(old_pte, pte);
+ mem_map[page_nr].count++;
}
-static inline int copy_one_pmd(pmd_t * old_pmd, pmd_t * new_pmd)
+static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
{
- int j;
- pte_t *old_pte, *new_pte;
+ pte_t * src_pte, * dst_pte;
+ unsigned long end;
- if (pmd_none(*old_pmd))
+ if (pmd_none(*src_pmd))
return 0;
- if (pmd_bad(*old_pmd)) {
- printk("copy_one_pmd: bad page table: probable memory corruption\n");
- pmd_clear(old_pmd);
- return 0;
- }
- old_pte = pte_offset(old_pmd, 0);
- if (pte_inuse(old_pte)) {
- pte_reuse(old_pte);
- *new_pmd = *old_pmd;
+ if (pmd_bad(*src_pmd)) {
+ printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
+ pmd_clear(src_pmd);
return 0;
}
- new_pte = pte_alloc(new_pmd, 0);
- if (!new_pte)
- return -ENOMEM;
- for (j = 0 ; j < PTRS_PER_PTE ; j++) {
- copy_one_pte(old_pte, new_pte);
- old_pte++;
- new_pte++;
+ src_pte = pte_offset(src_pmd, address);
+ if (pmd_none(*dst_pmd)) {
+ if (!pte_alloc(dst_pmd, 0))
+ return -ENOMEM;
}
+ dst_pte = pte_offset(dst_pmd, address);
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end >= PMD_SIZE)
+ end = PMD_SIZE;
+ do {
+ /* I would like to switch arguments here, to make it
+ * consistent with copy_xxx_range and memcpy syntax.
+ */
+ copy_one_pte(src_pte++, dst_pte++, cow);
+ address += PAGE_SIZE;
+ } while (address < end);
return 0;
}
-static inline int copy_one_pgd(pgd_t * old_pgd, pgd_t * new_pgd)
+static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
{
- int j;
- pmd_t *old_pmd, *new_pmd;
+ pmd_t * src_pmd, * dst_pmd;
+ unsigned long end;
+ int error = 0;
- if (pgd_none(*old_pgd))
+ if (pgd_none(*src_pgd))
return 0;
- if (pgd_bad(*old_pgd)) {
- printk("copy_one_pgd: bad page table (%p: %08lx): probable memory corruption\n", old_pgd, pgd_val(*old_pgd));
- pgd_clear(old_pgd);
+ if (pgd_bad(*src_pgd)) {
+ printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
+ pgd_clear(src_pgd);
return 0;
}
- old_pmd = pmd_offset(old_pgd, 0);
- if (pmd_inuse(old_pmd)) {
- pmd_reuse(old_pmd);
- *new_pgd = *old_pgd;
- return 0;
+ src_pmd = pmd_offset(src_pgd, address);
+ if (pgd_none(*dst_pgd)) {
+ if (!pmd_alloc(dst_pgd, 0))
+ return -ENOMEM;
}
- new_pmd = pmd_alloc(new_pgd, 0);
- if (!new_pmd)
- return -ENOMEM;
- for (j = 0 ; j < PTRS_PER_PMD ; j++) {
- int error = copy_one_pmd(old_pmd, new_pmd);
+ dst_pmd = pmd_offset(dst_pgd, address);
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {
+ error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
if (error)
- return error;
- old_pmd++;
- new_pmd++;
- }
- return 0;
+ break;
+ address = (address + PMD_SIZE) & PMD_MASK;
+ } while (address < end);
+ return error;
}
/*
- * copy_page_tables() just copies the whole process memory range:
- * note the special handling of RESERVED (ie kernel) pages, which
- * means that they are always shared by all processes.
+ * copy one vm_area from one task to the other. Assumes the page tables
+ * already present in the new task to be cleared in the whole range
+ * covered by this vma.
*/
-int copy_page_tables(struct task_struct * tsk)
-{
- int i;
- pgd_t *old_pgd;
- pgd_t *new_pgd;
-
- new_pgd = pgd_alloc();
- if (!new_pgd)
- return -ENOMEM;
- SET_PAGE_DIR(tsk, new_pgd);
- old_pgd = pgd_offset(current, 0);
- for (i = 0 ; i < PTRS_PER_PGD ; i++) {
- int errno = copy_one_pgd(old_pgd, new_pgd);
- if (errno) {
- free_page_tables(tsk);
- invalidate();
- return errno;
- }
- old_pgd++;
- new_pgd++;
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pgd_t * src_pgd, * dst_pgd;
+ unsigned long address = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ int error = 0, cow;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+ src_pgd = pgd_offset(src, address);
+ dst_pgd = pgd_offset(dst, address);
+ flush_cache_range(src, vma->vm_start, vma->vm_end);
+ flush_cache_range(dst, vma->vm_start, vma->vm_end);
+ while (address < end) {
+ error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
+ if (error)
+ break;
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
}
- invalidate();
- return 0;
+ /* Note that the src ptes get c-o-w treatment, so they change too. */
+ flush_tlb_range(src, vma->vm_start, vma->vm_end);
+ flush_tlb_range(dst, vma->vm_start, vma->vm_end);
+ return error;
}
-static inline void forget_pte(pte_t page)
+static inline void free_pte(pte_t page)
{
- if (pte_none(page))
- return;
if (pte_present(page)) {
- free_page(pte_page(page));
- if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED)
+ unsigned long addr = pte_page(page);
+ if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
return;
+ free_page(addr);
if (current->mm->rss <= 0)
return;
current->mm->rss--;
@@ -361,33 +314,45 @@ static inline void forget_pte(pte_t page)
swap_free(pte_val(page));
}
-static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
+static inline void forget_pte(pte_t page)
+{
+ if (!pte_none(page)) {
+ printk("forget_pte: old mapping existed!\n");
+ free_pte(page);
+ }
+}
+
+static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
{
pte_t * pte;
- unsigned long end;
if (pmd_none(*pmd))
return;
if (pmd_bad(*pmd)) {
- printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
+ printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
pmd_clear(pmd);
return;
}
pte = pte_offset(pmd, address);
address &= ~PMD_MASK;
- end = address + size;
- if (end >= PMD_SIZE)
- end = PMD_SIZE;
- do {
- pte_t page = *pte;
- pte_clear(pte);
- forget_pte(page);
- address += PAGE_SIZE;
+ if (address + size > PMD_SIZE)
+ size = PMD_SIZE - address;
+ size >>= PAGE_SHIFT;
+ for (;;) {
+ pte_t page;
+ if (!size)
+ break;
+ page = *pte;
pte++;
- } while (address < end);
+ size--;
+ if (pte_none(page))
+ continue;
+ pte_clear(pte-1);
+ free_pte(page);
+ }
}
-static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
+static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
{
pmd_t * pmd;
unsigned long end;
@@ -395,7 +360,7 @@ static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned
if (pgd_none(*dir))
return;
if (pgd_bad(*dir)) {
- printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
+ printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
pgd_clear(dir);
return;
}
@@ -405,28 +370,28 @@ static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
- unmap_pte_range(pmd, address, end - address);
+ zap_pte_range(pmd, address, end - address);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
}
/*
- * a more complete version of free_page_tables which performs with page
- * granularity.
+ * remove user pages in a given range.
*/
-int unmap_page_range(unsigned long address, unsigned long size)
+int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
{
pgd_t * dir;
unsigned long end = address + size;
- dir = pgd_offset(current, address);
+ dir = pgd_offset(mm, address);
+ flush_cache_range(mm, end - size, end);
while (address < end) {
- unmap_pmd_range(dir, address, end - address);
+ zap_pmd_range(dir, address, end - address);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
- invalidate();
+ flush_tlb_range(mm, end - size, end);
return 0;
}
@@ -440,7 +405,7 @@ static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigne
end = PMD_SIZE;
do {
pte_t oldpage = *pte;
- *pte = zero_pte;
+ set_pte(pte, zero_pte);
forget_pte(oldpage);
address += PAGE_SIZE;
pte++;
@@ -470,11 +435,13 @@ int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
{
int error = 0;
pgd_t * dir;
+ unsigned long beg = address;
unsigned long end = address + size;
pte_t zero_pte;
zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
- dir = pgd_offset(current, address);
+ dir = pgd_offset(current->mm, address);
+ flush_cache_range(current->mm, beg, end);
while (address < end) {
pmd_t *pmd = pmd_alloc(dir, address);
error = -ENOMEM;
@@ -486,7 +453,7 @@ int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
- invalidate();
+ flush_tlb_range(current->mm, beg, end);
return error;
}
@@ -496,7 +463,7 @@ int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
* in null mappings (currently treated as "copy-on-access")
*/
static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
- unsigned long offset, pgprot_t prot)
+ unsigned long phys_addr, pgprot_t prot)
{
unsigned long end;
@@ -505,23 +472,22 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
+ unsigned long mapnr;
pte_t oldpage = *pte;
pte_clear(pte);
- if (offset >= high_memory || (mem_map[MAP_NR(offset)] & MAP_PAGE_RESERVED))
- *pte = mk_pte(offset, prot);
- else if (mem_map[MAP_NR(offset)]) {
- mem_map[MAP_NR(offset)]++;
- *pte = mk_pte(offset, prot);
- }
+
+ mapnr = MAP_NR(__va(phys_addr));
+ if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr))
+ set_pte(pte, mk_pte_phys(phys_addr, prot));
forget_pte(oldpage);
address += PAGE_SIZE;
- offset += PAGE_SIZE;
+ phys_addr += PAGE_SIZE;
pte++;
} while (address < end);
}
static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
- unsigned long offset, pgprot_t prot)
+ unsigned long phys_addr, pgprot_t prot)
{
unsigned long end;
@@ -529,38 +495,40 @@ static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned l
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
- offset -= address;
+ phys_addr -= address;
do {
pte_t * pte = pte_alloc(pmd, address);
if (!pte)
return -ENOMEM;
- remap_pte_range(pte, address, end - address, address + offset, prot);
+ remap_pte_range(pte, address, end - address, address + phys_addr, prot);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
return 0;
}
-int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot)
+int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
int error = 0;
pgd_t * dir;
+ unsigned long beg = from;
unsigned long end = from + size;
- offset -= from;
- dir = pgd_offset(current, from);
+ phys_addr -= from;
+ dir = pgd_offset(current->mm, from);
+ flush_cache_range(current->mm, beg, end);
while (from < end) {
pmd_t *pmd = pmd_alloc(dir, from);
error = -ENOMEM;
if (!pmd)
break;
- error = remap_pmd_range(pmd, from, end - from, offset + from, prot);
+ error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
if (error)
break;
from = (from + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
- invalidate();
+ flush_tlb_range(current->mm, beg, end);
return error;
}
@@ -570,12 +538,11 @@ int remap_page_range(unsigned long from, unsigned long offset, unsigned long siz
static void put_page(pte_t * page_table, pte_t pte)
{
if (!pte_none(*page_table)) {
- printk("put_page: page already exists %08lx\n", pte_val(*page_table));
free_page(pte_page(pte));
return;
}
-/* no need for invalidate */
- *page_table = pte;
+/* no need for flush_tlb */
+ set_pte(page_table, pte);
}
/*
@@ -588,11 +555,11 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
pmd_t * pmd;
pte_t * pte;
- if (page >= high_memory)
+ if (MAP_NR(page) >= max_mapnr)
printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
- if (mem_map[MAP_NR(page)] != 1)
+ if (mem_map[MAP_NR(page)].count != 1)
printk("mem_map disagrees with %08lx at %08lx\n",page,address);
- pgd = pgd_offset(tsk,address);
+ pgd = pgd_offset(tsk->mm,address);
pmd = pmd_alloc(pgd, address);
if (!pmd) {
free_page(page);
@@ -607,10 +574,11 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
}
if (!pte_none(*pte)) {
printk("put_dirty_page: page already exists\n");
- pte_clear(pte);
- invalidate();
+ free_page(page);
+ return 0;
}
- *pte = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)));
+ flush_page_to_ram(page);
+ set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))));
/* no need for invalidate */
return page;
}
@@ -632,8 +600,8 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*/
-void do_wp_page(struct vm_area_struct * vma, unsigned long address,
- int write_access)
+void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
+ unsigned long address, int write_access)
{
pgd_t *page_dir;
pmd_t *page_middle;
@@ -641,7 +609,7 @@ void do_wp_page(struct vm_area_struct * vma, unsigned long address,
unsigned long old_page, new_page;
new_page = __get_free_page(GFP_KERNEL);
- page_dir = pgd_offset(vma->vm_task,address);
+ page_dir = pgd_offset(vma->vm_mm, address);
if (pgd_none(*page_dir))
goto end_wp_page;
if (pgd_bad(*page_dir))
@@ -658,44 +626,49 @@ void do_wp_page(struct vm_area_struct * vma, unsigned long address,
if (pte_write(pte))
goto end_wp_page;
old_page = pte_page(pte);
- if (old_page >= high_memory)
+ if (MAP_NR(old_page) >= max_mapnr)
goto bad_wp_page;
- vma->vm_task->mm->min_flt++;
+ tsk->min_flt++;
/*
* Do we need to copy?
*/
- if (mem_map[MAP_NR(old_page)] != 1) {
+ if (mem_map[MAP_NR(old_page)].count != 1) {
if (new_page) {
- if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED)
- ++vma->vm_task->mm->rss;
- copy_page(old_page,new_page);
- *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)));
+ if (PageReserved(mem_map + MAP_NR(old_page)))
+ ++vma->vm_mm->rss;
+ copy_cow_page(old_page,new_page);
+ flush_page_to_ram(old_page);
+ flush_page_to_ram(new_page);
+ flush_cache_page(vma, address);
+ set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
free_page(old_page);
- invalidate();
+ flush_tlb_page(vma, address);
return;
}
- *page_table = BAD_PAGE;
+ flush_cache_page(vma, address);
+ set_pte(page_table, BAD_PAGE);
+ flush_tlb_page(vma, address);
free_page(old_page);
- oom(vma->vm_task);
- invalidate();
+ oom(tsk);
return;
}
- *page_table = pte_mkdirty(pte_mkwrite(pte));
- invalidate();
+ flush_cache_page(vma, address);
+ set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
+ flush_tlb_page(vma, address);
if (new_page)
free_page(new_page);
return;
bad_wp_page:
printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
- send_sig(SIGKILL, vma->vm_task, 1);
+ send_sig(SIGKILL, tsk, 1);
goto end_wp_page;
bad_wp_pagemiddle:
printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle));
- send_sig(SIGKILL, vma->vm_task, 1);
+ send_sig(SIGKILL, tsk, 1);
goto end_wp_page;
bad_wp_pagedir:
printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir));
- send_sig(SIGKILL, vma->vm_task, 1);
+ send_sig(SIGKILL, tsk, 1);
end_wp_page:
if (new_page)
free_page(new_page);
@@ -703,301 +676,94 @@ end_wp_page:
}
/*
- * Ugly, ugly, but the goto's result in better assembly..
+ * This function zeroes out partial mmap'ed pages at truncation time..
*/
-int verify_area(int type, const void * addr, unsigned long size)
+static void partial_clear(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct * vma;
- unsigned long start = (unsigned long) addr;
-
- /* If the current user space is mapped to kernel space (for the
- * case where we use a fake user buffer with get_fs/set_fs()) we
- * don't expect to find the address in the user vm map.
- */
- if (get_fs() == get_ds())
- return 0;
-
- vma = find_vma(current, start);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= start)
- goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
- if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur)
- goto bad_area;
-
-good_area:
- if (type == VERIFY_WRITE)
- goto check_write;
- for (;;) {
- struct vm_area_struct * next;
- if (!(vma->vm_flags & VM_READ))
- goto bad_area;
- if (vma->vm_end - start >= size)
- return 0;
- next = vma->vm_next;
- if (!next || vma->vm_end != next->vm_start)
- goto bad_area;
- vma = next;
- }
-
-check_write:
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- if (!wp_works_ok)
- goto check_wp_fault_by_hand;
- for (;;) {
- if (vma->vm_end - start >= size)
- break;
- if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start)
- goto bad_area;
- vma = vma->vm_next;
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- }
- return 0;
-
-check_wp_fault_by_hand:
- size--;
- size += start & ~PAGE_MASK;
- size >>= PAGE_SHIFT;
- start &= PAGE_MASK;
-
- for (;;) {
- do_wp_page(vma, start, 1);
- if (!size)
- break;
- size--;
- start += PAGE_SIZE;
- if (start < vma->vm_end)
- continue;
- vma = vma->vm_next;
- if (!vma || vma->vm_start != start)
- goto bad_area;
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;;
- }
- return 0;
-
-bad_area:
- return -EFAULT;
-}
-
-static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table)
-{
- unsigned long tmp;
+ pgd_t *page_dir;
+ pmd_t *page_middle;
+ pte_t *page_table, pte;
- if (!(tmp = get_free_page(GFP_KERNEL))) {
- oom(vma->vm_task);
- put_page(page_table, BAD_PAGE);
+ page_dir = pgd_offset(vma->vm_mm, address);
+ if (pgd_none(*page_dir))
+ return;
+ if (pgd_bad(*page_dir)) {
+ printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
+ pgd_clear(page_dir);
return;
}
- put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot)));
-}
-
-/*
- * try_to_share() checks the page at address "address" in the task "p",
- * to see if it exists, and if it is clean. If so, share it with the current
- * task.
- *
- * NOTE! This assumes we have checked that p != current, and that they
- * share the same inode and can generally otherwise be shared.
- */
-static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area,
- unsigned long from_address, struct vm_area_struct * from_area,
- unsigned long newpage)
-{
- pgd_t * from_dir, * to_dir;
- pmd_t * from_middle, * to_middle;
- pte_t * from_table, * to_table;
- pte_t from, to;
-
- from_dir = pgd_offset(from_area->vm_task,from_address);
-/* is there a page-directory at from? */
- if (pgd_none(*from_dir))
- return 0;
- if (pgd_bad(*from_dir)) {
- printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir));
- pgd_clear(from_dir);
- return 0;
- }
- from_middle = pmd_offset(from_dir, from_address);
-/* is there a mid-directory at from? */
- if (pmd_none(*from_middle))
- return 0;
- if (pmd_bad(*from_middle)) {
- printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle));
- pmd_clear(from_middle);
- return 0;
- }
- from_table = pte_offset(from_middle, from_address);
- from = *from_table;
-/* is the page present? */
- if (!pte_present(from))
- return 0;
-/* if it is dirty it must be from a shared mapping to be shared */
- if (pte_dirty(from)) {
- if (!(from_area->vm_flags & VM_SHARED))
- return 0;
- if (pte_write(from)) {
- printk("nonwritable, but dirty, shared page\n");
- return 0;
- }
- }
-/* is the page reasonable at all? */
- if (pte_page(from) >= high_memory)
- return 0;
- if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED)
- return 0;
-/* is the destination ok? */
- to_dir = pgd_offset(to_area->vm_task,to_address);
-/* is there a page-directory at to? */
- if (pgd_none(*to_dir))
- return 0;
- if (pgd_bad(*to_dir)) {
- printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir));
- return 0;
- }
- to_middle = pmd_offset(to_dir, to_address);
-/* is there a mid-directory at to? */
- if (pmd_none(*to_middle))
- return 0;
- if (pmd_bad(*to_middle)) {
- printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle));
- return 0;
- }
- to_table = pte_offset(to_middle, to_address);
- to = *to_table;
- if (!pte_none(to))
- return 0;
-/* do we copy? */
- if (newpage) {
- /* if it's in the swap cache, it's dirty by implication */
- /* so we can't use it if it's not from a shared mapping */
- if (in_swap_cache(pte_page(from))) {
- if (!(from_area->vm_flags & VM_SHARED))
- return 0;
- if (!pte_write(from)) {
- printk("nonwritable, but dirty, shared page\n");
- return 0;
- }
- }
- copy_page(pte_page(from), newpage);
- *to_table = mk_pte(newpage, to_area->vm_page_prot);
- return 1;
- }
-/*
- * do a final swap-cache test before sharing them: if it's in the swap
- * cache, we have to remove it now, as we get two pointers to the same
- * physical page and the cache can't handle it. Mark the original dirty.
- *
- * NOTE! Even if "from" is dirty, "to" will be clean: if we get here
- * with a dirty "from", the from-mapping is a shared map, so we can trust
- * the page contents to be up-to-date
- */
- if (in_swap_cache(pte_page(from))) {
- if (!(from_area->vm_flags & VM_SHARED))
- return 0;
- *from_table = pte_mkdirty(from);
- delete_from_swap_cache(pte_page(from));
+ page_middle = pmd_offset(page_dir, address);
+ if (pmd_none(*page_middle))
+ return;
+ if (pmd_bad(*page_middle)) {
+ printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir));
+ pmd_clear(page_middle);
+ return;
}
- mem_map[MAP_NR(pte_page(from))]++;
- *to_table = mk_pte(pte_page(from), to_area->vm_page_prot);
-/* Check if we need to do anything at all to the 'from' field */
- if (!pte_write(from))
- return 1;
- if (from_area->vm_flags & VM_SHARED)
- return 1;
-/* ok, need to mark it read-only, so invalidate any possible old TB entry */
- *from_table = pte_wrprotect(from);
- invalidate();
- return 1;
+ page_table = pte_offset(page_middle, address);
+ pte = *page_table;
+ if (!pte_present(pte))
+ return;
+ flush_cache_page(vma, address);
+ address &= ~PAGE_MASK;
+ address += pte_page(pte);
+ if (MAP_NR(address) >= max_mapnr)
+ return;
+ memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK));
+ flush_page_to_ram(pte_page(pte));
}
/*
- * share_page() tries to find a process that could share a page with
- * the current one.
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
*
- * We first check if it is at all feasible by checking inode->i_count.
- * It should be >1 if there are other tasks sharing this inode.
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page. Ugly, but necessary.
*/
-static int share_page(struct vm_area_struct * area, unsigned long address,
- int write_access, unsigned long newpage)
+void vmtruncate(struct inode * inode, unsigned long offset)
{
- struct inode * inode;
- unsigned long offset;
- unsigned long from_address;
- unsigned long give_page;
struct vm_area_struct * mpnt;
- if (!area || !(inode = area->vm_inode) || inode->i_count < 2)
- return 0;
- /* do we need to copy or can we just share? */
- give_page = 0;
- if (write_access && !(area->vm_flags & VM_SHARED)) {
- if (!newpage)
- return 0;
- give_page = newpage;
- }
- offset = address - area->vm_start + area->vm_offset;
- /* See if there is something in the VM we can share pages with. */
- /* Traverse the entire circular i_mmap list, except `area' itself. */
- for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) {
- /* must be same inode */
- if (mpnt->vm_inode != inode) {
- printk("Aiee! Corrupt vm_area_struct i_mmap ring\n");
- break;
- }
- /* offsets must be mutually page-aligned */
- if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK)
- continue;
- /* the other area must actually cover the wanted page.. */
- from_address = offset + mpnt->vm_start - mpnt->vm_offset;
- if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end)
+ truncate_inode_pages(inode, offset);
+ if (!inode->i_mmap)
+ return;
+ mpnt = inode->i_mmap;
+ do {
+ unsigned long start = mpnt->vm_start;
+ unsigned long len = mpnt->vm_end - start;
+ unsigned long diff;
+
+ /* mapping wholly truncated? */
+ if (mpnt->vm_offset >= offset) {
+ zap_page_range(mpnt->vm_mm, start, len);
continue;
- /* .. NOW we can actually try to use the same physical page */
- if (!try_to_share(address, area, from_address, mpnt, give_page))
+ }
+ /* mapping wholly unaffected? */
+ diff = offset - mpnt->vm_offset;
+ if (diff >= len)
continue;
- /* free newpage if we never used it.. */
- if (give_page || !newpage)
- return 1;
- free_page(newpage);
- return 1;
- }
- return 0;
+ /* Ok, partially affected.. */
+ start += diff;
+ len = (len - diff) & PAGE_MASK;
+ if (start & ~PAGE_MASK) {
+ partial_clear(mpnt, start);
+ start = (start + ~PAGE_MASK) & PAGE_MASK;
+ }
+ zap_page_range(mpnt->vm_mm, start, len);
+ } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap);
}
-/*
- * fill in an empty page-table if none exists.
- */
-static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address)
-{
- pgd_t *pgd;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(tsk, address);
- pmd = pmd_alloc(pgd, address);
- if (!pmd) {
- oom(tsk);
- return NULL;
- }
- pte = pte_alloc(pmd, address);
- if (!pte) {
- oom(tsk);
- return NULL;
- }
- return pte;
-}
-static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address,
+static inline void do_swap_page(struct task_struct * tsk,
+ struct vm_area_struct * vma, unsigned long address,
pte_t * page_table, pte_t entry, int write_access)
{
pte_t page;
if (!vma->vm_ops || !vma->vm_ops->swapin) {
- swap_in(vma, page_table, pte_val(entry), write_access);
+ swap_in(tsk, vma, page_table, pte_val(entry), write_access);
+ flush_page_to_ram(pte_page(*page_table));
return;
}
page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
@@ -1005,11 +771,12 @@ static inline void do_swap_page(struct vm_area_struct * vma, unsigned long addre
free_page(pte_page(page));
return;
}
- if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED))
+ if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED))
page = pte_wrprotect(page);
- ++vma->vm_task->mm->rss;
- ++vma->vm_task->mm->maj_flt;
- *page_table = page;
+ ++vma->vm_mm->rss;
+ ++tsk->maj_flt;
+ flush_page_to_ram(pte_page(page));
+ set_pte(page_table, page);
return;
}
@@ -1018,71 +785,94 @@ static inline void do_swap_page(struct vm_area_struct * vma, unsigned long addre
* tries to share with existing pages, but makes a separate copy if
* the "write_access" parameter is true in order to avoid the next
* page fault.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
*/
-void do_no_page(struct vm_area_struct * vma, unsigned long address,
- int write_access)
+void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
+ unsigned long address, int write_access)
{
+ pgd_t * pgd;
+ pmd_t * pmd;
pte_t * page_table;
pte_t entry;
unsigned long page;
- page_table = get_empty_pgtable(vma->vm_task,address);
+ pgd = pgd_offset(tsk->mm, address);
+ pmd = pmd_alloc(pgd, address);
+ if (!pmd)
+ goto no_memory;
+ page_table = pte_alloc(pmd, address);
if (!page_table)
- return;
+ goto no_memory;
entry = *page_table;
if (pte_present(entry))
- return;
- if (!pte_none(entry)) {
- do_swap_page(vma, address, page_table, entry, write_access);
- return;
- }
+ goto is_present;
+ if (!pte_none(entry))
+ goto swap_page;
address &= PAGE_MASK;
- if (!vma->vm_ops || !vma->vm_ops->nopage) {
- ++vma->vm_task->mm->rss;
- ++vma->vm_task->mm->min_flt;
- get_empty_page(vma, page_table);
- return;
- }
- page = __get_free_page(GFP_KERNEL);
- if (share_page(vma, address, write_access, page)) {
- ++vma->vm_task->mm->min_flt;
- ++vma->vm_task->mm->rss;
- return;
- }
- if (!page) {
- oom(current);
- put_page(page_table, BAD_PAGE);
- return;
- }
- ++vma->vm_task->mm->maj_flt;
- ++vma->vm_task->mm->rss;
+ if (!vma->vm_ops || !vma->vm_ops->nopage)
+ goto anonymous_page;
/*
- * The fourth argument is "no_share", which tells the low-level code
+ * The third argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It's
* essentially an early COW detection
*/
- page = vma->vm_ops->nopage(vma, address, page,
- write_access && !(vma->vm_flags & VM_SHARED));
- if (share_page(vma, address, write_access, 0)) {
- free_page(page);
- return;
- }
+ page = vma->vm_ops->nopage(vma, address,
+ (vma->vm_flags & VM_SHARED)?0:write_access);
+ if (!page)
+ goto sigbus;
+ ++tsk->maj_flt;
+ ++vma->vm_mm->rss;
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
- * a exclusive copy of the page, or this is a shared mapping,
+ * an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
+/* do_no_page might already have flushed the page ... */
+ flush_page_to_ram(page);
entry = mk_pte(page, vma->vm_page_prot);
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
- } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED))
+ } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
put_page(page_table, entry);
+ /* no need to invalidate: a not-present page shouldn't be cached */
+ return;
+
+anonymous_page:
+ entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
+ if (write_access) {
+ unsigned long page = get_free_page(GFP_KERNEL);
+ if (!page)
+ goto sigbus;
+ entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ vma->vm_mm->rss++;
+ tsk->min_flt++;
+ flush_page_to_ram(page);
+ }
+ put_page(page_table, entry);
+ return;
+
+sigbus:
+ force_sig(SIGBUS, current);
+ put_page(page_table, BAD_PAGE);
+ /* no need to invalidate, wasn't present */
+ return;
+
+swap_page:
+ do_swap_page(tsk, vma, address, page_table, entry, write_access);
+ return;
+
+no_memory:
+ oom(tsk);
+is_present:
+ return;
}
/*
@@ -1102,17 +892,19 @@ static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long a
int write_access, pte_t * pte)
{
if (!pte_present(*pte)) {
- do_no_page(vma, address, write_access);
+ do_no_page(current, vma, address, write_access);
return;
}
- *pte = pte_mkyoung(*pte);
+ set_pte(pte, pte_mkyoung(*pte));
+ flush_tlb_page(vma, address);
if (!write_access)
return;
if (pte_write(*pte)) {
- *pte = pte_mkdirty(*pte);
+ set_pte(pte, pte_mkdirty(*pte));
+ flush_tlb_page(vma, address);
return;
}
- do_wp_page(vma, address, write_access);
+ do_wp_page(current, vma, address, write_access);
}
void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
@@ -1122,7 +914,7 @@ void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
pmd_t *pmd;
pte_t *pte;
- pgd = pgd_offset(vma->vm_task, address);
+ pgd = pgd_offset(vma->vm_mm, address);
pmd = pmd_alloc(pgd, address);
if (!pmd)
goto no_memory;
@@ -1133,5 +925,5 @@ void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
update_mmu_cache(vma, address, *pte);
return;
no_memory:
- oom(vma->vm_task);
+ oom(current);
}
diff --git a/mm/mlock.c b/mm/mlock.c
new file mode 100644
index 000000000..65b9e5407
--- /dev/null
+++ b/mm/mlock.c
@@ -0,0 +1,272 @@
+/*
+ * linux/mm/mlock.c
+ *
+ * (C) Copyright 1995 Linus Torvalds
+ */
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/errno.h>
+#include <linux/mman.h>
+#include <linux/string.h>
+#include <linux/malloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+
+static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
+{
+ vma->vm_flags = newflags;
+ return 0;
+}
+
+static inline int mlock_fixup_start(struct vm_area_struct * vma,
+ unsigned long end, int newflags)
+{
+ struct vm_area_struct * n;
+
+ n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (!n)
+ return -EAGAIN;
+ *n = *vma;
+ vma->vm_start = end;
+ n->vm_end = end;
+ vma->vm_offset += vma->vm_start - n->vm_start;
+ n->vm_flags = newflags;
+ if (n->vm_inode)
+ n->vm_inode->i_count++;
+ if (n->vm_ops && n->vm_ops->open)
+ n->vm_ops->open(n);
+ insert_vm_struct(current->mm, n);
+ return 0;
+}
+
+static inline int mlock_fixup_end(struct vm_area_struct * vma,
+ unsigned long start, int newflags)
+{
+ struct vm_area_struct * n;
+
+ n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (!n)
+ return -EAGAIN;
+ *n = *vma;
+ vma->vm_end = start;
+ n->vm_start = start;
+ n->vm_offset += n->vm_start - vma->vm_start;
+ n->vm_flags = newflags;
+ if (n->vm_inode)
+ n->vm_inode->i_count++;
+ if (n->vm_ops && n->vm_ops->open)
+ n->vm_ops->open(n);
+ insert_vm_struct(current->mm, n);
+ return 0;
+}
+
+static inline int mlock_fixup_middle(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int newflags)
+{
+ struct vm_area_struct * left, * right;
+
+ left = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (!left)
+ return -EAGAIN;
+ right = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (!right) {
+ kfree(left);
+ return -EAGAIN;
+ }
+ *left = *vma;
+ *right = *vma;
+ left->vm_end = start;
+ vma->vm_start = start;
+ vma->vm_end = end;
+ right->vm_start = end;
+ vma->vm_offset += vma->vm_start - left->vm_start;
+ right->vm_offset += right->vm_start - left->vm_start;
+ vma->vm_flags = newflags;
+ if (vma->vm_inode)
+ vma->vm_inode->i_count += 2;
+ if (vma->vm_ops && vma->vm_ops->open) {
+ vma->vm_ops->open(left);
+ vma->vm_ops->open(right);
+ }
+ insert_vm_struct(current->mm, left);
+ insert_vm_struct(current->mm, right);
+ return 0;
+}
+
+static int mlock_fixup(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, unsigned int newflags)
+{
+ int pages, retval;
+
+ if (newflags == vma->vm_flags)
+ return 0;
+
+ if (start == vma->vm_start) {
+ if (end == vma->vm_end)
+ retval = mlock_fixup_all(vma, newflags);
+ else
+ retval = mlock_fixup_start(vma, end, newflags);
+ } else {
+ if (end == vma->vm_end)
+ retval = mlock_fixup_end(vma, start, newflags);
+ else
+ retval = mlock_fixup_middle(vma, start, end, newflags);
+ }
+ if (!retval) {
+ /* keep track of amount of locked VM */
+ pages = (end - start) >> PAGE_SHIFT;
+ if (!(newflags & VM_LOCKED))
+ pages = -pages;
+ vma->vm_mm->locked_vm += pages;
+
+ if (newflags & VM_LOCKED)
+ while (start < end) {
+ char c;
+ get_user(c,(char *) start);
+ __asm__ __volatile__("": :"r" (c));
+ start += PAGE_SIZE;
+ }
+ }
+ return retval;
+}
+
+static int do_mlock(unsigned long start, size_t len, int on)
+{
+ unsigned long nstart, end, tmp;
+ struct vm_area_struct * vma, * next;
+ int error;
+
+ if (!suser())
+ return -EPERM;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+ vma = find_vma(current->mm, start);
+ if (!vma || vma->vm_start > start)
+ return -ENOMEM;
+
+ for (nstart = start ; ; ) {
+ unsigned int newflags;
+
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+
+ newflags = vma->vm_flags | VM_LOCKED;
+ if (!on)
+ newflags &= ~VM_LOCKED;
+
+ if (vma->vm_end >= end) {
+ error = mlock_fixup(vma, nstart, end, newflags);
+ break;
+ }
+
+ tmp = vma->vm_end;
+ next = vma->vm_next;
+ error = mlock_fixup(vma, nstart, tmp, newflags);
+ if (error)
+ break;
+ nstart = tmp;
+ vma = next;
+ if (!vma || vma->vm_start != nstart) {
+ error = -ENOMEM;
+ break;
+ }
+ }
+ merge_segments(current->mm, start, end);
+ return error;
+}
+
+asmlinkage int sys_mlock(unsigned long start, size_t len)
+{
+ unsigned long locked;
+ unsigned long lock_limit;
+
+ len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
+ start &= PAGE_MASK;
+
+ locked = len >> PAGE_SHIFT;
+ locked += current->mm->locked_vm;
+
+ lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ /* check against resource limits */
+ if (locked > lock_limit)
+ return -ENOMEM;
+
+ /* we may lock at most half of physical memory... */
+ /* (this check is pretty bogus, but doesn't hurt) */
+ if (locked > max_mapnr/2)
+ return -ENOMEM;
+
+ return do_mlock(start, len, 1);
+}
+
+asmlinkage int sys_munlock(unsigned long start, size_t len)
+{
+ len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
+ start &= PAGE_MASK;
+ return do_mlock(start, len, 0);
+}
+
+static int do_mlockall(int flags)
+{
+ int error;
+ unsigned int def_flags;
+ struct vm_area_struct * vma;
+
+ if (!suser())
+ return -EPERM;
+
+ def_flags = 0;
+ if (flags & MCL_FUTURE)
+ def_flags = VM_LOCKED;
+ current->mm->def_flags = def_flags;
+
+ error = 0;
+ for (vma = current->mm->mmap; vma ; vma = vma->vm_next) {
+ unsigned int newflags;
+
+ newflags = vma->vm_flags | VM_LOCKED;
+ if (!(flags & MCL_CURRENT))
+ newflags &= ~VM_LOCKED;
+ error = mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags);
+ if (error)
+ break;
+ }
+ merge_segments(current->mm, 0, TASK_SIZE);
+ return error;
+}
+
+asmlinkage int sys_mlockall(int flags)
+{
+ unsigned long lock_limit;
+
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
+ return -EINVAL;
+
+ lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ if (current->mm->total_vm > lock_limit)
+ return -ENOMEM;
+
+ /* we may lock at most half of physical memory... */
+ /* (this check is pretty bogus, but doesn't hurt) */
+ if (current->mm->total_vm > max_mapnr/2)
+ return -ENOMEM;
+
+ return do_mlockall(flags);
+}
+
+asmlinkage int sys_munlockall(void)
+{
+ return do_mlockall(0);
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index 3253a06c0..ac245a17f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -12,13 +12,13 @@
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/malloc.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
-#include <asm/segment.h>
+#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/pgtable.h>
-static int anon_map(struct inode *, struct file *, struct vm_area_struct *);
-
/*
* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
@@ -41,22 +41,126 @@ pgprot_t protection_map[16] = {
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
+/*
+ * Check that a process has enough memory to allocate a
+ * new virtual mapping.
+ */
+static inline int vm_enough_memory(long pages)
+{
+ /*
+ * stupid algorithm to decide if we have enough memory: while
+ * simple, it hopefully works in most obvious cases.. Easy to
+ * fool it, but this should catch most mistakes.
+ */
+ long freepages;
+ freepages = buffermem >> PAGE_SHIFT;
+ freepages += page_cache_size;
+ freepages >>= 1;
+ freepages += nr_free_pages;
+ freepages += nr_swap_pages;
+ freepages -= max_mapnr >> 4;
+ return freepages > pages;
+}
+
+asmlinkage unsigned long sys_brk(unsigned long brk)
+{
+ unsigned long rlim;
+ unsigned long newbrk, oldbrk;
+ struct mm_struct *mm = current->mm;
+
+ if (brk < mm->end_code)
+ return mm->brk;
+ newbrk = PAGE_ALIGN(brk);
+ oldbrk = PAGE_ALIGN(mm->brk);
+ if (oldbrk == newbrk)
+ return mm->brk = brk;
+
+ /*
+ * Always allow shrinking brk
+ */
+ if (brk <= mm->brk) {
+ mm->brk = brk;
+ do_munmap(newbrk, oldbrk-newbrk);
+ return brk;
+ }
+ /*
+ * Check against rlimit and stack..
+ */
+ rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+ if (rlim >= RLIM_INFINITY)
+ rlim = ~0;
+ if (brk - mm->end_code > rlim)
+ return mm->brk;
+
+ /*
+ * Check against existing mmap mappings.
+ */
+ if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+ return mm->brk;
+
+ /*
+ * Check if we have enough memory..
+ */
+ if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
+ return mm->brk;
+
+ /*
+ * Ok, looks good - let it rip.
+ */
+ if(do_mmap(NULL, oldbrk, newbrk-oldbrk,
+ PROT_READ|PROT_WRITE|PROT_EXEC,
+ MAP_FIXED|MAP_PRIVATE, 0) != oldbrk)
+ return mm->brk;
+ return mm->brk = brk;
+}
+
+/*
+ * Combine the mmap "prot" and "flags" argument into one "vm_flags" used
+ * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
+ * into "VM_xxx".
+ */
+static inline unsigned long vm_flags(unsigned long prot, unsigned long flags)
+{
+#define _trans(x,bit1,bit2) \
+((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
+
+ unsigned long prot_bits, flag_bits;
+ prot_bits =
+ _trans(prot, PROT_READ, VM_READ) |
+ _trans(prot, PROT_WRITE, VM_WRITE) |
+ _trans(prot, PROT_EXEC, VM_EXEC);
+ flag_bits =
+ _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
+ _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
+ _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
+ return prot_bits | flag_bits;
+#undef _trans
+}
+
unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long off)
{
- int error;
+ struct mm_struct * mm = current->mm;
struct vm_area_struct * vma;
if ((len = PAGE_ALIGN(len)) == 0)
return addr;
- if (addr > TASK_SIZE || len > TASK_SIZE || addr > TASK_SIZE-len)
+ if (len > TASK_SIZE || addr > TASK_SIZE-len)
return -EINVAL;
/* offset overflow? */
if (off + len < off)
return -EINVAL;
+ /* mlock MCL_FUTURE? */
+ if (mm->def_flags & VM_LOCKED) {
+ unsigned long locked = mm->locked_vm << PAGE_SHIFT;
+ locked += len;
+ if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+ return -EAGAIN;
+ }
+
/*
* do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
@@ -68,6 +172,11 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
case MAP_SHARED:
if ((prot & PROT_WRITE) && !(file->f_mode & 2))
return -EACCES;
+ /*
+ * make sure there are no mandatory locks on the file.
+ */
+ if (locks_verify_locked(file->f_inode))
+ return -EAGAIN;
/* fall through */
case MAP_PRIVATE:
if (!(file->f_mode & 1))
@@ -77,8 +186,10 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
default:
return -EINVAL;
}
- if ((flags & MAP_DENYWRITE) && (file->f_inode->i_wcount > 0))
- return -ETXTBSY;
+ if (flags & MAP_DENYWRITE) {
+ if (file->f_inode->i_writecount > 0)
+ return -ETXTBSY;
+ }
} else if ((flags & MAP_TYPE) != MAP_PRIVATE)
return -EINVAL;
@@ -90,8 +201,6 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
if (flags & MAP_FIXED) {
if (addr & ~PAGE_MASK)
return -EINVAL;
- if (len > TASK_SIZE || addr > TASK_SIZE - len)
- return -EINVAL;
} else {
addr = get_unmapped_area(addr, len);
if (!addr)
@@ -111,11 +220,10 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
if (!vma)
return -ENOMEM;
- vma->vm_task = current;
+ vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = prot & (VM_READ | VM_WRITE | VM_EXEC);
- vma->vm_flags |= flags & (VM_GROWSDOWN | VM_DENYWRITE | VM_EXECUTABLE);
+ vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
if (file) {
if (file->f_mode & 1)
@@ -145,17 +253,48 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
do_munmap(addr, len); /* Clear old maps */
- if (file)
- error = file->f_op->mmap(file->f_inode, file, vma);
- else
- error = anon_map(NULL, NULL, vma);
-
- if (error) {
+ /* Check against address space limit. */
+ if ((mm->total_vm << PAGE_SHIFT) + len
+ > current->rlim[RLIMIT_AS].rlim_cur) {
kfree(vma);
- return error;
+ return -ENOMEM;
+ }
+
+ /* Private writable mapping? Check memory availability.. */
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE) {
+ if (!(flags & MAP_NORESERVE) &&
+ !vm_enough_memory(len >> PAGE_SHIFT)) {
+ kfree(vma);
+ return -ENOMEM;
+ }
+ }
+
+ if (file) {
+ int error = file->f_op->mmap(file->f_inode, file, vma);
+
+ if (error) {
+ kfree(vma);
+ return error;
+ }
+ }
+
+ flags = vma->vm_flags;
+ insert_vm_struct(mm, vma);
+ merge_segments(mm, vma->vm_start, vma->vm_end);
+
+ /* merge_segments might have merged our vma, so we can't use it any more */
+ mm->total_vm += len >> PAGE_SHIFT;
+ if (flags & VM_LOCKED) {
+ unsigned long start = addr;
+ mm->locked_vm += len >> PAGE_SHIFT;
+ do {
+ char c;
+ get_user(c,(char *) start);
+ len -= PAGE_SIZE;
+ start += PAGE_SIZE;
+ __asm__ __volatile__("": :"r" (c));
+ } while (len > 0);
}
- insert_vm_struct(current, vma);
- merge_segments(current, vma->vm_start, vma->vm_end);
return addr;
}
@@ -174,41 +313,16 @@ unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
addr = TASK_SIZE / 3;
addr = PAGE_ALIGN(addr);
- for (vmm = current->mm->mmap; ; vmm = vmm->vm_next) {
+ for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+ /* At this point: (!vmm || addr < vmm->vm_end). */
if (TASK_SIZE - len < addr)
return 0;
- if (!vmm)
+ if (!vmm || addr + len <= vmm->vm_start)
return addr;
- if (addr > vmm->vm_end)
- continue;
- if (addr + len > vmm->vm_start) {
- addr = vmm->vm_end;
- continue;
- }
- return addr;
+ addr = vmm->vm_end;
}
}
-asmlinkage int sys_mmap(unsigned long *buffer)
-{
- int error;
- unsigned long flags;
- struct file * file = NULL;
-
- error = verify_area(VERIFY_READ, buffer, 6*sizeof(long));
- if (error)
- return error;
- flags = get_fs_long(buffer+3);
- if (!(flags & MAP_ANONYMOUS)) {
- unsigned long fd = get_fs_long(buffer+4);
- if (fd >= NR_OPEN || !(file = current->files->fd[fd]))
- return -EBADF;
- }
- return do_mmap(file, get_fs_long(buffer), get_fs_long(buffer+1),
- get_fs_long(buffer+2), flags, get_fs_long(buffer+5));
-}
-
-
/*
* Searching a VMA in the linear list task->mm->mmap is horribly slow.
* Use an AVL (Adelson-Velskii and Landis) tree to speed up this search
@@ -230,7 +344,6 @@ asmlinkage int sys_mmap(unsigned long *buffer)
* vm_avl_height 1+max(heightof(left),heightof(right))
* The empty tree is represented as NULL.
*/
-#define avl_empty (struct vm_area_struct *) NULL
/* Since the trees are balanced, their height will never be large. */
#define avl_maxheight 41 /* why this? a small exercise */
@@ -243,60 +356,8 @@ asmlinkage int sys_mmap(unsigned long *buffer)
* foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key.
*/
-/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct * find_vma (struct task_struct * task, unsigned long addr)
-{
-#if 0 /* equivalent, but slow */
- struct vm_area_struct * vma;
-
- for (vma = task->mm->mmap ; ; vma = vma->vm_next) {
- if (!vma)
- return NULL;
- if (vma->vm_end > addr)
- return vma;
- }
-#else
- struct vm_area_struct * result = NULL;
- struct vm_area_struct * tree;
-
- for (tree = task->mm->mmap_avl ; ; ) {
- if (tree == avl_empty)
- return result;
- if (tree->vm_end > addr) {
- if (tree->vm_start <= addr)
- return tree;
- result = tree;
- tree = tree->vm_avl_left;
- } else
- tree = tree->vm_avl_right;
- }
-#endif
-}
-
-/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
- NULL if none. Assume start_addr < end_addr. */
-struct vm_area_struct * find_vma_intersection (struct task_struct * task, unsigned long start_addr, unsigned long end_addr)
-{
- struct vm_area_struct * vma;
-
-#if 0 /* equivalent, but slow */
- for (vma = task->mm->mmap; vma; vma = vma->vm_next) {
- if (end_addr <= vma->vm_start)
- break;
- if (start_addr < vma->vm_end)
- return vma;
- }
- return NULL;
-#else
- vma = find_vma(task,start_addr);
- if (!vma || end_addr <= vma->vm_start)
- return NULL;
- return vma;
-#endif
-}
-
/* Look up the nodes at the left and at the right of a given node. */
-static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
+static inline void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
{
vm_avl_key_t key = node->vm_avl_key;
@@ -342,7 +403,7 @@ static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct
* nodes[0]..nodes[k-1] such that
* nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}.
*/
-static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
+static inline void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
{
for ( ; count > 0 ; count--) {
struct vm_area_struct ** nodeplace = *--nodeplaces_ptr;
@@ -419,7 +480,7 @@ static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
}
/* Insert a node into a tree. */
-static void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
+static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree)
{
vm_avl_key_t key = new_node->vm_avl_key;
struct vm_area_struct ** nodeplace = ptree;
@@ -446,7 +507,7 @@ static void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct
/* Insert a node into a tree, and
* return the node to the left of it and the node to the right of it.
*/
-static void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
+static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,
struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
{
vm_avl_key_t key = new_node->vm_avl_key;
@@ -476,7 +537,7 @@ static void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_a
}
/* Removes a node out of a tree. */
-static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
+static inline void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
{
vm_avl_key_t key = node_to_delete->vm_avl_key;
struct vm_area_struct ** nodeplace = ptree;
@@ -652,7 +713,7 @@ static void avl_check (struct task_struct * task, char *caller)
* Case 4 involves the creation of 2 new areas, for each side of
* the hole.
*/
-void unmap_fixup(struct vm_area_struct *area,
+static void unmap_fixup(struct vm_area_struct *area,
unsigned long addr, size_t len)
{
struct vm_area_struct *mpnt;
@@ -666,6 +727,9 @@ void unmap_fixup(struct vm_area_struct *area,
area->vm_start, area->vm_end, addr, end);
return;
}
+ area->vm_mm->total_vm -= len >> PAGE_SHIFT;
+ if (area->vm_flags & VM_LOCKED)
+ area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
/* Unmapping the whole area */
if (addr == area->vm_start && end == area->vm_end) {
@@ -699,7 +763,7 @@ void unmap_fixup(struct vm_area_struct *area,
if (mpnt->vm_ops && mpnt->vm_ops->open)
mpnt->vm_ops->open(mpnt);
area->vm_end = addr; /* Truncate area */
- insert_vm_struct(current, mpnt);
+ insert_vm_struct(current->mm, mpnt);
}
/* construct whatever mapping is needed */
@@ -713,7 +777,7 @@ void unmap_fixup(struct vm_area_struct *area,
area->vm_end = area->vm_start;
area->vm_ops->close(area);
}
- insert_vm_struct(current, mpnt);
+ insert_vm_struct(current->mm, mpnt);
}
asmlinkage int sys_munmap(unsigned long addr, size_t len)
@@ -743,7 +807,7 @@ int do_munmap(unsigned long addr, size_t len)
* every area affected in some way (by any overlap) is put
* on the list. If nothing is put on, nothing is affected.
*/
- mpnt = find_vma(current, addr);
+ mpnt = find_vma(current->mm, addr);
if (!mpnt)
return 0;
avl_neighbours(mpnt, current->mm->mmap_avl, &prev, &next);
@@ -768,7 +832,7 @@ int do_munmap(unsigned long addr, size_t len)
* If the one of the segments is only being partially unmapped,
* it will put new vm_area_struct(s) into the address space.
*/
- while (free) {
+ do {
unsigned long st, end;
mpnt = free;
@@ -782,38 +846,47 @@ int do_munmap(unsigned long addr, size_t len)
if (mpnt->vm_ops && mpnt->vm_ops->unmap)
mpnt->vm_ops->unmap(mpnt, st, end-st);
-
+ zap_page_range(current->mm, st, end-st);
unmap_fixup(mpnt, st, end-st);
kfree(mpnt);
- }
+ } while (free);
+
+ /* we could zap the page tables here too.. */
- unmap_page_range(addr, len);
return 0;
}
/* Build the AVL tree corresponding to the VMA list. */
-void build_mmap_avl(struct task_struct * task)
+void build_mmap_avl(struct mm_struct * mm)
{
struct vm_area_struct * vma;
- task->mm->mmap_avl = NULL;
- for (vma = task->mm->mmap; vma; vma = vma->vm_next)
- avl_insert(vma, &task->mm->mmap_avl);
+ mm->mmap_avl = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ avl_insert(vma, &mm->mmap_avl);
}
/* Release all mmaps. */
-void exit_mmap(struct task_struct * task)
+void exit_mmap(struct mm_struct * mm)
{
struct vm_area_struct * mpnt;
- mpnt = task->mm->mmap;
- task->mm->mmap = NULL;
- task->mm->mmap_avl = NULL;
+ mpnt = mm->mmap;
+ mm->mmap = NULL;
+ mm->mmap_avl = NULL;
+ mm->rss = 0;
+ mm->total_vm = 0;
+ mm->locked_vm = 0;
while (mpnt) {
struct vm_area_struct * next = mpnt->vm_next;
- if (mpnt->vm_ops && mpnt->vm_ops->close)
- mpnt->vm_ops->close(mpnt);
+ if (mpnt->vm_ops) {
+ if (mpnt->vm_ops->unmap)
+ mpnt->vm_ops->unmap(mpnt, mpnt->vm_start, mpnt->vm_end-mpnt->vm_start);
+ if (mpnt->vm_ops->close)
+ mpnt->vm_ops->close(mpnt);
+ }
remove_shared_vm_struct(mpnt);
+ zap_page_range(mm, mpnt->vm_start, mpnt->vm_end-mpnt->vm_start);
if (mpnt->vm_inode)
iput(mpnt->vm_inode);
kfree(mpnt);
@@ -825,7 +898,7 @@ void exit_mmap(struct task_struct * task)
* Insert vm structure into process list sorted by address
* and into the inode's i_mmap ring.
*/
-void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp)
+void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
{
struct vm_area_struct *share;
struct inode * inode;
@@ -833,7 +906,7 @@ void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp)
#if 0 /* equivalent, but slow */
struct vm_area_struct **p, *mpnt;
- p = &t->mm->mmap;
+ p = &mm->mmap;
while ((mpnt = *p) != NULL) {
if (mpnt->vm_start > vmp->vm_start)
break;
@@ -846,13 +919,13 @@ void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp)
#else
struct vm_area_struct * prev, * next;
- avl_insert_neighbours(vmp, &t->mm->mmap_avl, &prev, &next);
- if ((prev ? prev->vm_next : t->mm->mmap) != next)
+ avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next);
+ if ((prev ? prev->vm_next : mm->mmap) != next)
printk("insert_vm_struct: tree inconsistent with list\n");
if (prev)
prev->vm_next = vmp;
else
- t->mm->mmap = vmp;
+ mm->mmap = vmp;
vmp->vm_next = next;
#endif
@@ -901,14 +974,16 @@ void remove_shared_vm_struct(struct vm_area_struct *mpnt)
* We don't need to traverse the entire list, only those segments
* which intersect or are adjacent to a given interval.
*/
-void merge_segments (struct task_struct * task, unsigned long start_addr, unsigned long end_addr)
+void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
{
struct vm_area_struct *prev, *mpnt, *next;
- mpnt = find_vma(task, start_addr);
+ down(&mm->mmap_sem);
+ mpnt = find_vma(mm, start_addr);
if (!mpnt)
- return;
- avl_neighbours(mpnt, task->mm->mmap_avl, &prev, &next);
+ goto no_vma;
+
+ avl_neighbours(mpnt, mm->mmap_avl, &prev, &next);
/* we have prev->vm_next == mpnt && mpnt->vm_next = next */
if (!prev) {
@@ -952,7 +1027,7 @@ void merge_segments (struct task_struct * task, unsigned long start_addr, unsign
* big segment can possibly merge with the next one.
* The old unused mpnt is freed.
*/
- avl_remove(mpnt, &task->mm->mmap_avl);
+ avl_remove(mpnt, &mm->mmap_avl);
prev->vm_end = mpnt->vm_end;
prev->vm_next = mpnt->vm_next;
if (mpnt->vm_ops && mpnt->vm_ops->close) {
@@ -966,15 +1041,6 @@ void merge_segments (struct task_struct * task, unsigned long start_addr, unsign
kfree_s(mpnt, sizeof(*mpnt));
mpnt = prev;
}
-}
-
-/*
- * Map memory not associated with any file into a process
- * address space. Adjacent memory is merged.
- */
-static int anon_map(struct inode *ino, struct file * file, struct vm_area_struct * vma)
-{
- if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
- return -ENOMEM;
- return 0;
+no_vma:
+ up(&mm->mmap_sem);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ecf73730c..5aa7794a4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -13,7 +13,7 @@
#include <linux/string.h>
#include <linux/malloc.h>
-#include <asm/segment.h>
+#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/pgtable.h>
@@ -38,7 +38,7 @@ static inline void change_pte_range(pmd_t * pmd, unsigned long address,
do {
pte_t entry = *pte;
if (pte_present(entry))
- *pte = pte_modify(entry, newprot);
+ set_pte(pte, pte_modify(entry, newprot));
address += PAGE_SIZE;
pte++;
} while (address < end);
@@ -72,14 +72,16 @@ static inline void change_pmd_range(pgd_t * pgd, unsigned long address,
static void change_protection(unsigned long start, unsigned long end, pgprot_t newprot)
{
pgd_t *dir;
+ unsigned long beg = start;
- dir = pgd_offset(current, start);
+ dir = pgd_offset(current->mm, start);
+ flush_cache_range(current->mm, beg, end);
while (start < end) {
change_pmd_range(dir, start, end - start, newprot);
start = (start + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
- invalidate();
+ flush_tlb_range(current->mm, beg, end);
return;
}
@@ -110,7 +112,7 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma,
n->vm_inode->i_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- insert_vm_struct(current, n);
+ insert_vm_struct(current->mm, n);
return 0;
}
@@ -133,7 +135,7 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma,
n->vm_inode->i_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- insert_vm_struct(current, n);
+ insert_vm_struct(current->mm, n);
return 0;
}
@@ -167,8 +169,8 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
- insert_vm_struct(current, left);
- insert_vm_struct(current, right);
+ insert_vm_struct(current->mm, left);
+ insert_vm_struct(current->mm, right);
return 0;
}
@@ -214,7 +216,7 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma(current, start);
+ vma = find_vma(current->mm, start);
if (!vma || vma->vm_start > start)
return -EFAULT;
@@ -246,6 +248,6 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
break;
}
}
- merge_segments(current, start, end);
+ merge_segments(current->mm, start, end);
return error;
}
diff --git a/mm/mremap.c b/mm/mremap.c
new file mode 100644
index 000000000..a3e941055
--- /dev/null
+++ b/mm/mremap.c
@@ -0,0 +1,224 @@
+/*
+ * linux/mm/remap.c
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ */
+
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/errno.h>
+#include <linux/mman.h>
+#include <linux/string.h>
+#include <linux/malloc.h>
+#include <linux/swap.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+
+static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t * pgd;
+ pmd_t * pmd;
+ pte_t * pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none(*pgd))
+ goto end;
+ if (pgd_bad(*pgd)) {
+ printk("move_one_page: bad source pgd (%08lx)\n", pgd_val(*pgd));
+ pgd_clear(pgd);
+ goto end;
+ }
+
+ pmd = pmd_offset(pgd, addr);
+ if (pmd_none(*pmd))
+ goto end;
+ if (pmd_bad(*pmd)) {
+ printk("move_one_page: bad source pmd (%08lx)\n", pmd_val(*pmd));
+ pmd_clear(pmd);
+ goto end;
+ }
+
+ pte = pte_offset(pmd, addr);
+ if (pte_none(*pte))
+ pte = NULL;
+end:
+ return pte;
+}
+
+static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
+{
+ pmd_t * pmd;
+ pte_t * pte = NULL;
+
+ pmd = pmd_alloc(pgd_offset(mm, addr), addr);
+ if (pmd)
+ pte = pte_alloc(pmd, addr);
+ return pte;
+}
+
+static inline int copy_one_pte(pte_t * src, pte_t * dst)
+{
+ int error = 0;
+ pte_t pte = *src;
+
+ if (!pte_none(pte)) {
+ error++;
+ if (dst) {
+ pte_clear(src);
+ set_pte(dst, pte);
+ error--;
+ }
+ }
+ return error;
+}
+
+static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr)
+{
+ int error = 0;
+ pte_t * src;
+
+ src = get_one_pte(mm, old_addr);
+ if (src)
+ error = copy_one_pte(src, alloc_one_pte(mm, new_addr));
+ return error;
+}
+
+static int move_page_tables(struct mm_struct * mm,
+ unsigned long new_addr, unsigned long old_addr, unsigned long len)
+{
+ unsigned long offset = len;
+
+ flush_cache_range(mm, old_addr, old_addr + len);
+ flush_tlb_range(mm, old_addr, old_addr + len);
+
+ /*
+ * This is not the clever way to do this, but we're taking the
+ * easy way out on the assumption that most remappings will be
+ * only a few pages.. This also makes error recovery easier.
+ */
+ while (offset) {
+ offset -= PAGE_SIZE;
+ if (move_one_page(mm, old_addr + offset, new_addr + offset))
+ goto oops_we_failed;
+ }
+ return 0;
+
+ /*
+ * Ok, the move failed because we didn't have enough pages for
+ * the new page table tree. This is unlikely, but we have to
+ * take the possibility into account. In that case we just move
+ * all the pages back (this will work, because we still have
+ * the old page tables)
+ */
+oops_we_failed:
+ flush_cache_range(mm, new_addr, new_addr + len);
+ while ((offset += PAGE_SIZE) < len)
+ move_one_page(mm, new_addr + offset, old_addr + offset);
+ flush_tlb_range(mm, new_addr, new_addr + len);
+ zap_page_range(mm, new_addr, new_addr + len);
+ return -1;
+}
+
+static inline unsigned long move_vma(struct vm_area_struct * vma,
+ unsigned long addr, unsigned long old_len, unsigned long new_len)
+{
+ struct vm_area_struct * new_vma;
+
+ new_vma = (struct vm_area_struct *)
+ kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (new_vma) {
+ unsigned long new_addr = get_unmapped_area(addr, new_len);
+
+ if (new_addr && !move_page_tables(current->mm, new_addr, addr, old_len)) {
+ *new_vma = *vma;
+ new_vma->vm_start = new_addr;
+ new_vma->vm_end = new_addr+new_len;
+ new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start);
+ if (new_vma->vm_inode)
+ new_vma->vm_inode->i_count++;
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ insert_vm_struct(current->mm, new_vma);
+ merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
+ do_munmap(addr, old_len);
+ current->mm->total_vm += new_len >> PAGE_SHIFT;
+ return new_addr;
+ }
+ kfree(new_vma);
+ }
+ return -ENOMEM;
+}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ */
+asmlinkage unsigned long sys_mremap(unsigned long addr,
+ unsigned long old_len, unsigned long new_len,
+ unsigned long flags)
+{
+ struct vm_area_struct *vma;
+
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+ old_len = PAGE_ALIGN(old_len);
+ new_len = PAGE_ALIGN(new_len);
+
+ /*
+ * Always allow a shrinking remap: that just unmaps
+ * the unnecessary pages..
+ */
+ if (old_len > new_len) {
+ do_munmap(addr+new_len, old_len - new_len);
+ return addr;
+ }
+
+ /*
+ * Ok, we need to grow..
+ */
+ vma = find_vma(current->mm, addr);
+ if (!vma || vma->vm_start > addr)
+ return -EFAULT;
+ /* We can't remap across vm area boundaries */
+ if (old_len > vma->vm_end - addr)
+ return -EFAULT;
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked = current->mm->locked_vm << PAGE_SHIFT;
+ locked += new_len - old_len;
+ if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+ return -EAGAIN;
+ }
+ if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
+ > current->rlim[RLIMIT_AS].rlim_cur)
+ return -ENOMEM;
+
+ /* old_len exactly to the end of the area.. */
+ if (old_len == vma->vm_end - addr &&
+ (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
+ unsigned long max_addr = TASK_SIZE;
+ if (vma->vm_next)
+ max_addr = vma->vm_next->vm_start;
+ /* can we just expand the current mapping? */
+ if (max_addr - addr >= new_len) {
+ int pages = (new_len - old_len) >> PAGE_SHIFT;
+ vma->vm_end = addr + new_len;
+ current->mm->total_vm += pages;
+ if (vma->vm_flags & VM_LOCKED)
+ current->mm->locked_vm += pages;
+ return addr;
+ }
+ }
+
+ /*
+ * We weren't able to just expand or shrink the area,
+ * we need to create a new one and move it..
+ */
+ if (flags & MREMAP_MAYMOVE)
+ return move_vma(vma, addr, old_len, new_len);
+ return -ENOMEM;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
new file mode 100644
index 000000000..09373b3c9
--- /dev/null
+++ b/mm/page_alloc.c
@@ -0,0 +1,339 @@
+/*
+ * linux/mm/page_alloc.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/head.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/swapctl.h>
+#include <linux/interrupt.h>
+
+#include <asm/dma.h>
+#include <asm/system.h> /* for cli()/sti() */
+#include <asm/uaccess.h> /* for copy_to/from_user */
+#include <asm/bitops.h>
+#include <asm/pgtable.h>
+
+int nr_swap_pages = 0;
+int nr_free_pages = 0;
+
+/*
+ * Free area management
+ *
+ * The free_area_list arrays point to the queue heads of the free areas
+ * of different sizes
+ */
+
+#define NR_MEM_LISTS 6
+
+/* The start of this MUST match the start of "struct page" */
+struct free_area_struct {
+ struct page *next;
+ struct page *prev;
+ unsigned int * map;
+};
+
+#define memory_head(x) ((struct page *)(x))
+
+static struct free_area_struct free_area[NR_MEM_LISTS];
+
+static inline void init_mem_queue(struct free_area_struct * head)
+{
+ head->next = memory_head(head);
+ head->prev = memory_head(head);
+}
+
+static inline void add_mem_queue(struct free_area_struct * head, struct page * entry)
+{
+ struct page * next = head->next;
+
+ entry->prev = memory_head(head);
+ entry->next = next;
+ next->prev = entry;
+ head->next = entry;
+}
+
+static inline void remove_mem_queue(struct page * entry)
+{
+ struct page * next = entry->next;
+ struct page * prev = entry->prev;
+ next->prev = prev;
+ prev->next = next;
+}
+
+/*
+ * Free_page() adds the page to the free lists. This is optimized for
+ * fast normal cases (no error jumps taken normally).
+ *
+ * The way to optimize jumps for gcc-2.2.2 is to:
+ * - select the "normal" case and put it inside the if () { XXX }
+ * - no else-statements if you can avoid them
+ *
+ * With the above two rules, you get a straight-line execution path
+ * for the normal case, giving better asm-code.
+ *
+ * free_page() may sleep since the page being freed may be a buffer
+ * page or present in the swap cache. It will not sleep, however,
+ * for a freshly allocated page (get_free_page()).
+ */
+
+/*
+ * Buddy system. Hairy. You really aren't expected to understand this
+ *
+ * Hint: -mask = 1+~mask
+ */
+static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
+{
+ struct free_area_struct *area = free_area + order;
+ unsigned long index = map_nr >> (1 + order);
+ unsigned long mask = (~0UL) << order;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+
+#define list(x) (mem_map+(x))
+
+ map_nr &= mask;
+ nr_free_pages -= mask;
+ while (mask + (1 << (NR_MEM_LISTS-1))) {
+ if (!change_bit(index, area->map))
+ break;
+ remove_mem_queue(list(map_nr ^ -mask));
+ mask <<= 1;
+ area++;
+ index >>= 1;
+ map_nr &= mask;
+ }
+ add_mem_queue(area, list(map_nr));
+
+#undef list
+
+ restore_flags(flags);
+}
+
+void __free_page(struct page *page)
+{
+ if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
+ unsigned long map_nr = page->map_nr;
+ delete_from_swap_cache(map_nr);
+ free_pages_ok(map_nr, 0);
+ }
+}
+
+void free_pages(unsigned long addr, unsigned long order)
+{
+ unsigned long map_nr = MAP_NR(addr);
+
+ if (map_nr < max_mapnr) {
+ mem_map_t * map = mem_map + map_nr;
+ if (PageReserved(map))
+ return;
+ if (atomic_dec_and_test(&map->count)) {
+ delete_from_swap_cache(map_nr);
+ free_pages_ok(map_nr, order);
+ return;
+ }
+ }
+}
+
+/*
+ * Some ugly macros to speed up __get_free_pages()..
+ */
+#define MARK_USED(index, order, area) \
+ change_bit((index) >> (1+(order)), (area)->map)
+#define CAN_DMA(x) (PageDMA(x))
+#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
+#define RMQUEUE(order, dma) \
+do { struct free_area_struct * area = free_area+order; \
+ unsigned long new_order = order; \
+ do { struct page *prev = memory_head(area), *ret; \
+ while (memory_head(area) != (ret = prev->next)) { \
+ if (!dma || CAN_DMA(ret)) { \
+ unsigned long map_nr = ret->map_nr; \
+ (prev->next = ret->next)->prev = prev; \
+ MARK_USED(map_nr, new_order, area); \
+ nr_free_pages -= 1 << order; \
+ EXPAND(ret, map_nr, order, new_order, area); \
+ restore_flags(flags); \
+ return ADDRESS(map_nr); \
+ } \
+ prev = ret; \
+ } \
+ new_order++; area++; \
+ } while (new_order < NR_MEM_LISTS); \
+} while (0)
+
+#define EXPAND(map,index,low,high,area) \
+do { unsigned long size = 1 << high; \
+ while (high > low) { \
+ area--; high--; size >>= 1; \
+ add_mem_queue(area, map); \
+ MARK_USED(index, high, area); \
+ index += size; \
+ map += size; \
+ } \
+ map->count = 1; \
+ map->age = PAGE_INITIAL_AGE; \
+} while (0)
+
+unsigned long __get_free_pages(int priority, unsigned long order, int dma)
+{
+ unsigned long flags;
+ int reserved_pages;
+
+ if (order >= NR_MEM_LISTS)
+ return 0;
+ if (intr_count && priority != GFP_ATOMIC) {
+ static int count = 0;
+ if (++count < 5) {
+ printk("gfp called nonatomically from interrupt %p\n",
+ return_address());
+ priority = GFP_ATOMIC;
+ }
+ }
+ reserved_pages = 5;
+ if (priority != GFP_NFS)
+ reserved_pages = min_free_pages;
+ save_flags(flags);
+repeat:
+ cli();
+ if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
+ RMQUEUE(order, dma);
+ restore_flags(flags);
+ return 0;
+ }
+ restore_flags(flags);
+ if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1))
+ goto repeat;
+ return 0;
+}
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+ unsigned long order, flags;
+ unsigned long total = 0;
+
+ printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
+ save_flags(flags);
+ cli();
+ for (order=0 ; order < NR_MEM_LISTS; order++) {
+ struct page * tmp;
+ unsigned long nr = 0;
+ for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) {
+ nr ++;
+ }
+ total += nr * ((PAGE_SIZE>>10) << order);
+ printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
+ }
+ restore_flags(flags);
+ printk("= %lukB)\n", total);
+#ifdef SWAP_CACHE_INFO
+ show_swap_cache_info();
+#endif
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+/*
+ * set up the free-area data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ */
+unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
+{
+ mem_map_t * p;
+ unsigned long mask = PAGE_MASK;
+ int i;
+
+ /*
+ * select nr of pages we try to keep free for important stuff
+ * with a minimum of 16 pages. This is totally arbitrary
+ */
+ i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
+ if (i < 16)
+ i = 16;
+ min_free_pages = i;
+ free_pages_low = i + (i>>1);
+ free_pages_high = i + i;
+ start_mem = init_swap_cache(start_mem, end_mem);
+ mem_map = (mem_map_t *) start_mem;
+ p = mem_map + MAP_NR(end_mem);
+ start_mem = LONG_ALIGN((unsigned long) p);
+ memset(mem_map, 0, start_mem - (unsigned long) mem_map);
+ do {
+ --p;
+ p->flags = (1 << PG_DMA) | (1 << PG_reserved);
+ p->map_nr = p - mem_map;
+ } while (p > mem_map);
+
+ for (i = 0 ; i < NR_MEM_LISTS ; i++) {
+ unsigned long bitmap_size;
+ init_mem_queue(free_area+i);
+ mask += mask;
+ end_mem = (end_mem + ~mask) & mask;
+ bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
+ bitmap_size = (bitmap_size + 7) >> 3;
+ bitmap_size = LONG_ALIGN(bitmap_size);
+ free_area[i].map = (unsigned int *) start_mem;
+ memset((void *) start_mem, 0, bitmap_size);
+ start_mem += bitmap_size;
+ }
+ return start_mem;
+}
+
+/*
+ * The tests may look silly, but it essentially makes sure that
+ * no other process did a swap-in on us just as we were waiting.
+ *
+ * Also, don't bother to add to the swap cache if this page-in
+ * was due to a write access.
+ */
+void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
+ pte_t * page_table, unsigned long entry, int write_access)
+{
+ unsigned long page = __get_free_page(GFP_KERNEL);
+
+ if (pte_val(*page_table) != entry) {
+ free_page(page);
+ return;
+ }
+ if (!page) {
+ set_pte(page_table, BAD_PAGE);
+ swap_free(entry);
+ oom(tsk);
+ return;
+ }
+ read_swap_page(entry, (char *) page);
+ if (pte_val(*page_table) != entry) {
+ free_page(page);
+ return;
+ }
+ vma->vm_mm->rss++;
+ tsk->maj_flt++;
+ if (!write_access && add_to_swap_cache(MAP_NR(page), entry)) {
+ /* keep swap page allocated for the moment (swap cache) */
+ set_pte(page_table, mk_pte(page, vma->vm_page_prot));
+ return;
+ }
+ set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
+ swap_free(entry);
+ return;
+}
+
diff --git a/mm/page_io.c b/mm/page_io.c
new file mode 100644
index 000000000..9980c52b7
--- /dev/null
+++ b/mm/page_io.c
@@ -0,0 +1,193 @@
+/*
+ * linux/mm/page_io.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Swap reorganised 29.12.95,
+ * Asynchronous swapping added 30.12.95. Stephen Tweedie
+ * Removed race in async swapping. 14.4.1996. Bruno Haible
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/head.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/locks.h>
+#include <linux/swapctl.h>
+
+#include <asm/dma.h>
+#include <asm/system.h> /* for cli()/sti() */
+#include <asm/uaccess.h> /* for copy_to/from_user */
+#include <asm/bitops.h>
+#include <asm/pgtable.h>
+
+static struct wait_queue * lock_queue = NULL;
+
+/*
+ * Reads or writes a swap page.
+ * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
+ *
+ * Important prevention of race condition: The first thing we do is set a lock
+ * on this swap page, which lasts until I/O completes. This way a
+ * write_swap_page(entry) immediately followed by a read_swap_page(entry)
+ * on the same entry will first complete the write_swap_page(). Fortunately,
+ * not more than one write_swap_page() request can be pending per entry. So
+ * all races the caller must catch are: multiple read_swap_page() requests
+ * on the same entry.
+ */
+void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
+{
+ unsigned long type, offset;
+ struct swap_info_struct * p;
+ struct page *page;
+
+ type = SWP_TYPE(entry);
+ if (type >= nr_swapfiles) {
+ printk("Internal error: bad swap-device\n");
+ return;
+ }
+ p = &swap_info[type];
+ offset = SWP_OFFSET(entry);
+ if (offset >= p->max) {
+ printk("rw_swap_page: weirdness\n");
+ return;
+ }
+ if (p->swap_map && !p->swap_map[offset]) {
+ printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
+ return;
+ }
+ if (!(p->flags & SWP_USED)) {
+ printk("Trying to swap to unused swap-device\n");
+ return;
+ }
+ /* Make sure we are the only process doing I/O with this swap page. */
+ while (set_bit(offset,p->swap_lockmap)) {
+ run_task_queue(&tq_disk);
+ sleep_on(&lock_queue);
+ }
+ if (rw == READ)
+ kstat.pswpin++;
+ else
+ kstat.pswpout++;
+ page = mem_map + MAP_NR(buf);
+ atomic_inc(&page->count);
+ wait_on_page(page);
+ if (p->swap_device) {
+ if (!wait) {
+ set_bit(PG_free_after, &page->flags);
+ set_bit(PG_decr_after, &page->flags);
+ set_bit(PG_swap_unlock_after, &page->flags);
+ page->swap_unlock_entry = entry;
+ atomic_inc(&nr_async_pages);
+ }
+ ll_rw_page(rw,p->swap_device,offset,buf);
+ /*
+ * NOTE! We don't decrement the page count if we
+ * don't wait - that will happen asynchronously
+ * when the IO completes.
+ */
+ if (!wait)
+ return;
+ wait_on_page(page);
+ } else if (p->swap_file) {
+ struct inode *swapf = p->swap_file;
+ unsigned int zones[PAGE_SIZE/512];
+ int i;
+ if (swapf->i_op->bmap == NULL
+ && swapf->i_op->smap != NULL){
+ /*
+ With MsDOS, we use msdos_smap which return
+ a sector number (not a cluster or block number).
+ It is a patch to enable the UMSDOS project.
+ Other people are working on better solution.
+
+ It sounds like ll_rw_swap_file defined
+ it operation size (sector size) based on
+ PAGE_SIZE and the number of block to read.
+ So using bmap or smap should work even if
+ smap will require more blocks.
+ */
+ int j;
+ unsigned int block = offset << 3;
+
+ for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
+ if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
+ printk("rw_swap_page: bad swap file\n");
+ return;
+ }
+ }
+ }else{
+ int j;
+ unsigned int block = offset
+ << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
+
+ for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
+ if (!(zones[i] = bmap(swapf,block++))) {
+ printk("rw_swap_page: bad swap file\n");
+ }
+ }
+ ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
+ } else
+ printk("rw_swap_page: no swap file or device\n");
+ atomic_dec(&page->count);
+ if (offset && !clear_bit(offset,p->swap_lockmap))
+ printk("rw_swap_page: lock already cleared\n");
+ wake_up(&lock_queue);
+}
+
+/* This is run when asynchronous page I/O has completed. */
+void swap_after_unlock_page (unsigned long entry)
+{
+ unsigned long type, offset;
+ struct swap_info_struct * p;
+
+ type = SWP_TYPE(entry);
+ if (type >= nr_swapfiles) {
+ printk("swap_after_unlock_page: bad swap-device\n");
+ return;
+ }
+ p = &swap_info[type];
+ offset = SWP_OFFSET(entry);
+ if (offset >= p->max) {
+ printk("swap_after_unlock_page: weirdness\n");
+ return;
+ }
+ if (!clear_bit(offset,p->swap_lockmap))
+ printk("swap_after_unlock_page: lock already cleared\n");
+ wake_up(&lock_queue);
+}
+
+/*
+ * Swap partitions are now read via brw_page. ll_rw_page is an
+ * asynchronous function now --- we must call wait_on_page afterwards
+ * if synchronous IO is required.
+ */
+void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer)
+{
+ int block = offset;
+ struct page *page;
+
+ switch (rw) {
+ case READ:
+ break;
+ case WRITE:
+ if (is_read_only(dev)) {
+ printk("Can't page to read-only device %s\n",
+ kdevname(dev));
+ return;
+ }
+ break;
+ default:
+ panic("ll_rw_page: bad block dev cmd, must be R/W");
+ }
+ page = mem_map + MAP_NR(buffer);
+ if (set_bit(PG_locked, &page->flags))
+ panic ("ll_rw_page: page already locked");
+ brw_page(rw, page, dev, &block, PAGE_SIZE, 0);
+}
diff --git a/mm/swap.c b/mm/swap.c
index 2906df9c2..7076ec2f6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -7,6 +7,8 @@
/*
* This file should contain most things doing the swapping from/to disk.
* Started 18.12.91
+ *
+ * Swap aging added 23.2.95, Stephen Tweedie.
*/
#include <linux/mm.h>
@@ -17,1215 +19,88 @@
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
+#include <linux/swap.h>
#include <linux/fs.h>
+#include <linux/swapctl.h>
+#include <linux/pagemap.h>
#include <asm/dma.h>
#include <asm/system.h> /* for cli()/sti() */
+#include <asm/uaccess.h> /* for copy_to/from_user */
#include <asm/bitops.h>
#include <asm/pgtable.h>
-#define MAX_SWAPFILES 8
-
-#define SWP_USED 1
-#define SWP_WRITEOK 3
-
-int min_free_pages = 20;
-
-static int nr_swapfiles = 0;
-static struct wait_queue * lock_queue = NULL;
-
-static struct swap_info_struct {
- unsigned long flags;
- struct inode * swap_file;
- unsigned int swap_device;
- unsigned char * swap_map;
- unsigned char * swap_lockmap;
- int pages;
- int lowest_bit;
- int highest_bit;
- unsigned long max;
-} swap_info[MAX_SWAPFILES];
-
-extern int shm_swap (int);
-
-unsigned long *swap_cache;
-
-#ifdef SWAP_CACHE_INFO
-unsigned long swap_cache_add_total = 0;
-unsigned long swap_cache_add_success = 0;
-unsigned long swap_cache_del_total = 0;
-unsigned long swap_cache_del_success = 0;
-unsigned long swap_cache_find_total = 0;
-unsigned long swap_cache_find_success = 0;
-
-extern inline void show_swap_cache_info(void)
-{
- printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
- swap_cache_add_total, swap_cache_add_success,
- swap_cache_del_total, swap_cache_del_success,
- swap_cache_find_total, swap_cache_find_success);
-}
-#endif
-
-static int add_to_swap_cache(unsigned long addr, unsigned long entry)
-{
- struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
-
-#ifdef SWAP_CACHE_INFO
- swap_cache_add_total++;
-#endif
- if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
- entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry);
- if (entry) {
- printk("swap_cache: replacing non-NULL entry\n");
- }
-#ifdef SWAP_CACHE_INFO
- swap_cache_add_success++;
-#endif
- return 1;
- }
- return 0;
-}
-
-static unsigned long init_swap_cache(unsigned long mem_start,
- unsigned long mem_end)
-{
- unsigned long swap_cache_size;
-
- mem_start = (mem_start + 15) & ~15;
- swap_cache = (unsigned long *) mem_start;
- swap_cache_size = MAP_NR(mem_end);
- memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
- return (unsigned long) (swap_cache + swap_cache_size);
-}
-
-void rw_swap_page(int rw, unsigned long entry, char * buf)
-{
- unsigned long type, offset;
- struct swap_info_struct * p;
-
- type = SWP_TYPE(entry);
- if (type >= nr_swapfiles) {
- printk("Internal error: bad swap-device\n");
- return;
- }
- p = &swap_info[type];
- offset = SWP_OFFSET(entry);
- if (offset >= p->max) {
- printk("rw_swap_page: weirdness\n");
- return;
- }
- if (p->swap_map && !p->swap_map[offset]) {
- printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
- return;
- }
- if (!(p->flags & SWP_USED)) {
- printk("Trying to swap to unused swap-device\n");
- return;
- }
- while (set_bit(offset,p->swap_lockmap))
- sleep_on(&lock_queue);
- if (rw == READ)
- kstat.pswpin++;
- else
- kstat.pswpout++;
- if (p->swap_device) {
- ll_rw_page(rw,p->swap_device,offset,buf);
- } else if (p->swap_file) {
- struct inode *swapf = p->swap_file;
- unsigned int zones[PAGE_SIZE/512];
- int i;
- if (swapf->i_op->bmap == NULL
- && swapf->i_op->smap != NULL){
- /*
- With MsDOS, we use msdos_smap which return
- a sector number (not a cluster or block number).
- It is a patch to enable the UMSDOS project.
- Other people are working on better solution.
-
- It sounds like ll_rw_swap_file defined
- it operation size (sector size) based on
- PAGE_SIZE and the number of block to read.
- So using bmap or smap should work even if
- smap will require more blocks.
- */
- int j;
- unsigned int block = offset << 3;
-
- for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
- if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
- printk("rw_swap_page: bad swap file\n");
- return;
- }
- }
- }else{
- int j;
- unsigned int block = offset
- << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
-
- for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
- if (!(zones[i] = bmap(swapf,block++))) {
- printk("rw_swap_page: bad swap file\n");
- return;
- }
- }
- ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
- } else
- printk("re_swap_page: no swap file or device\n");
- if (offset && !clear_bit(offset,p->swap_lockmap))
- printk("rw_swap_page: lock already cleared\n");
- wake_up(&lock_queue);
-}
-
-unsigned long get_swap_page(void)
-{
- struct swap_info_struct * p;
- unsigned long offset, type;
-
- p = swap_info;
- for (type = 0 ; type < nr_swapfiles ; type++,p++) {
- if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
- continue;
- for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
- if (p->swap_map[offset])
- continue;
- if (test_bit(offset, p->swap_lockmap))
- continue;
- p->swap_map[offset] = 1;
- nr_swap_pages--;
- if (offset == p->highest_bit)
- p->highest_bit--;
- p->lowest_bit = offset;
- return SWP_ENTRY(type,offset);
- }
- }
- return 0;
-}
-
-void swap_duplicate(unsigned long entry)
-{
- struct swap_info_struct * p;
- unsigned long offset, type;
-
- if (!entry)
- return;
- offset = SWP_OFFSET(entry);
- type = SWP_TYPE(entry);
- if (type == SHM_SWP_TYPE)
- return;
- if (type >= nr_swapfiles) {
- printk("Trying to duplicate nonexistent swap-page\n");
- return;
- }
- p = type + swap_info;
- if (offset >= p->max) {
- printk("swap_duplicate: weirdness\n");
- return;
- }
- if (!p->swap_map[offset]) {
- printk("swap_duplicate: trying to duplicate unused page\n");
- return;
- }
- p->swap_map[offset]++;
- return;
-}
-
-void swap_free(unsigned long entry)
-{
- struct swap_info_struct * p;
- unsigned long offset, type;
-
- if (!entry)
- return;
- type = SWP_TYPE(entry);
- if (type == SHM_SWP_TYPE)
- return;
- if (type >= nr_swapfiles) {
- printk("Trying to free nonexistent swap-page\n");
- return;
- }
- p = & swap_info[type];
- offset = SWP_OFFSET(entry);
- if (offset >= p->max) {
- printk("swap_free: weirdness\n");
- return;
- }
- if (!(p->flags & SWP_USED)) {
- printk("Trying to free swap from unused swap-device\n");
- return;
- }
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit)
- p->highest_bit = offset;
- if (!p->swap_map[offset])
- printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
- else
- if (!--p->swap_map[offset])
- nr_swap_pages++;
-}
-
-/*
- * The tests may look silly, but it essentially makes sure that
- * no other process did a swap-in on us just as we were waiting.
- *
- * Also, don't bother to add to the swap cache if this page-in
- * was due to a write access.
- */
-void swap_in(struct vm_area_struct * vma, pte_t * page_table,
- unsigned long entry, int write_access)
-{
- unsigned long page = get_free_page(GFP_KERNEL);
-
- if (pte_val(*page_table) != entry) {
- free_page(page);
- return;
- }
- if (!page) {
- *page_table = BAD_PAGE;
- swap_free(entry);
- oom(current);
- return;
- }
- read_swap_page(entry, (char *) page);
- if (pte_val(*page_table) != entry) {
- free_page(page);
- return;
- }
- vma->vm_task->mm->rss++;
- vma->vm_task->mm->maj_flt++;
- if (!write_access && add_to_swap_cache(page, entry)) {
- *page_table = mk_pte(page, vma->vm_page_prot);
- return;
- }
- *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
- swap_free(entry);
- return;
-}
-
-/*
- * The swap-out functions return 1 if they successfully
- * threw something out, and we got a free page. It returns
- * zero if it couldn't do anything, and any other value
- * indicates it decreased rss, but the page was shared.
- *
- * NOTE! If it sleeps, it *must* return 1 to make sure we
- * don't continue with the swap-out. Otherwise we may be
- * using a process that no longer actually exists (it might
- * have died while we slept).
- */
-static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table)
-{
- pte_t pte;
- unsigned long entry;
- unsigned long page;
-
- pte = *page_table;
- if (!pte_present(pte))
- return 0;
- page = pte_page(pte);
- if (page >= high_memory)
- return 0;
- if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)
- return 0;
- if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte)) {
- *page_table = pte_mkold(pte);
- return 0;
- }
- if (pte_dirty(pte)) {
- if (mem_map[MAP_NR(page)] != 1)
- return 0;
- if (vma->vm_ops && vma->vm_ops->swapout) {
- vma->vm_task->mm->rss--;
- vma->vm_ops->swapout(vma, address-vma->vm_start, page_table);
- } else {
- if (!(entry = get_swap_page()))
- return 0;
- vma->vm_task->mm->rss--;
- pte_val(*page_table) = entry;
- invalidate();
- write_swap_page(entry, (char *) page);
- }
- free_page(page);
- return 1; /* we slept: the process may not exist any more */
- }
- if ((entry = find_in_swap_cache(page))) {
- if (mem_map[MAP_NR(page)] != 1) {
- *page_table = pte_mkdirty(pte);
- printk("Aiee.. duplicated cached swap-cache entry\n");
- return 0;
- }
- vma->vm_task->mm->rss--;
- pte_val(*page_table) = entry;
- invalidate();
- free_page(page);
- return 1;
- }
- vma->vm_task->mm->rss--;
- pte_clear(page_table);
- invalidate();
- entry = mem_map[MAP_NR(page)];
- free_page(page);
- return entry;
-}
-
-/*
- * A new implementation of swap_out(). We do not swap complete processes,
- * but only a small number of blocks, before we continue with the next
- * process. The number of blocks actually swapped is determined on the
- * number of page faults, that this process actually had in the last time,
- * so we won't swap heavily used processes all the time ...
- *
- * Note: the priority argument is a hint on much CPU to waste with the
- * swap block search, not a hint, of how much blocks to swap with
- * each process.
- *
- * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
- */
-
-/*
- * These are the minimum and maximum number of pages to swap from one process,
- * before proceeding to the next:
- */
-#define SWAP_MIN 4
-#define SWAP_MAX 32
-
-/*
- * The actual number of pages to swap is determined as:
- * SWAP_RATIO / (number of recent major page faults)
- */
-#define SWAP_RATIO 128
-
-static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir,
- unsigned long address, unsigned long end)
-{
- pte_t * pte;
- unsigned long pmd_end;
-
- if (pmd_none(*dir))
- return 0;
- if (pmd_bad(*dir)) {
- printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
- pmd_clear(dir);
- return 0;
- }
-
- pte = pte_offset(dir, address);
-
- pmd_end = (address + PMD_SIZE) & PMD_MASK;
- if (end > pmd_end)
- end = pmd_end;
-
- do {
- int result;
- vma->vm_task->mm->swap_address = address + PAGE_SIZE;
- result = try_to_swap_out(vma, address, pte);
- if (result)
- return result;
- address += PAGE_SIZE;
- pte++;
- } while (address < end);
- return 0;
-}
-
-static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir,
- unsigned long address, unsigned long end)
-{
- pmd_t * pmd;
- unsigned long pgd_end;
-
- if (pgd_none(*dir))
- return 0;
- if (pgd_bad(*dir)) {
- printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
- pgd_clear(dir);
- return 0;
- }
-
- pmd = pmd_offset(dir, address);
-
- pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
- if (end > pgd_end)
- end = pgd_end;
-
- do {
- int result = swap_out_pmd(vma, pmd, address, end);
- if (result)
- return result;
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address < end);
- return 0;
-}
-
-static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir,
- unsigned long start)
-{
- unsigned long end;
-
- /* Don't swap out areas like shared memory which have their
- own separate swapping mechanism. */
- if (vma->vm_flags & VM_SHM)
- return 0;
-
- end = vma->vm_end;
- while (start < end) {
- int result = swap_out_pgd(vma, pgdir, start, end);
- if (result)
- return result;
- start = (start + PGDIR_SIZE) & PGDIR_MASK;
- pgdir++;
- }
- return 0;
-}
-
-static int swap_out_process(struct task_struct * p)
-{
- unsigned long address;
- struct vm_area_struct* vma;
-
- /*
- * Go through process' page directory.
- */
- address = p->mm->swap_address;
- p->mm->swap_address = 0;
-
- /*
- * Find the proper vm-area
- */
- vma = find_vma(p, address);
- if (!vma)
- return 0;
- if (address < vma->vm_start)
- address = vma->vm_start;
-
- for (;;) {
- int result = swap_out_vma(vma, pgd_offset(p, address), address);
- if (result)
- return result;
- vma = vma->vm_next;
- if (!vma)
- break;
- address = vma->vm_start;
- }
- p->mm->swap_address = 0;
- return 0;
-}
-
-static int swap_out(unsigned int priority)
-{
- static int swap_task;
- int loop, counter;
- struct task_struct *p;
-
- counter = 6*nr_tasks >> priority;
- for(; counter >= 0; counter--) {
- /*
- * Check that swap_task is suitable for swapping. If not, look for
- * the next suitable process.
- */
- loop = 0;
- while(1) {
- if (swap_task >= NR_TASKS) {
- swap_task = 1;
- if (loop)
- /* all processes are unswappable or already swapped out */
- return 0;
- loop = 1;
- }
-
- p = task[swap_task];
- if (p && p->mm->swappable && p->mm->rss)
- break;
-
- swap_task++;
- }
-
- /*
- * Determine the number of pages to swap from this process.
- */
- if (!p->mm->swap_cnt) {
- p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt;
- p->mm->old_maj_flt = p->mm->maj_flt;
-
- if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) {
- p->mm->dec_flt = SWAP_RATIO / SWAP_MIN;
- p->mm->swap_cnt = SWAP_MIN;
- } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX)
- p->mm->swap_cnt = SWAP_MAX;
- else
- p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt;
- }
- if (!--p->mm->swap_cnt)
- swap_task++;
- switch (swap_out_process(p)) {
- case 0:
- if (p->mm->swap_cnt)
- swap_task++;
- break;
- case 1:
- return 1;
- default:
- break;
- }
- }
- return 0;
-}
-
/*
- * we keep on shrinking one resource until it's considered "too hard",
- * and then switch to the next one (priority being an indication on how
- * hard we should try with the resource).
+ * We identify three levels of free memory. We never let free mem
+ * fall below the min_free_pages except for atomic allocations. We
+ * start background swapping if we fall below free_pages_high free
+ * pages, and we begin intensive swapping below free_pages_low.
*
- * This should automatically find the resource that can most easily be
- * free'd, so hopefully we'll get reasonable behaviour even under very
- * different circumstances.
+ * Keep these three variables contiguous for sysctl(2).
*/
-static int try_to_free_page(int priority)
-{
- static int state = 0;
- int i=6;
-
- switch (state) {
- do {
- case 0:
- if (priority != GFP_NOBUFFER && shrink_buffers(i))
- return 1;
- state = 1;
- case 1:
- if (shm_swap(i))
- return 1;
- state = 2;
- default:
- if (swap_out(i))
- return 1;
- state = 0;
- } while(i--);
- }
- return 0;
-}
-
-static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
-{
- entry->prev = head;
- (entry->next = head->next)->prev = entry;
- head->next = entry;
-}
-
-static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
-{
- entry->next->prev = entry->prev;
- entry->prev->next = entry->next;
-}
-
-/*
- * Free_page() adds the page to the free lists. This is optimized for
- * fast normal cases (no error jumps taken normally).
- *
- * The way to optimize jumps for gcc-2.2.2 is to:
- * - select the "normal" case and put it inside the if () { XXX }
- * - no else-statements if you can avoid them
- *
- * With the above two rules, you get a straight-line execution path
- * for the normal case, giving better asm-code.
- *
- * free_page() may sleep since the page being freed may be a buffer
- * page or present in the swap cache. It will not sleep, however,
- * for a freshly allocated page (get_free_page()).
- */
-
-/*
- * Buddy system. Hairy. You really aren't expected to understand this
- */
-static inline void free_pages_ok(unsigned long addr, unsigned long order)
-{
- unsigned long index = MAP_NR(addr) >> (1 + order);
- unsigned long mask = PAGE_MASK << order;
-
- addr &= mask;
- nr_free_pages += 1 << order;
- while (order < NR_MEM_LISTS-1) {
- if (!change_bit(index, free_area_map[order]))
- break;
- remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
- order++;
- index >>= 1;
- mask <<= 1;
- addr &= mask;
- }
- add_mem_queue(free_area_list+order, (struct mem_list *) addr);
-}
-
-static inline void check_free_buffers(unsigned long addr)
-{
- struct buffer_head * bh;
-
- bh = buffer_pages[MAP_NR(addr)];
- if (bh) {
- struct buffer_head *tmp = bh;
- do {
- if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff)
- refile_buffer(tmp);
- tmp = tmp->b_this_page;
- } while (tmp != bh);
- }
-}
-
-void free_pages(unsigned long addr, unsigned long order)
-{
- if (addr < high_memory) {
- unsigned long flag;
- mem_map_t * map = mem_map + MAP_NR(addr);
- if (*map) {
- if (!(*map & MAP_PAGE_RESERVED)) {
- save_flags(flag);
- cli();
- if (!--*map) {
- free_pages_ok(addr, order);
- delete_from_swap_cache(addr);
- }
- restore_flags(flag);
- if (*map == 1)
- check_free_buffers(addr);
- }
- return;
- }
- printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
- printk("PC = %p\n", __builtin_return_address(0));
- return;
- }
-}
-
-/*
- * Some ugly macros to speed up __get_free_pages()..
- */
-#define RMQUEUE(order) \
-do { struct mem_list * queue = free_area_list+order; \
- unsigned long new_order = order; \
- do { struct mem_list *next = queue->next; \
- if (queue != next) { \
- (queue->next = next->next)->prev = queue; \
- mark_used((unsigned long) next, new_order); \
- nr_free_pages -= 1 << order; \
- restore_flags(flags); \
- EXPAND(next, order, new_order); \
- return (unsigned long) next; \
- } new_order++; queue++; \
- } while (new_order < NR_MEM_LISTS); \
-} while (0)
-
-static inline int mark_used(unsigned long addr, unsigned long order)
-{
- return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
-}
-
-#define EXPAND(addr,low,high) \
-do { unsigned long size = PAGE_SIZE << high; \
- while (high > low) { \
- high--; size >>= 1; cli(); \
- add_mem_queue(free_area_list+high, addr); \
- mark_used((unsigned long) addr, high); \
- restore_flags(flags); \
- addr = (struct mem_list *) (size + (unsigned long) addr); \
- } mem_map[MAP_NR((unsigned long) addr)] = 1; \
-} while (0)
-
-unsigned long __get_free_pages(int priority, unsigned long order)
-{
- unsigned long flags;
- int reserved_pages;
-
- if (intr_count && priority != GFP_ATOMIC) {
- static int count = 0;
- if (++count < 5) {
- printk("gfp called nonatomically from interrupt %p\n",
- __builtin_return_address(0));
- priority = GFP_ATOMIC;
- }
- }
- reserved_pages = 5;
- if (priority != GFP_NFS)
- reserved_pages = min_free_pages;
- save_flags(flags);
-repeat:
- cli();
- if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
- RMQUEUE(order);
- restore_flags(flags);
- return 0;
- }
- restore_flags(flags);
- if (priority != GFP_BUFFER && try_to_free_page(priority))
- goto repeat;
- return 0;
-}
-
-/*
- * Yes, I know this is ugly. Don't tell me.
- */
-unsigned long __get_dma_pages(int priority, unsigned long order)
-{
- unsigned long list = 0;
- unsigned long result;
- unsigned long limit = MAX_DMA_ADDRESS;
-
- /* if (EISA_bus) limit = ~0UL; */
- if (priority != GFP_ATOMIC)
- priority = GFP_BUFFER;
- for (;;) {
- result = __get_free_pages(priority, order);
- if (result < limit) /* covers failure as well */
- break;
- *(unsigned long *) result = list;
- list = result;
- }
- while (list) {
- unsigned long tmp = list;
- list = *(unsigned long *) list;
- free_pages(tmp, order);
- }
- return result;
-}
-
-/*
- * Show free area list (used inside shift_scroll-lock stuff)
- * We also calculate the percentage fragmentation. We do this by counting the
- * memory on each free list with the exception of the first item on the list.
- */
-void show_free_areas(void)
-{
- unsigned long order, flags;
- unsigned long total = 0;
-
- printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
- save_flags(flags);
- cli();
- for (order=0 ; order < NR_MEM_LISTS; order++) {
- struct mem_list * tmp;
- unsigned long nr = 0;
- for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
- nr ++;
- }
- total += nr * ((PAGE_SIZE>>10) << order);
- printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
- }
- restore_flags(flags);
- printk("= %lukB)\n", total);
-#ifdef SWAP_CACHE_INFO
- show_swap_cache_info();
-#endif
-}
-
-/*
- * Trying to stop swapping from a file is fraught with races, so
- * we repeat quite a bit here when we have to pause. swapoff()
- * isn't exactly timing-critical, so who cares (but this is /really/
- * inefficient, ugh).
- *
- * We return 1 after having slept, which makes the process start over
- * from the beginning for this process..
- */
-static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
- pte_t *dir, unsigned int type, unsigned long page)
-{
- pte_t pte = *dir;
-
- if (pte_none(pte))
- return 0;
- if (pte_present(pte)) {
- unsigned long page = pte_page(pte);
- if (page >= high_memory)
- return 0;
- if (!in_swap_cache(page))
- return 0;
- if (SWP_TYPE(in_swap_cache(page)) != type)
- return 0;
- delete_from_swap_cache(page);
- *dir = pte_mkdirty(pte);
- return 0;
- }
- if (SWP_TYPE(pte_val(pte)) != type)
- return 0;
- read_swap_page(pte_val(pte), (char *) page);
- if (pte_val(*dir) != pte_val(pte)) {
- free_page(page);
- return 1;
- }
- *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
- ++vma->vm_task->mm->rss;
- swap_free(pte_val(pte));
- return 1;
-}
-
-static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
- unsigned long address, unsigned long size, unsigned long offset,
- unsigned int type, unsigned long page)
-{
- pte_t * pte;
- unsigned long end;
-
- if (pmd_none(*dir))
- return 0;
- if (pmd_bad(*dir)) {
- printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
- pmd_clear(dir);
- return 0;
- }
- pte = pte_offset(dir, address);
- offset += address & PMD_MASK;
- address &= ~PMD_MASK;
- end = address + size;
- if (end > PMD_SIZE)
- end = PMD_SIZE;
- do {
- if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
- return 1;
- address += PAGE_SIZE;
- pte++;
- } while (address < end);
- return 0;
-}
-
-static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
- unsigned long address, unsigned long size,
- unsigned int type, unsigned long page)
-{
- pmd_t * pmd;
- unsigned long offset, end;
-
- if (pgd_none(*dir))
- return 0;
- if (pgd_bad(*dir)) {
- printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
- pgd_clear(dir);
- return 0;
- }
- pmd = pmd_offset(dir, address);
- offset = address & PGDIR_MASK;
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- do {
- if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
- return 1;
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address < end);
- return 0;
-}
-
-static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
- unsigned long start, unsigned long end,
- unsigned int type, unsigned long page)
-{
- while (start < end) {
- if (unuse_pgd(vma, pgdir, start, end - start, type, page))
- return 1;
- start = (start + PGDIR_SIZE) & PGDIR_MASK;
- pgdir++;
- }
- return 0;
-}
-
-static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
-{
- struct vm_area_struct* vma;
-
- /*
- * Go through process' page directory.
- */
- vma = p->mm->mmap;
- while (vma) {
- pgd_t * pgd = pgd_offset(p, vma->vm_start);
- if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
- return 1;
- vma = vma->vm_next;
- }
- return 0;
-}
-
-/*
- * To avoid races, we repeat for each process after having
- * swapped something in. That gets rid of a few pesky races,
- * and "swapoff" isn't exactly timing critical.
- */
-static int try_to_unuse(unsigned int type)
-{
- int nr;
- unsigned long page = get_free_page(GFP_KERNEL);
-
- if (!page)
- return -ENOMEM;
- nr = 0;
- while (nr < NR_TASKS) {
- if (task[nr]) {
- if (unuse_process(task[nr], type, page)) {
- page = get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- continue;
- }
- }
- nr++;
- }
- free_page(page);
- return 0;
-}
-
-asmlinkage int sys_swapoff(const char * specialfile)
-{
- struct swap_info_struct * p;
- struct inode * inode;
- unsigned int type;
- struct file filp;
+int min_free_pages = 20;
+int free_pages_low = 30;
+int free_pages_high = 40;
+
+/* We track the number of pages currently being asynchronously swapped
+ out, so that we don't try to swap TOO many pages out at once */
+atomic_t nr_async_pages = 0;
+
+/*
+ * Constants for the page aging mechanism: the maximum age (actually,
+ * the maximum "youthfulness"); the quanta by which pages rejuvenate
+ * and age; and the initial age for new pages.
+ */
+
+swap_control_t swap_control = {
+ 20, 3, 1, 3, /* Page aging */
+ 10, 2, 2, 4, /* Buffer aging */
+ 32, 4, /* Aging cluster */
+ 8192, 8192, /* Pageout and bufferout weights */
+ -200, /* Buffer grace */
+ 1, 1, /* Buffs/pages to free */
+ RCL_ROUND_ROBIN /* Balancing policy */
+};
+
+swapstat_t swapstats = {0};
+
+/* General swap control */
+
+/* Parse the kernel command line "swap=" option at load time: */
+void swap_setup(char *str, int *ints)
+{
+ int * swap_vars[8] = {
+ &MAX_PAGE_AGE,
+ &PAGE_ADVANCE,
+ &PAGE_DECLINE,
+ &PAGE_INITIAL_AGE,
+ &AGE_CLUSTER_FRACT,
+ &AGE_CLUSTER_MIN,
+ &PAGEOUT_WEIGHT,
+ &BUFFEROUT_WEIGHT
+ };
int i;
-
- if (!suser())
- return -EPERM;
- i = namei(specialfile,&inode);
- if (i)
- return i;
- p = swap_info;
- for (type = 0 ; type < nr_swapfiles ; type++,p++) {
- if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
- continue;
- if (p->swap_file) {
- if (p->swap_file == inode)
- break;
- } else {
- if (!S_ISBLK(inode->i_mode))
- continue;
- if (p->swap_device == inode->i_rdev)
- break;
- }
+ for (i=0; i < ints[0] && i < 8; i++) {
+ if (ints[i+1])
+ *(swap_vars[i]) = ints[i+1];
}
-
- if (type >= nr_swapfiles){
- iput(inode);
- return -EINVAL;
- }
- p->flags = SWP_USED;
- i = try_to_unuse(type);
- if (i) {
- iput(inode);
- p->flags = SWP_WRITEOK;
- return i;
- }
-
- if(p->swap_device){
- memset(&filp, 0, sizeof(filp));
- filp.f_inode = inode;
- filp.f_mode = 3; /* read write */
- /* open it again to get fops */
- if( !blkdev_open(inode, &filp) &&
- filp.f_op && filp.f_op->release){
- filp.f_op->release(inode,&filp);
- filp.f_op->release(inode,&filp);
- }
- }
- iput(inode);
-
- nr_swap_pages -= p->pages;
- iput(p->swap_file);
- p->swap_file = NULL;
- p->swap_device = 0;
- vfree(p->swap_map);
- p->swap_map = NULL;
- free_page((long) p->swap_lockmap);
- p->swap_lockmap = NULL;
- p->flags = 0;
- return 0;
-}
-
-/*
- * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
- *
- * The swapon system call
- */
-asmlinkage int sys_swapon(const char * specialfile)
-{
- struct swap_info_struct * p;
- struct inode * swap_inode;
- unsigned int type;
- int i,j;
- int error;
- struct file filp;
-
- memset(&filp, 0, sizeof(filp));
- if (!suser())
- return -EPERM;
- p = swap_info;
- for (type = 0 ; type < nr_swapfiles ; type++,p++)
- if (!(p->flags & SWP_USED))
- break;
- if (type >= MAX_SWAPFILES)
- return -EPERM;
- if (type >= nr_swapfiles)
- nr_swapfiles = type+1;
- p->flags = SWP_USED;
- p->swap_file = NULL;
- p->swap_device = 0;
- p->swap_map = NULL;
- p->swap_lockmap = NULL;
- p->lowest_bit = 0;
- p->highest_bit = 0;
- p->max = 1;
- error = namei(specialfile,&swap_inode);
- if (error)
- goto bad_swap_2;
- p->swap_file = swap_inode;
- error = -EBUSY;
- if (swap_inode->i_count != 1)
- goto bad_swap_2;
- error = -EINVAL;
-
- if (S_ISBLK(swap_inode->i_mode)) {
- p->swap_device = swap_inode->i_rdev;
-
- filp.f_inode = swap_inode;
- filp.f_mode = 3; /* read write */
- error = blkdev_open(swap_inode, &filp);
- p->swap_file = NULL;
- iput(swap_inode);
- if(error)
- goto bad_swap_2;
- error = -ENODEV;
- if (!p->swap_device)
- goto bad_swap;
- error = -EBUSY;
- for (i = 0 ; i < nr_swapfiles ; i++) {
- if (i == type)
- continue;
- if (p->swap_device == swap_info[i].swap_device)
- goto bad_swap;
- }
- } else if (!S_ISREG(swap_inode->i_mode))
- goto bad_swap;
- p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
- if (!p->swap_lockmap) {
- printk("Unable to start swapping: out of memory :-)\n");
- error = -ENOMEM;
- goto bad_swap;
- }
- read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
- if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
- printk("Unable to find swap-space signature\n");
- error = -EINVAL;
- goto bad_swap;
- }
- memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
- j = 0;
- p->lowest_bit = 0;
- p->highest_bit = 0;
- for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
- if (test_bit(i,p->swap_lockmap)) {
- if (!p->lowest_bit)
- p->lowest_bit = i;
- p->highest_bit = i;
- p->max = i+1;
- j++;
- }
- }
- if (!j) {
- printk("Empty swap-file\n");
- error = -EINVAL;
- goto bad_swap;
- }
- p->swap_map = (unsigned char *) vmalloc(p->max);
- if (!p->swap_map) {
- error = -ENOMEM;
- goto bad_swap;
- }
- for (i = 1 ; i < p->max ; i++) {
- if (test_bit(i,p->swap_lockmap))
- p->swap_map[i] = 0;
- else
- p->swap_map[i] = 0x80;
- }
- p->swap_map[0] = 0x80;
- memset(p->swap_lockmap,0,PAGE_SIZE);
- p->flags = SWP_WRITEOK;
- p->pages = j;
- nr_swap_pages += j;
- printk("Adding Swap: %dk swap-space\n",j<<(PAGE_SHIFT-10));
- return 0;
-bad_swap:
- if(filp.f_op && filp.f_op->release)
- filp.f_op->release(filp.f_inode,&filp);
-bad_swap_2:
- free_page((long) p->swap_lockmap);
- vfree(p->swap_map);
- iput(p->swap_file);
- p->swap_device = 0;
- p->swap_file = NULL;
- p->swap_map = NULL;
- p->swap_lockmap = NULL;
- p->flags = 0;
- return error;
}
-void si_swapinfo(struct sysinfo *val)
+/* Parse the kernel command line "buff=" option at load time: */
+void buff_setup(char *str, int *ints)
{
- unsigned int i, j;
-
- val->freeswap = val->totalswap = 0;
- for (i = 0; i < nr_swapfiles; i++) {
- if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
- continue;
- for (j = 0; j < swap_info[i].max; ++j)
- switch (swap_info[i].swap_map[j]) {
- case 128:
- continue;
- case 0:
- ++val->freeswap;
- default:
- ++val->totalswap;
- }
- }
- val->freeswap <<= PAGE_SHIFT;
- val->totalswap <<= PAGE_SHIFT;
- return;
-}
-
-/*
- * set up the free-area data structures:
- * - mark all pages MAP_PAGE_RESERVED
- * - mark all memory queues empty
- * - clear the memory bitmaps
- */
-unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
-{
- mem_map_t * p;
- unsigned long mask = PAGE_MASK;
+ int * buff_vars[6] = {
+ &MAX_BUFF_AGE,
+ &BUFF_ADVANCE,
+ &BUFF_DECLINE,
+ &BUFF_INITIAL_AGE,
+ &BUFFEROUT_WEIGHT,
+ &BUFFERMEM_GRACE
+ };
int i;
-
- /*
- * select nr of pages we try to keep free for important stuff
- * with a minimum of 16 pages. This is totally arbitrary
- */
- i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
- if (i < 16)
- i = 16;
- min_free_pages = i;
- start_mem = init_swap_cache(start_mem, end_mem);
- mem_map = (mem_map_t *) start_mem;
- p = mem_map + MAP_NR(end_mem);
- start_mem = (unsigned long) p;
- while (p > mem_map)
- *--p = MAP_PAGE_RESERVED;
-
- for (i = 0 ; i < NR_MEM_LISTS ; i++) {
- unsigned long bitmap_size;
- free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
- mask += mask;
- end_mem = (end_mem + ~mask) & mask;
- bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
- bitmap_size = (bitmap_size + 7) >> 3;
- bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1);
- free_area_map[i] = (unsigned char *) start_mem;
- memset((void *) start_mem, 0, bitmap_size);
- start_mem += bitmap_size;
+ for (i=0; i < ints[0] && i < 6; i++) {
+ if (ints[i+1])
+ *(buff_vars[i]) = ints[i+1];
}
- return start_mem;
}
+
diff --git a/mm/swap_state.c b/mm/swap_state.c
new file mode 100644
index 000000000..044180721
--- /dev/null
+++ b/mm/swap_state.c
@@ -0,0 +1,111 @@
+/*
+ * linux/mm/swap_state.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/head.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/swapctl.h>
+
+#include <asm/dma.h>
+#include <asm/system.h> /* for cli()/sti() */
+#include <asm/uaccess.h> /* for cop_to/from_user */
+#include <asm/bitops.h>
+#include <asm/pgtable.h>
+
+/*
+ * To save us from swapping out pages which have just been swapped in and
+ * have not been modified since then, we keep in swap_cache[page>>PAGE_SHIFT]
+ * the swap entry which was last used to fill the page, or zero if the
+ * page does not currently correspond to a page in swap. PAGE_DIRTY makes
+ * this info useless.
+ */
+unsigned long *swap_cache;
+
+#ifdef SWAP_CACHE_INFO
+unsigned long swap_cache_add_total = 0;
+unsigned long swap_cache_add_success = 0;
+unsigned long swap_cache_del_total = 0;
+unsigned long swap_cache_del_success = 0;
+unsigned long swap_cache_find_total = 0;
+unsigned long swap_cache_find_success = 0;
+
+void show_swap_cache_info(void)
+{
+ printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
+ swap_cache_add_total, swap_cache_add_success,
+ swap_cache_del_total, swap_cache_del_success,
+ swap_cache_find_total, swap_cache_find_success);
+}
+#endif
+
+int add_to_swap_cache(unsigned long index, unsigned long entry)
+{
+ struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
+
+#ifdef SWAP_CACHE_INFO
+ swap_cache_add_total++;
+#endif
+ if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
+ entry = xchg(swap_cache + index, entry);
+ if (entry) {
+ printk("swap_cache: replacing non-NULL entry\n");
+ }
+#ifdef SWAP_CACHE_INFO
+ swap_cache_add_success++;
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+unsigned long init_swap_cache(unsigned long mem_start,
+ unsigned long mem_end)
+{
+ unsigned long swap_cache_size;
+
+ mem_start = (mem_start + 15) & ~15;
+ swap_cache = (unsigned long *) mem_start;
+ swap_cache_size = MAP_NR(mem_end);
+ memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
+ return (unsigned long) (swap_cache + swap_cache_size);
+}
+
+void swap_duplicate(unsigned long entry)
+{
+ struct swap_info_struct * p;
+ unsigned long offset, type;
+
+ if (!entry)
+ return;
+ offset = SWP_OFFSET(entry);
+ type = SWP_TYPE(entry);
+ if (type & SHM_SWP_TYPE)
+ return;
+ if (type >= nr_swapfiles) {
+ printk("Trying to duplicate nonexistent swap-page\n");
+ return;
+ }
+ p = type + swap_info;
+ if (offset >= p->max) {
+ printk("swap_duplicate: weirdness\n");
+ return;
+ }
+ if (!p->swap_map[offset]) {
+ printk("swap_duplicate: trying to duplicate unused page\n");
+ return;
+ }
+ p->swap_map[offset]++;
+ return;
+}
+
diff --git a/mm/swapfile.c b/mm/swapfile.c
new file mode 100644
index 000000000..0ee8b30c1
--- /dev/null
+++ b/mm/swapfile.c
@@ -0,0 +1,577 @@
+/*
+ * linux/mm/swapfile.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/head.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/swapctl.h>
+#include <linux/blkdev.h> /* for blk_size */
+#include <linux/vmalloc.h>
+
+#include <asm/dma.h>
+#include <asm/system.h> /* for cli()/sti() */
+#include <asm/uaccess.h> /* for copy_to/from_user */
+#include <asm/bitops.h>
+#include <asm/pgtable.h>
+
+unsigned int nr_swapfiles = 0;
+
+static struct {
+ int head; /* head of priority-ordered swapfile list */
+ int next; /* swapfile to be used next */
+} swap_list = {-1, -1};
+
+struct swap_info_struct swap_info[MAX_SWAPFILES];
+
+
+static inline int scan_swap_map(struct swap_info_struct *si)
+{
+ unsigned long offset;
+ /*
+ * We try to cluster swap pages by allocating them
+ * sequentially in swap. Once we've allocated
+ * SWAP_CLUSTER_MAX pages this way, however, we resort to
+ * first-free allocation, starting a new cluster. This
+ * prevents us from scattering swap pages all over the entire
+ * swap partition, so that we reduce overall disk seek times
+ * between swap pages. -- sct */
+ if (si->cluster_nr) {
+ while (si->cluster_next <= si->highest_bit) {
+ offset = si->cluster_next++;
+ if (si->swap_map[offset])
+ continue;
+ if (test_bit(offset, si->swap_lockmap))
+ continue;
+ si->cluster_nr--;
+ goto got_page;
+ }
+ }
+ si->cluster_nr = SWAP_CLUSTER_MAX;
+ for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
+ if (si->swap_map[offset])
+ continue;
+ if (test_bit(offset, si->swap_lockmap))
+ continue;
+ si->lowest_bit = offset;
+got_page:
+ si->swap_map[offset] = 1;
+ nr_swap_pages--;
+ if (offset == si->highest_bit)
+ si->highest_bit--;
+ si->cluster_next = offset;
+ return offset;
+ }
+ return 0;
+}
+
+unsigned long get_swap_page(void)
+{
+ struct swap_info_struct * p;
+ unsigned long offset, entry;
+ int type, wrapped = 0;
+
+ type = swap_list.next;
+ if (type < 0)
+ return 0;
+
+ while (1) {
+ p = &swap_info[type];
+ if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
+ offset = scan_swap_map(p);
+ if (offset) {
+ entry = SWP_ENTRY(type,offset);
+ type = swap_info[type].next;
+ if (type < 0 ||
+ p->prio != swap_info[type].prio)
+ {
+ swap_list.next = swap_list.head;
+ }
+ else
+ {
+ swap_list.next = type;
+ }
+ return entry;
+ }
+ }
+ type = p->next;
+ if (!wrapped) {
+ if (type < 0 || p->prio != swap_info[type].prio) {
+ type = swap_list.head;
+ wrapped = 1;
+ }
+ } else if (type < 0) {
+ return 0; /* out of swap space */
+ }
+ }
+}
+
+void swap_free(unsigned long entry)
+{
+ struct swap_info_struct * p;
+ unsigned long offset, type;
+
+ if (!entry)
+ return;
+ type = SWP_TYPE(entry);
+ if (type & SHM_SWP_TYPE)
+ return;
+ if (type >= nr_swapfiles) {
+ printk("Trying to free nonexistent swap-page\n");
+ return;
+ }
+ p = & swap_info[type];
+ offset = SWP_OFFSET(entry);
+ if (offset >= p->max) {
+ printk("swap_free: weirdness\n");
+ return;
+ }
+ if (!(p->flags & SWP_USED)) {
+ printk("Trying to free swap from unused swap-device\n");
+ return;
+ }
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+ p->highest_bit = offset;
+ if (!p->swap_map[offset])
+ printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
+ else
+ if (!--p->swap_map[offset])
+ nr_swap_pages++;
+ if (p->prio > swap_info[swap_list.next].prio) {
+ swap_list.next = swap_list.head;
+ }
+}
+
+/*
+ * Trying to stop swapping from a file is fraught with races, so
+ * we repeat quite a bit here when we have to pause. swapoff()
+ * isn't exactly timing-critical, so who cares (but this is /really/
+ * inefficient, ugh).
+ *
+ * We return 1 after having slept, which makes the process start over
+ * from the beginning for this process..
+ */
+static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
+ pte_t *dir, unsigned int type, unsigned long page)
+{
+ pte_t pte = *dir;
+
+ if (pte_none(pte))
+ return 0;
+ if (pte_present(pte)) {
+ unsigned long page_nr = MAP_NR(pte_page(pte));
+ if (page_nr >= max_mapnr)
+ return 0;
+ if (!in_swap_cache(page_nr))
+ return 0;
+ if (SWP_TYPE(in_swap_cache(page_nr)) != type)
+ return 0;
+ delete_from_swap_cache(page_nr);
+ set_pte(dir, pte_mkdirty(pte));
+ return 0;
+ }
+ if (SWP_TYPE(pte_val(pte)) != type)
+ return 0;
+ read_swap_page(pte_val(pte), (char *) page);
+ if (pte_val(*dir) != pte_val(pte)) {
+ free_page(page);
+ return 1;
+ }
+ set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
+ flush_tlb_page(vma, address);
+ ++vma->vm_mm->rss;
+ swap_free(pte_val(pte));
+ return 1;
+}
+
+static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+ unsigned long address, unsigned long size, unsigned long offset,
+ unsigned int type, unsigned long page)
+{
+ pte_t * pte;
+ unsigned long end;
+
+ if (pmd_none(*dir))
+ return 0;
+ if (pmd_bad(*dir)) {
+ printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
+ pmd_clear(dir);
+ return 0;
+ }
+ pte = pte_offset(dir, address);
+ offset += address & PMD_MASK;
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ do {
+ if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
+ return 1;
+ address += PAGE_SIZE;
+ pte++;
+ } while (address < end);
+ return 0;
+}
+
+static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+ unsigned long address, unsigned long size,
+ unsigned int type, unsigned long page)
+{
+ pmd_t * pmd;
+ unsigned long offset, end;
+
+ if (pgd_none(*dir))
+ return 0;
+ if (pgd_bad(*dir)) {
+ printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
+ pgd_clear(dir);
+ return 0;
+ }
+ pmd = pmd_offset(dir, address);
+ offset = address & PGDIR_MASK;
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {
+ if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
+ return 1;
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+ return 0;
+}
+
+static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+ unsigned long start, unsigned long end,
+ unsigned int type, unsigned long page)
+{
+ while (start < end) {
+ if (unuse_pgd(vma, pgdir, start, end - start, type, page))
+ return 1;
+ start = (start + PGDIR_SIZE) & PGDIR_MASK;
+ pgdir++;
+ }
+ return 0;
+}
+
+static int unuse_process(struct mm_struct * mm, unsigned int type, unsigned long page)
+{
+ struct vm_area_struct* vma;
+
+ /*
+ * Go through process' page directory.
+ */
+ if (!mm || mm == &init_mm)
+ return 0;
+ vma = mm->mmap;
+ while (vma) {
+ pgd_t * pgd = pgd_offset(mm, vma->vm_start);
+ if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
+ return 1;
+ vma = vma->vm_next;
+ }
+ return 0;
+}
+
+/*
+ * To avoid races, we repeat for each process after having
+ * swapped something in. That gets rid of a few pesky races,
+ * and "swapoff" isn't exactly timing critical.
+ */
+static int try_to_unuse(unsigned int type)
+{
+ int nr;
+ unsigned long page = get_free_page(GFP_KERNEL);
+
+ if (!page)
+ return -ENOMEM;
+ nr = 0;
+ while (nr < NR_TASKS) {
+ struct task_struct * p = task[nr];
+ if (p) {
+ if (unuse_process(p->mm, type, page)) {
+ page = get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ continue;
+ }
+ }
+ nr++;
+ }
+ free_page(page);
+ return 0;
+}
+
+asmlinkage int sys_swapoff(const char * specialfile)
+{
+ struct swap_info_struct * p;
+ struct inode * inode;
+ struct file filp;
+ int i, type, prev;
+ int err;
+
+ if (!suser())
+ return -EPERM;
+ err = namei(specialfile,&inode);
+ if (err)
+ return err;
+ prev = -1;
+ for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
+ p = swap_info + type;
+ if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
+ if (p->swap_file) {
+ if (p->swap_file == inode)
+ break;
+ } else {
+ if (S_ISBLK(inode->i_mode)
+ && (p->swap_device == inode->i_rdev))
+ break;
+ }
+ }
+ prev = type;
+ }
+ if (type < 0){
+ iput(inode);
+ return -EINVAL;
+ }
+ if (prev < 0) {
+ swap_list.head = p->next;
+ } else {
+ swap_info[prev].next = p->next;
+ }
+ if (type == swap_list.next) {
+ /* just pick something that's safe... */
+ swap_list.next = swap_list.head;
+ }
+ p->flags = SWP_USED;
+ err = try_to_unuse(type);
+ if (err) {
+ iput(inode);
+ /* re-insert swap space back into swap_list */
+ for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+ if (p->prio >= swap_info[i].prio)
+ break;
+ p->next = i;
+ if (prev < 0)
+ swap_list.head = swap_list.next = p - swap_info;
+ else
+ swap_info[prev].next = p - swap_info;
+ p->flags = SWP_WRITEOK;
+ return err;
+ }
+ if(p->swap_device){
+ memset(&filp, 0, sizeof(filp));
+ filp.f_inode = inode;
+ filp.f_mode = 3; /* read write */
+ /* open it again to get fops */
+ if( !blkdev_open(inode, &filp) &&
+ filp.f_op && filp.f_op->release){
+ filp.f_op->release(inode,&filp);
+ filp.f_op->release(inode,&filp);
+ }
+ }
+ iput(inode);
+
+ nr_swap_pages -= p->pages;
+ iput(p->swap_file);
+ p->swap_file = NULL;
+ p->swap_device = 0;
+ vfree(p->swap_map);
+ p->swap_map = NULL;
+ free_page((long) p->swap_lockmap);
+ p->swap_lockmap = NULL;
+ p->flags = 0;
+ return 0;
+}
+
+/*
+ * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
+ *
+ * The swapon system call
+ */
+asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
+{
+ struct swap_info_struct * p;
+ struct inode * swap_inode;
+ unsigned int type;
+ int i, j, prev;
+ int error;
+ struct file filp;
+ static int least_priority = 0;
+
+ memset(&filp, 0, sizeof(filp));
+ if (!suser())
+ return -EPERM;
+ p = swap_info;
+ for (type = 0 ; type < nr_swapfiles ; type++,p++)
+ if (!(p->flags & SWP_USED))
+ break;
+ if (type >= MAX_SWAPFILES)
+ return -EPERM;
+ if (type >= nr_swapfiles)
+ nr_swapfiles = type+1;
+ p->flags = SWP_USED;
+ p->swap_file = NULL;
+ p->swap_device = 0;
+ p->swap_map = NULL;
+ p->swap_lockmap = NULL;
+ p->lowest_bit = 0;
+ p->highest_bit = 0;
+ p->cluster_nr = 0;
+ p->max = 1;
+ p->next = -1;
+ if (swap_flags & SWAP_FLAG_PREFER) {
+ p->prio =
+ (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
+ } else {
+ p->prio = --least_priority;
+ }
+ error = namei(specialfile,&swap_inode);
+ if (error)
+ goto bad_swap_2;
+ p->swap_file = swap_inode;
+ error = -EBUSY;
+ if (swap_inode->i_count != 1)
+ goto bad_swap_2;
+ error = -EINVAL;
+
+ if (S_ISBLK(swap_inode->i_mode)) {
+ p->swap_device = swap_inode->i_rdev;
+ set_blocksize(p->swap_device, PAGE_SIZE);
+
+ filp.f_inode = swap_inode;
+ filp.f_mode = 3; /* read write */
+ error = blkdev_open(swap_inode, &filp);
+ p->swap_file = NULL;
+ iput(swap_inode);
+ if(error)
+ goto bad_swap_2;
+ error = -ENODEV;
+ if (!p->swap_device ||
+ (blk_size[MAJOR(p->swap_device)] &&
+ !blk_size[MAJOR(p->swap_device)][MINOR(p->swap_device)]))
+ goto bad_swap;
+ error = -EBUSY;
+ for (i = 0 ; i < nr_swapfiles ; i++) {
+ if (i == type)
+ continue;
+ if (p->swap_device == swap_info[i].swap_device)
+ goto bad_swap;
+ }
+ } else if (!S_ISREG(swap_inode->i_mode))
+ goto bad_swap;
+ p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
+ if (!p->swap_lockmap) {
+ printk("Unable to start swapping: out of memory :-)\n");
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+ read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
+ if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
+ printk("Unable to find swap-space signature\n");
+ error = -EINVAL;
+ goto bad_swap;
+ }
+ memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
+ j = 0;
+ p->lowest_bit = 0;
+ p->highest_bit = 0;
+ for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
+ if (test_bit(i,p->swap_lockmap)) {
+ if (!p->lowest_bit)
+ p->lowest_bit = i;
+ p->highest_bit = i;
+ p->max = i+1;
+ j++;
+ }
+ }
+ if (!j) {
+ printk("Empty swap-file\n");
+ error = -EINVAL;
+ goto bad_swap;
+ }
+ p->swap_map = (unsigned char *) vmalloc(p->max);
+ if (!p->swap_map) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+ for (i = 1 ; i < p->max ; i++) {
+ if (test_bit(i,p->swap_lockmap))
+ p->swap_map[i] = 0;
+ else
+ p->swap_map[i] = 0x80;
+ }
+ p->swap_map[0] = 0x80;
+ clear_page(p->swap_lockmap);
+ p->flags = SWP_WRITEOK;
+ p->pages = j;
+ nr_swap_pages += j;
+ printk("Adding Swap: %dk swap-space (priority %d)\n",
+ j<<(PAGE_SHIFT-10), p->prio);
+
+ /* insert swap space into swap_list: */
+ prev = -1;
+ for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+ if (p->prio >= swap_info[i].prio) {
+ break;
+ }
+ prev = i;
+ }
+ p->next = i;
+ if (prev < 0) {
+ swap_list.head = swap_list.next = p - swap_info;
+ } else {
+ swap_info[prev].next = p - swap_info;
+ }
+ return 0;
+bad_swap:
+ if(filp.f_op && filp.f_op->release)
+ filp.f_op->release(filp.f_inode,&filp);
+bad_swap_2:
+ free_page((long) p->swap_lockmap);
+ vfree(p->swap_map);
+ iput(p->swap_file);
+ p->swap_device = 0;
+ p->swap_file = NULL;
+ p->swap_map = NULL;
+ p->swap_lockmap = NULL;
+ p->flags = 0;
+ return error;
+}
+
+void si_swapinfo(struct sysinfo *val)
+{
+ unsigned int i, j;
+
+ val->freeswap = val->totalswap = 0;
+ for (i = 0; i < nr_swapfiles; i++) {
+ if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
+ continue;
+ for (j = 0; j < swap_info[i].max; ++j)
+ switch (swap_info[i].swap_map[j]) {
+ case 128:
+ continue;
+ case 0:
+ ++val->freeswap;
+ default:
+ ++val->totalswap;
+ }
+ }
+ val->freeswap <<= PAGE_SHIFT;
+ val->totalswap <<= PAGE_SHIFT;
+ return;
+}
+
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 107be5546..142e6d256 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4,37 +4,14 @@
* Copyright (C) 1993 Linus Torvalds
*/
-#include <asm/system.h>
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/head.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
#include <linux/malloc.h>
-#include <linux/mm.h>
-
-#include <asm/segment.h>
-#include <asm/pgtable.h>
+#include <linux/vmalloc.h>
-struct vm_struct {
- unsigned long flags;
- void * addr;
- unsigned long size;
- struct vm_struct * next;
-};
+#include <asm/uaccess.h>
+#include <asm/system.h>
static struct vm_struct * vmlist = NULL;
-static inline void set_pgdir(unsigned long address, pgd_t entry)
-{
- struct task_struct * p;
-
- for_each_task(p)
- *pgd_offset(p,address) = entry;
-}
-
static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
{
pte_t * pte;
@@ -96,13 +73,14 @@ static void free_area_pages(unsigned long address, unsigned long size)
pgd_t * dir;
unsigned long end = address + size;
- dir = pgd_offset(&init_task, address);
+ dir = pgd_offset(&init_mm, address);
+ flush_cache_all();
while (address < end) {
free_area_pmd(dir, address, end - address);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
- invalidate();
+ flush_tlb_all();
}
static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned long size)
@@ -120,7 +98,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo
page = __get_free_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
- *pte = mk_pte(page, PAGE_KERNEL);
+ set_pte(pte, mk_pte(page, PAGE_KERNEL));
address += PAGE_SIZE;
pte++;
}
@@ -152,7 +130,8 @@ static int alloc_area_pages(unsigned long address, unsigned long size)
pgd_t * dir;
unsigned long end = address + size;
- dir = pgd_offset(&init_task, address);
+ dir = pgd_offset(&init_mm, address);
+ flush_cache_all();
while (address < end) {
pmd_t *pmd = pmd_alloc_kernel(dir, address);
if (!pmd)
@@ -163,10 +142,32 @@ static int alloc_area_pages(unsigned long address, unsigned long size)
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
- invalidate();
+ flush_tlb_all();
return 0;
}
+struct vm_struct * get_vm_area(unsigned long size)
+{
+ void *addr;
+ struct vm_struct **p, *tmp, *area;
+
+ area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
+ if (!area)
+ return NULL;
+ addr = (void *) VMALLOC_START;
+ area->size = size + PAGE_SIZE;
+ area->next = NULL;
+ for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
+ if (size + (unsigned long) addr < (unsigned long) tmp->addr)
+ break;
+ addr = (void *) (tmp->size + (unsigned long) tmp->addr);
+ }
+ area->addr = addr;
+ area->next = *p;
+ *p = area;
+ return area;
+}
+
void vfree(void * addr)
{
struct vm_struct **p, *tmp;
@@ -191,25 +192,15 @@ void vfree(void * addr)
void * vmalloc(unsigned long size)
{
void * addr;
- struct vm_struct **p, *tmp, *area;
+ struct vm_struct *area;
size = PAGE_ALIGN(size);
- if (!size || size > high_memory)
+ if (!size || size > (max_mapnr << PAGE_SHIFT))
return NULL;
- area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
+ area = get_vm_area(size);
if (!area)
return NULL;
- addr = (void *) VMALLOC_START;
- area->size = size + PAGE_SIZE;
- area->next = NULL;
- for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
- if (size + (unsigned long) addr < (unsigned long) tmp->addr)
- break;
- addr = (void *) (tmp->size + (unsigned long) tmp->addr);
- }
- area->addr = addr;
- area->next = *p;
- *p = area;
+ addr = area->addr;
if (alloc_area_pages(VMALLOC_VMADDR(addr), size)) {
vfree(addr);
return NULL;
@@ -228,7 +219,10 @@ int vread(char *buf, char *addr, int count)
while (addr < vaddr) {
if (count == 0)
goto finished;
- put_fs_byte('\0', buf++), addr++, count--;
+ put_user('\0', buf);
+ buf++;
+ addr++;
+ count--;
}
n = tmp->size - PAGE_SIZE;
if (addr > vaddr)
@@ -236,7 +230,10 @@ int vread(char *buf, char *addr, int count)
while (--n >= 0) {
if (count == 0)
goto finished;
- put_fs_byte(*addr++, buf++), count--;
+ put_user(*addr, buf);
+ buf++;
+ addr++;
+ count--;
}
}
finished:
diff --git a/mm/vmscan.c b/mm/vmscan.c
new file mode 100644
index 000000000..d14a82f0b
--- /dev/null
+++ b/mm/vmscan.c
@@ -0,0 +1,453 @@
+/*
+ * linux/mm/vmscan.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Swap reorganised 29.12.95, Stephen Tweedie.
+ * kswapd added: 7.1.96 sct
+ * Version: $Id: vmscan.c,v 1.4.2.2 1996/01/20 18:22:47 linux Exp $
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/head.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/swapctl.h>
+#include <linux/smp_lock.h>
+
+#include <asm/dma.h>
+#include <asm/system.h> /* for cli()/sti() */
+#include <asm/uaccess.h> /* for copy_to/from_user */
+#include <asm/bitops.h>
+#include <asm/pgtable.h>
+
+/*
+ * When are we next due for a page scan?
+ */
+static int next_swap_jiffies = 0;
+
+/*
+ * How often do we do a pageout scan during normal conditions?
+ * Default is four times a second.
+ */
+int swapout_interval = HZ / 4;
+
+/*
+ * The wait queue for waking up the pageout daemon:
+ */
+static struct wait_queue * kswapd_wait = NULL;
+
+/*
+ * We avoid doing a reschedule if the pageout daemon is already awake;
+ */
+static int kswapd_awake = 0;
+
+/*
+ * sysctl-modifiable parameters to control the aggressiveness of the
+ * page-searching within the kswapd page recovery daemon.
+ */
+kswapd_control_t kswapd_ctl = {4, -1, -1, -1, -1};
+
+static void init_swap_timer(void);
+
+/*
+ * The swap-out functions return 1 if they successfully
+ * threw something out, and we got a free page. It returns
+ * zero if it couldn't do anything, and any other value
+ * indicates it decreased rss, but the page was shared.
+ *
+ * NOTE! If it sleeps, it *must* return 1 to make sure we
+ * don't continue with the swap-out. Otherwise we may be
+ * using a process that no longer actually exists (it might
+ * have died while we slept).
+ */
+static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
+ unsigned long address, pte_t * page_table, int dma, int wait)
+{
+ pte_t pte;
+ unsigned long entry;
+ unsigned long page;
+ struct page * page_map;
+
+ pte = *page_table;
+ if (!pte_present(pte))
+ return 0;
+ page = pte_page(pte);
+ if (MAP_NR(page) >= max_mapnr)
+ return 0;
+
+ page_map = mem_map + MAP_NR(page);
+ if (PageReserved(page_map)
+ || PageLocked(page_map)
+ || (dma && !PageDMA(page_map)))
+ return 0;
+ /* Deal with page aging. Pages age from being unused; they
+ * rejuvenate on being accessed. Only swap old pages (age==0
+ * is oldest). */
+ if ((pte_dirty(pte) && delete_from_swap_cache(MAP_NR(page)))
+ || pte_young(pte)) {
+ set_pte(page_table, pte_mkold(pte));
+ touch_page(page_map);
+ return 0;
+ }
+ age_page(page_map);
+ if (page_map->age)
+ return 0;
+ if (pte_dirty(pte)) {
+ if (vma->vm_ops && vma->vm_ops->swapout) {
+ pid_t pid = tsk->pid;
+ vma->vm_mm->rss--;
+ if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
+ kill_proc(pid, SIGBUS, 1);
+ } else {
+ if (page_map->count != 1)
+ return 0;
+ if (!(entry = get_swap_page())) {
+ /* Aieee!!! Out of swap space! */
+ int retval = -1;
+ if (nr_swapfiles == 0)
+ retval = 0;
+ return retval;
+ }
+ vma->vm_mm->rss--;
+ flush_cache_page(vma, address);
+ set_pte(page_table, __pte(entry));
+ flush_tlb_page(vma, address);
+ tsk->nswap++;
+ rw_swap_page(WRITE, entry, (char *) page, wait);
+ }
+ free_page(page);
+ return 1; /* we slept: the process may not exist any more */
+ }
+ if ((entry = find_in_swap_cache(MAP_NR(page)))) {
+ if (page_map->count != 1) {
+ set_pte(page_table, pte_mkdirty(pte));
+ printk("Aiee.. duplicated cached swap-cache entry\n");
+ return 0;
+ }
+ vma->vm_mm->rss--;
+ flush_cache_page(vma, address);
+ set_pte(page_table, __pte(entry));
+ flush_tlb_page(vma, address);
+ free_page(page);
+ return 1;
+ }
+ vma->vm_mm->rss--;
+ flush_cache_page(vma, address);
+ pte_clear(page_table);
+ flush_tlb_page(vma, address);
+ entry = page_unuse(page);
+ free_page(page);
+ return entry;
+}
+
+/*
+ * A new implementation of swap_out(). We do not swap complete processes,
+ * but only a small number of blocks, before we continue with the next
+ * process. The number of blocks actually swapped is determined on the
+ * number of page faults, that this process actually had in the last time,
+ * so we won't swap heavily used processes all the time ...
+ *
+ * Note: the priority argument is a hint on much CPU to waste with the
+ * swap block search, not a hint, of how much blocks to swap with
+ * each process.
+ *
+ * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
+ */
+
+static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
+ pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+{
+ pte_t * pte;
+ unsigned long pmd_end;
+
+ if (pmd_none(*dir))
+ return 0;
+ if (pmd_bad(*dir)) {
+ printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
+ pmd_clear(dir);
+ return 0;
+ }
+
+ pte = pte_offset(dir, address);
+
+ pmd_end = (address + PMD_SIZE) & PMD_MASK;
+ if (end > pmd_end)
+ end = pmd_end;
+
+ do {
+ int result;
+ tsk->swap_address = address + PAGE_SIZE;
+ result = try_to_swap_out(tsk, vma, address, pte, dma, wait);
+ if (result)
+ return result;
+ address += PAGE_SIZE;
+ pte++;
+ } while (address < end);
+ return 0;
+}
+
+static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
+ pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+{
+ pmd_t * pmd;
+ unsigned long pgd_end;
+
+ if (pgd_none(*dir))
+ return 0;
+ if (pgd_bad(*dir)) {
+ printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
+ pgd_clear(dir);
+ return 0;
+ }
+
+ pmd = pmd_offset(dir, address);
+
+ pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
+ if (end > pgd_end)
+ end = pgd_end;
+
+ do {
+ int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait);
+ if (result)
+ return result;
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+ return 0;
+}
+
+static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
+ pgd_t *pgdir, unsigned long start, int dma, int wait)
+{
+ unsigned long end;
+
+ /* Don't swap out areas like shared memory which have their
+ own separate swapping mechanism or areas which are locked down */
+ if (vma->vm_flags & (VM_SHM | VM_LOCKED))
+ return 0;
+
+ end = vma->vm_end;
+ while (start < end) {
+ int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait);
+ if (result)
+ return result;
+ start = (start + PGDIR_SIZE) & PGDIR_MASK;
+ pgdir++;
+ }
+ return 0;
+}
+
+static int swap_out_process(struct task_struct * p, int dma, int wait)
+{
+ unsigned long address;
+ struct vm_area_struct* vma;
+
+ /*
+ * Go through process' page directory.
+ */
+ address = p->swap_address;
+ p->swap_address = 0;
+
+ /*
+ * Find the proper vm-area
+ */
+ vma = find_vma(p->mm, address);
+ if (!vma)
+ return 0;
+ if (address < vma->vm_start)
+ address = vma->vm_start;
+
+ for (;;) {
+ int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait);
+ if (result)
+ return result;
+ vma = vma->vm_next;
+ if (!vma)
+ break;
+ address = vma->vm_start;
+ }
+ p->swap_address = 0;
+ return 0;
+}
+
+static int swap_out(unsigned int priority, int dma, int wait)
+{
+ static int swap_task;
+ int loop, counter;
+ struct task_struct *p;
+
+ counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+ for(; counter >= 0; counter--) {
+ /*
+ * Check that swap_task is suitable for swapping. If not, look for
+ * the next suitable process.
+ */
+ loop = 0;
+ while(1) {
+ if (swap_task >= NR_TASKS) {
+ swap_task = 1;
+ if (loop)
+ /* all processes are unswappable or already swapped out */
+ return 0;
+ loop = 1;
+ }
+
+ p = task[swap_task];
+ if (p && p->swappable && p->mm->rss)
+ break;
+
+ swap_task++;
+ }
+
+ /*
+ * Determine the number of pages to swap from this process.
+ */
+ if (!p->swap_cnt) {
+ /* Normalise the number of pages swapped by
+ multiplying by (RSS / 1MB) */
+ p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
+ }
+ if (!--p->swap_cnt)
+ swap_task++;
+ switch (swap_out_process(p, dma, wait)) {
+ /* out of swap space? */
+ case -1:
+ return 0;
+ case 0:
+ if (p->swap_cnt)
+ swap_task++;
+ break;
+ case 1:
+ return 1;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+/*
+ * We are much more aggressive about trying to swap out than we used
+ * to be. This works out OK, because we now do proper aging on page
+ * contents.
+ */
+int try_to_free_page(int priority, int dma, int wait)
+{
+ static int state = 0;
+ int i=6;
+ int stop;
+
+ /* we don't try as hard if we're not waiting.. */
+ stop = 3;
+ if (wait)
+ stop = 0;
+ switch (state) {
+ do {
+ case 0:
+ if (shrink_mmap(i, dma))
+ return 1;
+ state = 1;
+ case 1:
+ if (shm_swap(i, dma))
+ return 1;
+ state = 2;
+ default:
+ if (swap_out(i, dma, wait))
+ return 1;
+ state = 0;
+ i--;
+ } while ((i - stop) >= 0);
+ }
+ return 0;
+}
+
+
+/*
+ * The background pageout daemon.
+ * Started as a kernel thread from the init process.
+ */
+int kswapd(void *unused)
+{
+ int i;
+ char *revision="$Revision: 1.4.2.2 $", *s, *e;
+
+ current->session = 1;
+ current->pgrp = 1;
+ sprintf(current->comm, "kswapd");
+ current->blocked = ~0UL;
+
+ /*
+ * As a kernel thread we want to tamper with system buffers
+ * and other internals and thus be subject to the SMP locking
+ * rules. (On a uniprocessor box this does nothing).
+ */
+
+#ifdef __SMP__
+ lock_kernel();
+ syscall_count++;
+#endif
+
+ /* Give kswapd a realtime priority. */
+ current->policy = SCHED_FIFO;
+ current->priority = 32; /* Fixme --- we need to standardise our
+ namings for POSIX.4 realtime scheduling
+ priorities. */
+
+ init_swap_timer();
+
+ if ((s = strchr(revision, ':')) &&
+ (e = strchr(s, '$')))
+ s++, i = e - s;
+ else
+ s = revision, i = -1;
+ printk ("Started kswapd v%.*s\n", i, s);
+
+ while (1) {
+ kswapd_awake = 0;
+ current->signal = 0;
+ run_task_queue(&tq_disk);
+ interruptible_sleep_on(&kswapd_wait);
+ kswapd_awake = 1;
+ swapstats.wakeups++;
+ /* Do the background pageout: */
+ for (i=0; i < kswapd_ctl.maxpages; i++)
+ try_to_free_page(GFP_KERNEL, 0, 0);
+ }
+}
+
+/*
+ * The swap_tick function gets called on every clock tick.
+ */
+
+void swap_tick(void)
+{
+ if ((nr_free_pages + nr_async_pages) < free_pages_low ||
+ ((nr_free_pages + nr_async_pages) < free_pages_high &&
+ jiffies >= next_swap_jiffies)) {
+ if (!kswapd_awake && kswapd_ctl.maxpages > 0) {
+ wake_up(&kswapd_wait);
+ need_resched = 1;
+ }
+ next_swap_jiffies = jiffies + swapout_interval;
+ }
+ timer_active |= (1<<SWAP_TIMER);
+}
+
+
+/*
+ * Initialise the swap timer
+ */
+
+void init_swap_timer(void)
+{
+ timer_table[SWAP_TIMER].expires = 0;
+ timer_table[SWAP_TIMER].fn = swap_tick;
+ timer_active |= (1<<SWAP_TIMER);
+}