summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-10-09 00:00:47 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-10-09 00:00:47 +0000
commitd6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch)
treee2be02f33984c48ec019c654051d27964e42c441 /mm
parent609d1e803baf519487233b765eb487f9ec227a18 (diff)
Merge with 2.3.19.
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bigmem.c71
-rw-r--r--mm/filemap.c533
-rw-r--r--mm/memory.c245
-rw-r--r--mm/mlock.c10
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c95
-rw-r--r--mm/page_io.c82
-rw-r--r--mm/slab.c152
-rw-r--r--mm/swap_state.c19
-rw-r--r--mm/swapfile.c98
-rw-r--r--mm/vmalloc.c3
-rw-r--r--mm/vmscan.c171
15 files changed, 951 insertions, 547 deletions
diff --git a/mm/Makefile b/mm/Makefile
index c64eefbd2..68404aa67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,4 +12,8 @@ O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o \
swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o
+ifeq ($(CONFIG_BIGMEM),y)
+O_OBJS += bigmem.o
+endif
+
include $(TOPDIR)/Rules.make
diff --git a/mm/bigmem.c b/mm/bigmem.c
new file mode 100644
index 000000000..af63e860c
--- /dev/null
+++ b/mm/bigmem.c
@@ -0,0 +1,71 @@
+/*
+ * BIGMEM common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bigmem.h>
+
+unsigned long bigmem_mapnr;
+int nr_free_bigpages = 0;
+
+struct page * prepare_bigmem_swapout(struct page * page)
+{
+ /* if this is a bigmem page so it can't be swapped out directly
+ otherwise the b_data buffer addresses will break
+ the lowlevel device drivers. */
+ if (PageBIGMEM(page)) {
+ unsigned long regular_page;
+ unsigned long vaddr;
+
+ regular_page = __get_free_page(GFP_ATOMIC);
+ if (!regular_page)
+ return NULL;
+
+ vaddr = kmap(page_address(page), KM_READ);
+ copy_page(regular_page, vaddr);
+ kunmap(vaddr, KM_READ);
+
+ /* ok, we can just forget about our bigmem page since
+ we stored its data into the new regular_page. */
+ __free_page(page);
+
+ page = MAP_NR(regular_page) + mem_map;
+ }
+ return page;
+}
+
+struct page * replace_with_bigmem(struct page * page)
+{
+ if (!PageBIGMEM(page) && nr_free_bigpages) {
+ unsigned long kaddr;
+
+ kaddr = __get_free_page(GFP_ATOMIC|GFP_BIGMEM);
+ if (kaddr) {
+ struct page * bigmem_page;
+
+ bigmem_page = MAP_NR(kaddr) + mem_map;
+ if (PageBIGMEM(bigmem_page)) {
+ unsigned long vaddr;
+
+ vaddr = kmap(kaddr, KM_WRITE);
+ copy_page(vaddr, page_address(page));
+ kunmap(vaddr, KM_WRITE);
+
+ /* Preserve the caching of the swap_entry. */
+ bigmem_page->offset = page->offset;
+
+ /* We can just forget the old page since
+ we stored its data into the new
+ bigmem_page. */
+ __free_page(page);
+
+ page = bigmem_page;
+ }
+ }
+ }
+ return page;
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 668c6c99f..5efa9aaf7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,8 @@
*
* finished 'unifying' the page and buffer cache and SMP-threaded the
* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
*/
atomic_t page_cache_size = ATOMIC_INIT(0);
@@ -40,7 +42,16 @@ unsigned int page_hash_bits;
struct page **page_hash_table;
spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+/*
+ * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
+ * the pagemap_lru_lock held.
+ */
+spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
+#define CLUSTER_PAGES (1 << page_cluster)
+#define CLUSTER_SHIFT (PAGE_CACHE_SHIFT + page_cluster)
+#define CLUSTER_BYTES (1 << CLUSTER_SHIFT)
+#define CLUSTER_OFFSET(x) (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT)
void __add_page_to_hash_queue(struct page * page, struct page **p)
{
@@ -117,6 +128,7 @@ repeat:
}
if (page_count(page) != 2)
printk("hm, busy page invalidated? (not necesserily a bug)\n");
+ lru_cache_del(page);
remove_page_from_inode_queue(page);
remove_page_from_hash_queue(page);
@@ -151,8 +163,9 @@ repeat:
lock_page(page);
- if (inode->i_op->flushpage)
- inode->i_op->flushpage(inode, page, 0);
+ if (!inode->i_op->flushpage ||
+ inode->i_op->flushpage(inode, page, 0))
+ lru_cache_del(page);
/*
* We remove the page from the page cache
@@ -212,93 +225,75 @@ repeat:
spin_unlock(&pagecache_lock);
}
-extern atomic_t too_many_dirty_buffers;
-
int shrink_mmap(int priority, int gfp_mask)
{
- static unsigned long clock = 0;
- unsigned long limit = num_physpages << 1;
+ int ret = 0, count;
+ LIST_HEAD(young);
+ LIST_HEAD(old);
+ LIST_HEAD(forget);
+ struct list_head * page_lru, * dispose;
struct page * page;
- int count, users;
- count = limit >> priority;
+ count = nr_lru_pages / (priority+1);
- page = mem_map + clock;
- do {
- int referenced;
+ spin_lock(&pagemap_lru_lock);
- /* This works even in the presence of PageSkip because
- * the first two entries at the beginning of a hole will
- * be marked, not just the first.
- */
- page++;
- clock++;
- if (clock >= max_mapnr) {
- clock = 0;
- page = mem_map;
- }
- if (PageSkip(page)) {
- /* next_hash is overloaded for PageSkip */
- page = page->next_hash;
- clock = page - mem_map;
- }
-
- referenced = test_and_clear_bit(PG_referenced, &page->flags);
+ while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+ page = list_entry(page_lru, struct page, lru);
+ list_del(page_lru);
+ dispose = &lru_cache;
+ if (test_and_clear_bit(PG_referenced, &page->flags))
+ /* Roll the page at the top of the lru list,
+ * we could also be more aggressive putting
+ * the page in the young-dispose-list, so
+ * avoiding to free young pages in each pass.
+ */
+ goto dispose_continue;
+
+ dispose = &old;
+ /* don't account passes over not DMA pages */
if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
- continue;
+ goto dispose_continue;
+ if (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page))
+ goto dispose_continue;
count--;
- /*
- * Some common cases that we just short-circuit without
- * getting the locks - we need to re-check this once we
- * have the lock, but that's fine.
- */
- users = page_count(page);
- if (!users)
- continue;
- if (!page->buffers) {
- if (!page->inode)
- continue;
- if (users > 1)
- continue;
- }
-
- /*
- * ok, now the page looks interesting. Re-check things
- * and keep the lock.
- */
+ dispose = &young;
+ if (TryLockPage(page))
+ goto dispose_continue;
+
+ /* Release the pagemap_lru lock even if the page is not yet
+ queued in any lru queue since we have just locked down
+ the page so nobody else may SMP race with us running
+ a lru_cache_del() (lru_cache_del() always run with the
+ page locked down ;). */
+ spin_unlock(&pagemap_lru_lock);
+
+ /* avoid unscalable SMP locking */
+ if (!page->buffers && page_count(page) > 1)
+ goto unlock_noput_continue;
+
+ /* Take the pagecache_lock spinlock held to avoid
+ other tasks to notice the page while we are looking at its
+ page count. If it's a pagecache-page we'll free it
+ in one atomic transaction after checking its page count. */
spin_lock(&pagecache_lock);
- if (!page->inode && !page->buffers) {
- spin_unlock(&pagecache_lock);
- continue;
- }
- if (!page_count(page)) {
- spin_unlock(&pagecache_lock);
- BUG();
- continue;
- }
- get_page(page);
- if (TryLockPage(page)) {
- spin_unlock(&pagecache_lock);
- goto put_continue;
- }
- /*
- * we keep pagecache_lock locked and unlock it in
- * each branch, so that the page->inode case doesnt
- * have to re-grab it. Here comes the 'real' logic
- * to free memory:
- */
+ /* avoid freeing the page while it's locked */
+ get_page(page);
/* Is it a buffer page? */
if (page->buffers) {
- int mem = page->inode ? 0 : PAGE_CACHE_SIZE;
spin_unlock(&pagecache_lock);
if (!try_to_free_buffers(page))
goto unlock_continue;
- atomic_sub(mem, &buffermem);
+ /* page was locked, inode can't go away under us */
+ if (!page->inode) {
+ atomic_sub(PAGE_CACHE_SIZE, &buffermem);
+ goto made_buffer_progress;
+ }
spin_lock(&pagecache_lock);
}
@@ -307,7 +302,7 @@ int shrink_mmap(int priority, int gfp_mask)
* (count == 2 because we added one ourselves above).
*/
if (page_count(page) != 2)
- goto spin_unlock_continue;
+ goto cache_unlock_continue;
/*
* Is it a page swap page? If so, we want to
@@ -316,35 +311,68 @@ int shrink_mmap(int priority, int gfp_mask)
*/
if (PageSwapCache(page)) {
spin_unlock(&pagecache_lock);
- if (referenced && swap_count(page->offset) != 2)
- goto unlock_continue;
__delete_from_swap_cache(page);
- page_cache_release(page);
- goto made_progress;
+ goto made_inode_progress;
}
/* is it a page-cache page? */
- if (!referenced && page->inode && !pgcache_under_min()) {
- remove_page_from_inode_queue(page);
- remove_page_from_hash_queue(page);
- page->inode = NULL;
- spin_unlock(&pagecache_lock);
-
- page_cache_release(page);
- goto made_progress;
+ if (page->inode)
+ {
+ dispose = &old;
+ if (!pgcache_under_min())
+ {
+ remove_page_from_inode_queue(page);
+ remove_page_from_hash_queue(page);
+ page->inode = NULL;
+ spin_unlock(&pagecache_lock);
+ goto made_inode_progress;
+ }
+ goto cache_unlock_continue;
}
-spin_unlock_continue:
+
+ dispose = &forget;
+ printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
+
+cache_unlock_continue:
spin_unlock(&pagecache_lock);
unlock_continue:
UnlockPage(page);
-put_continue:
put_page(page);
- } while (count > 0);
- return 0;
-made_progress:
+dispose_relock_continue:
+ /* even if the dispose list is local, a truncate_inode_page()
+ may remove a page from its queue so always
+ synchronize with the lru lock while accesing the
+ page->lru field */
+ spin_lock(&pagemap_lru_lock);
+ list_add(page_lru, dispose);
+ continue;
+
+unlock_noput_continue:
+ UnlockPage(page);
+ goto dispose_relock_continue;
+
+dispose_continue:
+ list_add(page_lru, dispose);
+ }
+ goto out;
+
+made_inode_progress:
+ page_cache_release(page);
+made_buffer_progress:
UnlockPage(page);
put_page(page);
- return 1;
+ ret = 1;
+ spin_lock(&pagemap_lru_lock);
+ /* nr_lru_pages needs the spinlock */
+ nr_lru_pages--;
+
+out:
+ list_splice(&young, &lru_cache);
+ list_splice(&old, lru_cache.prev);
+
+ spin_unlock(&pagemap_lru_lock);
+
+ return ret;
}
static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
@@ -461,13 +489,14 @@ static inline void __add_to_page_cache(struct page * page,
{
unsigned long flags;
- flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
- page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced));
- page->owner = (int)current; /* REMOVEME */
+ flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
+ page->flags = flags | (1 << PG_locked);
+ page->owner = current; /* REMOVEME */
get_page(page);
page->offset = offset;
add_page_to_inode_queue(inode, page);
__add_page_to_hash_queue(page, hash);
+ lru_cache_add(page);
}
void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
@@ -498,39 +527,58 @@ int add_to_page_cache_unique(struct page * page,
}
/*
- * Try to read ahead in the file. "page_cache" is a potentially free page
- * that we could use for the cache (if it is 0 we can try to create one,
- * this is all overlapped with the IO on the previous page finishing anyway)
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
*/
-static unsigned long try_to_read_ahead(struct file * file,
- unsigned long offset, unsigned long page_cache)
+static inline void page_cache_read(struct file * file, unsigned long offset)
{
+ unsigned long new_page;
struct inode *inode = file->f_dentry->d_inode;
- struct page * page;
- struct page ** hash;
+ struct page ** hash = page_hash(inode, offset);
+ struct page * page;
- offset &= PAGE_CACHE_MASK;
- switch (page_cache) {
- case 0:
- page_cache = page_cache_alloc();
- if (!page_cache)
- break;
- default:
- if (offset >= inode->i_size)
- break;
- hash = page_hash(inode, offset);
- page = page_cache_entry(page_cache);
- if (!add_to_page_cache_unique(page, inode, offset, hash)) {
- /*
- * We do not have to check the return value here
- * because it's a readahead.
- */
- inode->i_op->readpage(file, page);
- page_cache = 0;
- page_cache_release(page);
- }
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, offset, *hash);
+ spin_unlock(&pagecache_lock);
+ if (page)
+ return;
+
+ new_page = page_cache_alloc();
+ if (!new_page)
+ return;
+ page = page_cache_entry(new_page);
+
+ if (!add_to_page_cache_unique(page, inode, offset, hash)) {
+ inode->i_op->readpage(file, page);
+ page_cache_release(page);
+ return;
+ }
+
+ /*
+ * We arrive here in the unlikely event that someone
+ * raced with us and added our page to the cache first.
+ */
+ page_cache_free(new_page);
+ return;
+}
+
+/*
+ * Read in an entire cluster at once. A cluster is usually a 64k-
+ * aligned block that includes the address requested in "offset."
+ */
+static void read_cluster_nonblocking(struct file * file,
+ unsigned long offset)
+{
+ off_t filesize = file->f_dentry->d_inode->i_size;
+ unsigned long pages = CLUSTER_PAGES;
+
+ offset = CLUSTER_OFFSET(offset);
+ while ((pages-- > 0) && (offset < filesize)) {
+ page_cache_read(file, offset);
+ offset += PAGE_CACHE_SIZE;
}
- return page_cache;
+
+ return;
}
/*
@@ -547,8 +595,8 @@ void ___wait_on_page(struct page *page)
add_wait_queue(&page->wait, &wait);
do {
- tsk->state = TASK_UNINTERRUPTIBLE;
run_task_queue(&tq_disk);
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!PageLocked(page))
break;
schedule();
@@ -562,23 +610,8 @@ void ___wait_on_page(struct page *page)
*/
void lock_page(struct page *page)
{
- if (TryLockPage(page)) {
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, current);
-
- run_task_queue(&tq_disk);
- add_wait_queue(&page->wait, &wait);
- tsk->state = TASK_UNINTERRUPTIBLE;
-
- while (TryLockPage(page)) {
- run_task_queue(&tq_disk);
- schedule();
- tsk->state = TASK_UNINTERRUPTIBLE;
- }
-
- remove_wait_queue(&page->wait, &wait);
- tsk->state = TASK_RUNNING;
- }
+ while (TryLockPage(page))
+ ___wait_on_page(page);
}
@@ -607,13 +640,14 @@ repeat:
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
+ run_task_queue(&tq_disk);
+
+ __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
add_wait_queue(&page->wait, &wait);
- tsk->state = TASK_UNINTERRUPTIBLE;
- run_task_queue(&tq_disk);
if (PageLocked(page))
schedule();
- tsk->state = TASK_RUNNING;
+ __set_task_state(tsk, TASK_RUNNING);
remove_wait_queue(&page->wait, &wait);
/*
@@ -656,13 +690,14 @@ repeat:
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
+ run_task_queue(&tq_disk);
+
+ __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
add_wait_queue(&page->wait, &wait);
- tsk->state = TASK_UNINTERRUPTIBLE;
- run_task_queue(&tq_disk);
if (PageLocked(page))
schedule();
- tsk->state = TASK_RUNNING;
+ __set_task_state(tsk, TASK_RUNNING);
remove_wait_queue(&page->wait, &wait);
/*
@@ -811,9 +846,9 @@ static inline int get_max_readahead(struct inode * inode)
return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
}
-static inline unsigned long generic_file_readahead(int reada_ok,
+static void generic_file_readahead(int reada_ok,
struct file * filp, struct inode * inode,
- unsigned long ppos, struct page * page, unsigned long page_cache)
+ unsigned long ppos, struct page * page)
{
unsigned long max_ahead, ahead;
unsigned long raend;
@@ -877,8 +912,7 @@ static inline unsigned long generic_file_readahead(int reada_ok,
ahead = 0;
while (ahead < max_ahead) {
ahead += PAGE_CACHE_SIZE;
- page_cache = try_to_read_ahead(filp, raend + ahead,
- page_cache);
+ page_cache_read(filp, raend + ahead);
}
/*
* If we tried to read ahead some pages,
@@ -910,26 +944,9 @@ static inline unsigned long generic_file_readahead(int reada_ok,
#endif
}
- return page_cache;
+ return;
}
-/*
- * "descriptor" for what we're up to with a read.
- * This allows us to use the same read code yet
- * have multiple different users of the data that
- * we read from a file.
- *
- * The simplest case just copies the data to user
- * mode.
- */
-typedef struct {
- size_t written;
- size_t count;
- char * buf;
- int error;
-} read_descriptor_t;
-
-typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
/*
* This is a generic file read routine, and uses the
@@ -939,7 +956,7 @@ typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
+void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
{
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
@@ -1044,7 +1061,8 @@ page_ok:
* Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
*/
page_not_up_to_date:
- page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+ generic_file_readahead(reada_ok, filp, inode,
+ pos & PAGE_CACHE_MASK, page);
if (Page_Uptodate(page))
goto page_ok;
@@ -1065,7 +1083,8 @@ readpage:
goto page_ok;
/* Again, try some read-ahead while waiting for the page to finish.. */
- page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+ generic_file_readahead(reada_ok, filp, inode,
+ pos & PAGE_CACHE_MASK, page);
wait_on_page(page);
if (Page_Uptodate(page))
goto page_ok;
@@ -1267,31 +1286,36 @@ out:
}
/*
- * Semantics for shared and private memory areas are different past the end
- * of the file. A shared mapping past the last page of the file is an error
- * and results in a SIGBUS, while a private mapping just maps in a zero page.
+ * filemap_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
*
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*
- * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
- * ahead of the wait if we're sure to need it.
+ * XXX - at some point, this should return unique values to indicate to
+ * the caller whether this is EIO, OOM, or SIGBUS.
*/
-static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
+static unsigned long filemap_nopage(struct vm_area_struct * area,
+ unsigned long address, int no_share)
{
struct file * file = area->vm_file;
struct dentry * dentry = file->f_dentry;
struct inode * inode = dentry->d_inode;
- unsigned long offset, reada, i;
struct page * page, **hash;
- unsigned long old_page, new_page;
- int error;
+ unsigned long old_page;
+
+ unsigned long offset = address - area->vm_start + area->vm_offset;
- new_page = 0;
- offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
- if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
- goto no_page;
+ /*
+ * Semantics for shared and private memory areas are different
+ * past the end of the file. A shared mapping past the last page
+ * of the file is an error and results in a SIGBUS, while a
+ * private mapping just maps in a zero page.
+ */
+ if ((offset >= inode->i_size) &&
+ (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
+ return 0;
/*
* Do we have something in the page cache already?
@@ -1302,24 +1326,12 @@ retry_find:
if (!page)
goto no_cached_page;
-found_page:
/*
* Ok, found a page in the page cache, now we need to check
- * that it's up-to-date. First check whether we'll need an
- * extra page -- better to overlap the allocation with the I/O.
+ * that it's up-to-date.
*/
- if (no_share && !new_page) {
- new_page = page_cache_alloc();
- if (!new_page)
- goto failure;
- }
-
- if (!Page_Uptodate(page)) {
- lock_page(page);
- if (!Page_Uptodate(page))
- goto page_not_uptodate;
- UnlockPage(page);
- }
+ if (!Page_Uptodate(page))
+ goto page_not_uptodate;
success:
/*
@@ -1327,100 +1339,76 @@ success:
* and possibly copy it over to another page..
*/
old_page = page_address(page);
- if (!no_share) {
- /*
- * Ok, we can share the cached page directly.. Get rid
- * of any potential extra pages.
- */
- if (new_page)
- page_cache_free(new_page);
+ if (no_share) {
+ unsigned long new_page = page_cache_alloc();
- flush_page_to_ram(old_page);
- return old_page;
+ if (new_page) {
+ copy_page(new_page, old_page);
+ flush_page_to_ram(new_page);
+ }
+ page_cache_release(page);
+ return new_page;
}
-
- /*
- * No sharing ... copy to the new page.
- */
- copy_page(new_page, old_page);
- flush_page_to_ram(new_page);
- page_cache_release(page);
- return new_page;
+
+ flush_page_to_ram(old_page);
+ return old_page;
no_cached_page:
/*
- * Try to read in an entire cluster at once.
- */
- reada = offset;
- reada >>= PAGE_CACHE_SHIFT + page_cluster;
- reada <<= PAGE_CACHE_SHIFT + page_cluster;
-
- for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
- new_page = try_to_read_ahead(file, reada, new_page);
-
- if (!new_page)
- new_page = page_cache_alloc();
- if (!new_page)
- goto no_page;
-
- /*
- * During getting the above page we might have slept,
- * so we need to re-check the situation with the page
- * cache.. The page we just got may be useful if we
- * can't share, so don't get rid of it here.
- */
- page = __find_get_page(inode, offset, hash);
- if (page)
- goto found_page;
-
- /*
- * Now, create a new page-cache page from the page we got
+ * If the requested offset is within our file, try to read a whole
+ * cluster of pages at once.
+ *
+ * Otherwise, we're off the end of a privately mapped file,
+ * so we need to map a zero page.
*/
- page = page_cache_entry(new_page);
- if (add_to_page_cache_unique(page, inode, offset, hash))
- goto retry_find;
+ if (offset < inode->i_size)
+ read_cluster_nonblocking(file, offset);
+ else
+ page_cache_read(file, offset);
/*
- * Now it's ours and locked, we can do initial IO to it:
+ * The page we want has now been added to the page cache.
+ * In the unlikely event that someone removed it in the
+ * meantime, we'll just come back here and read it again.
*/
- new_page = 0;
+ goto retry_find;
page_not_uptodate:
- error = inode->i_op->readpage(file, page);
+ lock_page(page);
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto success;
+ }
- if (!error) {
+ if (!inode->i_op->readpage(file, page)) {
wait_on_page(page);
- if (PageError(page))
- goto page_read_error;
- goto success;
+ if (Page_Uptodate(page))
+ goto success;
}
-page_read_error:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- if (!PageLocked(page))
- PAGE_BUG(page);
- ClearPageError(page);
- error = inode->i_op->readpage(file, page);
- if (error)
- goto failure;
- wait_on_page(page);
- if (Page_Uptodate(page))
+ lock_page(page);
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
goto success;
+ }
+ ClearPageError(page);
+ if (!inode->i_op->readpage(file, page)) {
+ wait_on_page(page);
+ if (Page_Uptodate(page))
+ goto success;
+ }
/*
* Things didn't work out. Return zero to tell the
* mm layer so, possibly freeing the page cache page first.
*/
-failure:
page_cache_release(page);
- if (new_page)
- page_cache_free(new_page);
-no_page:
return 0;
}
@@ -1702,7 +1690,7 @@ static int msync_interval(struct vm_area_struct * vma,
return 0;
}
-asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
+asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
{
unsigned long end;
struct vm_area_struct * vma;
@@ -1855,28 +1843,29 @@ repeat_find:
if (!PageLocked(page)) {
PAGE_BUG(page);
} else {
- if (page->owner != (int)current) {
+ if (page->owner != current) {
PAGE_BUG(page);
}
}
status = write_one_page(file, page, offset, bytes, buf);
+ if (status >= 0) {
+ written += status;
+ count -= status;
+ pos += status;
+ buf += status;
+ if (pos > inode->i_size)
+ inode->i_size = pos;
+ }
/* Mark it unlocked again and drop the page.. */
UnlockPage(page);
page_cache_release(page);
if (status < 0)
break;
-
- written += status;
- count -= status;
- pos += status;
- buf += status;
}
*ppos = pos;
- if (pos > inode->i_size)
- inode->i_size = pos;
if (page_cache)
page_cache_free(page_cache);
diff --git a/mm/memory.c b/mm/memory.c
index a31e862b2..5498dbcf0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -31,6 +31,9 @@
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ * (Gerhard.Wichert@pdb.siemens.de)
*/
#include <linux/mm.h>
@@ -39,6 +42,8 @@
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
#include <linux/swapctl.h>
+#include <linux/iobuf.h>
+#include <linux/bigmem.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -55,10 +60,10 @@ void * high_memory = NULL;
static inline void copy_cow_page(unsigned long from, unsigned long to)
{
if (from == ZERO_PAGE(to)) {
- clear_page(to);
+ clear_bigpage(to);
return;
}
- copy_page(to, from);
+ copy_bigpage(to, from);
}
mem_map_t * mem_map = NULL;
@@ -142,39 +147,6 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
check_pgt_cache();
}
-/*
- * This function just free's the page directory - the
- * pages tables themselves have been freed earlier by
- * clear_page_tables().
- */
-void free_page_tables(struct mm_struct * mm)
-{
- pgd_t * page_dir = mm->pgd;
-
- if (page_dir) {
- if (page_dir == swapper_pg_dir)
- goto out_bad;
- pgd_free(page_dir);
- }
- return;
-
-out_bad:
- printk(KERN_ERR
- "free_page_tables: Trying to free kernel pgd\n");
- return;
-}
-
-int new_page_tables(struct task_struct * tsk)
-{
- pgd_t * new_pg;
-
- if (!(new_pg = pgd_alloc()))
- return -ENOMEM;
- SET_PAGE_DIR(tsk, new_pg);
- tsk->mm->pgd = new_pg;
- return 0;
-}
-
#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
@@ -417,6 +389,192 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s
}
}
+
+/*
+ * Do a quick page-table lookup for a single page.
+ */
+static unsigned long follow_page(unsigned long address)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(current->mm, address);
+ pmd = pmd_offset(pgd, address);
+ if (pmd) {
+ pte_t * pte = pte_offset(pmd, address);
+ if (pte && pte_present(*pte)) {
+ return pte_page(*pte);
+ }
+ }
+
+ printk(KERN_ERR "Missing page in follow_page\n");
+ return 0;
+}
+
+/*
+ * Given a physical address, is there a useful struct page pointing to it?
+ */
+
+static struct page * get_page_map(unsigned long page)
+{
+ struct page *map;
+
+ if (MAP_NR(page) >= max_mapnr)
+ return 0;
+ if (page == ZERO_PAGE(page))
+ return 0;
+ map = mem_map + MAP_NR(page);
+ if (PageReserved(map))
+ return 0;
+ return map;
+}
+
+/*
+ * Force in an entire range of pages from the current process's user VA,
+ * and pin and lock the pages for IO.
+ */
+
+#define dprintk(x...)
+int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
+{
+ unsigned long ptr, end;
+ int err;
+ struct mm_struct * mm;
+ struct vm_area_struct * vma = 0;
+ unsigned long page;
+ struct page * map;
+ int doublepage = 0;
+ int repeat = 0;
+ int i;
+
+ /* Make sure the iobuf is not already mapped somewhere. */
+ if (iobuf->nr_pages)
+ return -EINVAL;
+
+ mm = current->mm;
+ dprintk ("map_user_kiobuf: begin\n");
+
+ ptr = va & PAGE_MASK;
+ end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
+ err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
+ if (err)
+ return err;
+
+ repeat:
+ down(&mm->mmap_sem);
+
+ err = -EFAULT;
+ iobuf->locked = 1;
+ iobuf->offset = va & ~PAGE_MASK;
+ iobuf->length = len;
+
+ i = 0;
+
+ /*
+ * First of all, try to fault in all of the necessary pages
+ */
+ while (ptr < end) {
+ if (!vma || ptr >= vma->vm_end) {
+ vma = find_vma(current->mm, ptr);
+ if (!vma)
+ goto out_unlock;
+ }
+ if (handle_mm_fault(current, vma, ptr, (rw==READ)) <= 0)
+ goto out_unlock;
+ spin_lock(&mm->page_table_lock);
+ page = follow_page(ptr);
+ if (!page) {
+ dprintk (KERN_ERR "Missing page in map_user_kiobuf\n");
+ map = NULL;
+ goto retry;
+ }
+ map = get_page_map(page);
+ if (map) {
+ if (TryLockPage(map)) {
+ goto retry;
+ }
+ atomic_inc(&map->count);
+ }
+ spin_unlock(&mm->page_table_lock);
+ dprintk ("Installing page %p %p: %d\n", (void *)page, map, i);
+ iobuf->pagelist[i] = page;
+ iobuf->maplist[i] = map;
+ iobuf->nr_pages = ++i;
+
+ ptr += PAGE_SIZE;
+ }
+
+ up(&mm->mmap_sem);
+ dprintk ("map_user_kiobuf: end OK\n");
+ return 0;
+
+ out_unlock:
+ up(&mm->mmap_sem);
+ unmap_kiobuf(iobuf);
+ dprintk ("map_user_kiobuf: end %d\n", err);
+ return err;
+
+ retry:
+
+ /*
+ * Undo the locking so far, wait on the page we got to, and try again.
+ */
+ spin_unlock(&mm->page_table_lock);
+ unmap_kiobuf(iobuf);
+ up(&mm->mmap_sem);
+
+ /*
+ * Did the release also unlock the page we got stuck on?
+ */
+ if (map) {
+ if (!PageLocked(map)) {
+ /* If so, we may well have the page mapped twice
+ * in the IO address range. Bad news. Of
+ * course, it _might_ * just be a coincidence,
+ * but if it happens more than * once, chances
+ * are we have a double-mapped page. */
+ if (++doublepage >= 3) {
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Try again...
+ */
+ wait_on_page(map);
+ }
+
+ if (++repeat < 16) {
+ ptr = va & PAGE_MASK;
+ goto repeat;
+ }
+ return -EAGAIN;
+}
+
+
+/*
+ * Unmap all of the pages referenced by a kiobuf. We release the pages,
+ * and unlock them if they were locked.
+ */
+
+void unmap_kiobuf (struct kiobuf *iobuf)
+{
+ int i;
+ struct page *map;
+
+ for (i = 0; i < iobuf->nr_pages; i++) {
+ map = iobuf->maplist[i];
+
+ if (map && iobuf->locked) {
+ __free_page(map);
+ UnlockPage(map);
+ }
+ }
+
+ iobuf->nr_pages = 0;
+ iobuf->locked = 0;
+}
+
static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
unsigned long size, pgprot_t prot)
{
@@ -655,7 +813,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
* Ok, we need to copy. Oh, well..
*/
spin_unlock(&tsk->mm->page_table_lock);
- new_page = __get_free_page(GFP_USER);
+ new_page = __get_free_page(GFP_BIGUSER);
if (!new_page)
return -1;
spin_lock(&tsk->mm->page_table_lock);
@@ -667,7 +825,6 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
if (PageReserved(page))
++vma->vm_mm->rss;
copy_cow_page(old_page,new_page);
- flush_page_to_ram(old_page);
flush_page_to_ram(new_page);
flush_cache_page(vma, address);
set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
@@ -681,6 +838,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
return 1;
bad_wp_page:
+ spin_unlock(&tsk->mm->page_table_lock);
printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
return -1;
}
@@ -781,7 +939,7 @@ out_unlock:
* because it doesn't cost us any seek time. We also make sure to queue
* the 'original' request together with the readahead ones...
*/
-static void swapin_readahead(unsigned long entry)
+void swapin_readahead(unsigned long entry)
{
int i;
struct page *new_page;
@@ -833,12 +991,17 @@ static int do_swap_page(struct task_struct * tsk,
vma->vm_mm->rss++;
tsk->min_flt++;
+ lock_kernel();
swap_free(entry);
+ unlock_kernel();
pte = mk_pte(page_address(page), vma->vm_page_prot);
+ set_bit(PG_swap_entry, &page->flags);
if (write_access && !is_page_shared(page)) {
delete_from_swap_cache(page);
+ page = replace_with_bigmem(page);
+ pte = mk_pte(page_address(page), vma->vm_page_prot);
pte = pte_mkwrite(pte_mkdirty(pte));
}
set_pte(page_table, pte);
@@ -854,10 +1017,10 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v
{
pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
if (write_access) {
- unsigned long page = __get_free_page(GFP_USER);
+ unsigned long page = __get_free_page(GFP_BIGUSER);
if (!page)
return -1;
- clear_page(page);
+ clear_bigpage(page);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
vma->vm_mm->rss++;
tsk->min_flt++;
@@ -898,6 +1061,8 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
if (!page)
return 0; /* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */
+ if (page == -1)
+ return -1; /* OOM */
++tsk->maj_flt;
++vma->vm_mm->rss;
diff --git a/mm/mlock.c b/mm/mlock.c
index d6b19cfb1..be5e07cbf 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -130,7 +130,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
struct vm_area_struct * vma, * next;
int error;
- if (!capable(CAP_IPC_LOCK))
+ if (on && !capable(CAP_IPC_LOCK))
return -EPERM;
len = (len + ~PAGE_MASK) & PAGE_MASK;
end = start + len;
@@ -172,7 +172,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-asmlinkage int sys_mlock(unsigned long start, size_t len)
+asmlinkage long sys_mlock(unsigned long start, size_t len)
{
unsigned long locked;
unsigned long lock_limit;
@@ -203,7 +203,7 @@ out:
return error;
}
-asmlinkage int sys_munlock(unsigned long start, size_t len)
+asmlinkage long sys_munlock(unsigned long start, size_t len)
{
int ret;
@@ -244,7 +244,7 @@ static int do_mlockall(int flags)
return error;
}
-asmlinkage int sys_mlockall(int flags)
+asmlinkage long sys_mlockall(int flags)
{
unsigned long lock_limit;
int ret = -EINVAL;
@@ -271,7 +271,7 @@ out:
return ret;
}
-asmlinkage int sys_munlockall(void)
+asmlinkage long sys_munlockall(void)
{
int ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 61826cfa2..3bddb5c18 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -275,7 +275,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
vma->vm_ops = NULL;
vma->vm_offset = off;
vma->vm_file = NULL;
- vma->vm_pte = 0;
+ vma->vm_private_data = NULL;
/* Clear old maps */
error = -ENOMEM;
@@ -547,7 +547,7 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
mpnt->vm_ops = area->vm_ops;
mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
mpnt->vm_file = area->vm_file;
- mpnt->vm_pte = area->vm_pte;
+ mpnt->vm_private_data = area->vm_private_data;
if (mpnt->vm_file)
get_file(mpnt->vm_file);
if (mpnt->vm_ops && mpnt->vm_ops->open)
@@ -707,7 +707,7 @@ int do_munmap(unsigned long addr, size_t len)
return 0;
}
-asmlinkage int sys_munmap(unsigned long addr, size_t len)
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
{
int ret;
@@ -778,7 +778,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
vma->vm_ops = NULL;
vma->vm_offset = 0;
vma->vm_file = NULL;
- vma->vm_pte = 0;
+ vma->vm_private_data = NULL;
/*
* merge_segments may merge our vma, so we can't refer to it
@@ -813,6 +813,7 @@ void exit_mmap(struct mm_struct * mm)
{
struct vm_area_struct * mpnt;
+ release_segments(mm);
mpnt = mm->mmap;
mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
mm->rss = 0;
@@ -919,7 +920,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
/* To share, we must have the same file, operations.. */
if ((mpnt->vm_file != prev->vm_file)||
- (mpnt->vm_pte != prev->vm_pte) ||
+ (mpnt->vm_private_data != prev->vm_private_data) ||
(mpnt->vm_ops != prev->vm_ops) ||
(mpnt->vm_flags != prev->vm_flags) ||
(prev->vm_end != mpnt->vm_start))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b1504af83..61ef3116d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -194,7 +194,7 @@ static int mprotect_fixup(struct vm_area_struct * vma,
return 0;
}
-asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
+asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
{
unsigned long nstart, end, tmp;
struct vm_area_struct * vma, * next;
diff --git a/mm/mremap.c b/mm/mremap.c
index 2852f9b06..95f4b4f90 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -118,7 +118,7 @@ oops_we_failed:
flush_cache_range(mm, new_addr, new_addr + len);
while ((offset += PAGE_SIZE) < len)
move_one_page(mm, new_addr + offset, old_addr + offset);
- zap_page_range(mm, new_addr, new_addr + len);
+ zap_page_range(mm, new_addr, len);
flush_tlb_range(mm, new_addr, new_addr + len);
return -1;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22ce7ac00..b62783c72 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3,6 +3,7 @@
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*/
#include <linux/config.h>
@@ -13,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/pagemap.h>
+#include <linux/bigmem.h> /* export bigmem vars */
#include <asm/dma.h>
#include <asm/uaccess.h> /* for copy_to/from_user */
@@ -20,6 +22,8 @@
int nr_swap_pages = 0;
int nr_free_pages = 0;
+int nr_lru_pages;
+LIST_HEAD(lru_cache);
/*
* Free area management
@@ -45,7 +49,12 @@ struct free_area_struct {
#define memory_head(x) ((struct page *)(x))
+#ifdef CONFIG_BIGMEM
+#define BIGMEM_LISTS_OFFSET NR_MEM_LISTS
+static struct free_area_struct free_area[NR_MEM_LISTS*2];
+#else
static struct free_area_struct free_area[NR_MEM_LISTS];
+#endif
static inline void init_mem_queue(struct free_area_struct * head)
{
@@ -101,6 +110,12 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
#define list(x) (mem_map+(x))
+#ifdef CONFIG_BIGMEM
+ if (map_nr >= bigmem_mapnr) {
+ area += BIGMEM_LISTS_OFFSET;
+ nr_free_bigpages -= mask;
+ }
+#endif
map_nr &= mask;
nr_free_pages -= mask;
while (mask + (1 << (NR_MEM_LISTS-1))) {
@@ -127,7 +142,6 @@ int __free_page(struct page *page)
if (PageLocked(page))
PAGE_BUG(page);
- page->flags &= ~(1 << PG_referenced);
free_pages_ok(page - mem_map, 0);
return 1;
}
@@ -145,7 +159,6 @@ int free_pages(unsigned long addr, unsigned long order)
PAGE_BUG(map);
if (PageLocked(map))
PAGE_BUG(map);
- map->flags &= ~(1 << PG_referenced);
free_pages_ok(map_nr, order);
return 1;
}
@@ -160,6 +173,29 @@ int free_pages(unsigned long addr, unsigned long order)
change_bit((index) >> (1+(order)), (area)->map)
#define CAN_DMA(x) (PageDMA(x))
#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
+
+#ifdef CONFIG_BIGMEM
+#define RMQUEUEBIG(order, gfp_mask) \
+if (gfp_mask & __GFP_BIGMEM) { \
+ struct free_area_struct * area = free_area+order+BIGMEM_LISTS_OFFSET; \
+ unsigned long new_order = order; \
+ do { struct page *prev = memory_head(area), *ret = prev->next; \
+ if (memory_head(area) != ret) { \
+ unsigned long map_nr; \
+ (prev->next = ret->next)->prev = prev; \
+ map_nr = ret - mem_map; \
+ MARK_USED(map_nr, new_order, area); \
+ nr_free_pages -= 1 << order; \
+ nr_free_bigpages -= 1 << order; \
+ EXPAND(ret, map_nr, order, new_order, area); \
+ spin_unlock_irqrestore(&page_alloc_lock, flags); \
+ return ADDRESS(map_nr); \
+ } \
+ new_order++; area++; \
+ } while (new_order < NR_MEM_LISTS); \
+}
+#endif
+
#define RMQUEUE(order, gfp_mask) \
do { struct free_area_struct * area = free_area+order; \
unsigned long new_order = order; \
@@ -194,8 +230,6 @@ do { unsigned long size = 1 << high; \
set_page_count(map, 1); \
} while (0)
-int low_on_memory = 0;
-
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags;
@@ -221,7 +255,9 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
*/
if (!(current->flags & PF_MEMALLOC)) {
int freed;
+ static int low_on_memory = 0;
+#ifndef CONFIG_BIGMEM
if (nr_free_pages > freepages.min) {
if (!low_on_memory)
goto ok_to_allocate;
@@ -232,6 +268,32 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
}
low_on_memory = 1;
+#else
+ static int low_on_bigmemory = 0;
+
+ if (gfp_mask & __GFP_BIGMEM)
+ {
+ if (nr_free_pages > freepages.min) {
+ if (!low_on_bigmemory)
+ goto ok_to_allocate;
+ if (nr_free_pages >= freepages.high) {
+ low_on_bigmemory = 0;
+ goto ok_to_allocate;
+ }
+ }
+ low_on_bigmemory = 1;
+ } else {
+ if (nr_free_pages-nr_free_bigpages > freepages.min) {
+ if (!low_on_memory)
+ goto ok_to_allocate;
+ if (nr_free_pages-nr_free_bigpages >= freepages.high) {
+ low_on_memory = 0;
+ goto ok_to_allocate;
+ }
+ }
+ low_on_memory = 1;
+ }
+#endif
current->flags |= PF_MEMALLOC;
freed = try_to_free_pages(gfp_mask);
current->flags &= ~PF_MEMALLOC;
@@ -241,6 +303,9 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order)
}
ok_to_allocate:
spin_lock_irqsave(&page_alloc_lock, flags);
+#ifdef CONFIG_BIGMEM
+ RMQUEUEBIG(order, gfp_mask);
+#endif
RMQUEUE(order, gfp_mask);
spin_unlock_irqrestore(&page_alloc_lock, flags);
@@ -268,9 +333,12 @@ void show_free_areas(void)
unsigned long order, flags;
unsigned long total = 0;
- printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
- printk("Free: %d (%d %d %d)\n",
+ printk("Free pages: %6dkB (%6dkB BigMem)\n ( ",
+ nr_free_pages<<(PAGE_SHIFT-10),
+ nr_free_bigpages<<(PAGE_SHIFT-10));
+ printk("Free: %d, lru_cache: %d (%d %d %d)\n",
nr_free_pages,
+ nr_lru_pages,
freepages.min,
freepages.low,
freepages.high);
@@ -281,6 +349,13 @@ void show_free_areas(void)
for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) {
nr ++;
}
+#ifdef CONFIG_BIGMEM
+ for (tmp = free_area[BIGMEM_LISTS_OFFSET+order].next;
+ tmp != memory_head(free_area+BIGMEM_LISTS_OFFSET+order);
+ tmp = tmp->next) {
+ nr ++;
+ }
+#endif
total += nr * ((PAGE_SIZE>>10) << order);
printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
}
@@ -334,6 +409,9 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
for (i = 0 ; i < NR_MEM_LISTS ; i++) {
unsigned long bitmap_size;
init_mem_queue(free_area+i);
+#ifdef CONFIG_BIGMEM
+ init_mem_queue(free_area+BIGMEM_LISTS_OFFSET+i);
+#endif
mask += mask;
end_mem = (end_mem + ~mask) & mask;
bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
@@ -342,6 +420,11 @@ unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_m
free_area[i].map = (unsigned int *) start_mem;
memset((void *) start_mem, 0, bitmap_size);
start_mem += bitmap_size;
+#ifdef CONFIG_BIGMEM
+ free_area[BIGMEM_LISTS_OFFSET+i].map = (unsigned int *) start_mem;
+ memset((void *) start_mem, 0, bitmap_size);
+ start_mem += bitmap_size;
+#endif
}
return start_mem;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 0f7e6d199..72e8cb95a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,8 +18,6 @@
#include <asm/pgtable.h>
-static DECLARE_WAIT_QUEUE_HEAD(lock_queue);
-
/*
* Reads or writes a swap page.
* wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
@@ -35,7 +33,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lock_queue);
* that shared pages stay shared while being swapped.
*/
-static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait, int dolock)
+static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, int wait)
{
unsigned long type, offset;
struct swap_info_struct * p;
@@ -90,7 +88,6 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
} else
kstat.pswpout++;
- get_page(page);
if (p->swap_device) {
zones[0] = offset;
zones_used = 1;
@@ -99,58 +96,26 @@ static void rw_swap_page_base(int rw, unsigned long entry, struct page *page, in
} else if (p->swap_file) {
struct inode *swapf = p->swap_file->d_inode;
int i;
- if (swapf->i_op->get_block == NULL
- && swapf->i_op->smap != NULL){
- /*
- With MS-DOS, we use msdos_smap which returns
- a sector number (not a cluster or block number).
- It is a patch to enable the UMSDOS project.
- Other people are working on better solution.
-
- It sounds like ll_rw_swap_file defined
- its operation size (sector size) based on
- PAGE_SIZE and the number of blocks to read.
- So using get_block or smap should work even if
- smap will require more blocks.
- */
- int j;
- unsigned int block = offset << 3;
-
- for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
- if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
- printk("rw_swap_page: bad swap file\n");
- return;
- }
+ int j;
+ unsigned int block = offset
+ << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
+
+ block_size = swapf->i_sb->s_blocksize;
+ for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
+ if (!(zones[i] = bmap(swapf,block++))) {
+ printk("rw_swap_page: bad swap file\n");
+ return;
}
- block_size = 512;
- }else{
- int j;
- unsigned int block = offset
- << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
-
- block_size = swapf->i_sb->s_blocksize;
- for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
- if (!(zones[i] = bmap(swapf,block++))) {
- printk("rw_swap_page: bad swap file\n");
- return;
- }
- zones_used = i;
- dev = swapf->i_dev;
- }
+ zones_used = i;
+ dev = swapf->i_dev;
} else {
printk(KERN_ERR "rw_swap_page: no swap file or device\n");
- put_page(page);
return;
}
if (!wait) {
set_bit(PG_decr_after, &page->flags);
atomic_inc(&nr_async_pages);
}
- if (dolock) {
- set_bit(PG_free_swap_after, &page->flags);
- p->swap_map[offset]++;
- }
- set_bit(PG_free_after, &page->flags);
/* block_size == PAGE_SIZE/zones_used */
brw_page(rw, page, dev, zones, block_size, 0);
@@ -192,29 +157,10 @@ void rw_swap_page(int rw, struct page *page, int wait)
PAGE_BUG(page);
if (page->inode != &swapper_inode)
PAGE_BUG(page);
- rw_swap_page_base(rw, entry, page, wait, 1);
-}
-
-/*
- * Setting up a new swap file needs a simple wrapper just to read the
- * swap signature. SysV shared memory also needs a simple wrapper.
- */
-void rw_swap_page_nocache(int rw, unsigned long entry, char *buf)
-{
- struct page *page = mem_map + MAP_NR(buf);
-
- if (TryLockPage(page))
- PAGE_BUG(page);
- if (PageSwapCache(page))
- PAGE_BUG(page);
- if (page->inode)
- PAGE_BUG(page);
- page->offset = entry;
- rw_swap_page_base(rw, entry, page, 1, 1);
+ rw_swap_page_base(rw, entry, page, wait);
}
/*
- * shmfs needs a version that doesn't put the page in the page cache!
* The swap lock map insists that pages be in the page cache!
* Therefore we can't use it. Later when we can remove the need for the
* lock map and we can reduce the number of functions exported.
@@ -227,5 +173,5 @@ void rw_swap_page_nolock(int rw, unsigned long entry, char *buf, int wait)
PAGE_BUG(page);
if (PageSwapCache(page))
PAGE_BUG(page);
- rw_swap_page_base(rw, entry, page, wait, 0);
+ rw_swap_page_base(rw, entry, page, wait);
}
diff --git a/mm/slab.c b/mm/slab.c
index ef7ec9279..0350f3370 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3,6 +3,8 @@
* Written by Mark Hemment, 1996/97.
* (markhe@nextd.demon.co.uk)
*
+ * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
+ *
* 11 April '97. Started multi-threading - markhe
* The global cache-chain is protected by the semaphore 'cache_chain_sem'.
* The sem is only needed when accessing/extending the cache-chain, which
@@ -100,16 +102,10 @@
* is less than 512 (PAGE_SIZE<<3), but greater than 256.
*/
-#include <linux/mm.h>
+#include <linux/config.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
-#include <linux/config.h>
#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-#include <asm/spinlock.h>
#ifdef __mips__
#include <asm/pgtable.h>
#include <asm/addrspace.h>
@@ -989,6 +985,58 @@ opps:
return cachep;
}
+/*
+ * This check if the kmem_cache_t pointer is chained in the cache_cache
+ * list. -arca
+ */
+static int is_chained_kmem_cache(kmem_cache_t * cachep)
+{
+ kmem_cache_t * searchp;
+ int ret = 0;
+
+ /* Find the cache in the chain of caches. */
+ down(&cache_chain_sem);
+ for (searchp = &cache_cache; searchp->c_nextp != &cache_cache;
+ searchp = searchp->c_nextp) {
+ if (searchp->c_nextp != cachep)
+ continue;
+
+ /* Accessing clock_searchp is safe - we hold the mutex. */
+ if (cachep == clock_searchp)
+ clock_searchp = cachep->c_nextp;
+ ret = 1;
+ break;
+ }
+ up(&cache_chain_sem);
+
+ return ret;
+}
+
+/* returns 0 if every slab is been freed -arca */
+static int __kmem_cache_shrink(kmem_cache_t *cachep)
+{
+ kmem_slab_t *slabp;
+ int ret;
+
+ spin_lock_irq(&cachep->c_spinlock);
+
+ /* If the cache is growing, stop shrinking. */
+ while (!cachep->c_growing) {
+ slabp = cachep->c_lastp;
+ if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
+ break;
+ kmem_slab_unlink(slabp);
+ spin_unlock_irq(&cachep->c_spinlock);
+ kmem_slab_destroy(cachep, slabp);
+ spin_lock_irq(&cachep->c_spinlock);
+ }
+ ret = 1;
+ if (cachep->c_lastp == kmem_slab_end(cachep))
+ ret = 0; /* Cache is empty. */
+ spin_unlock_irq(&cachep->c_spinlock);
+ return ret;
+}
+
/* Shrink a cache. Releases as many slabs as possible for a cache.
* It is expected this function will be called by a module when it is
* unloaded. The cache is _not_ removed, this creates too many problems and
@@ -1000,10 +1048,6 @@ opps:
int
kmem_cache_shrink(kmem_cache_t *cachep)
{
- kmem_cache_t *searchp;
- kmem_slab_t *slabp;
- int ret;
-
if (!cachep) {
printk(KERN_ERR "kmem_shrink: NULL ptr\n");
return 2;
@@ -1013,43 +1057,73 @@ kmem_cache_shrink(kmem_cache_t *cachep)
return 2;
}
+ if (!is_chained_kmem_cache(cachep)) {
+ printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n",
+ cachep);
+ return 2;
+ }
+
+ return __kmem_cache_shrink(cachep);
+}
+
+/*
+ * Remove a kmem_cache_t object from the slab cache. When returns 0 it
+ * completed succesfully. -arca
+ */
+int kmem_cache_destroy(kmem_cache_t * cachep)
+{
+ kmem_cache_t * prev;
+ int ret;
+
+ if (!cachep) {
+ printk(KERN_ERR "kmem_destroy: NULL ptr\n");
+ return 1;
+ }
+ if (in_interrupt()) {
+ printk(KERN_ERR "kmem_destroy: Called during int - %s\n",
+ cachep->c_name);
+ return 1;
+ }
+
+ ret = 0;
/* Find the cache in the chain of caches. */
- down(&cache_chain_sem); /* Semaphore is needed. */
- searchp = &cache_cache;
- for (;searchp->c_nextp != &cache_cache; searchp = searchp->c_nextp) {
- if (searchp->c_nextp != cachep)
+ down(&cache_chain_sem);
+ for (prev = &cache_cache; prev->c_nextp != &cache_cache;
+ prev = prev->c_nextp) {
+ if (prev->c_nextp != cachep)
continue;
/* Accessing clock_searchp is safe - we hold the mutex. */
if (cachep == clock_searchp)
clock_searchp = cachep->c_nextp;
- goto found;
+
+ /* remove the cachep from the cache_cache list. -arca */
+ prev->c_nextp = cachep->c_nextp;
+
+ ret = 1;
+ break;
}
up(&cache_chain_sem);
- printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep);
- return 2;
-found:
- /* Release the semaphore before getting the cache-lock. This could
- * mean multiple engines are shrinking the cache, but so what.
- */
- up(&cache_chain_sem);
- spin_lock_irq(&cachep->c_spinlock);
- /* If the cache is growing, stop shrinking. */
- while (!cachep->c_growing) {
- slabp = cachep->c_lastp;
- if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
- break;
- kmem_slab_unlink(slabp);
- spin_unlock_irq(&cachep->c_spinlock);
- kmem_slab_destroy(cachep, slabp);
- spin_lock_irq(&cachep->c_spinlock);
+ if (!ret) {
+ printk(KERN_ERR "kmem_destroy: Invalid cache addr %p\n",
+ cachep);
+ return 1;
}
- ret = 1;
- if (cachep->c_lastp == kmem_slab_end(cachep))
- ret--; /* Cache is empty. */
- spin_unlock_irq(&cachep->c_spinlock);
- return ret;
+
+ if (__kmem_cache_shrink(cachep)) {
+ printk(KERN_ERR "kmem_destroy: Can't free all objects %p\n",
+ cachep);
+ down(&cache_chain_sem);
+ cachep->c_nextp = cache_cache.c_nextp;
+ cache_cache.c_nextp = cachep;
+ up(&cache_chain_sem);
+ return 1;
+ }
+
+ kmem_cache_free(&cache_cache, cachep);
+
+ return 0;
}
/* Get the memory for a slab management obj. */
@@ -1587,7 +1661,7 @@ bad_slab:
#if 1
/* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
-*(int *) 0 = 0;
+ BUG();
#endif
return;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2aa17d3a4..5cfc686dd 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -214,8 +214,6 @@ static inline void remove_from_swap_cache(struct page *page)
page_address(page), page_count(page));
#endif
PageClearSwapCache(page);
- if (inode->i_op->flushpage)
- inode->i_op->flushpage(inode, page, 0);
remove_inode_page(page);
}
@@ -239,6 +237,15 @@ void __delete_from_swap_cache(struct page *page)
swap_free (entry);
}
+static void delete_from_swap_cache_nolock(struct page *page)
+{
+ if (!swapper_inode.i_op->flushpage ||
+ swapper_inode.i_op->flushpage(&swapper_inode, page, 0))
+ lru_cache_del(page);
+
+ __delete_from_swap_cache(page);
+}
+
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
@@ -247,7 +254,7 @@ void delete_from_swap_cache(struct page *page)
{
lock_page(page);
- __delete_from_swap_cache(page);
+ delete_from_swap_cache_nolock(page);
UnlockPage(page);
page_cache_release(page);
@@ -267,13 +274,13 @@ void free_page_and_swap_cache(unsigned long addr)
*/
lock_page(page);
if (PageSwapCache(page) && !is_page_shared(page)) {
- long entry = page->offset;
- remove_from_swap_cache(page);
- swap_free(entry);
+ delete_from_swap_cache_nolock(page);
page_cache_release(page);
}
UnlockPage(page);
+ clear_bit(PG_swap_entry, &page->flags);
+
__free_page(page);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ce18f34f5..c4ce5377d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -46,16 +46,36 @@ static inline int scan_swap_map(struct swap_info_struct *si)
}
}
si->cluster_nr = SWAPFILE_CLUSTER;
+
+ /* try to find an empty (even not aligned) cluster. */
+ offset = si->lowest_bit;
+ check_next_cluster:
+ if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
+ {
+ int nr;
+ for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
+ if (si->swap_map[nr])
+ {
+ offset = nr+1;
+ goto check_next_cluster;
+ }
+ /* We found a completly empty cluster, so start
+ * using it.
+ */
+ goto got_page;
+ }
+ /* No luck, so now go finegrined as usual. -Andrea */
for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
if (si->swap_map[offset])
continue;
- si->lowest_bit = offset;
-got_page:
- si->swap_map[offset] = 1;
- nr_swap_pages--;
+ got_page:
+ if (offset == si->lowest_bit)
+ si->lowest_bit++;
if (offset == si->highest_bit)
si->highest_bit--;
- si->cluster_next = offset;
+ si->swap_map[offset] = 1;
+ nr_swap_pages--;
+ si->cluster_next = offset+1;
return offset;
}
return 0;
@@ -81,12 +101,9 @@ unsigned long get_swap_page(void)
entry = SWP_ENTRY(type,offset);
type = swap_info[type].next;
if (type < 0 ||
- p->prio != swap_info[type].prio)
- {
+ p->prio != swap_info[type].prio) {
swap_list.next = swap_list.head;
- }
- else
- {
+ } else {
swap_list.next = type;
}
return entry;
@@ -126,15 +143,16 @@ void swap_free(unsigned long entry)
offset = SWP_OFFSET(entry);
if (offset >= p->max)
goto bad_offset;
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit)
- p->highest_bit = offset;
if (!p->swap_map[offset])
goto bad_free;
if (p->swap_map[offset] < SWAP_MAP_MAX) {
- if (!--p->swap_map[offset])
+ if (!--p->swap_map[offset]) {
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+ p->highest_bit = offset;
nr_swap_pages++;
+ }
}
#ifdef DEBUG_SWAP
printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
@@ -157,6 +175,44 @@ bad_free:
goto out;
}
+/* needs the big kernel lock */
+unsigned long acquire_swap_entry(struct page *page)
+{
+ struct swap_info_struct * p;
+ unsigned long offset, type;
+ unsigned long entry;
+
+ if (!test_bit(PG_swap_entry, &page->flags))
+ goto new_swap_entry;
+
+ /* We have the old entry in the page offset still */
+ entry = page->offset;
+ if (!entry)
+ goto new_swap_entry;
+ type = SWP_TYPE(entry);
+ if (type & SHM_SWP_TYPE)
+ goto new_swap_entry;
+ if (type >= nr_swapfiles)
+ goto new_swap_entry;
+ p = type + swap_info;
+ if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK)
+ goto new_swap_entry;
+ offset = SWP_OFFSET(entry);
+ if (offset >= p->max)
+ goto new_swap_entry;
+ /* Has it been re-used for something else? */
+ if (p->swap_map[offset])
+ goto new_swap_entry;
+
+ /* We're cool, we can just use the old one */
+ p->swap_map[offset] = 1;
+ nr_swap_pages--;
+ return entry;
+
+new_swap_entry:
+ return get_swap_page();
+}
+
/*
* The swap entry has been read in advance, and we return 1 to indicate
* that the page has been used or is no longer needed.
@@ -266,7 +322,7 @@ static void unuse_process(struct mm_struct * mm, unsigned long entry,
/*
* Go through process' page directory.
*/
- if (!mm || mm == &init_mm)
+ if (!mm)
return;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
@@ -340,7 +396,7 @@ static int try_to_unuse(unsigned int type)
return 0;
}
-asmlinkage int sys_swapoff(const char * specialfile)
+asmlinkage long sys_swapoff(const char * specialfile)
{
struct swap_info_struct * p = NULL;
struct dentry * dentry;
@@ -484,7 +540,7 @@ int is_swap_partition(kdev_t dev) {
*
* The swapon system call
*/
-asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
+asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
{
struct swap_info_struct * p;
struct dentry * swap_dentry;
@@ -495,7 +551,6 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
static int least_priority = 0;
union swap_header *swap_header = 0;
int swap_header_version;
- int lock_map_size = PAGE_SIZE;
int nr_good_pages = 0;
unsigned long maxpages;
int swapfilesize;
@@ -661,8 +716,9 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
else
p->swap_map[page] = SWAP_MAP_BAD;
}
- nr_good_pages = swap_header->info.last_page - i;
- lock_map_size = (p->max + 7) / 8;
+ nr_good_pages = swap_header->info.last_page -
+ swap_header->info.nr_badpages -
+ 1 /* header page */;
if (error)
goto bad_swap;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a4eeb1dc5..9bd4142c3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2,6 +2,7 @@
* linux/mm/vmalloc.c
*
* Copyright (C) 1993 Linus Torvalds
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*/
#include <linux/malloc.h>
@@ -94,7 +95,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo
unsigned long page;
if (!pte_none(*pte))
printk("alloc_area_pte: page already exists\n");
- page = __get_free_page(GFP_KERNEL);
+ page = __get_free_page(GFP_KERNEL|GFP_BIGMEM);
if (!page)
return -ENOMEM;
set_pte(pte, mk_pte(page, prot));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ae052b94..8ee000fc0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -17,6 +17,7 @@
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
#include <linux/init.h>
+#include <linux/bigmem.h>
#include <asm/pgtable.h>
@@ -31,8 +32,7 @@
* using a process that no longer actually exists (it might
* have died while we slept).
*/
-static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
- unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
{
pte_t pte;
unsigned long entry;
@@ -47,15 +47,12 @@ static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
goto out_failed;
page = mem_map + MAP_NR(page_addr);
- spin_lock(&tsk->mm->page_table_lock);
+ spin_lock(&vma->vm_mm->page_table_lock);
if (pte_val(pte) != pte_val(*page_table))
goto out_failed_unlock;
- /*
- * Dont be too eager to get aging right if
- * memory is dangerously low.
- */
- if (!low_on_memory && pte_young(pte)) {
+ /* Don't look at this pte if it's been accessed recently. */
+ if (pte_young(pte)) {
/*
* Transfer the "accessed" bit from the page
* tables to the global page map.
@@ -67,7 +64,8 @@ static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
if (PageReserved(page)
|| PageLocked(page)
- || ((gfp_mask & __GFP_DMA) && !PageDMA(page)))
+ || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+ || (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page)))
goto out_failed_unlock;
/*
@@ -136,15 +134,16 @@ drop_pte:
*/
flush_cache_page(vma, address);
if (vma->vm_ops && vma->vm_ops->swapout) {
- pid_t pid = tsk->pid;
+ int error;
pte_clear(page_table);
- spin_unlock(&tsk->mm->page_table_lock);
+ spin_unlock(&vma->vm_mm->page_table_lock);
flush_tlb_page(vma, address);
vma->vm_mm->rss--;
-
- if (vma->vm_ops->swapout(vma, page))
- kill_proc(pid, SIGBUS, 1);
- goto out_free_success;
+ error = vma->vm_ops->swapout(vma, page);
+ if (!error)
+ goto out_free_success;
+ __free_page(page);
+ return error;
}
/*
@@ -153,14 +152,16 @@ drop_pte:
* we have the swap cache set up to associate the
* page with that swap entry.
*/
- entry = get_swap_page();
+ entry = acquire_swap_entry(page);
if (!entry)
- goto out_failed; /* No swap space left */
+ goto out_failed_unlock; /* No swap space left */
+ if (!(page = prepare_bigmem_swapout(page)))
+ goto out_swap_free_unlock;
+
vma->vm_mm->rss--;
- tsk->nswap++;
set_pte(page_table, __pte(entry));
- spin_unlock(&tsk->mm->page_table_lock);
+ spin_unlock(&vma->vm_mm->page_table_lock);
flush_tlb_page(vma, address);
swap_duplicate(entry); /* One for the process, one for the swap cache */
@@ -175,9 +176,14 @@ out_free_success:
__free_page(page);
return 1;
out_failed_unlock:
- spin_unlock(&tsk->mm->page_table_lock);
+ spin_unlock(&vma->vm_mm->page_table_lock);
out_failed:
return 0;
+out_swap_free_unlock:
+ swap_free(entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+
}
/*
@@ -194,8 +200,7 @@ out_failed:
* (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
*/
-static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
- pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
{
pte_t * pte;
unsigned long pmd_end;
@@ -216,8 +221,8 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
do {
int result;
- tsk->mm->swap_address = address + PAGE_SIZE;
- result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+ vma->vm_mm->swap_address = address + PAGE_SIZE;
+ result = try_to_swap_out(vma, address, pte, gfp_mask);
if (result)
return result;
address += PAGE_SIZE;
@@ -226,8 +231,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
return 0;
}
-static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
- pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
{
pmd_t * pmd;
unsigned long pgd_end;
@@ -247,7 +251,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
end = pgd_end;
do {
- int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+ int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
if (result)
return result;
address = (address + PMD_SIZE) & PMD_MASK;
@@ -256,8 +260,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
return 0;
}
-static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
- unsigned long address, int gfp_mask)
+static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
{
pgd_t *pgdir;
unsigned long end;
@@ -266,11 +269,11 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
if (vma->vm_flags & VM_LOCKED)
return 0;
- pgdir = pgd_offset(tsk->mm, address);
+ pgdir = pgd_offset(vma->vm_mm, address);
end = vma->vm_end;
while (address < end) {
- int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+ int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
if (result)
return result;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -279,7 +282,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
return 0;
}
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
{
unsigned long address;
struct vm_area_struct* vma;
@@ -287,18 +290,18 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
/*
* Go through process' page directory.
*/
- address = p->mm->swap_address;
+ address = mm->swap_address;
/*
* Find the proper vm-area
*/
- vma = find_vma(p->mm, address);
+ vma = find_vma(mm, address);
if (vma) {
if (address < vma->vm_start)
address = vma->vm_start;
for (;;) {
- int result = swap_out_vma(p, vma, address, gfp_mask);
+ int result = swap_out_vma(vma, address, gfp_mask);
if (result)
return result;
vma = vma->vm_next;
@@ -309,8 +312,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
}
/* We didn't find anything for the process */
- p->mm->swap_cnt = 0;
- p->mm->swap_address = 0;
+ mm->swap_cnt = 0;
+ mm->swap_address = 0;
return 0;
}
@@ -321,9 +324,11 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
*/
static int swap_out(unsigned int priority, int gfp_mask)
{
- struct task_struct * p, * pbest;
- int counter, assign, max_cnt;
+ struct task_struct * p;
+ int counter;
+ int __ret = 0;
+ lock_kernel();
/*
* We make one or two passes through the task list, indexed by
* assign = {0, 1}:
@@ -338,46 +343,61 @@ static int swap_out(unsigned int priority, int gfp_mask)
* Think of swap_cnt as a "shadow rss" - it tells us which process
* we want to page out (always try largest first).
*/
- counter = nr_tasks / (priority+1);
+ counter = nr_threads / (priority+1);
if (counter < 1)
counter = 1;
- if (counter > nr_tasks)
- counter = nr_tasks;
+ if (counter > nr_threads)
+ counter = nr_threads;
for (; counter >= 0; counter--) {
- assign = 0;
- max_cnt = 0;
- pbest = NULL;
+ int assign = 0;
+ int max_cnt = 0;
+ struct mm_struct *best = NULL;
+ int pid = 0;
select:
read_lock(&tasklist_lock);
p = init_task.next_task;
for (; p != &init_task; p = p->next_task) {
- if (!p->swappable)
+ struct mm_struct *mm = p->mm;
+ if (!p->swappable || !mm)
continue;
- if (p->mm->rss <= 0)
+ if (mm->rss <= 0)
continue;
/* Refresh swap_cnt? */
if (assign)
- p->mm->swap_cnt = p->mm->rss;
- if (p->mm->swap_cnt > max_cnt) {
- max_cnt = p->mm->swap_cnt;
- pbest = p;
+ mm->swap_cnt = mm->rss;
+ if (mm->swap_cnt > max_cnt) {
+ max_cnt = mm->swap_cnt;
+ best = mm;
+ pid = p->pid;
}
}
read_unlock(&tasklist_lock);
- if (!pbest) {
+ if (!best) {
if (!assign) {
assign = 1;
goto select;
}
goto out;
- }
+ } else {
+ int ret;
+
+ atomic_inc(&best->mm_count);
+ ret = swap_out_mm(best, gfp_mask);
+ mmdrop(best);
+
+ if (!ret)
+ continue;
- if (swap_out_process(pbest, gfp_mask))
- return 1;
+ if (ret < 0)
+ kill_proc(pid, SIGBUS, 1);
+ __ret = 1;
+ goto out;
+ }
}
out:
- return 0;
+ unlock_kernel();
+ return __ret;
}
/*
@@ -394,8 +414,6 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
int priority;
int count = SWAP_CLUSTER_MAX;
- lock_kernel();
-
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
@@ -423,32 +441,10 @@ static int do_try_to_free_pages(unsigned int gfp_mask)
shrink_dcache_memory(priority, gfp_mask);
} while (--priority >= 0);
done:
- unlock_kernel();
return priority >= 0;
}
-/*
- * Before we start the kernel thread, print out the
- * kswapd initialization message (otherwise the init message
- * may be printed in the middle of another driver's init
- * message). It looks very bad when that happens.
- */
-void __init kswapd_setup(void)
-{
- int i;
- char *revision="$Revision: 1.5 $", *s, *e;
-
- swap_setup();
-
- if ((s = strchr(revision, ':')) &&
- (e = strchr(s, '$')))
- s++, i = e - s;
- else
- s = revision, i = -1;
- printk ("Starting kswapd v%.*s\n", i, s);
-}
-
static struct task_struct *kswapd_process;
/*
@@ -499,7 +495,9 @@ int kswapd(void *unused)
* up on a more timely basis.
*/
do {
- if (nr_free_pages >= freepages.high)
+ /* kswapd is critical to provide GFP_ATOMIC
+ allocations (not GFP_BIGMEM ones). */
+ if (nr_free_pages - nr_free_bigpages >= freepages.high)
break;
if (!do_try_to_free_pages(GFP_KSWAPD))
@@ -535,4 +533,13 @@ int try_to_free_pages(unsigned int gfp_mask)
retval = do_try_to_free_pages(gfp_mask);
return retval;
}
-
+
+static int __init kswapd_init(void)
+{
+ printk("Starting kswapd v1.6\n");
+ swap_setup();
+ kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ return 0;
+}
+
+module_init(kswapd_init)