summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/.cvsignore1
-rw-r--r--mm/filemap.c187
-rw-r--r--mm/memory.c231
-rw-r--r--mm/mlock.c10
-rw-r--r--mm/mmap.c73
-rw-r--r--mm/mprotect.c10
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/page_alloc.c128
-rw-r--r--mm/page_io.c133
-rw-r--r--mm/simp.c4
-rw-r--r--mm/slab.c48
-rw-r--r--mm/swap_state.c219
-rw-r--r--mm/swapfile.c169
-rw-r--r--mm/vmscan.c253
14 files changed, 954 insertions, 516 deletions
diff --git a/mm/.cvsignore b/mm/.cvsignore
index 4671378ae..857dd22e9 100644
--- a/mm/.cvsignore
+++ b/mm/.cvsignore
@@ -1 +1,2 @@
.depend
+.*.flags
diff --git a/mm/filemap.c b/mm/filemap.c
index 6d718c01d..7a4e20e21 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -25,6 +25,8 @@
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/swapctl.h>
#include <asm/system.h>
#include <asm/pgtable.h>
@@ -115,7 +117,7 @@ repeat:
}
}
-int shrink_mmap(int priority, int dma)
+int shrink_mmap(int priority, int gfp_mask)
{
static unsigned long clock = 0;
struct page * page;
@@ -134,7 +136,7 @@ int shrink_mmap(int priority, int dma)
if (PageLocked(page))
goto next;
- if (dma && !PageDMA(page))
+ if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
goto next;
/* First of all, regenerate the page's referenced bit
from any buffers in the page */
@@ -158,20 +160,31 @@ int shrink_mmap(int priority, int dma)
switch (atomic_read(&page->count)) {
case 1:
- /* If it has been referenced recently, don't free it */
- if (test_and_clear_bit(PG_referenced, &page->flags))
- break;
-
- /* is it a page cache page? */
+ /* is it a swap-cache or page-cache page? */
if (page->inode) {
+ if (test_and_clear_bit(PG_referenced, &page->flags)) {
+ touch_page(page);
+ break;
+ }
+ age_page(page);
+ if (page->age)
+ break;
+ if (PageSwapCache(page)) {
+ delete_from_swap_cache(page);
+ return 1;
+ }
remove_page_from_hash_queue(page);
remove_page_from_inode_queue(page);
__free_page(page);
return 1;
}
+ /* It's not a cache page, so we don't do aging.
+ * If it has been referenced recently, don't free it */
+ if (test_and_clear_bit(PG_referenced, &page->flags))
+ break;
/* is it a buffer cache page? */
- if (bh && try_to_free_buffer(bh, &bh, 6))
+ if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6))
return 1;
break;
@@ -208,6 +221,8 @@ unsigned long page_unuse(unsigned long page)
return count;
if (!p->inode)
return count;
+ if (PageSwapCache(p))
+ panic ("Doing a normal page_unuse of a swap cache page");
remove_page_from_hash_queue(p);
remove_page_from_inode_queue(p);
free_page(page);
@@ -260,8 +275,10 @@ static inline void add_to_page_cache(struct page * page,
* that we could use for the cache (if it is 0 we can try to create one,
* this is all overlapped with the IO on the previous page finishing anyway)
*/
-static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
+static unsigned long try_to_read_ahead(struct file * file,
+ unsigned long offset, unsigned long page_cache)
{
+ struct inode *inode = file->f_dentry->d_inode;
struct page * page;
struct page ** hash;
@@ -282,7 +299,7 @@ static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offse
*/
page = mem_map + MAP_NR(page_cache);
add_to_page_cache(page, inode, offset, hash);
- inode->i_op->readpage(inode, page);
+ inode->i_op->readpage(file, page);
page_cache = 0;
}
release_page(page);
@@ -299,18 +316,20 @@ static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offse
*/
void __wait_on_page(struct page *page)
{
- struct wait_queue wait = { current, NULL };
+ struct task_struct *tsk = current;
+ struct wait_queue wait;
+ wait.task = tsk;
add_wait_queue(&page->wait, &wait);
repeat:
+ tsk->state = TASK_UNINTERRUPTIBLE;
run_task_queue(&tq_disk);
- current->state = TASK_UNINTERRUPTIBLE;
if (PageLocked(page)) {
schedule();
goto repeat;
}
+ tsk->state = TASK_RUNNING;
remove_wait_queue(&page->wait, &wait);
- current->state = TASK_RUNNING;
}
#if 0
@@ -436,16 +455,6 @@ static void profile_readahead(int async, struct file *filp)
* 64k if defined (4K page size assumed).
*/
-#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)
-
-#if 0 /* small readahead */
-#define MAX_READAHEAD PageAlignSize(4096*7)
-#define MIN_READAHEAD PageAlignSize(4096*2)
-#else /* large readahead */
-#define MAX_READAHEAD PageAlignSize(4096*18)
-#define MIN_READAHEAD PageAlignSize(4096*3)
-#endif
-
static inline int get_max_readahead(struct inode * inode)
{
if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
@@ -453,9 +462,9 @@ static inline int get_max_readahead(struct inode * inode)
return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
}
-static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
- unsigned long ppos, struct page * page,
- unsigned long page_cache)
+static inline unsigned long generic_file_readahead(int reada_ok,
+ struct file * filp, struct inode * inode,
+ unsigned long ppos, struct page * page, unsigned long page_cache)
{
unsigned long max_ahead, ahead;
unsigned long raend;
@@ -519,7 +528,8 @@ static inline unsigned long generic_file_readahead(int reada_ok, struct file * f
ahead = 0;
while (ahead < max_ahead) {
ahead += PAGE_SIZE;
- page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
+ page_cache = try_to_read_ahead(filp, raend + ahead,
+ page_cache);
}
/*
* If we tried to read ahead some pages,
@@ -567,7 +577,8 @@ static inline unsigned long generic_file_readahead(int reada_ok, struct file * f
ssize_t generic_file_read(struct file * filp, char * buf,
size_t count, loff_t *ppos)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct dentry *dentry = filp->f_dentry;
+ struct inode *inode = dentry->d_inode;
ssize_t error, read;
size_t pos, pgpos, page_cache;
int reada_ok;
@@ -724,7 +735,7 @@ no_cached_page:
if (reada_ok && filp->f_ramax > MIN_READAHEAD)
filp->f_ramax = MIN_READAHEAD;
- error = inode->i_op->readpage(inode, page);
+ error = inode->i_op->readpage(filp, page);
if (!error)
goto found_page;
release_page(page);
@@ -736,7 +747,7 @@ page_read_error:
* Try to re-read it _once_. We do this synchronously,
* because this happens only if there were errors.
*/
- error = inode->i_op->readpage(inode, page);
+ error = inode->i_op->readpage(filp, page);
if (!error) {
wait_on_page(page);
if (PageUptodate(page) && !PageError(page))
@@ -751,7 +762,7 @@ page_read_error:
filp->f_reada = 1;
if (page_cache)
free_page(page_cache);
- UPDATE_ATIME(inode)
+ UPDATE_ATIME(inode);
if (!read)
read = error;
return read;
@@ -771,11 +782,11 @@ page_read_error:
*/
static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
{
-/* XXX: Check the flushes in this code. At least sometimes we do
- duplicate flushes. ... */
+ struct file * file = area->vm_file;
+ struct dentry * dentry = file->f_dentry;
+ struct inode * inode = dentry->d_inode;
unsigned long offset;
struct page * page, **hash;
- struct inode * inode = area->vm_dentry->d_inode;
unsigned long old_page, new_page;
new_page = 0;
@@ -856,14 +867,14 @@ no_cached_page:
new_page = 0;
add_to_page_cache(page, inode, offset, hash);
- if (inode->i_op->readpage(inode, page) != 0)
+ if (inode->i_op->readpage(file, page) != 0)
goto failure;
/*
* Do a very limited read-ahead if appropriate
*/
if (PageLocked(page))
- new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
+ new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
goto found_page;
page_locked_wait:
@@ -878,7 +889,7 @@ page_read_error:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- if (inode->i_op->readpage(inode, page) != 0)
+ if (inode->i_op->readpage(file, page) != 0)
goto failure;
wait_on_page(page);
if (PageError(page))
@@ -907,6 +918,7 @@ static inline int do_write_page(struct inode * inode, struct file * file,
{
int retval;
unsigned long size;
+ loff_t loff = offset;
mm_segment_t old_fs;
size = offset + PAGE_SIZE;
@@ -922,8 +934,7 @@ static inline int do_write_page(struct inode * inode, struct file * file,
old_fs = get_fs();
set_fs(KERNEL_DS);
retval = -EIO;
- if (size == file->f_op->write(file, (const char *) page,
- size, &file->f_pos))
+ if (size == file->f_op->write(file, (const char *) page, size, &loff))
retval = 0;
set_fs(old_fs);
return retval;
@@ -934,7 +945,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
unsigned long page)
{
int result;
- struct file file;
+ struct file * file;
struct dentry * dentry;
struct inode * inode;
struct buffer_head * bh;
@@ -954,27 +965,21 @@ static int filemap_write_page(struct vm_area_struct * vma,
return 0;
}
- dentry = vma->vm_dentry;
+ file = vma->vm_file;
+ dentry = file->f_dentry;
inode = dentry->d_inode;
- file.f_op = inode->i_op->default_file_ops;
- if (!file.f_op->write)
+ if (!file->f_op->write)
return -EIO;
- file.f_mode = 3;
- file.f_flags = 0;
- file.f_count = 1;
- file.f_dentry = dentry;
- file.f_pos = offset;
- file.f_reada = 0;
/*
* If a task terminates while we're swapping the page, the vma and
- * and dentry could be released ... increment the count to be safe.
+ * and file could be released ... increment the count to be safe.
*/
- dget(dentry);
+ file->f_count++;
down(&inode->i_sem);
- result = do_write_page(inode, &file, (const char *) page, offset);
+ result = do_write_page(inode, file, (const char *) page, offset);
up(&inode->i_sem);
- dput(dentry);
+ fput(file);
return result;
}
@@ -1209,7 +1214,8 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
if (!inode->i_op || !inode->i_op->readpage)
return -ENOEXEC;
UPDATE_ATIME(inode);
- vma->vm_dentry = dget(file->f_dentry);
+ vma->vm_file = file;
+ file->f_count++;
vma->vm_ops = ops;
return 0;
}
@@ -1222,15 +1228,16 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
static int msync_interval(struct vm_area_struct * vma,
unsigned long start, unsigned long end, int flags)
{
- if (vma->vm_dentry && vma->vm_ops && vma->vm_ops->sync) {
+ if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
int error;
error = vma->vm_ops->sync(vma, start, end-start, flags);
if (!error && (flags & MS_SYNC)) {
- struct dentry * dentry = vma->vm_dentry;
- if (dentry) {
+ struct file * file = vma->vm_file;
+ if (file) {
+ struct dentry * dentry = file->f_dentry;
struct inode * inode = dentry->d_inode;
down(&inode->i_sem);
- error = file_fsync(NULL,dentry);
+ error = file_fsync(file, dentry);
up(&inode->i_sem);
}
}
@@ -1315,7 +1322,8 @@ ssize_t
generic_file_write(struct file *file, const char *buf,
size_t count, loff_t *ppos)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct dentry *dentry = file->f_dentry;
+ struct inode *inode = dentry->d_inode;
struct page *page, **hash;
unsigned long page_cache = 0;
unsigned long pgpos, offset;
@@ -1349,11 +1357,10 @@ generic_file_write(struct file *file, const char *buf,
if (!(page = __find_page(inode, pgpos, *hash))) {
if (!page_cache) {
page_cache = __get_free_page(GFP_KERNEL);
- if (!page_cache) {
- status = -ENOMEM;
- break;
- }
- continue;
+ if (page_cache)
+ continue;
+ status = -ENOMEM;
+ break;
}
page = mem_map + MAP_NR(page_cache);
add_to_page_cache(page, inode, pgpos, hash);
@@ -1361,36 +1368,47 @@ generic_file_write(struct file *file, const char *buf,
}
/*
- * WSH 06/05/97: restructured slightly to make sure we release
- * the page on an error exit. Removed explicit setting of
- * PG_locked, as that's handled below the i_op->xxx interface.
+ * Note: setting of the PG_locked bit is handled
+ * below the i_op->xxx interface.
*/
didread = 0;
page_wait:
wait_on_page(page);
+ if (PageUptodate(page))
+ goto do_update_page;
/*
- * If the page is not uptodate, and we're writing less
+ * The page is not up-to-date ... if we're writing less
* than a full page of data, we may have to read it first.
- * However, don't bother with reading the page when it's
- * after the current end of file.
+ * But if the page is past the current end of file, we must
+ * clear it before updating.
*/
- if (!PageUptodate(page)) {
- if (bytes < PAGE_SIZE && pgpos < inode->i_size) {
- if (didread < 2)
- status = inode->i_op->readpage(inode, page);
- else
- status = -EIO; /* two tries ... error out */
+ if (bytes < PAGE_SIZE) {
+ if (pgpos < inode->i_size) {
+ status = -EIO;
+ if (didread >= 2)
+ goto done_with_page;
+ status = inode->i_op->readpage(file, page);
if (status < 0)
goto done_with_page;
didread++;
goto page_wait;
+ } else {
+ /* Must clear for partial writes */
+ memset((void *) page_address(page), 0,
+ PAGE_SIZE);
}
- set_bit(PG_uptodate, &page->flags);
}
+ /*
+ * N.B. We should defer setting PG_uptodate at least until
+ * the data is copied. A failure in i_op->updatepage() could
+ * leave the page with garbage data.
+ */
+ set_bit(PG_uptodate, &page->flags);
+do_update_page:
/* Alright, the page is there. Now update it. */
- status = inode->i_op->updatepage(inode, page, buf,
+ status = inode->i_op->updatepage(file, page, buf,
offset, bytes, sync);
done_with_page:
__free_page(page);
@@ -1408,9 +1426,7 @@ done_with_page:
if (page_cache)
free_page(page_cache);
- if (written)
- return written;
- return status;
+ return written ? written : status;
}
/*
@@ -1429,7 +1445,7 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset,
{
struct page * page;
struct page ** hash;
- unsigned long page_cache;
+ unsigned long page_cache = 0;
hash = page_hash(inode, offset);
page = __find_page(inode, offset, *hash);
@@ -1443,14 +1459,15 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset,
add_to_page_cache(page, inode, offset, hash);
}
if (atomic_read(&page->count) != 2)
- printk("get_cached_page: page count=%d\n",
+ printk(KERN_ERR "get_cached_page: page count=%d\n",
atomic_read(&page->count));
if (test_bit(PG_locked, &page->flags))
- printk("get_cached_page: page already locked!\n");
+ printk(KERN_ERR "get_cached_page: page already locked!\n");
set_bit(PG_locked, &page->flags);
+ page_cache = page_address(page);
out:
- return page_address(page);
+ return page_cache;
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 82ed6c986..66cdf0bc1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -175,100 +175,16 @@ int new_page_tables(struct task_struct * tsk)
return 0;
}
-static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow)
-{
- pte_t pte = *old_pte;
- unsigned long page_nr;
-
- if (pte_none(pte))
- return;
- if (!pte_present(pte)) {
- swap_duplicate(pte_val(pte));
- set_pte(new_pte, pte);
- return;
- }
- page_nr = MAP_NR(pte_page(pte));
- if (page_nr >= max_mapnr || PageReserved(mem_map+page_nr)) {
- set_pte(new_pte, pte);
- return;
- }
- if (cow)
- pte = pte_wrprotect(pte);
- if (delete_from_swap_cache(&mem_map[page_nr]))
- pte = pte_mkdirty(pte);
- set_pte(new_pte, pte_mkold(pte));
- set_pte(old_pte, pte);
- atomic_inc(&mem_map[page_nr].count);
-}
-
-static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow)
-{
- pte_t * src_pte, * dst_pte;
- unsigned long end;
-
- if (pmd_none(*src_pmd))
- return 0;
- if (pmd_bad(*src_pmd)) {
- printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
- pmd_clear(src_pmd);
- return 0;
- }
- src_pte = pte_offset(src_pmd, address);
- if (pmd_none(*dst_pmd)) {
- if (!pte_alloc(dst_pmd, 0))
- return -ENOMEM;
- }
- dst_pte = pte_offset(dst_pmd, address);
- address &= ~PMD_MASK;
- end = address + size;
- if (end >= PMD_SIZE)
- end = PMD_SIZE;
- do {
- /* I would like to switch arguments here, to make it
- * consistent with copy_xxx_range and memcpy syntax.
- */
- copy_one_pte(src_pte++, dst_pte++, cow);
- address += PAGE_SIZE;
- } while (address < end);
- return 0;
-}
-
-static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow)
-{
- pmd_t * src_pmd, * dst_pmd;
- unsigned long end;
- int error = 0;
-
- if (pgd_none(*src_pgd))
- return 0;
- if (pgd_bad(*src_pgd)) {
- printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd));
- pgd_clear(src_pgd);
- return 0;
- }
- src_pmd = pmd_offset(src_pgd, address);
- if (pgd_none(*dst_pgd)) {
- if (!pmd_alloc(dst_pgd, 0))
- return -ENOMEM;
- }
- dst_pmd = pmd_offset(dst_pgd, address);
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- do {
- error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow);
- if (error)
- break;
- address = (address + PMD_SIZE) & PMD_MASK;
- } while (address < end);
- return error;
-}
+#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
+#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
+ *
+ * 08Jan98 Merged into one routine from several inline routines to reduce
+ * variable count and make things faster. -jj
*/
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -276,18 +192,105 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
pgd_t * src_pgd, * dst_pgd;
unsigned long address = vma->vm_start;
unsigned long end = vma->vm_end;
- int error = 0, cow;
+ unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+
+ src_pgd = pgd_offset(src, address)-1;
+ dst_pgd = pgd_offset(dst, address)-1;
+
+ for (;;) {
+ pmd_t * src_pmd, * dst_pmd;
+
+ src_pgd++; dst_pgd++;
+
+ /* copy_pmd_range */
+
+ if (pgd_none(*src_pgd))
+ goto skip_copy_pmd_range;
+ if (pgd_bad(*src_pgd)) {
+ printk("copy_pmd_range: bad pgd (%08lx)\n",
+ pgd_val(*src_pgd));
+ pgd_clear(src_pgd);
+skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ if (address >= end)
+ goto out;
+ continue;
+ }
+ if (pgd_none(*dst_pgd)) {
+ if (!pmd_alloc(dst_pgd, 0))
+ goto nomem;
+ }
+
+ src_pmd = pmd_offset(src_pgd, address);
+ dst_pmd = pmd_offset(dst_pgd, address);
+
+ do {
+ pte_t * src_pte, * dst_pte;
+
+ /* copy_pte_range */
+
+ if (pmd_none(*src_pmd))
+ goto skip_copy_pte_range;
+ if (pmd_bad(*src_pmd)) {
+ printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd));
+ pmd_clear(src_pmd);
+skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
+ if (address >= end)
+ goto out;
+ goto cont_copy_pmd_range;
+ }
+ if (pmd_none(*dst_pmd)) {
+ if (!pte_alloc(dst_pmd, 0))
+ goto nomem;
+ }
+
+ src_pte = pte_offset(src_pmd, address);
+ dst_pte = pte_offset(dst_pmd, address);
+
+ do {
+ pte_t pte = *src_pte;
+ unsigned long page_nr;
+
+ /* copy_one_pte */
+
+ if (pte_none(pte))
+ goto cont_copy_pte_range;
+ if (!pte_present(pte)) {
+ swap_duplicate(pte_val(pte));
+ set_pte(dst_pte, pte);
+ goto cont_copy_pte_range;
+ }
+ page_nr = MAP_NR(pte_page(pte));
+ if (page_nr >= max_mapnr ||
+ PageReserved(mem_map+page_nr)) {
+ set_pte(dst_pte, pte);
+ goto cont_copy_pte_range;
+ }
+ if (cow)
+ pte = pte_wrprotect(pte);
+#if 0 /* No longer needed with the new swap cache code */
+ if (delete_from_swap_cache(&mem_map[page_nr]))
+ pte = pte_mkdirty(pte);
+#endif
+ set_pte(dst_pte, pte_mkold(pte));
+ set_pte(src_pte, pte);
+ atomic_inc(&mem_map[page_nr].count);
+
+cont_copy_pte_range: address += PAGE_SIZE;
+ if (address >= end)
+ goto out;
+ src_pte++;
+ dst_pte++;
+ } while ((unsigned long)src_pte & PTE_TABLE_MASK);
+
+cont_copy_pmd_range: src_pmd++;
+ dst_pmd++;
+ } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
+ }
+out:
+ return 0;
- cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
- src_pgd = pgd_offset(src, address);
- dst_pgd = pgd_offset(dst, address);
- while (address < end) {
- error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow);
- if (error)
- break;
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- }
- return error;
+nomem:
+ return -ENOMEM;
}
/*
@@ -299,7 +302,11 @@ static inline int free_pte(pte_t page)
unsigned long addr = pte_page(page);
if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr)))
return 0;
- free_page(addr);
+ /*
+ * free_page() used to be able to clear swap cache
+ * entries. We may now have to do it manually.
+ */
+ free_page_and_swap_cache(addr);
return 1;
}
swap_free(pte_val(page));
@@ -542,7 +549,7 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long
static void put_page(pte_t * page_table, pte_t pte)
{
if (!pte_none(*page_table)) {
- free_page(pte_page(pte));
+ free_page_and_swap_cache(pte_page(pte));
return;
}
/* no need for flush_tlb */
@@ -609,9 +616,13 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
{
pte_t pte;
unsigned long old_page, new_page;
-
- new_page = __get_free_page(GFP_KERNEL);
+ struct page * page_map;
+
pte = *page_table;
+ new_page = __get_free_page(GFP_KERNEL);
+ /* Did someone else copy this page for us while we slept? */
+ if (pte_val(*page_table) != pte_val(pte))
+ goto end_wp_page;
if (!pte_present(pte))
goto end_wp_page;
if (pte_write(pte))
@@ -620,10 +631,12 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
if (MAP_NR(old_page) >= max_mapnr)
goto bad_wp_page;
tsk->min_flt++;
+ page_map = mem_map + MAP_NR(old_page);
+
/*
* Do we need to copy?
*/
- if (atomic_read(&mem_map[MAP_NR(old_page)].count) != 1) {
+ if (is_page_shared(page_map)) {
if (new_page) {
if (PageReserved(mem_map + MAP_NR(old_page)))
++vma->vm_mm->rss;
@@ -643,6 +656,8 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
oom(tsk);
return;
}
+ if (PageSwapCache(page_map))
+ delete_from_swap_cache(page_map);
flush_cache_page(vma, address);
set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
flush_tlb_page(vma, address);
@@ -867,12 +882,14 @@ static inline void handle_pte_fault(struct task_struct *tsk,
do_no_page(tsk, vma, address, write_access, pte, entry);
return;
}
- set_pte(pte, pte_mkyoung(entry));
+ entry = pte_mkyoung(entry);
+ set_pte(pte, entry);
flush_tlb_page(vma, address);
if (!write_access)
return;
if (pte_write(entry)) {
- set_pte(pte, pte_mkdirty(*pte));
+ entry = pte_mkdirty(entry);
+ set_pte(pte, entry);
flush_tlb_page(vma, address);
return;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index eea100add..5bffab93f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -38,7 +38,8 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma,
n->vm_end = end;
vma->vm_offset += vma->vm_start - n->vm_start;
n->vm_flags = newflags;
- n->vm_dentry = dget(vma->vm_dentry);
+ if (n->vm_file)
+ n->vm_file->f_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
insert_vm_struct(current->mm, n);
@@ -58,7 +59,8 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma,
n->vm_start = start;
n->vm_offset += n->vm_start - vma->vm_start;
n->vm_flags = newflags;
- n->vm_dentry = dget(vma->vm_dentry);
+ if (n->vm_file)
+ n->vm_file->f_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
insert_vm_struct(current->mm, n);
@@ -87,8 +89,8 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
vma->vm_offset += vma->vm_start - left->vm_start;
right->vm_offset += right->vm_start - left->vm_start;
vma->vm_flags = newflags;
- if (vma->vm_dentry)
- vma->vm_dentry->d_count += 2;
+ if (vma->vm_file)
+ vma->vm_file->f_count += 2;
if (vma->vm_ops && vma->vm_ops->open) {
vma->vm_ops->open(left);
diff --git a/mm/mmap.c b/mm/mmap.c
index 501b31913..52c185e85 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -17,6 +17,7 @@
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -74,11 +75,11 @@ int vm_enough_memory(long pages)
/* Remove one vm structure from the inode's i_mmap ring. */
static inline void remove_shared_vm_struct(struct vm_area_struct *vma)
{
- struct dentry * dentry = vma->vm_dentry;
+ struct file * file = vma->vm_file;
- if (dentry) {
+ if (file) {
if (vma->vm_flags & VM_DENYWRITE)
- dentry->d_inode->i_writecount++;
+ file->f_dentry->d_inode->i_writecount++;
if(vma->vm_next_share)
vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
*vma->vm_pprev_share = vma->vm_next_share;
@@ -173,6 +174,10 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
if (off + len < off)
return -EINVAL;
+ /* Too many mappings? */
+ if (mm->map_count > MAX_MAP_COUNT)
+ return -ENOMEM;
+
/* mlock MCL_FUTURE? */
if (mm->def_flags & VM_LOCKED) {
unsigned long locked = mm->locked_vm << PAGE_SHIFT;
@@ -257,7 +262,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
vma->vm_offset = off;
- vma->vm_dentry = NULL;
+ vma->vm_file = NULL;
vma->vm_pte = 0;
/* Clear old maps */
@@ -390,8 +395,8 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
if (addr == area->vm_start && end == area->vm_end) {
if (area->vm_ops && area->vm_ops->close)
area->vm_ops->close(area);
- if (area->vm_dentry)
- dput(area->vm_dentry);
+ if (area->vm_file)
+ fput(area->vm_file);
return 0;
}
@@ -414,7 +419,9 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
mpnt->vm_flags = area->vm_flags;
mpnt->vm_ops = area->vm_ops;
mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
- mpnt->vm_dentry = dget(area->vm_dentry);
+ mpnt->vm_file = area->vm_file;
+ if (mpnt->vm_file)
+ mpnt->vm_file->f_count++;
if (mpnt->vm_ops && mpnt->vm_ops->open)
mpnt->vm_ops->open(mpnt);
area->vm_end = addr; /* Truncate area */
@@ -452,6 +459,7 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len)
*/
int do_munmap(unsigned long addr, size_t len)
{
+ struct mm_struct * mm;
struct vm_area_struct *mpnt, *next, *free, *extra;
int freed;
@@ -466,7 +474,8 @@ int do_munmap(unsigned long addr, size_t len)
* every area affected in some way (by any overlap) is put
* on the list. If nothing is put on, nothing is affected.
*/
- mpnt = current->mm->mmap;
+ mm = current->mm;
+ mpnt = mm->mmap;
while(mpnt && mpnt->vm_end <= addr)
mpnt = mpnt->vm_next;
if (!mpnt)
@@ -496,6 +505,13 @@ int do_munmap(unsigned long addr, size_t len)
mpnt = next;
}
+ if (free && (free->vm_start < addr) && (free->vm_end > addr+len)) {
+ if (mm->map_count > MAX_MAP_COUNT) {
+ kmem_cache_free(vm_area_cachep, extra);
+ return -ENOMEM;
+ }
+ }
+
/* Ok - we have the memory areas we should free on the 'free' list,
* so release them, and unmap the page range..
* If the one of the segments is only being partially unmapped,
@@ -508,6 +524,7 @@ int do_munmap(unsigned long addr, size_t len)
free = free->vm_next;
freed = 1;
+ mm->map_count--;
remove_shared_vm_struct(mpnt);
st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
@@ -518,9 +535,9 @@ int do_munmap(unsigned long addr, size_t len)
if (mpnt->vm_ops && mpnt->vm_ops->unmap)
mpnt->vm_ops->unmap(mpnt, st, size);
- flush_cache_range(current->mm, st, end);
- zap_page_range(current->mm, st, size);
- flush_tlb_range(current->mm, st, end);
+ flush_cache_range(mm, st, end);
+ zap_page_range(mm, st, size);
+ flush_tlb_range(mm, st, end);
/*
* Fix the mapping, and free the old area if it wasn't reused.
@@ -534,7 +551,7 @@ int do_munmap(unsigned long addr, size_t len)
kmem_cache_free(vm_area_cachep, extra);
if (freed)
- current->mm->mmap_cache = NULL; /* Kill the cache. */
+ mm->mmap_cache = NULL; /* Kill the cache. */
return 0;
}
@@ -560,13 +577,18 @@ void exit_mmap(struct mm_struct * mm)
if (mpnt->vm_ops->close)
mpnt->vm_ops->close(mpnt);
}
+ mm->map_count--;
remove_shared_vm_struct(mpnt);
zap_page_range(mm, start, size);
- if (mpnt->vm_dentry)
- dput(mpnt->vm_dentry);
+ if (mpnt->vm_file)
+ fput(mpnt->vm_file);
kmem_cache_free(vm_area_cachep, mpnt);
mpnt = next;
}
+
+ /* This is just debugging */
+ if (mm->map_count)
+ printk("exit_mmap: map count is %d\n", mm->map_count);
}
/* Insert vm structure into process list sorted by address
@@ -575,7 +597,9 @@ void exit_mmap(struct mm_struct * mm)
void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
{
struct vm_area_struct **pprev = &mm->mmap;
- struct dentry * dentry;
+ struct file * file;
+
+ mm->map_count++;
/* Find where to link it in. */
while(*pprev && (*pprev)->vm_start <= vmp->vm_start)
@@ -587,9 +611,9 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
*pprev = vmp;
vmp->vm_pprev = pprev;
- dentry = vmp->vm_dentry;
- if (dentry) {
- struct inode * inode = dentry->d_inode;
+ file = vmp->vm_file;
+ if (file) {
+ struct inode * inode = file->f_dentry->d_inode;
if (vmp->vm_flags & VM_DENYWRITE)
inode->i_writecount--;
@@ -636,8 +660,8 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
for ( ; mpnt && prev->vm_start < end_addr ; prev = mpnt, mpnt = next) {
next = mpnt->vm_next;
- /* To share, we must have the same dentry, operations.. */
- if ((mpnt->vm_dentry != prev->vm_dentry)||
+ /* To share, we must have the same file, operations.. */
+ if ((mpnt->vm_file != prev->vm_file)||
(mpnt->vm_pte != prev->vm_pte) ||
(mpnt->vm_ops != prev->vm_ops) ||
(mpnt->vm_flags != prev->vm_flags) ||
@@ -645,10 +669,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
continue;
/*
- * If we have a dentry or it's a shared memory area
+ * If we have a file or it's a shared memory area
* the offsets must be contiguous..
*/
- if ((mpnt->vm_dentry != NULL) || (mpnt->vm_flags & VM_SHM)) {
+ if ((mpnt->vm_file != NULL) || (mpnt->vm_flags & VM_SHM)) {
unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start;
if (off != mpnt->vm_offset)
continue;
@@ -668,9 +692,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
mpnt->vm_start = mpnt->vm_end;
mpnt->vm_ops->close(mpnt);
}
+ mm->map_count--;
remove_shared_vm_struct(mpnt);
- if (mpnt->vm_dentry)
- dput(mpnt->vm_dentry);
+ if (mpnt->vm_file)
+ fput(mpnt->vm_file);
kmem_cache_free(vm_area_cachep, mpnt);
mpnt = prev;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ddf4f4ed6..a34225d83 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -110,7 +110,8 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma,
vma->vm_offset += vma->vm_start - n->vm_start;
n->vm_flags = newflags;
n->vm_page_prot = prot;
- n->vm_dentry = dget(n->vm_dentry);
+ if (n->vm_file)
+ n->vm_file->f_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
insert_vm_struct(current->mm, n);
@@ -132,7 +133,8 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma,
n->vm_offset += n->vm_start - vma->vm_start;
n->vm_flags = newflags;
n->vm_page_prot = prot;
- n->vm_dentry = dget(n->vm_dentry);
+ if (n->vm_file)
+ n->vm_file->f_count++;
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
insert_vm_struct(current->mm, n);
@@ -163,8 +165,8 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
right->vm_offset += right->vm_start - left->vm_start;
vma->vm_flags = newflags;
vma->vm_page_prot = prot;
- if (vma->vm_dentry)
- vma->vm_dentry->d_count += 2;
+ if (vma->vm_file)
+ vma->vm_file->f_count += 2;
if (vma->vm_ops && vma->vm_ops->open) {
vma->vm_ops->open(left);
vma->vm_ops->open(right);
diff --git a/mm/mremap.c b/mm/mremap.c
index aaabde322..a31a0ae14 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -140,7 +140,9 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
new_vma->vm_start = new_addr;
new_vma->vm_end = new_addr+new_len;
new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start);
- new_vma->vm_dentry = dget(vma->vm_dentry);
+ new_vma->vm_file = vma->vm_file;
+ if (new_vma->vm_file)
+ new_vma->vm_file->f_count++;
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
insert_vm_struct(current->mm, new_vma);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07264f81e..ed748bbfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,6 +19,7 @@
#include <linux/swapctl.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/pagemap.h>
#include <asm/dma.h>
#include <asm/system.h> /* for cli()/sti() */
@@ -101,6 +102,46 @@ static inline void remove_mem_queue(struct page * entry)
static spinlock_t page_alloc_lock;
#endif
+/*
+ * This routine is used by the kernel swap deamon to determine
+ * whether we have "enough" free pages. It is fairly arbitrary,
+ * but this had better return false if any reasonable "get_free_page()"
+ * allocation could currently fail..
+ *
+ * Currently we approve of the following situations:
+ * - the highest memory order has two entries
+ * - the highest memory order has one free entry and:
+ * - the next-highest memory order has two free entries
+ * - the highest memory order has one free entry and:
+ * - the next-highest memory order has one free entry
+ * - the next-next-highest memory order has two free entries
+ *
+ * [previously, there had to be two entries of the highest memory
+ * order, but this lead to problems on large-memory machines.]
+ */
+int free_memory_available(void)
+{
+ int i, retval = 0;
+ unsigned long flags;
+ struct free_area_struct * list = NULL;
+
+ spin_lock_irqsave(&page_alloc_lock, flags);
+ /* We fall through the loop if the list contains one
+ * item. -- thanks to Colin Plumb <colin@nyx.net>
+ */
+ for (i = 1; i < 4; ++i) {
+ list = free_area + NR_MEM_LISTS - i;
+ if (list->next == memory_head(list))
+ break;
+ if (list->next->next == memory_head(list))
+ continue;
+ retval = 1;
+ break;
+ }
+ spin_unlock_irqrestore(&page_alloc_lock, flags);
+ return retval;
+}
+
static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
{
struct free_area_struct *area = free_area + order;
@@ -133,9 +174,12 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
void __free_page(struct page *page)
{
if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
- delete_from_swap_cache(page);
+ if (PageSwapCache(page))
+ panic ("Freeing swap cache page");
free_pages_ok(page->map_nr, 0);
}
+ if (PageSwapCache(page) && atomic_read(&page->count) == 1)
+ panic ("Releasing swap cache page");
}
void free_pages(unsigned long addr, unsigned long order)
@@ -147,10 +191,14 @@ void free_pages(unsigned long addr, unsigned long order)
if (PageReserved(map))
return;
if (atomic_dec_and_test(&map->count)) {
- delete_from_swap_cache(map);
+ if (PageSwapCache(map))
+ panic ("Freeing swap cache pages");
free_pages_ok(map_nr, order);
return;
}
+ if (PageSwapCache(map) && atomic_read(&map->count) == 1)
+ panic ("Releasing swap cache pages at %p",
+ __builtin_return_address(0));
}
}
@@ -161,11 +209,13 @@ void free_pages(unsigned long addr, unsigned long order)
change_bit((index) >> (1+(order)), (area)->map)
#define CAN_DMA(x) (PageDMA(x))
#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-#define RMQUEUE(order, dma) \
+#define RMQUEUE(order, maxorder, dma) \
do { struct free_area_struct * area = free_area+order; \
unsigned long new_order = order; \
- do { struct page *prev = memory_head(area), *ret; \
- while (memory_head(area) != (ret = prev->next)) { \
+ do { struct page *prev = memory_head(area), *ret = prev->next; \
+ while (memory_head(area) != ret) { \
+ if (new_order >= maxorder && ret->next == prev) \
+ break; \
if (!dma || CAN_DMA(ret)) { \
unsigned long map_nr = ret->map_nr; \
(prev->next = ret->next)->prev = prev; \
@@ -176,6 +226,7 @@ do { struct free_area_struct * area = free_area+order; \
return ADDRESS(map_nr); \
} \
prev = ret; \
+ ret = ret->next; \
} \
new_order++; area++; \
} while (new_order < NR_MEM_LISTS); \
@@ -194,36 +245,40 @@ do { unsigned long size = 1 << high; \
map->age = PAGE_INITIAL_AGE; \
} while (0)
-unsigned long __get_free_pages(int priority, unsigned long order, int dma)
+unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
- unsigned long flags;
- int reserved_pages;
+ unsigned long flags, maxorder;
if (order >= NR_MEM_LISTS)
- return 0;
+ goto nopage;
- if (in_interrupt() && priority != GFP_ATOMIC) {
+ /*
+ * "maxorder" is the highest order number that we're allowed
+ * to empty in order to find a free page..
+ */
+ maxorder = order + NR_MEM_LISTS/3;
+ if (gfp_mask & __GFP_MED)
+ maxorder += NR_MEM_LISTS/3;
+ if ((gfp_mask & __GFP_HIGH) || maxorder > NR_MEM_LISTS)
+ maxorder = NR_MEM_LISTS;
+
+ if (in_interrupt() && (gfp_mask & __GFP_WAIT)) {
static int count = 0;
if (++count < 5) {
printk("gfp called nonatomically from interrupt %p\n",
- return_address());
- priority = GFP_ATOMIC;
+ return_address());
+ gfp_mask &= ~__GFP_WAIT;
}
}
- reserved_pages = 5;
- if (priority != GFP_NFS)
- reserved_pages = min_free_pages;
repeat:
spin_lock_irqsave(&page_alloc_lock, flags);
- if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
- RMQUEUE(order, dma);
- spin_unlock_irqrestore(&page_alloc_lock, flags);
- return 0;
- }
+ RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA));
spin_unlock_irqrestore(&page_alloc_lock, flags);
- if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1))
+ if ((gfp_mask & __GFP_WAIT) && try_to_free_page(gfp_mask))
goto repeat;
+
+nopage:
return 0;
}
@@ -315,31 +370,38 @@ __initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long e
void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
pte_t * page_table, unsigned long entry, int write_access)
{
- unsigned long page = __get_free_page(GFP_KERNEL);
+ unsigned long page;
+ struct page *page_map;
+
+ page_map = read_swap_cache(entry);
if (pte_val(*page_table) != entry) {
- free_page(page);
+ if (page_map)
+ free_page_and_swap_cache(page_address(page_map));
return;
}
- if (!page) {
+ if (!page_map) {
set_pte(page_table, BAD_PAGE);
swap_free(entry);
oom(tsk);
return;
}
- read_swap_page(entry, (char *) page);
- if (pte_val(*page_table) != entry) {
- free_page(page);
- return;
- }
+
+ page = page_address(page_map);
vma->vm_mm->rss++;
- tsk->maj_flt++;
- if (!write_access && add_to_swap_cache(&mem_map[MAP_NR(page)], entry)) {
- /* keep swap page allocated for the moment (swap cache) */
+ tsk->min_flt++;
+ swap_free(entry);
+
+ if (!write_access || is_page_shared(page_map)) {
set_pte(page_table, mk_pte(page, vma->vm_page_prot));
return;
}
+
+ /* The page is unshared, and we want write access. In this
+ case, it is safe to tear down the swap cache and give the
+ page over entirely to this process. */
+
+ delete_from_swap_cache(page_map);
set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
- swap_free(entry);
return;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 5ebea3f09..e02565def 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -6,6 +6,7 @@
* Swap reorganised 29.12.95,
* Asynchronous swapping added 30.12.95. Stephen Tweedie
* Removed race in async swapping. 14.4.1996. Bruno Haible
+ * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
*/
#include <linux/mm.h>
@@ -27,26 +28,38 @@
#include <asm/bitops.h>
#include <asm/pgtable.h>
-static struct wait_queue * lock_queue = NULL;
-
/*
* Reads or writes a swap page.
* wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
+ * All IO to swap files (as opposed to swap partitions) is done
+ * synchronously.
*
- * Important prevention of race condition: The first thing we do is set a lock
- * on this swap page, which lasts until I/O completes. This way a
- * write_swap_page(entry) immediately followed by a read_swap_page(entry)
- * on the same entry will first complete the write_swap_page(). Fortunately,
- * not more than one write_swap_page() request can be pending per entry. So
- * all races the caller must catch are: multiple read_swap_page() requests
- * on the same entry.
+ * Important prevention of race condition: the caller *must* atomically
+ * create a unique swap cache entry for this swap page before calling
+ * rw_swap_page, and must lock that page. By ensuring that there is a
+ * single page of memory reserved for the swap entry, the normal VM page
+ * lock on that page also doubles as a lock on swap entries. Having only
+ * one lock to deal with per swap entry (rather than locking swap and memory
+ * independently) also makes it easier to make certain swapping operations
+ * atomic, which is particularly important when we are trying to ensure
+ * that shared pages stay shared while being swapped.
*/
+
void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
{
unsigned long type, offset;
struct swap_info_struct * p;
- struct page *page;
-
+ struct page *page = mem_map + MAP_NR(buf);
+
+#ifdef DEBUG_SWAP
+ printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n",
+ (rw == READ) ? "read" : "write",
+ entry, buf, atomic_read(&page->count),
+ wait ? "wait" : "nowait");
+#endif
+
+ if (page->inode && page->inode != &swapper_inode)
+ panic ("Tried to swap a non-swapper page");
type = SWP_TYPE(entry);
if (type >= nr_swapfiles) {
printk("Internal error: bad swap-device\n");
@@ -59,33 +72,49 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
return;
}
if (p->swap_map && !p->swap_map[offset]) {
- printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
+ printk("Hmm.. Trying to %s unallocated swap (%08lx)\n",
+ (rw == READ) ? "read" : "write",
+ entry);
return;
}
if (!(p->flags & SWP_USED)) {
printk("Trying to swap to unused swap-device\n");
return;
}
- /* Make sure we are the only process doing I/O with this swap page. */
- while (test_and_set_bit(offset,p->swap_lockmap)) {
- run_task_queue(&tq_disk);
- sleep_on(&lock_queue);
+
+ if (!PageLocked(page)) {
+ printk("VM: swap page is unlocked\n");
+ return;
}
- if (rw == READ)
+
+ if (rw == READ) {
+ clear_bit(PG_uptodate, &page->flags);
kstat.pswpin++;
- else
+ } else
kstat.pswpout++;
- page = mem_map + MAP_NR(buf);
+
atomic_inc(&page->count);
- wait_on_page(page);
+ /*
+ * Make sure that we have a swap cache association for this
+ * page. We need this to find which swap page to unlock once
+ * the swap IO has completed to the physical page. If the page
+ * is not already in the cache, just overload the offset entry
+ * as if it were: we are not allowed to manipulate the inode
+ * hashing for locked pages.
+ */
+ if (!PageSwapCache(page)) {
+ printk("VM: swap page is not in swap cache\n");
+ return;
+ }
+ if (page->offset != entry) {
+ printk ("swap entry mismatch");
+ return;
+ }
+
if (p->swap_device) {
if (!wait) {
set_bit(PG_free_after, &page->flags);
set_bit(PG_decr_after, &page->flags);
- set_bit(PG_swap_unlock_after, &page->flags);
- /* swap-cache shouldn't be set, but play safe */
- PageClearSwapCache(page);
- page->pg_swap_entry = entry;
atomic_inc(&nr_async_pages);
}
ll_rw_page(rw,p->swap_device,offset,buf);
@@ -132,39 +161,55 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
if (!(zones[i] = bmap(swapf,block++))) {
printk("rw_swap_page: bad swap file\n");
+ return;
}
}
ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
+ /* Unlike ll_rw_page, ll_rw_swap_file won't unlock the
+ page for us. */
+ clear_bit(PG_locked, &page->flags);
+ wake_up(&page->wait);
} else
printk("rw_swap_page: no swap file or device\n");
+
atomic_dec(&page->count);
- if (offset && !test_and_clear_bit(offset,p->swap_lockmap))
- printk("rw_swap_page: lock already cleared\n");
- wake_up(&lock_queue);
+#ifdef DEBUG_SWAP
+ printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
+ (rw == READ) ? "read" : "write",
+ buf, atomic_read(&page->count));
+#endif
}
-/* This is run when asynchronous page I/O has completed. */
-void swap_after_unlock_page (unsigned long entry)
+/*
+ * Setting up a new swap file needs a simple wrapper just to read the
+ * swap signature. SysV shared memory also needs a simple wrapper.
+ */
+void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer)
{
- unsigned long type, offset;
- struct swap_info_struct * p;
-
- type = SWP_TYPE(entry);
- if (type >= nr_swapfiles) {
- printk("swap_after_unlock_page: bad swap-device\n");
+ struct page *page;
+
+ page = mem_map + MAP_NR((unsigned long) buffer);
+ wait_on_page(page);
+ set_bit(PG_locked, &page->flags);
+ if (test_and_set_bit(PG_swap_cache, &page->flags)) {
+ printk ("VM: read_swap_page: page already in swap cache!\n");
return;
}
- p = &swap_info[type];
- offset = SWP_OFFSET(entry);
- if (offset >= p->max) {
- printk("swap_after_unlock_page: weirdness\n");
+ if (page->inode) {
+ printk ("VM: read_swap_page: page already in page cache!\n");
return;
}
- if (!test_and_clear_bit(offset,p->swap_lockmap))
- printk("swap_after_unlock_page: lock already cleared\n");
- wake_up(&lock_queue);
+ page->inode = &swapper_inode;
+ page->offset = entry;
+ atomic_inc(&page->count); /* Protect from shrink_mmap() */
+ rw_swap_page(rw, entry, buffer, 1);
+ atomic_dec(&page->count);
+ page->inode = 0;
+ clear_bit(PG_swap_cache, &page->flags);
}
+
+
/*
* Swap partitions are now read via brw_page. ll_rw_page is an
* asynchronous function now --- we must call wait_on_page afterwards
@@ -189,7 +234,7 @@ void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer)
panic("ll_rw_page: bad block dev cmd, must be R/W");
}
page = mem_map + MAP_NR(buffer);
- if (test_and_set_bit(PG_locked, &page->flags))
- panic ("ll_rw_page: page already locked");
+ if (!PageLocked(page))
+ panic ("ll_rw_page: page not already locked");
brw_page(rw, page, dev, &block, PAGE_SIZE, 0);
}
diff --git a/mm/simp.c b/mm/simp.c
index 6ad6bc73c..ba7864604 100644
--- a/mm/simp.c
+++ b/mm/simp.c
@@ -115,7 +115,7 @@ struct simp * simp_create(char * name, long size,
if(!global) {
#ifdef __SMP__
- global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER, 0);
+ global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER);
memset(global, 0, CHUNK_SIZE);
#else
global = (struct global_data*)get_free_page(GFP_KERNEL);
@@ -167,7 +167,7 @@ static void alloc_header(struct simp * simp)
spin_unlock(&simp->lock);
for(;;) {
- hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER, 0);
+ hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER);
if(hdr)
break;
if(!simp_garbage())
diff --git a/mm/slab.c b/mm/slab.c
index 2d6466656..d0b4214f9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -506,8 +506,7 @@ kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
void *addr;
*dma = flags & SLAB_DMA;
- addr = (void*) __get_free_pages(flags & SLAB_LEVEL_MASK,
- cachep->c_gfporder, *dma);
+ addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
/* Assume that now we have the pages no one else can legally
* messes with the 'struct page's.
* However vm_scan() might try to test the structure to see if
@@ -1732,19 +1731,18 @@ kmem_find_general_cachep(size_t size)
* This function _cannot_ be called within a int, but it
* can be interrupted.
*/
-int
-kmem_cache_reap(int pri, int dma, int wait)
+void
+kmem_cache_reap(int gfp_mask)
{
kmem_slab_t *slabp;
kmem_cache_t *searchp;
kmem_cache_t *best_cachep;
unsigned int scan;
unsigned int reap_level;
- static unsigned long call_count = 0;
if (in_interrupt()) {
printk("kmem_cache_reap() called within int!\n");
- return 0;
+ return;
}
/* We really need a test semphore op so we can avoid sleeping when
@@ -1752,28 +1750,8 @@ kmem_cache_reap(int pri, int dma, int wait)
*/
down(&cache_chain_sem);
- scan = 10-pri;
- if (pri == 6 && !dma) {
- if (++call_count == 199) {
- /* Hack Alert!
- * Occassionally we try hard to reap a slab.
- */
- call_count = 0UL;
- reap_level = 0;
- scan += 2;
- } else
- reap_level = 3;
- } else {
- if (pri >= 5) {
- /* We also come here for dma==1 at pri==6, just
- * to try that bit harder (assumes that there are
- * less DMAable pages in a system - not always true,
- * but this doesn't hurt).
- */
- reap_level = 2;
- } else
- reap_level = 0;
- }
+ scan = 10;
+ reap_level = 0;
best_cachep = NULL;
searchp = clock_searchp;
@@ -1812,7 +1790,7 @@ kmem_cache_reap(int pri, int dma, int wait)
}
spin_unlock_irq(&searchp->c_spinlock);
- if (dma && !dma_flag)
+ if ((gfp_mask & GFP_DMA) && !dma_flag)
goto next;
if (full_free) {
@@ -1825,10 +1803,6 @@ kmem_cache_reap(int pri, int dma, int wait)
* more than one page per slab (as it can be difficult
* to get high orders from gfp()).
*/
- if (pri == 6) { /* magic '6' from try_to_free_page() */
- if (searchp->c_gfporder || searchp->c_ctor)
- full_free--;
- }
if (full_free >= reap_level) {
reap_level = full_free;
best_cachep = searchp;
@@ -1846,12 +1820,12 @@ next:
if (!best_cachep) {
/* couldn't find anthying to reap */
- return 0;
+ return;
}
spin_lock_irq(&best_cachep->c_spinlock);
if (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) {
- if (dma) {
+ if (gfp_mask & GFP_DMA) {
do {
if (slabp->s_dma)
goto good_dma;
@@ -1874,11 +1848,11 @@ good_dma:
*/
spin_unlock_irq(&best_cachep->c_spinlock);
kmem_slab_destroy(best_cachep, slabp);
- return 1;
+ return;
}
dma_fail:
spin_unlock_irq(&best_cachep->c_spinlock);
- return 0;
+ return;
}
#if SLAB_SELFTEST
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 75f284124..4ebc5c05f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -3,6 +3,8 @@
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
+ *
+ * Rewritten to use page cache, (C) 1998 Stephen Tweedie
*/
#include <linux/mm.h>
@@ -17,6 +19,7 @@
#include <linux/fs.h>
#include <linux/swapctl.h>
#include <linux/init.h>
+#include <linux/pagemap.h>
#include <asm/bitops.h>
#include <asm/pgtable.h>
@@ -29,6 +32,18 @@ unsigned long swap_cache_del_success = 0;
unsigned long swap_cache_find_total = 0;
unsigned long swap_cache_find_success = 0;
+/*
+ * Keep a reserved false inode which we will use to mark pages in the
+ * page cache are acting as swap cache instead of file cache.
+ *
+ * We only need a unique pointer to satisfy the page cache, but we'll
+ * reserve an entire zeroed inode structure for the purpose just to
+ * ensure that any mistaken dereferences of this structure cause a
+ * kernel oops.
+ */
+struct inode swapper_inode;
+
+
void show_swap_cache_info(void)
{
printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
@@ -40,21 +55,33 @@ void show_swap_cache_info(void)
int add_to_swap_cache(struct page *page, unsigned long entry)
{
- struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
-
#ifdef SWAP_CACHE_INFO
swap_cache_add_total++;
#endif
- if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
- page->pg_swap_entry = entry;
- if (PageTestandSetSwapCache(page))
- printk("swap_cache: replacing non-empty entry\n");
-#ifdef SWAP_CACHE_INFO
- swap_cache_add_success++;
+#ifdef DEBUG_SWAP
+ printk("DebugVM: add_to_swap_cache(%08lx count %d, entry %08lx)\n",
+ page_address(page), atomic_read(&page->count), entry);
#endif
- return 1;
+ if (PageTestandSetSwapCache(page)) {
+ printk("swap_cache: replacing non-empty entry %08lx "
+ "on page %08lx",
+ page->offset, page_address(page));
+ return 0;
}
- return 0;
+ if (page->inode) {
+ printk("swap_cache: replacing page-cached entry "
+ "on page %08lx", page_address(page));
+ return 0;
+ }
+ atomic_inc(&page->count);
+ page->inode = &swapper_inode;
+ page->offset = entry;
+ add_page_to_hash_queue(page, &swapper_inode, entry);
+ add_page_to_inode_queue(&swapper_inode, page);
+#ifdef SWAP_CACHE_INFO
+ swap_cache_add_success++;
+#endif
+ return 1;
}
/*
@@ -87,6 +114,10 @@ void swap_duplicate(unsigned long entry)
entry, p->swap_map[offset]);
p->swap_map[offset] = 127;
}
+#ifdef DEBUG_SWAP
+ printk("DebugVM: swap_duplicate(entry %08lx, count now %d)\n",
+ entry, p->swap_map[offset]);
+#endif
out:
return;
@@ -97,7 +128,173 @@ bad_offset:
printk("swap_duplicate: offset exceeds max\n");
goto out;
bad_unused:
- printk("swap_duplicate: unused page\n");
+ printk("swap_duplicate at %8p: unused page\n",
+ __builtin_return_address(0));
goto out;
}
+
+void remove_from_swap_cache(struct page *page)
+{
+ if (!page->inode) {
+ printk ("VM: Removing swap cache page with zero inode hash "
+ "on page %08lx", page_address(page));
+ return;
+ }
+ if (page->inode != &swapper_inode) {
+ printk ("VM: Removing swap cache page with wrong inode hash "
+ "on page %08lx", page_address(page));
+ }
+ /*
+ * This will be a legal case once we have a more mature swap cache.
+ */
+ if (atomic_read(&page->count) == 1) {
+ printk ("VM: Removing page cache on unshared page %08lx",
+ page_address(page));
+ return;
+ }
+
+
+#ifdef DEBUG_SWAP
+ printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
+ page_address(page), atomic_read(&page->count));
+#endif
+ remove_page_from_hash_queue (page);
+ remove_page_from_inode_queue (page);
+ PageClearSwapCache (page);
+ __free_page (page);
+}
+
+
+long find_in_swap_cache(struct page *page)
+{
+#ifdef SWAP_CACHE_INFO
+ swap_cache_find_total++;
+#endif
+ if (PageSwapCache (page)) {
+ long entry = page->offset;
+#ifdef SWAP_CACHE_INFO
+ swap_cache_find_success++;
+#endif
+ remove_from_swap_cache (page);
+ return entry;
+ }
+ return 0;
+}
+
+int delete_from_swap_cache(struct page *page)
+{
+#ifdef SWAP_CACHE_INFO
+ swap_cache_del_total++;
+#endif
+ if (PageSwapCache (page)) {
+ long entry = page->offset;
+#ifdef SWAP_CACHE_INFO
+ swap_cache_del_success++;
+#endif
+#ifdef DEBUG_SWAP
+ printk("DebugVM: delete_from_swap_cache(%08lx count %d, "
+ "entry %08lx)\n",
+ page_address(page), atomic_read(&page->count), entry);
+#endif
+ remove_from_swap_cache (page);
+ swap_free (entry);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Perform a free_page(), also freeing any swap cache associated with
+ * this page if it is the last user of the page.
+ */
+
+void free_page_and_swap_cache(unsigned long addr)
+{
+ struct page *page = mem_map + MAP_NR(addr);
+ /*
+ * If we are the only user, then free up the swap cache.
+ */
+ if (PageSwapCache(page) && !is_page_shared(page)) {
+ delete_from_swap_cache(page);
+ }
+
+ free_page(addr);
+}
+
+
+/*
+ * Lookup a swap entry in the swap cache. We need to be careful about
+ * locked pages. A found page will be returned with its refcount
+ * incremented.
+ */
+
+static struct page * lookup_swap_cache(unsigned long entry)
+{
+ struct page *found;
+
+ while (1) {
+ found = find_page(&swapper_inode, entry);
+ if (!found)
+ return 0;
+ if (found->inode != &swapper_inode
+ || !PageSwapCache(found)) {
+ __free_page(found);
+ printk ("VM: Found a non-swapper swap page!\n");
+ return 0;
+ }
+ if (!PageLocked(found))
+ return found;
+ __free_page(found);
+ __wait_on_page(found);
+ }
+}
+
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached. If wait==0, we are
+ * only doing readahead, so don't worry if the page is already locked.
+ */
+
+struct page * read_swap_cache_async(unsigned long entry, int wait)
+{
+ struct page *found_page, *new_page = 0;
+ unsigned long new_page_addr = 0;
+
+#ifdef DEBUG_SWAP
+ printk("DebugVM: read_swap_cache_async entry %08lx%s\n",
+ entry, wait ? ", wait" : "");
+#endif
+repeat:
+ found_page = lookup_swap_cache(entry);
+ if (found_page) {
+ if (new_page)
+ __free_page(new_page);
+ return found_page;
+ }
+
+ /* The entry is not present. Lock down a new page, add it to
+ * the swap cache and read its contents. */
+ if (!new_page) {
+ new_page_addr = __get_free_page(GFP_KERNEL);
+ if (!new_page_addr)
+ return 0; /* Out of memory */
+ new_page = mem_map + MAP_NR(new_page_addr);
+ goto repeat; /* We might have stalled */
+ }
+
+ if (!add_to_swap_cache(new_page, entry)) {
+ free_page(new_page_addr);
+ return 0;
+ }
+ swap_duplicate(entry); /* Account for the swap cache */
+ set_bit(PG_locked, &new_page->flags);
+ rw_swap_page(READ, entry, (char *) new_page_addr, wait);
+#ifdef DEBUG_SWAP
+ printk("DebugVM: read_swap_cache_async created "
+ "entry %08lx at %p\n",
+ entry, (char *) page_address(new_page));
+#endif
+ return new_page;
+}
+
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 13d2436ba..8608db8d8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,6 +21,7 @@
#include <linux/malloc.h>
#include <linux/blkdev.h> /* for blk_size */
#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
#include <asm/bitops.h>
#include <asm/pgtable.h>
@@ -51,8 +52,6 @@ static inline int scan_swap_map(struct swap_info_struct *si)
offset = si->cluster_next++;
if (si->swap_map[offset])
continue;
- if (test_bit(offset, si->swap_lockmap))
- continue;
si->cluster_nr--;
goto got_page;
}
@@ -61,8 +60,6 @@ static inline int scan_swap_map(struct swap_info_struct *si)
for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
if (si->swap_map[offset])
continue;
- if (test_bit(offset, si->swap_lockmap))
- continue;
si->lowest_bit = offset;
got_page:
si->swap_map[offset] = 1;
@@ -129,6 +126,7 @@ void swap_free(unsigned long entry)
if (!entry)
goto out;
+
type = SWP_TYPE(entry);
if (type & SHM_SWP_TYPE)
goto out;
@@ -152,6 +150,10 @@ void swap_free(unsigned long entry)
if (!--p->swap_map[offset])
nr_swap_pages++;
}
+#ifdef DEBUG_SWAP
+ printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
+ entry, p->swap_map[offset]);
+#endif
out:
return;
@@ -172,42 +174,38 @@ bad_free:
/*
* The swap entry has been read in advance, and we return 1 to indicate
* that the page has been used or is no longer needed.
+ *
+ * Always set the resulting pte to be nowrite (the same as COW pages
+ * after one process has exited). We don't know just how many ptes will
+ * share this swap entry, so be cautious and let do_wp_page work out
+ * what to do if a write is requested later.
*/
-static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
+static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
pte_t *dir, unsigned long entry, unsigned long page)
{
pte_t pte = *dir;
if (pte_none(pte))
- return 0;
+ return;
if (pte_present(pte)) {
- struct page *pg;
- unsigned long page_nr = MAP_NR(pte_page(pte));
- unsigned long pg_swap_entry;
-
- if (page_nr >= max_mapnr)
- return 0;
- pg = mem_map + page_nr;
- if (!(pg_swap_entry = in_swap_cache(pg)))
- return 0;
- if (SWP_TYPE(pg_swap_entry) != SWP_TYPE(entry))
- return 0;
- delete_from_swap_cache(pg);
+ /* If this entry is swap-cached, then page must already
+ hold the right address for any copies in physical
+ memory */
+ if (pte_page(pte) != page)
+ return;
+ /* We will be removing the swap cache in a moment, so... */
set_pte(dir, pte_mkdirty(pte));
- if (pg_swap_entry != entry)
- return 0;
- free_page(page);
- return 1;
+ return;
}
if (pte_val(pte) != entry)
- return 0;
- set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
- ++vma->vm_mm->rss;
+ return;
+ set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
- return 1;
+ atomic_inc(&mem_map[MAP_NR(page)].count);
+ ++vma->vm_mm->rss;
}
-static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset,
unsigned long entry, unsigned long page)
{
@@ -215,11 +213,11 @@ static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long end;
if (pmd_none(*dir))
- return 0;
+ return;
if (pmd_bad(*dir)) {
printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
pmd_clear(dir);
- return 0;
+ return;
}
pte = pte_offset(dir, address);
offset += address & PMD_MASK;
@@ -228,16 +226,13 @@ static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- if (unuse_pte(vma, offset+address-vma->vm_start, pte, entry,
- page))
- return 1;
+ unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
address += PAGE_SIZE;
pte++;
} while (address < end);
- return 0;
}
-static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size,
unsigned long entry, unsigned long page)
{
@@ -245,11 +240,11 @@ static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long offset, end;
if (pgd_none(*dir))
- return 0;
+ return;
if (pgd_bad(*dir)) {
printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
pgd_clear(dir);
- return 0;
+ return;
}
pmd = pmd_offset(dir, address);
offset = address & PGDIR_MASK;
@@ -258,30 +253,26 @@ static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
- if (unuse_pmd(vma, pmd, address, end - address, offset, entry,
- page))
- return 1;
+ unuse_pmd(vma, pmd, address, end - address, offset, entry,
+ page);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
- return 0;
}
-static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
unsigned long entry, unsigned long page)
{
unsigned long start = vma->vm_start, end = vma->vm_end;
while (start < end) {
- if (unuse_pgd(vma, pgdir, start, end - start, entry, page))
- return 1;
+ unuse_pgd(vma, pgdir, start, end - start, entry, page);
start = (start + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
}
- return 0;
}
-static int unuse_process(struct mm_struct * mm, unsigned long entry,
+static void unuse_process(struct mm_struct * mm, unsigned long entry,
unsigned long page)
{
struct vm_area_struct* vma;
@@ -290,13 +281,12 @@ static int unuse_process(struct mm_struct * mm, unsigned long entry,
* Go through process' page directory.
*/
if (!mm || mm == &init_mm)
- return 0;
+ return;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
- if (unuse_vma(vma, pgd, entry, page))
- return 1;
+ unuse_vma(vma, pgd, entry, page);
}
- return 0;
+ return;
}
/*
@@ -309,19 +299,14 @@ static int try_to_unuse(unsigned int type)
struct swap_info_struct * si = &swap_info[type];
struct task_struct *p;
unsigned long page = 0;
+ struct page *page_map;
unsigned long entry;
int i;
while (1) {
- if (!page) {
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- }
-
/*
- * Find a swap page in use and read it in.
- */
+ * Find a swap page in use and read it in.
+ */
for (i = 1 , entry = 0; i < si->max ; i++) {
if (si->swap_map[i] > 0 && si->swap_map[i] != 0x80) {
entry = SWP_ENTRY(type, i);
@@ -330,36 +315,31 @@ static int try_to_unuse(unsigned int type)
}
if (!entry)
break;
- read_swap_page(entry, (char *) page);
+ /* Get a page for the entry, using the existing swap
+ cache page if there is one. Otherwise, get a clean
+ page and read the swap into it. */
+ page_map = read_swap_cache(entry);
+ if (!page_map)
+ return -ENOMEM;
+ page = page_address(page_map);
read_lock(&tasklist_lock);
- for_each_task(p) {
- if (unuse_process(p->mm, entry, page)) {
- page = 0;
- goto unlock;
- }
- }
- unlock:
+ for_each_task(p)
+ unuse_process(p->mm, entry, page);
read_unlock(&tasklist_lock);
- if (page) {
- /*
- * If we couldn't find an entry, there are several
- * possible reasons: someone else freed it first,
- * we freed the last reference to an overflowed entry,
- * or the system has lost track of the use counts.
- */
- if (si->swap_map[i] != 0) {
- if (si->swap_map[i] != 127)
- printk("try_to_unuse: entry %08lx "
- "not in use\n", entry);
- si->swap_map[i] = 0;
- nr_swap_pages++;
- }
+ /* Now get rid of the extra reference to the temporary
+ page we've been using. */
+ if (PageSwapCache(page_map))
+ delete_from_swap_cache(page_map);
+ free_page(page);
+ if (si->swap_map[i] != 0) {
+ if (si->swap_map[i] != 127)
+ printk("try_to_unuse: entry %08lx "
+ "not in use\n", entry);
+ si->swap_map[i] = 0;
+ nr_swap_pages++;
}
}
-
- if (page)
- free_page(page);
return 0;
}
@@ -370,7 +350,7 @@ asmlinkage int sys_swapoff(const char * specialfile)
struct file filp;
int i, type, prev;
int err = -EPERM;
-
+
lock_kernel();
if (!suser())
goto out;
@@ -444,8 +424,6 @@ asmlinkage int sys_swapoff(const char * specialfile)
p->swap_device = 0;
vfree(p->swap_map);
p->swap_map = NULL;
- free_page((long) p->swap_lockmap);
- p->swap_lockmap = NULL;
p->flags = 0;
err = 0;
out:
@@ -505,6 +483,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
int error = -EPERM;
struct file filp;
static int least_priority = 0;
+ unsigned char *avail_map = 0;
lock_kernel();
if (!suser())
@@ -522,7 +501,6 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
p->swap_file = NULL;
p->swap_device = 0;
p->swap_map = NULL;
- p->swap_lockmap = NULL;
p->lowest_bit = 0;
p->highest_bit = 0;
p->cluster_nr = 0;
@@ -565,24 +543,24 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
}
} else if (!S_ISREG(swap_dentry->d_inode->i_mode))
goto bad_swap;
- p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
- if (!p->swap_lockmap) {
+ avail_map = (unsigned char *) get_free_page(GFP_USER);
+ if (!avail_map) {
printk("Unable to start swapping: out of memory :-)\n");
error = -ENOMEM;
goto bad_swap;
}
- read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
- if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
+ rw_swap_page_nocache(READ, SWP_ENTRY(type,0), (char *) avail_map);
+ if (memcmp("SWAP-SPACE",avail_map+PAGE_SIZE-10,10)) {
printk("Unable to find swap-space signature\n");
error = -EINVAL;
goto bad_swap;
}
- memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
+ memset(avail_map+PAGE_SIZE-10,0,10);
j = 0;
p->lowest_bit = 0;
p->highest_bit = 0;
for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
- if (test_bit(i,p->swap_lockmap)) {
+ if (test_bit(i,avail_map)) {
if (!p->lowest_bit)
p->lowest_bit = i;
p->highest_bit = i;
@@ -601,13 +579,12 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
goto bad_swap;
}
for (i = 1 ; i < p->max ; i++) {
- if (test_bit(i,p->swap_lockmap))
+ if (test_bit(i,avail_map))
p->swap_map[i] = 0;
else
p->swap_map[i] = 0x80;
}
p->swap_map[0] = 0x80;
- clear_page(p->swap_lockmap);
p->flags = SWP_WRITEOK;
p->pages = j;
nr_swap_pages += j;
@@ -634,15 +611,15 @@ bad_swap:
if(filp.f_op && filp.f_op->release)
filp.f_op->release(filp.f_dentry->d_inode,&filp);
bad_swap_2:
- free_page((long) p->swap_lockmap);
vfree(p->swap_map);
dput(p->swap_file);
p->swap_device = 0;
p->swap_file = NULL;
p->swap_map = NULL;
- p->swap_lockmap = NULL;
p->flags = 0;
out:
+ if (avail_map)
+ free_page((long) avail_map);
unlock_kernel();
return error;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a50684973..ebef7a362 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7,7 +7,7 @@
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to free_pages_high: 2.4.97, Rik van Riel.
- * Version: $Id: vmscan.c,v 1.23 1997/04/12 04:31:05 davem Exp $
+ * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
*/
#include <linux/mm.h>
@@ -61,7 +61,7 @@ static void init_swap_timer(void);
* have died while we slept).
*/
static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
- unsigned long address, pte_t * page_table, int dma, int wait)
+ unsigned long address, pte_t * page_table, int gfp_mask)
{
pte_t pte;
unsigned long entry;
@@ -78,20 +78,62 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
page_map = mem_map + MAP_NR(page);
if (PageReserved(page_map)
|| PageLocked(page_map)
- || (dma && !PageDMA(page_map)))
+ || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
return 0;
- /* Deal with page aging. Pages age from being unused; they
- * rejuvenate on being accessed. Only swap old pages (age==0
- * is oldest). */
- if ((pte_dirty(pte) && delete_from_swap_cache(page_map))
- || pte_young(pte)) {
+
+ /*
+ * Deal with page aging. There are several special cases to
+ * consider:
+ *
+ * Page has been accessed, but is swap cached. If the page is
+ * getting sufficiently "interesting" --- its age is getting
+ * high --- then if we are sufficiently short of free swap
+ * pages, then delete the swap cache. We can only do this if
+ * the swap page's reference count is one: ie. there are no
+ * other references to it beyond the swap cache (as there must
+ * still be pte's pointing to it if count > 1).
+ *
+ * If the page has NOT been touched, and its age reaches zero,
+ * then we are swapping it out:
+ *
+ * If there is already a swap cache page for this page, then
+ * another process has already allocated swap space, so just
+ * dereference the physical page and copy in the swap entry
+ * from the swap cache.
+ *
+ * Note, we rely on all pages read in from swap either having
+ * the swap cache flag set, OR being marked writable in the pte,
+ * but NEVER BOTH. (It IS legal to be neither cached nor dirty,
+ * however.)
+ *
+ * -- Stephen Tweedie 1998 */
+
+ if (PageSwapCache(page_map)) {
+ if (pte_write(pte)) {
+ printk ("VM: Found a writable swap-cached page!\n");
+ return 0;
+ }
+ }
+
+ if (pte_young(pte)) {
set_pte(page_table, pte_mkold(pte));
touch_page(page_map);
+ /*
+ * We should test here to see if we want to recover any
+ * swap cache page here. We do this if the page seeing
+ * enough activity, AND we are sufficiently low on swap
+ *
+ * We need to track both the number of available swap
+ * pages and the total number present before we can do
+ * this...
+ */
return 0;
}
+
age_page(page_map);
if (page_map->age)
return 0;
+
if (pte_dirty(pte)) {
if (vma->vm_ops && vma->vm_ops->swapout) {
pid_t pid = tsk->pid;
@@ -99,33 +141,83 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
kill_proc(pid, SIGBUS, 1);
} else {
- if (atomic_read(&page_map->count) != 1)
- return 0;
- if (!(entry = get_swap_page()))
- return 0;
+ /*
+ * This is a dirty, swappable page. First of all,
+ * get a suitable swap entry for it, and make sure
+ * we have the swap cache set up to associate the
+ * page with that swap entry.
+ */
+ if (PageSwapCache(page_map)) {
+ entry = page_map->offset;
+ } else {
+ entry = get_swap_page();
+ if (!entry)
+ return 0; /* No swap space left */
+ }
+
vma->vm_mm->rss--;
+ tsk->nswap++;
flush_cache_page(vma, address);
set_pte(page_table, __pte(entry));
flush_tlb_page(vma, address);
- tsk->nswap++;
- rw_swap_page(WRITE, entry, (char *) page, wait);
+ swap_duplicate(entry);
+
+ /* Now to write back the page. We have two
+ * cases: if the page is already part of the
+ * swap cache, then it is already on disk. Just
+ * free the page and return (we release the swap
+ * cache on the last accessor too).
+ *
+ * If we have made a new swap entry, then we
+ * start the write out to disk. If the page is
+ * shared, however, we still need to keep the
+ * copy in memory, so we add it to the swap
+ * cache. */
+ if (PageSwapCache(page_map)) {
+ free_page_and_swap_cache(page);
+ return (atomic_read(&page_map->count) == 0);
+ }
+ add_to_swap_cache(page_map, entry);
+ /* We checked we were unlocked way up above, and we
+ have been careful not to stall until here */
+ set_bit(PG_locked, &page_map->flags);
+ /* OK, do a physical write to swap. */
+ rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT));
}
- free_page(page);
+ /* Now we can free the current physical page. We also
+ * free up the swap cache if this is the last use of the
+ * page. Note that there is a race here: the page may
+ * still be shared COW by another process, but that
+ * process may exit while we are writing out the page
+ * asynchronously. That's no problem, shrink_mmap() can
+ * correctly clean up the occassional unshared page
+ * which gets left behind in the swap cache. */
+ free_page_and_swap_cache(page);
return 1; /* we slept: the process may not exist any more */
}
- if ((entry = find_in_swap_cache(page_map))) {
- if (atomic_read(&page_map->count) != 1) {
- set_pte(page_table, pte_mkdirty(pte));
- printk("Aiee.. duplicated cached swap-cache entry\n");
- return 0;
- }
+
+ /* The page was _not_ dirty, but still has a zero age. It must
+ * already be uptodate on disk. If it is in the swap cache,
+ * then we can just unlink the page now. Remove the swap cache
+ * too if this is the last user. */
+ if ((entry = in_swap_cache(page_map))) {
vma->vm_mm->rss--;
flush_cache_page(vma, address);
set_pte(page_table, __pte(entry));
flush_tlb_page(vma, address);
- free_page(page);
- return 1;
+ swap_duplicate(entry);
+ free_page_and_swap_cache(page);
+ return (atomic_read(&page_map->count) == 0);
}
+ /*
+ * A clean page to be discarded? Must be mmap()ed from
+ * somewhere. Unlink the pte, and tell the filemap code to
+ * discard any cached backing page if this is the last user.
+ */
+ if (PageSwapCache(page_map)) {
+ printk ("VM: How can this page _still_ be cached?");
+ return 0;
+ }
vma->vm_mm->rss--;
flush_cache_page(vma, address);
pte_clear(page_table);
@@ -150,7 +242,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
*/
static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
- pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+ pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
{
pte_t * pte;
unsigned long pmd_end;
@@ -172,7 +264,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
do {
int result;
tsk->swap_address = address + PAGE_SIZE;
- result = try_to_swap_out(tsk, vma, address, pte, dma, wait);
+ result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
if (result)
return result;
address += PAGE_SIZE;
@@ -182,7 +274,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
}
static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
- pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+ pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
{
pmd_t * pmd;
unsigned long pgd_end;
@@ -202,7 +294,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
end = pgd_end;
do {
- int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait);
+ int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
if (result)
return result;
address = (address + PMD_SIZE) & PMD_MASK;
@@ -212,7 +304,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct *
}
static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
- pgd_t *pgdir, unsigned long start, int dma, int wait)
+ pgd_t *pgdir, unsigned long start, int gfp_mask)
{
unsigned long end;
@@ -223,7 +315,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
end = vma->vm_end;
while (start < end) {
- int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait);
+ int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask);
if (result)
return result;
start = (start + PGDIR_SIZE) & PGDIR_MASK;
@@ -232,7 +324,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
return 0;
}
-static int swap_out_process(struct task_struct * p, int dma, int wait)
+static int swap_out_process(struct task_struct * p, int gfp_mask)
{
unsigned long address;
struct vm_area_struct* vma;
@@ -241,19 +333,20 @@ static int swap_out_process(struct task_struct * p, int dma, int wait)
* Go through process' page directory.
*/
address = p->swap_address;
- p->swap_address = 0;
/*
* Find the proper vm-area
*/
vma = find_vma(p->mm, address);
- if (!vma)
+ if (!vma) {
+ p->swap_address = 0;
return 0;
+ }
if (address < vma->vm_start)
address = vma->vm_start;
for (;;) {
- int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait);
+ int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask);
if (result)
return result;
vma = vma->vm_next;
@@ -270,7 +363,7 @@ static int swap_out_process(struct task_struct * p, int dma, int wait)
* N.B. This function returns only 0 or 1. Return values != 1 from
* the lower level routines result in continued processing.
*/
-static int swap_out(unsigned int priority, int dma, int wait)
+static int swap_out(unsigned int priority, int gfp_mask)
{
struct task_struct * p, * pbest;
int counter, assign, max_cnt;
@@ -321,7 +414,7 @@ static int swap_out(unsigned int priority, int dma, int wait)
}
pbest->swap_cnt--;
- switch (swap_out_process(pbest, dma, wait)) {
+ switch (swap_out_process(pbest, gfp_mask)) {
case 0:
/*
* Clear swap_cnt so we don't look at this task
@@ -345,7 +438,7 @@ out:
* to be. This works out OK, because we now do proper aging on page
* contents.
*/
-static inline int do_try_to_free_page(int priority, int dma, int wait)
+static inline int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
int i=6;
@@ -353,25 +446,27 @@ static inline int do_try_to_free_page(int priority, int dma, int wait)
/* Let the dcache know we're looking for memory ... */
shrink_dcache_memory();
+
/* Always trim SLAB caches when memory gets low. */
- (void) kmem_cache_reap(0, dma, wait);
+ kmem_cache_reap(gfp_mask);
- /* we don't try as hard if we're not waiting.. */
+ /* We try harder if we are waiting .. */
stop = 3;
- if (wait)
+ if (gfp_mask & __GFP_WAIT)
stop = 0;
+
switch (state) {
do {
case 0:
- if (shrink_mmap(i, dma))
+ if (shrink_mmap(i, gfp_mask))
return 1;
state = 1;
case 1:
- if (shm_swap(i, dma))
+ if ((gfp_mask & __GFP_IO) && shm_swap(i, gfp_mask))
return 1;
state = 2;
default:
- if (swap_out(i, dma, wait))
+ if (swap_out(i, gfp_mask))
return 1;
state = 0;
i--;
@@ -387,12 +482,12 @@ static inline int do_try_to_free_page(int priority, int dma, int wait)
* now we need this so that we can do page allocations
* without holding the kernel lock etc.
*/
-int try_to_free_page(int priority, int dma, int wait)
+int try_to_free_page(int gfp_mask)
{
int retval;
lock_kernel();
- retval = do_try_to_free_page(priority,dma,wait);
+ retval = do_try_to_free_page(gfp_mask);
unlock_kernel();
return retval;
}
@@ -406,7 +501,7 @@ int try_to_free_page(int priority, int dma, int wait)
void kswapd_setup(void)
{
int i;
- char *revision="$Revision: 1.23 $", *s, *e;
+ char *revision="$Revision: 1.5 $", *s, *e;
if ((s = strchr(revision, ':')) &&
(e = strchr(s, '$')))
@@ -423,6 +518,7 @@ void kswapd_setup(void)
*/
int kswapd(void *unused)
{
+ struct wait_queue wait = { current, NULL };
current->session = 1;
current->pgrp = 1;
sprintf(current->comm, "kswapd");
@@ -442,42 +538,63 @@ int kswapd(void *unused)
priorities. */
init_swap_timer();
-
+ add_wait_queue(&kswapd_wait, &wait);
while (1) {
- int fail;
+ int tries;
kswapd_awake = 0;
flush_signals(current);
run_task_queue(&tq_disk);
- interruptible_sleep_on(&kswapd_wait);
+ schedule();
+ current->state = TASK_INTERRUPTIBLE;
kswapd_awake = 1;
swapstats.wakeups++;
/* Do the background pageout:
- * We now only swap out as many pages as needed.
- * When we are truly low on memory, we swap out
- * synchronously (WAIT == 1). -- Rik.
- * If we've had too many consecutive failures,
- * go back to sleep to let other tasks run.
+ * When we've got loads of memory, we try
+ * (free_pages_high - nr_free_pages) times to
+ * free memory. As memory gets tighter, kswapd
+ * gets more and more agressive. -- Rik.
*/
- for (fail = 0; fail++ < MAX_SWAP_FAIL;) {
- int pages, wait;
+ tries = free_pages_high - nr_free_pages;
+ if (tries < min_free_pages) {
+ tries = min_free_pages;
+ }
+ else if (nr_free_pages < (free_pages_high + free_pages_low) / 2) {
+ tries <<= 1;
+ if (nr_free_pages < free_pages_low) {
+ tries <<= 1;
+ if (nr_free_pages <= min_free_pages) {
+ tries <<= 1;
+ }
+ }
+ }
+ while (tries--) {
+ int gfp_mask;
- pages = nr_free_pages;
- if (nr_free_pages >= min_free_pages)
- pages += atomic_read(&nr_async_pages);
- if (pages >= free_pages_high)
+ if (free_memory_available())
break;
- wait = (pages < free_pages_low);
- if (try_to_free_page(GFP_KERNEL, 0, wait))
- fail = 0;
+ gfp_mask = __GFP_IO;
+ try_to_free_page(gfp_mask);
+ /*
+ * Syncing large chunks is faster than swapping
+ * synchronously (less head movement). -- Rik.
+ */
+ if (atomic_read(&nr_async_pages) >= SWAP_CLUSTER_MAX)
+ run_task_queue(&tq_disk);
+
}
- /*
- * Report failure if we couldn't reach the minimum goal.
- */
- if (nr_free_pages < min_free_pages)
- printk("kswapd: failed, got %d of %d\n",
- nr_free_pages, min_free_pages);
+#if 0
+ /*
+ * Report failure if we couldn't even reach min_free_pages.
+ */
+ if (nr_free_pages < min_free_pages)
+ printk("kswapd: failed, got %d of %d\n",
+ nr_free_pages, min_free_pages);
+#endif
}
+ /* As if we could ever get here - maybe we want to make this killable */
+ remove_wait_queue(&kswapd_wait, &wait);
+ return 0;
}
/*