summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1998-08-25 09:12:35 +0000
committerRalf Baechle <ralf@linux-mips.org>1998-08-25 09:12:35 +0000
commitc7fc24dc4420057f103afe8fc64524ebc25c5d37 (patch)
tree3682407a599b8f9f03fc096298134cafba1c9b2f /mm
parent1d793fade8b063fde3cf275bf1a5c2d381292cd9 (diff)
o Merge with Linux 2.1.116.
o New Newport console code. o New G364 console code.
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c433
-rw-r--r--mm/memory.c90
-rw-r--r--mm/mlock.c17
-rw-r--r--mm/mmap.c96
-rw-r--r--mm/mprotect.c15
-rw-r--r--mm/mremap.c11
-rw-r--r--mm/page_alloc.c150
-rw-r--r--mm/page_io.c24
-rw-r--r--mm/simp.c435
-rw-r--r--mm/slab.c97
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c27
-rw-r--r--mm/swapfile.c10
-rw-r--r--mm/vmalloc.c39
-rw-r--r--mm/vmscan.c101
15 files changed, 680 insertions, 873 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 3f2632a15..d0bf1270f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -117,12 +117,100 @@ repeat:
}
}
+/*
+ * Remove a page from the page cache and free it.
+ */
+void remove_inode_page(struct page *page)
+{
+ remove_page_from_hash_queue(page);
+ remove_page_from_inode_queue(page);
+ __free_page(page);
+}
+
+/*
+ * Check whether we can free this page.
+ */
+static inline int shrink_one_page(struct page *page, int gfp_mask)
+{
+ struct buffer_head *tmp, *bh;
+
+ if (PageLocked(page))
+ goto next;
+ if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+ goto next;
+ /* First of all, regenerate the page's referenced bit
+ * from any buffers in the page
+ */
+ bh = page->buffers;
+ if (bh) {
+ tmp = bh;
+ do {
+ if (buffer_touched(tmp)) {
+ clear_bit(BH_Touched, &tmp->b_state);
+ set_bit(PG_referenced, &page->flags);
+ }
+ tmp = tmp->b_this_page;
+ } while (tmp != bh);
+
+ /* Refuse to swap out all buffer pages */
+ if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
+ goto next;
+ }
+
+ /* We can't throw away shared pages, but we do mark
+ them as referenced. This relies on the fact that
+ no page is currently in both the page cache and the
+ buffer cache; we'd have to modify the following
+ test to allow for that case. */
+
+ switch (atomic_read(&page->count)) {
+ case 1:
+ /* is it a swap-cache or page-cache page? */
+ if (page->inode) {
+ if (test_and_clear_bit(PG_referenced, &page->flags)) {
+ touch_page(page);
+ break;
+ }
+ age_page(page);
+#if 0
+ if (page->age)
+ break;
+ if (page_cache_size * 100 < (page_cache.min_percent * num_physpages))
+ break;
+#endif
+ if (PageSwapCache(page)) {
+ delete_from_swap_cache(page);
+ return 1;
+ }
+ remove_inode_page(page);
+ return 1;
+ }
+ /* It's not a cache page, so we don't do aging.
+ * If it has been referenced recently, don't free it */
+ if (test_and_clear_bit(PG_referenced, &page->flags))
+ break;
+
+ /* is it a buffer cache page? */
+ if (bh && try_to_free_buffer(bh, &bh, 6))
+ return 1;
+ break;
+
+ default:
+ /* more than one user: we can't throw it away */
+ set_bit(PG_referenced, &page->flags);
+ /* fall through */
+ case 0:
+ /* nothing */
+ }
+next:
+ return 0;
+}
+
int shrink_mmap(int priority, int gfp_mask)
{
static unsigned long clock = 0;
- struct page * page;
unsigned long limit = num_physpages;
- struct buffer_head *tmp, *bh;
+ struct page * page;
int count_max, count_min;
count_max = (limit<<1) >> (priority>>1);
@@ -130,79 +218,20 @@ int shrink_mmap(int priority, int gfp_mask)
page = mem_map + clock;
do {
+ if (PageSkip(page)) {
+ /* next_hash is overloaded for PageSkip */
+ page = page->next_hash;
+ clock = page->map_nr;
+ }
+
+ if (shrink_one_page(page, gfp_mask))
+ return 1;
count_max--;
if (page->inode || page->buffers)
count_min--;
-
- if (PageLocked(page))
- goto next;
- if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
- goto next;
- /* First of all, regenerate the page's referenced bit
- from any buffers in the page */
- bh = page->buffers;
- if (bh) {
- tmp = bh;
- do {
- if (buffer_touched(tmp)) {
- clear_bit(BH_Touched, &tmp->b_state);
- set_bit(PG_referenced, &page->flags);
- }
- tmp = tmp->b_this_page;
- } while (tmp != bh);
-
- /* Refuse to swap out all buffer pages */
- if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
- goto next;
- }
-
- /* We can't throw away shared pages, but we do mark
- them as referenced. This relies on the fact that
- no page is currently in both the page cache and the
- buffer cache; we'd have to modify the following
- test to allow for that case. */
-
- switch (atomic_read(&page->count)) {
- case 1:
- /* is it a swap-cache or page-cache page? */
- if (page->inode) {
- if (test_and_clear_bit(PG_referenced, &page->flags)) {
- touch_page(page);
- break;
- }
- age_page(page);
- if (page->age || page_cache_size * 100 < (page_cache.min_percent * num_physpages))
- break;
- if (PageSwapCache(page)) {
- delete_from_swap_cache(page);
- return 1;
- }
- remove_page_from_hash_queue(page);
- remove_page_from_inode_queue(page);
- __free_page(page);
- return 1;
- }
- /* It's not a cache page, so we don't do aging.
- * If it has been referenced recently, don't free it */
- if (test_and_clear_bit(PG_referenced, &page->flags))
- break;
-
- /* is it a buffer cache page? */
- if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6))
- return 1;
- break;
-
- default:
- /* more than one users: we can't throw it away */
- set_bit(PG_referenced, &page->flags);
- /* fall through */
- case 0:
- /* nothing */
- }
-next:
page++;
clock++;
- if (clock >= limit) {
+ if (clock >= max_mapnr) {
clock = 0;
page = mem_map;
}
@@ -216,20 +245,17 @@ next:
* free it from the page hash-queues etc, as we don't want to keep it
* in-core unnecessarily.
*/
-unsigned long page_unuse(unsigned long page)
+unsigned long page_unuse(struct page * page)
{
- struct page * p = mem_map + MAP_NR(page);
- int count = atomic_read(&p->count);
+ int count = atomic_read(&page->count);
if (count != 2)
return count;
- if (!p->inode)
+ if (!page->inode)
return count;
- if (PageSwapCache(p))
+ if (PageSwapCache(page))
panic ("Doing a normal page_unuse of a swap cache page");
- remove_page_from_hash_queue(p);
- remove_page_from_inode_queue(p);
- free_page(page);
+ remove_inode_page(page);
return 1;
}
@@ -303,6 +329,7 @@ static unsigned long try_to_read_ahead(struct file * file,
*/
page = mem_map + MAP_NR(page_cache);
add_to_page_cache(page, inode, offset, hash);
+ set_bit(PG_referenced, &page->flags);
inode->i_op->readpage(file, page);
page_cache = 0;
}
@@ -568,6 +595,23 @@ static inline unsigned long generic_file_readahead(int reada_ok,
return page_cache;
}
+/*
+ * "descriptor" for what we're up to with a read.
+ * This allows us to use the same read code yet
+ * have multiple different users of the data that
+ * we read from a file.
+ *
+ * The simplest case just copies the data to user
+ * mode.
+ */
+typedef struct {
+ size_t written;
+ size_t count;
+ char * buf;
+ int error;
+} read_descriptor_t;
+
+typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
/*
* This is a generic file read routine, and uses the
@@ -577,23 +621,14 @@ static inline unsigned long generic_file_readahead(int reada_ok,
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-
-ssize_t generic_file_read(struct file * filp, char * buf,
- size_t count, loff_t *ppos)
+static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
{
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
- ssize_t error, read;
size_t pos, pgpos, page_cache;
int reada_ok;
int max_readahead = get_max_readahead(inode);
- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
- if (!count)
- return 0;
- error = 0;
- read = 0;
page_cache = 0;
pos = *ppos;
@@ -621,12 +656,12 @@ ssize_t generic_file_read(struct file * filp, char * buf,
* Then, at least MIN_READAHEAD if read ahead is ok,
* and at most MAX_READAHEAD in all cases.
*/
- if (pos + count <= (PAGE_SIZE >> 1)) {
+ if (pos + desc->count <= (PAGE_SIZE >> 1)) {
filp->f_ramax = 0;
} else {
unsigned long needed;
- needed = ((pos + count) & PAGE_MASK) - pgpos;
+ needed = ((pos + desc->count) & PAGE_MASK) - pgpos;
if (filp->f_ramax < needed)
filp->f_ramax = needed;
@@ -679,20 +714,20 @@ success:
offset = pos & ~PAGE_MASK;
nr = PAGE_SIZE - offset;
- if (nr > count)
- nr = count;
if (nr > inode->i_size - pos)
nr = inode->i_size - pos;
- nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr);
- release_page(page);
- error = -EFAULT;
- if (!nr)
- break;
- buf += nr;
+
+ /*
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ nr = actor(desc, (const char *) (page_address(page) + offset), nr);
pos += nr;
- read += nr;
- count -= nr;
- if (count)
+ release_page(page);
+ if (nr && desc->count)
continue;
break;
}
@@ -710,7 +745,7 @@ no_cached_page:
*/
if (page_cache)
continue;
- error = -ENOMEM;
+ desc->error = -ENOMEM;
break;
}
@@ -739,11 +774,14 @@ no_cached_page:
if (reada_ok && filp->f_ramax > MIN_READAHEAD)
filp->f_ramax = MIN_READAHEAD;
- error = inode->i_op->readpage(filp, page);
- if (!error)
- goto found_page;
- release_page(page);
- break;
+ {
+ int error = inode->i_op->readpage(filp, page);
+ if (!error)
+ goto found_page;
+ desc->error = error;
+ release_page(page);
+ break;
+ }
page_read_error:
/*
@@ -751,15 +789,18 @@ page_read_error:
* Try to re-read it _once_. We do this synchronously,
* because this happens only if there were errors.
*/
- error = inode->i_op->readpage(filp, page);
- if (!error) {
- wait_on_page(page);
- if (PageUptodate(page) && !PageError(page))
- goto success;
- error = -EIO; /* Some unspecified error occurred.. */
+ {
+ int error = inode->i_op->readpage(filp, page);
+ if (!error) {
+ wait_on_page(page);
+ if (PageUptodate(page) && !PageError(page))
+ goto success;
+ error = -EIO; /* Some unspecified error occurred.. */
+ }
+ desc->error = error;
+ release_page(page);
+ break;
}
- release_page(page);
- break;
}
*ppos = pos;
@@ -767,9 +808,159 @@ page_read_error:
if (page_cache)
free_page(page_cache);
UPDATE_ATIME(inode);
- if (!read)
- read = error;
- return read;
+}
+
+static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
+{
+ unsigned long left;
+ unsigned long count = desc->count;
+
+ if (size > count)
+ size = count;
+ left = __copy_to_user(desc->buf, area, size);
+ if (left) {
+ size -= left;
+ desc->error = -EFAULT;
+ }
+ desc->count = count - size;
+ desc->written += size;
+ desc->buf += size;
+ return size;
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+{
+ ssize_t retval;
+
+ retval = -EFAULT;
+ if (access_ok(VERIFY_WRITE, buf, count)) {
+ retval = 0;
+ if (count) {
+ read_descriptor_t desc;
+
+ desc.written = 0;
+ desc.count = count;
+ desc.buf = buf;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
+
+ retval = desc.written;
+ if (!retval)
+ retval = desc.error;
+ }
+ }
+ return retval;
+}
+
+static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
+{
+ ssize_t written;
+ unsigned long count = desc->count;
+ struct file *file = (struct file *) desc->buf;
+ struct inode *inode = file->f_dentry->d_inode;
+ mm_segment_t old_fs;
+
+ if (size > count)
+ size = count;
+ down(&inode->i_sem);
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ written = file->f_op->write(file, area, size, &file->f_pos);
+ set_fs(old_fs);
+ up(&inode->i_sem);
+ if (written < 0) {
+ desc->error = written;
+ written = 0;
+ }
+ desc->count = count - written;
+ desc->written += written;
+ return written;
+}
+
+asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+ ssize_t retval;
+ struct file * in_file, * out_file;
+ struct inode * in_inode, * out_inode;
+
+ lock_kernel();
+
+ /*
+ * Get input file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ in_file = fget(in_fd);
+ if (!in_file)
+ goto out;
+ if (!(in_file->f_mode & FMODE_READ))
+ goto fput_in;
+ retval = -EINVAL;
+ in_inode = in_file->f_dentry->d_inode;
+ if (!in_inode)
+ goto fput_in;
+ if (!in_inode->i_op || !in_inode->i_op->readpage)
+ goto fput_in;
+ retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
+ if (retval)
+ goto fput_in;
+
+ /*
+ * Get output file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ out_file = fget(out_fd);
+ if (!out_file)
+ goto fput_in;
+ if (!(out_file->f_mode & FMODE_WRITE))
+ goto fput_out;
+ retval = -EINVAL;
+ if (!out_file->f_op || !out_file->f_op->write)
+ goto fput_out;
+ out_inode = out_file->f_dentry->d_inode;
+ if (!out_inode)
+ goto fput_out;
+ retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
+ if (retval)
+ goto fput_out;
+
+ retval = 0;
+ if (count) {
+ read_descriptor_t desc;
+ loff_t pos = 0, *ppos;
+
+ retval = -EFAULT;
+ ppos = &in_file->f_pos;
+ if (offset) {
+ if (get_user(pos, offset))
+ goto fput_out;
+ ppos = &pos;
+ }
+
+ desc.written = 0;
+ desc.count = count;
+ desc.buf = (char *) out_file;
+ desc.error = 0;
+ do_generic_file_read(in_file, ppos, &desc, file_send_actor);
+
+ retval = desc.written;
+ if (!retval)
+ retval = desc.error;
+ if (offset)
+ put_user(pos, offset);
+ }
+
+
+fput_out:
+ fput(out_file);
+fput_in:
+ fput(in_file);
+out:
+ unlock_kernel();
+ return retval;
}
/*
@@ -903,7 +1094,7 @@ page_read_error:
goto success;
/*
- * Uhhuh.. Things didn't work out. Return zero to tell the
+ * Things didn't work out. Return zero to tell the
* mm layer so, possibly freeing the page cache page first.
*/
failure:
@@ -1257,6 +1448,7 @@ asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
struct vm_area_struct * vma;
int unmapped_error, error = -EINVAL;
+ down(&current->mm->mmap_sem);
lock_kernel();
if (start & ~PAGE_MASK)
goto out;
@@ -1304,6 +1496,7 @@ asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
}
out:
unlock_kernel();
+ up(&current->mm->mmap_sem);
return error;
}
@@ -1412,7 +1605,7 @@ page_wait:
set_bit(PG_uptodate, &page->flags);
do_update_page:
- /* Alright, the page is there. Now update it. */
+ /* All right, the page is there. Now update it. */
status = inode->i_op->updatepage(file, page, buf,
offset, bytes, sync);
done_with_page:
diff --git a/mm/memory.c b/mm/memory.c
index af4297702..77a814f07 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -44,6 +44,8 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -68,8 +70,6 @@ static inline void copy_cow_page(unsigned long from, unsigned long to)
copy_page(to, from);
}
-#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
-
mem_map_t * mem_map = NULL;
/*
@@ -121,22 +121,41 @@ static inline void free_one_pgd(pgd_t * dir)
pmd_free(pmd);
}
+/* Low and high watermarks for page table cache.
+ The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
+ */
+int pgt_cache_water[2] = { 25, 50 };
+
+/* Returns the number of pages freed */
+int check_pgt_cache(void)
+{
+ return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
+}
+
+
/*
* This function clears all user-level page tables of a process - this
* is needed by execve(), so that old pages aren't in the way.
*/
void clear_page_tables(struct task_struct * tsk)
{
+ pgd_t * page_dir = tsk->mm->pgd;
int i;
- pgd_t * page_dir;
- page_dir = tsk->mm->pgd;
- if (!page_dir || page_dir == swapper_pg_dir) {
- printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
- return;
- }
+ if (!page_dir || page_dir == swapper_pg_dir)
+ goto out_bad;
for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
free_one_pgd(page_dir + i);
+
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+ return;
+
+out_bad:
+ printk(KERN_ERR
+ "clear_page_tables: %s trying to clear kernel pgd\n",
+ tsk->comm);
+ return;
}
/*
@@ -146,30 +165,34 @@ void clear_page_tables(struct task_struct * tsk)
*/
void free_page_tables(struct mm_struct * mm)
{
+ pgd_t * page_dir = mm->pgd;
int i;
- pgd_t * page_dir;
- page_dir = mm->pgd;
- if (page_dir) {
- if (page_dir == swapper_pg_dir) {
- printk("free_page_tables: Trying to free kernel pgd\n");
- return;
- }
- for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
- free_one_pgd(page_dir + i);
- pgd_free(page_dir);
- }
+ if (!page_dir)
+ goto out;
+ if (page_dir == swapper_pg_dir)
+ goto out_bad;
+ for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
+ free_one_pgd(page_dir + i);
+ pgd_free(page_dir);
+
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+out:
+ return;
+
+out_bad:
+ printk(KERN_ERR
+ "free_page_tables: Trying to free kernel pgd\n");
+ return;
}
int new_page_tables(struct task_struct * tsk)
{
- pgd_t * page_dir, * new_pg;
+ pgd_t * new_pg;
if (!(new_pg = pgd_alloc()))
return -ENOMEM;
- page_dir = pgd_offset(&init_mm, 0);
- memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD,
- (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t));
SET_PAGE_DIR(tsk, new_pg);
tsk->mm->pgd = new_pg;
return 0;
@@ -898,6 +921,9 @@ static inline void handle_pte_fault(struct task_struct *tsk,
do_wp_page(tsk, vma, address, write_access, pte);
}
+/*
+ * By the time we get here, we already hold the mm semaphore
+ */
void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
unsigned long address, int write_access)
{
@@ -912,9 +938,27 @@ void handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
pte = pte_alloc(pmd, address);
if (!pte)
goto no_memory;
+ lock_kernel();
handle_pte_fault(tsk, vma, address, write_access, pte);
+ unlock_kernel();
update_mmu_cache(vma, address, *pte);
return;
no_memory:
oom(tsk);
}
+
+/*
+ * Simplistic page force-in..
+ */
+void make_pages_present(unsigned long addr, unsigned long end)
+{
+ int write;
+ struct vm_area_struct * vma;
+
+ vma = find_vma(current->mm, addr);
+ write = (vma->vm_flags & VM_WRITE) != 0;
+ while (addr < end) {
+ handle_mm_fault(current, vma, addr, write);
+ addr += PAGE_SIZE;
+ }
+}
diff --git a/mm/mlock.c b/mm/mlock.c
index 3a322f8a5..527443946 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -126,14 +126,7 @@ static int mlock_fixup(struct vm_area_struct * vma,
if (!(newflags & VM_LOCKED))
pages = -pages;
vma->vm_mm->locked_vm += pages;
-
- if (newflags & VM_LOCKED)
- while (start < end) {
- char c;
- get_user(c,(char *) start);
- __asm__ __volatile__("": :"r" (c));
- start += PAGE_SIZE;
- }
+ make_pages_present(start, end);
}
return retval;
}
@@ -192,6 +185,7 @@ asmlinkage int sys_mlock(unsigned long start, size_t len)
unsigned long lock_limit;
int error = -ENOMEM;
+ down(&current->mm->mmap_sem);
lock_kernel();
len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
start &= PAGE_MASK;
@@ -214,6 +208,7 @@ asmlinkage int sys_mlock(unsigned long start, size_t len)
error = do_mlock(start, len, 1);
out:
unlock_kernel();
+ up(&current->mm->mmap_sem);
return error;
}
@@ -221,11 +216,13 @@ asmlinkage int sys_munlock(unsigned long start, size_t len)
{
int ret;
+ down(&current->mm->mmap_sem);
lock_kernel();
len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK;
start &= PAGE_MASK;
ret = do_mlock(start, len, 0);
unlock_kernel();
+ up(&current->mm->mmap_sem);
return ret;
}
@@ -263,6 +260,7 @@ asmlinkage int sys_mlockall(int flags)
unsigned long lock_limit;
int ret = -EINVAL;
+ down(&current->mm->mmap_sem);
lock_kernel();
if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
goto out;
@@ -282,6 +280,7 @@ asmlinkage int sys_mlockall(int flags)
ret = do_mlockall(flags);
out:
unlock_kernel();
+ up(&current->mm->mmap_sem);
return ret;
}
@@ -289,8 +288,10 @@ asmlinkage int sys_munlockall(void)
{
int ret;
+ down(&current->mm->mmap_sem);
lock_kernel();
ret = do_mlockall(0);
unlock_kernel();
+ up(&current->mm->mmap_sem);
return ret;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 52c185e85..172bcd8f1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -57,19 +57,19 @@ int vm_enough_memory(long pages)
* simple, it hopefully works in most obvious cases.. Easy to
* fool it, but this should catch most mistakes.
*/
- long freepages;
+ long free;
/* Sometimes we want to use more memory than we have. */
if (sysctl_overcommit_memory)
return 1;
- freepages = buffermem >> PAGE_SHIFT;
- freepages += page_cache_size;
- freepages >>= 1;
- freepages += nr_free_pages;
- freepages += nr_swap_pages;
- freepages -= num_physpages >> 4;
- return freepages > pages;
+ free = buffermem >> PAGE_SHIFT;
+ free += page_cache_size;
+ free >>= 1;
+ free += nr_free_pages;
+ free += nr_swap_pages;
+ free -= num_physpages >> 4;
+ return free > pages;
}
/* Remove one vm structure from the inode's i_mmap ring. */
@@ -92,6 +92,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ down(&mm->mmap_sem);
lock_kernel();
if (brk < mm->end_code)
goto out;
@@ -109,9 +110,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
/* Check against rlimit and stack.. */
rlim = current->rlim[RLIMIT_DATA].rlim_cur;
- if (rlim >= RLIM_INFINITY)
- rlim = ~0;
- if (brk - mm->end_code > rlim)
+ if (rlim < RLIM_INFINITY && brk - mm->end_code > rlim)
goto out;
/* Check against existing mmap mappings. */
@@ -132,6 +131,7 @@ set_brk:
out:
retval = mm->brk;
unlock_kernel();
+ up(&mm->mmap_sem);
return retval;
}
@@ -196,9 +196,14 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
if ((prot & PROT_WRITE) && !(file->f_mode & 2))
return -EACCES;
+ /* Make sure we don't allow writing to an append-only file.. */
+ if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & 2))
+ return -EACCES;
+
/* make sure there are no mandatory locks on the file. */
if (locks_verify_locked(file->f_dentry->d_inode))
return -EAGAIN;
+
/* fall through */
case MAP_PRIVATE:
if (!(file->f_mode & 1))
@@ -316,16 +321,9 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
merge_segments(mm, vma->vm_start, vma->vm_end);
mm->total_vm += len >> PAGE_SHIFT;
- if ((flags & VM_LOCKED) && !(flags & VM_IO)) {
- unsigned long start = addr;
+ if (flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
- do {
- char c;
- get_user(c,(char *) start);
- len -= PAGE_SIZE;
- start += PAGE_SIZE;
- __asm__ __volatile__("": :"r" (c));
- } while (len > 0);
+ make_pages_present(addr, addr + len);
}
return addr;
@@ -428,30 +426,10 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr,
insert_vm_struct(current->mm, mpnt);
}
- /* Close the current area ... */
- if (area->vm_ops && area->vm_ops->close) {
- end = area->vm_end; /* save new end */
- area->vm_end = area->vm_start;
- area->vm_ops->close(area);
- area->vm_end = end;
- }
- /* ... then reopen and reinsert. */
- if (area->vm_ops && area->vm_ops->open)
- area->vm_ops->open(area);
insert_vm_struct(current->mm, area);
return 1;
}
-asmlinkage int sys_munmap(unsigned long addr, size_t len)
-{
- int ret;
-
- lock_kernel();
- ret = do_munmap(addr, len);
- unlock_kernel();
- return ret;
-}
-
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
@@ -460,7 +438,7 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len)
int do_munmap(unsigned long addr, size_t len)
{
struct mm_struct * mm;
- struct vm_area_struct *mpnt, *next, *free, *extra;
+ struct vm_area_struct *mpnt, *free, *extra;
int freed;
if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
@@ -481,6 +459,11 @@ int do_munmap(unsigned long addr, size_t len)
if (!mpnt)
return 0;
+ /* If we'll make "hole", check the vm areas limit */
+ if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) &&
+ mm->map_count > MAX_MAP_COUNT)
+ return -ENOMEM;
+
/*
* We may need one additional vma to fix up the mappings ...
* and this is the last chance for an easy error exit.
@@ -489,9 +472,7 @@ int do_munmap(unsigned long addr, size_t len)
if (!extra)
return -ENOMEM;
- next = mpnt->vm_next;
-
- /* we have mpnt->vm_next = next and addr < mpnt->vm_end */
+ /* we have addr < mpnt->vm_end */
free = NULL;
for ( ; mpnt && mpnt->vm_start < addr+len; ) {
struct vm_area_struct *next = mpnt->vm_next;
@@ -505,13 +486,6 @@ int do_munmap(unsigned long addr, size_t len)
mpnt = next;
}
- if (free && (free->vm_start < addr) && (free->vm_end > addr+len)) {
- if (mm->map_count > MAX_MAP_COUNT) {
- kmem_cache_free(vm_area_cachep, extra);
- return -ENOMEM;
- }
- }
-
/* Ok - we have the memory areas we should free on the 'free' list,
* so release them, and unmap the page range..
* If the one of the segments is only being partially unmapped,
@@ -555,6 +529,18 @@ int do_munmap(unsigned long addr, size_t len)
return 0;
}
+asmlinkage int sys_munmap(unsigned long addr, size_t len)
+{
+ int ret;
+
+ down(&current->mm->mmap_sem);
+ lock_kernel();
+ ret = do_munmap(addr, len);
+ unlock_kernel();
+ up(&current->mm->mmap_sem);
+ return ret;
+}
+
/* Release all mmaps. */
void exit_mmap(struct mm_struct * mm)
{
@@ -630,13 +616,13 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp)
* This assumes that the list is ordered by address.
* We don't need to traverse the entire list, only those segments
* which intersect or are adjacent to a given interval.
+ *
+ * We must already hold the mm semaphore when we get here..
*/
void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
{
struct vm_area_struct *prev, *mpnt, *next;
- down(&mm->mmap_sem);
-
prev = NULL;
mpnt = mm->mmap;
while(mpnt && mpnt->vm_end <= start_addr) {
@@ -644,7 +630,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
mpnt = mpnt->vm_next;
}
if (!mpnt)
- goto no_vma;
+ return;
next = mpnt->vm_next;
@@ -700,8 +686,6 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
mpnt = prev;
}
mm->mmap_cache = NULL; /* Kill the cache. */
-no_vma:
- up(&mm->mmap_sem);
}
__initfunc(void vma_init(void))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5dac4cd..cc78e10ab 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -208,18 +208,20 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
struct vm_area_struct * vma, * next;
int error = -EINVAL;
- lock_kernel();
if (start & ~PAGE_MASK)
- goto out;
+ return -EINVAL;
len = (len + ~PAGE_MASK) & PAGE_MASK;
end = start + len;
if (end < start)
- goto out;
+ return -EINVAL;
if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
- goto out;
- error = 0;
+ return -EINVAL;
if (end == start)
- goto out;
+ return 0;
+
+ down(&current->mm->mmap_sem);
+ lock_kernel();
+
vma = find_vma(current->mm, start);
error = -EFAULT;
if (!vma || vma->vm_start > start)
@@ -256,5 +258,6 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot)
merge_segments(current->mm, start, end);
out:
unlock_kernel();
+ up(&current->mm->mmap_sem);
return error;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index a31a0ae14..cd7a7eb4a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -21,6 +21,8 @@
#include <asm/system.h>
#include <asm/pgtable.h>
+extern int vm_enough_memory(long pages);
+
static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
{
pgd_t * pgd;
@@ -167,6 +169,7 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
+ down(&current->mm->mmap_sem);
lock_kernel();
if (addr & ~PAGE_MASK)
goto out;
@@ -178,7 +181,7 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
* the unnecessary pages..
*/
ret = addr;
- if (old_len > new_len) {
+ if (old_len >= new_len) {
do_munmap(addr+new_len, old_len - new_len);
goto out;
}
@@ -204,6 +207,11 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
> current->rlim[RLIMIT_AS].rlim_cur)
goto out;
+ /* Private writable mapping? Check memory availability.. */
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
+ !(flags & MAP_NORESERVE) &&
+ !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT))
+ goto out;
/* old_len exactly to the end of the area.. */
if (old_len == vma->vm_end - addr &&
@@ -233,5 +241,6 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
ret = -ENOMEM;
out:
unlock_kernel();
+ up(&current->mm->mmap_sem);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d61d74f44..c51db59d9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -98,53 +98,33 @@ static inline void remove_mem_queue(struct page * entry)
*
* Hint: -mask = 1+~mask
*/
-static spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
/*
- * This routine is used by the kernel swap deamon to determine
+ * This routine is used by the kernel swap daemon to determine
* whether we have "enough" free pages. It is fairly arbitrary,
- * but this had better return false if any reasonable "get_free_page()"
- * allocation could currently fail..
+ * having a low-water and high-water mark.
*
- * This will return zero if no list was found, non-zero
- * if there was memory (the bigger, the better).
+ * This returns:
+ * 0 - urgent need for memory
+ * 1 - need some memory, but do it slowly in the background
+ * 2 - no need to even think about it.
*/
-int free_memory_available(int nr)
+int free_memory_available(void)
{
- int retval = 0;
- unsigned long flags;
- struct free_area_struct * list;
+ static int available = 1;
- /*
- * If we have more than about 3% to 5% of all memory free,
- * consider it to be good enough for anything.
- * It may not be, due to fragmentation, but we
- * don't want to keep on forever trying to find
- * free unfragmented memory.
- * Added low/high water marks to avoid thrashing -- Rik.
- */
- if (nr_free_pages > (nr ? freepages.low : freepages.high))
- return nr+1;
+ if (nr_free_pages < freepages.low) {
+ available = 0;
+ return 0;
+ }
- list = free_area + NR_MEM_LISTS;
- spin_lock_irqsave(&page_alloc_lock, flags);
- /* We fall through the loop if the list contains one
- * item. -- thanks to Colin Plumb <colin@nyx.net>
- */
- do {
- list--;
- /* Empty list? Bad - we need more memory */
- if (list->next == memory_head(list))
- break;
- /* One item on the list? Look further */
- if (list->next->next == memory_head(list))
- continue;
- /* More than one item? We're ok */
- retval = nr + 1;
- break;
- } while (--nr >= 0);
- spin_unlock_irqrestore(&page_alloc_lock, flags);
- return retval;
+ if (nr_free_pages > freepages.high) {
+ available = 1;
+ return 2;
+ }
+
+ return available;
}
static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
@@ -182,9 +162,11 @@ void __free_page(struct page *page)
if (PageSwapCache(page))
panic ("Freeing swap cache page");
free_pages_ok(page->map_nr, 0);
+ return;
}
if (PageSwapCache(page) && atomic_read(&page->count) == 1)
- panic ("Releasing swap cache page");
+ printk(KERN_WARNING "VM: Releasing swap cache page at %p",
+ __builtin_return_address(0));
}
void free_pages(unsigned long addr, unsigned long order)
@@ -202,8 +184,9 @@ void free_pages(unsigned long addr, unsigned long order)
return;
}
if (PageSwapCache(map) && atomic_read(&map->count) == 1)
- panic ("Releasing swap cache pages at %p",
- __builtin_return_address(0));
+ printk(KERN_WARNING
+ "VM: Releasing swap cache pages at %p",
+ __builtin_return_address(0));
}
}
@@ -214,13 +197,11 @@ void free_pages(unsigned long addr, unsigned long order)
change_bit((index) >> (1+(order)), (area)->map)
#define CAN_DMA(x) (PageDMA(x))
#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-#define RMQUEUE(order, maxorder, dma) \
+#define RMQUEUE(order, dma) \
do { struct free_area_struct * area = free_area+order; \
unsigned long new_order = order; \
do { struct page *prev = memory_head(area), *ret = prev->next; \
while (memory_head(area) != ret) { \
- if (new_order >= maxorder && ret->next == prev) \
- break; \
if (!dma || CAN_DMA(ret)) { \
unsigned long map_nr = ret->map_nr; \
(prev->next = ret->next)->prev = prev; \
@@ -252,39 +233,46 @@ do { unsigned long size = 1 << high; \
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
- unsigned long flags, maxorder;
+ unsigned long flags;
if (order >= NR_MEM_LISTS)
goto nopage;
- /*
- * "maxorder" is the highest order number that we're allowed
- * to empty in order to find a free page..
- */
- maxorder = NR_MEM_LISTS-1;
- if (gfp_mask & __GFP_HIGH)
- maxorder = NR_MEM_LISTS;
-
- if (in_interrupt() && (gfp_mask & __GFP_WAIT)) {
- static int count = 0;
- if (++count < 5) {
- printk("gfp called nonatomically from interrupt %p\n",
- return_address());
- gfp_mask &= ~__GFP_WAIT;
+ if (gfp_mask & __GFP_WAIT) {
+ if (in_interrupt()) {
+ static int count = 0;
+ if (++count < 5) {
+ printk("gfp called nonatomically from interrupt %p\n",
+ __builtin_return_address(0));
+ }
+ goto nopage;
}
- }
- for (;;) {
- spin_lock_irqsave(&page_alloc_lock, flags);
- RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA));
- spin_unlock_irqrestore(&page_alloc_lock, flags);
- if (!(gfp_mask & __GFP_WAIT))
- break;
- if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX))
- break;
- gfp_mask &= ~__GFP_WAIT; /* go through this only once */
- maxorder = NR_MEM_LISTS; /* Allow anything this time */
+ if (freepages.min > nr_free_pages) {
+ int freed;
+ freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
+ /*
+ * Low priority (user) allocations must not
+ * succeed if we didn't have enough memory
+ * and we couldn't get more..
+ */
+ if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+ goto nopage;
+ }
}
+ spin_lock_irqsave(&page_alloc_lock, flags);
+ RMQUEUE(order, (gfp_mask & GFP_DMA));
+ spin_unlock_irqrestore(&page_alloc_lock, flags);
+
+ /*
+ * If we failed to find anything, we'll return NULL, but we'll
+ * wake up kswapd _now_ ad even wait for it synchronously if
+ * we can.. This way we'll at least make some forward progress
+ * over time.
+ */
+ wake_up(&kswapd_wait);
+ if (gfp_mask & __GFP_WAIT)
+ schedule();
nopage:
return 0;
}
@@ -300,6 +288,11 @@ void show_free_areas(void)
unsigned long total = 0;
printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
+ printk("Free: %d (%d %d %d)\n",
+ nr_free_pages,
+ freepages.min,
+ freepages.low,
+ freepages.high);
spin_lock_irqsave(&page_alloc_lock, flags);
for (order=0 ; order < NR_MEM_LISTS; order++) {
struct page * tmp;
@@ -329,22 +322,23 @@ __initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long e
{
mem_map_t * p;
unsigned long mask = PAGE_MASK;
- int i;
+ unsigned long i;
/*
* Select nr of pages we try to keep free for important stuff
- * with a minimum of 48 pages and a maximum of 256 pages, so
+ * with a minimum of 10 pages and a maximum of 256 pages, so
* that we don't waste too much memory on large systems.
- * This is totally arbitrary.
+ * This is fairly arbitrary, but based on some behaviour
+ * analysis.
*/
i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
- if (i < 48)
- i = 48;
+ if (i < 10)
+ i = 10;
if (i > 256)
i = 256;
freepages.min = i;
- freepages.low = i << 1;
- freepages.high = freepages.low + i;
+ freepages.low = i * 2;
+ freepages.high = i * 3;
mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
p = mem_map + MAP_NR(end_mem);
start_mem = LONG_ALIGN((unsigned long) p);
diff --git a/mm/page_io.c b/mm/page_io.c
index eb436f7b7..7e5a35186 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -74,18 +74,19 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
return;
}
if (p->swap_map && !p->swap_map[offset]) {
- printk("Hmm.. Trying to %s unallocated swap (%08lx)\n",
- (rw == READ) ? "read" : "write",
- entry);
+ printk(KERN_ERR "rw_swap_page: "
+ "Trying to %s unallocated swap (%08lx)\n",
+ (rw == READ) ? "read" : "write", entry);
return;
}
if (!(p->flags & SWP_USED)) {
- printk("Trying to swap to unused swap-device\n");
+ printk(KERN_ERR "rw_swap_page: "
+ "Trying to swap to unused swap-device\n");
return;
}
if (!PageLocked(page)) {
- printk("VM: swap page is unlocked\n");
+ printk(KERN_ERR "VM: swap page is unlocked\n");
return;
}
@@ -111,11 +112,11 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
* hashing for locked pages.
*/
if (!PageSwapCache(page)) {
- printk("VM: swap page is not in swap cache\n");
+ printk(KERN_ERR "VM: swap page is not in swap cache\n");
return;
}
if (page->offset != entry) {
- printk ("swap entry mismatch");
+ printk (KERN_ERR "VM: swap entry mismatch\n");
return;
}
@@ -142,7 +143,7 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
if (swapf->i_op->bmap == NULL
&& swapf->i_op->smap != NULL){
/*
- With MsDOS, we use msdos_smap which return
+ With MS-DOS, we use msdos_smap which return
a sector number (not a cluster or block number).
It is a patch to enable the UMSDOS project.
Other people are working on better solution.
@@ -179,11 +180,14 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait)
clear_bit(PG_locked, &page->flags);
wake_up(&page->wait);
} else
- printk("rw_swap_page: no swap file or device\n");
+ printk(KERN_ERR "rw_swap_page: no swap file or device\n");
+ /* This shouldn't happen, but check to be sure. */
+ if (atomic_read(&page->count) == 1)
+ printk(KERN_ERR "rw_swap_page: page unused while waiting!\n");
atomic_dec(&page->count);
if (offset && !test_and_clear_bit(offset,p->swap_lockmap))
- printk("rw_swap_page: lock already cleared\n");
+ printk(KERN_ERR "rw_swap_page: lock already cleared\n");
wake_up(&lock_queue);
#ifdef DEBUG_SWAP
printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n",
diff --git a/mm/simp.c b/mm/simp.c
deleted file mode 100644
index 581cde3d7..000000000
--- a/mm/simp.c
+++ /dev/null
@@ -1,435 +0,0 @@
-#define NULL 0
-/*
- * mm/simp.c -- simple allocator for cached objects
- *
- * (C) 1997 Thomas Schoebel-Theuer
- */
-
-#include <linux/simp.h>
-#include <linux/tasks.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-#include <asm/spinlock.h>
-
-/* The next two defines can be independently enabled for debugging */
-/*#define DEBUG*/
-/*#define DEAD_BEEF*/
-
-#ifdef DEAD_BEEF
-#define DEBUG_BEEF 1
-#else
-#define DEBUG_BEEF 0
-#endif
-
-#ifdef __SMP__
-#define NR_PROCESSORS NR_CPUS
-#define GLOBAL_SIZE CHUNK_SIZE
-#else
-#define NR_PROCESSORS 1
-#define GLOBAL_SIZE PAGE_SIZE
-#endif
-
-#define POSTBUFFER_SIZE 63
-#define ORDER 2
-#define CHUNK_SIZE (PAGE_SIZE*(1<<ORDER))
-#define CHUNK_BASE(ptr) (struct header*)(((unsigned long)(ptr)) & ~(CHUNK_SIZE-1))
-#define CHUNK_END(hdr) (void**)((char*)(hdr) + CHUNK_SIZE)
-
-#define COLOR_INCREMENT (8*sizeof(void*)) /* should be 1 cache line */
-#define ALIGN_CACHE(adr) ((((((unsigned long)adr) - 1) / COLOR_INCREMENT) + 1) * COLOR_INCREMENT)
-#define HEADER_SIZE ALIGN_CACHE(sizeof(struct header))
-#define ELEM_SIZE ALIGN_CACHE(sizeof(struct elem))
-#define FILL_TYPE(name,wrongsize) char name[ALIGN_CACHE(wrongsize)-(wrongsize)]
-
-#define MAX_SIMPS ((GLOBAL_SIZE / sizeof(struct simp)) - 1)
-
-struct header { /* this is at the beginning of each memory region */
- /* 1st cache line */
- void ** index;
- void ** fresh;
- struct simp * father;
- void ** emptypos;
- struct header * next;
- structor again_ctor;
- structor first_ctor;
- void * fill[1];
-#ifdef DEBUG
- /* 2nd cache line */
- char magic[32];
-#endif
-};
-
-struct per_processor {
- void ** buffer_pos;
- void * postbuffer[POSTBUFFER_SIZE];
-};
-
-struct simp {
- /* 1st cache lines */
- struct per_processor private[NR_PROCESSORS];
- /* next cache line */
- struct header * usable_list;
- spinlock_t lock;
- /* This value is negative on Alpha SMP. */
- /* char fill[sizeof(void*) - sizeof(spinlock_t)]; */
- long real_size;
- long max_elems;
- structor again_ctor;
- structor first_ctor;
- structor dtor;
- long fill2;
- /* next cache line */
- long create_offset;
- long color;
- long max_color;
- long size;
- long fill3[4];
- /* next cache line */
- char name[32];
-};
-
-struct global_data {
- /* 1st cache line */
- long changed_flag;
- long nr_simps;
- spinlock_t lock;
- char fill[(6+8)*sizeof(void*)+sizeof(void*)-sizeof(spinlock_t)];
- /* rest */
- struct simp simps[MAX_SIMPS];
-};
-
-static struct global_data * global = NULL;
-
-#ifdef DEBUG
-static char global_magic[32] = "SIMP header SdC581oi9rY20051962\n";
-#endif
-
-struct simp * simp_create(char * name, long size,
- structor first_ctor,
- structor again_ctor,
- structor dtor)
-{
- struct simp * simp;
- long fraction;
- long real_size;
- int cpu;
-
- if(!global) {
-#ifdef __SMP__
- global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER);
- memset(global, 0, CHUNK_SIZE);
-#else
- global = (struct global_data*)get_free_page(GFP_KERNEL);
-#endif
- spin_lock_init(&global->lock);
- }
-
- spin_lock(&global->lock);
- simp = &global->simps[global->nr_simps++];
- spin_unlock(&global->lock);
-
- if(global->nr_simps >= MAX_SIMPS) {
- printk("SIMP: too many simps allocated\n");
- return NULL;
- }
- memset(simp, 0, sizeof(struct simp));
- spin_lock_init(&simp->lock);
- strncpy(simp->name, name, 15);
- simp->size = size;
- simp->real_size = real_size = ALIGN_CACHE(size);
- /* allow aggregation of very small objects in 2-power fractions of
- * cachelines */
- fraction = COLOR_INCREMENT / 2;
- while(size <= fraction && fraction >= sizeof(void*)) {
- simp->real_size = fraction;
- fraction >>= 1;
- }
- simp->first_ctor = first_ctor;
- simp->again_ctor = again_ctor;
- simp->dtor = dtor;
-
- real_size += sizeof(void*);
- simp->max_elems = (CHUNK_SIZE - HEADER_SIZE) / real_size;
- simp->max_color = (CHUNK_SIZE - HEADER_SIZE) % real_size;
- for(cpu = 0; cpu < NR_PROCESSORS; cpu++) {
- struct per_processor * private = &simp->private[cpu];
- private->buffer_pos = private->postbuffer;
- }
- return simp;
-}
-
-/* Do *not* inline this, it clobbers too many registers... */
-static void alloc_header(struct simp * simp)
-{
- struct header * hdr;
- char * ptr;
- void ** index;
- long count;
-
- spin_unlock(&simp->lock);
- for(;;) {
- hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER);
- if(hdr)
- break;
- if(!simp_garbage())
- return;
- }
-#ifdef DEBUG
- if(CHUNK_BASE(hdr) != hdr)
- panic("simp: bad kernel page alignment");
-#endif
-
- memset(hdr, 0, HEADER_SIZE);
-#ifdef DEBUG
- memcpy(hdr->magic, global_magic, sizeof(global_magic));
-#endif
- hdr->father = simp;
- hdr->again_ctor = simp->again_ctor;
- hdr->first_ctor = simp->first_ctor;
-
- /* note: races on simp->color don't produce any error :-) */
- ptr = ((char*)hdr) + HEADER_SIZE + simp->color;
- index = CHUNK_END(hdr);
- for(count = 0; count < simp->max_elems; count++) {
- *--index = ptr;
- ptr += simp->real_size;
- /* note: constructors are not called here in bunch but
- * instead at each single simp_alloc(), in order
- * to maximize chances that the cache will be
- * polluted after a simp_alloc() anyway,
- * and not here. */
- }
- hdr->index = hdr->fresh = hdr->emptypos = index;
-
- spin_lock(&simp->lock);
- simp->color += COLOR_INCREMENT;
- if(simp->color >= simp->max_color)
- simp->color = 0;
- hdr->next = simp->usable_list;
- simp->usable_list = hdr;
-}
-
-/* current x86 memcpy() is horribly moving around registers for nothing,
- * is doing unnecessary work if the size is dividable by a power-of-two,
- * and it clobbers way too many registers.
- * This results in nearly any other register being transfered to stack.
- * Fixing this would be a major win for the whole kernel!
- */
-static void ** bunch_alloc(struct simp * simp, void ** buffer)
-{
- struct header * hdr;
- void ** index;
- void ** to;
- void ** end;
- structor todo;
- long length;
-
- spin_lock(&simp->lock);
- hdr = simp->usable_list;
- if(!hdr) {
- alloc_header(simp);
- hdr = simp->usable_list;
- if(!hdr) {
- spin_unlock(&simp->lock);
- *buffer = NULL;
- return buffer+1;
- }
- }
-
- index = hdr->index;
- end = hdr->fresh;
- todo = hdr->again_ctor;
- if(index == end) {
- end = CHUNK_END(hdr);
- todo = hdr->first_ctor;
- }
- to = index + POSTBUFFER_SIZE/2;
- if(to >= end) {
- to = end;
- if(to == CHUNK_END(hdr)) {
- simp->usable_list = hdr->next;
- hdr->next = NULL;
- }
- }
- if(to > hdr->fresh)
- hdr->fresh = to;
- hdr->index = to;
- length = ((unsigned long)to) - (unsigned long)index;
- to = buffer + (length/sizeof(void**));
-
- memcpy(buffer, index, length);
-
- spin_unlock(&simp->lock);
-
- if(todo) {
- do {
- todo(*buffer++);
- } while(buffer < to);
- }
- return to;
-}
-
-void * simp_alloc(struct simp * simp)
-{
-#ifdef __SMP__
- const long cpu = smp_processor_id();
- struct per_processor * priv = &simp->private[cpu];
-#else
-#define priv (&simp->private[0]) /*fool gcc to use no extra register*/
-#endif
- void ** buffer_pos = priv->buffer_pos;
- void * res;
-
- if(buffer_pos == priv->postbuffer) {
- buffer_pos = bunch_alloc(simp, buffer_pos);
- }
- buffer_pos--;
- res = *buffer_pos;
- priv->buffer_pos = buffer_pos;
- return res;
-}
-
-#ifdef DEBUG
-long check_header(struct header * hdr, void * ptr)
-{
- void ** test;
-
- if(!hdr) {
- printk("SIMP: simp_free() with NULL pointer\n");
- return 1;
- }
- if(strncmp(hdr->magic, global_magic, 32)) {
- printk("SIMP: simpe_free() with bad ptr %p, or header corruption\n", ptr);
- return 1;
- }
- /* This is brute force, but I don't want to pay for any
- * overhead if debugging is not enabled, in particular
- * no space overhead for keeping hashtables etc. */
- test = hdr->index;
- while(test < CHUNK_END(hdr)) {
- if(*test++ == ptr) {
- printk("SIMP: trying to simp_free(%p) again\n", ptr);
- return 1;
- }
- }
- return 0;
-}
-#endif
-
-static void ** bunch_free(struct simp * simp, void ** buffer)
-{
- void ** stop;
-
- stop = buffer - POSTBUFFER_SIZE/3;
-
- spin_lock(&simp->lock);
- while(buffer > stop) {
- void * elem = buffer[-1];
- struct header * hdr = CHUNK_BASE(elem);
- void ** index = hdr->index;
- index--;
- hdr->index = index;
- *index = elem;
- if(!hdr->next) {
- hdr->next = simp->usable_list;
- simp->usable_list = hdr;
- }
-
- buffer -= 2;
- elem = *buffer;
- hdr = CHUNK_BASE(elem);
- index = hdr->index;
- index--;
- hdr->index = index;
- *index = elem;
- if(!hdr->next) {
- hdr->next = simp->usable_list;
- simp->usable_list = hdr;
- }
- }
- spin_unlock(&simp->lock);
- global->changed_flag = 1;
- return buffer;
-}
-
-void simp_free(void * objp)
-{
- struct header * hdr;
- void ** buffer_pos;
- struct per_processor * private;
-#ifdef __SMP__
- const long cpu = smp_processor_id();
-#else
- const long cpu = 0;
-#endif
-
- hdr = CHUNK_BASE(objp);
-#ifdef DEBUG
- if(check_header(hdr, objp))
- return;
-#endif
-
- private = &hdr->father->private[cpu];
- buffer_pos = private->buffer_pos;
- if(buffer_pos >= private->postbuffer+POSTBUFFER_SIZE) {
- buffer_pos = bunch_free(hdr->father, buffer_pos);
- }
-
- *buffer_pos++ = objp;
- private->buffer_pos = buffer_pos;
-
-#ifdef DEAD_BEEF
- {
- unsigned int * ptr = (unsigned int*)objp;
- int count = (hdr->father->real_size - ELEM_SIZE) / sizeof(unsigned int);
- while(count--)
- *ptr++ = 0xdeadbeef;
- }
-#endif
-}
-
-long simp_garbage(void)
-{
- int i;
- int res;
-
- if(!global->changed_flag)
- return 0; /* shortcut */
- /* Note: costs do not matter here. Any heavy thrashing of
- * simp chunks that could be caused by pools stealing each
- * other's memory has to be considered a BUG :-)
- * Simply avoid memory shortages by conservative allocating
- * policies.
- */
- global->changed_flag = 0;
- res = 0;
- for(i = 0; i < global->nr_simps; i++) {
- struct simp * simp = &global->simps[i];
- struct header ** base = &simp->usable_list;
- struct header * del;
-
- spin_lock(&simp->lock);
- del = *base;
- while(del) {
- if(del->index == del->emptypos) {
- if(simp->dtor) {
- void ** ptr = del->index;
- while(ptr < CHUNK_END(del)) {
- simp->dtor(*ptr++);
- }
- }
- *base = del->next;
-#ifdef DEBUG
- memset(del, 0, CHUNK_SIZE);
-#endif
- free_pages((unsigned long)del, ORDER);
- res++;
- } else
- base = &del->next;
- del = *base;
- }
- spin_unlock(&simp->lock);
- }
- return res;
-}
-
diff --git a/mm/slab.c b/mm/slab.c
index a2ed8c1c5..dc9dc05d2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -70,7 +70,7 @@
*
* Calls to printk() are not 100% safe (the function is not threaded). However,
* printk() is only used under an error condition, and the risk is v. small (not
- * sure if the console write functions 'enjoy' executing multiple contextes in
+ * sure if the console write functions 'enjoy' executing multiple contexts in
* parallel. I guess they don't...).
* Note, for most calls to printk() any held cache-lock is dropped. This is not
* always done for text size reasons - having *_unlock() everywhere is bloat.
@@ -92,11 +92,11 @@
* index to hold the bufctls. This allows the bufctl structure to
* be small (one word), but limits the number of objects a slab (not
* a cache) can contain when off-slab bufctls are used. The limit is the
- * size of the largest general-cache that does not use off-slab bufctls,
+ * size of the largest general cache that does not use off-slab bufctls,
* divided by the size of a bufctl. For 32bit archs, is this 256/4 = 64.
* This is not serious, as it is only for large objects, when it is unwise
* to have too many per slab.
- * Note: This limit can be raised by introducing a general-cache whose size
+ * Note: This limit can be raised by introducing a general cache whose size
* is less than 512 (PAGE_SIZE<<3), but greater than 256.
*/
@@ -109,7 +109,6 @@
#include <asm/system.h>
#include <asm/atomic.h>
-#include <asm/smp_lock.h>
#include <asm/spinlock.h>
#ifdef __mips__
#include <asm/pgtable.h>
@@ -128,12 +127,12 @@
*
* SLAB_DEBUG_SUPPORT - 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE,
* SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON.
- * 0 for faster, smaller, code (espically in the critical paths).
+ * 0 for faster, smaller, code (especially in the critical paths).
*
* SLAB_STATS - 1 to collect stats for /proc/slabinfo.
- * 0 for faster, smaller, code (espically in the critical paths).
+ * 0 for faster, smaller, code (especially in the critical paths).
*
- * SLAB_SELFTEST - 1 to perform a few tests, mainly for developement.
+ * SLAB_SELFTEST - 1 to perform a few tests, mainly for development.
*/
#define SLAB_MGMT_CHECKS 1
#define SLAB_DEBUG_SUPPORT 0
@@ -184,7 +183,7 @@ typedef struct kmem_slab_s {
s_dma:1;
} kmem_slab_t;
-/* When the slab mgmt is on-slab, this gives the size to use. */
+/* When the slab management is on-slab, this gives the size to use. */
#define slab_align_size (L1_CACHE_ALIGN(sizeof(kmem_slab_t)))
/* Test for end of slab chain. */
@@ -192,7 +191,7 @@ typedef struct kmem_slab_s {
/* s_magic */
#define SLAB_MAGIC_ALLOC 0xA5C32F2BUL /* slab is alive */
-#define SLAB_MAGIC_DESTROYED 0xB2F23C5AUL /* slab has been destoryed */
+#define SLAB_MAGIC_DESTROYED 0xB2F23C5AUL /* slab has been destroyed */
/* Bufctl's are used for linking objs within a slab, identifying what slab an obj
* is in, and the address of the associated obj (for sanity checking with off-slab
@@ -264,9 +263,9 @@ struct kmem_cache_s {
};
/* internal c_flags */
-#define SLAB_CFLGS_OFF_SLAB 0x010000UL /* slab mgmt in own cache */
+#define SLAB_CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */
#define SLAB_CFLGS_BUFCTL 0x020000UL /* bufctls in own cache */
-#define SLAB_CFLGS_GENERAL 0x080000UL /* a general-cache */
+#define SLAB_CFLGS_GENERAL 0x080000UL /* a general cache */
/* c_dflags (dynamic flags). Need to hold the spinlock to access this member */
#define SLAB_CFLGS_GROWN 0x000002UL /* don't reap a recently grown */
@@ -311,13 +310,15 @@ static void kmem_self_test(void);
/* maximum num of pages for a slab (prevents large requests to the VM layer) */
#define SLAB_MAX_GFP_ORDER 5 /* 32 pages */
-/* the 'prefered' minimum num of objs per slab - maybe less for large objs */
+/* the 'preferred' minimum num of objs per slab - maybe less for large objs */
#define SLAB_MIN_OBJS_PER_SLAB 4
/* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
* then the page order must be less than this before trying the next order.
*/
-#define SLAB_BREAK_GFP_ORDER 2
+#define SLAB_BREAK_GFP_ORDER_HI 2
+#define SLAB_BREAK_GFP_ORDER_LO 1
+static int slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_LO;
/* Macros for storing/retrieving the cachep and or slab from the
* global 'mem_map'. With off-slab bufctls, these are used to find the
@@ -329,7 +330,7 @@ static void kmem_self_test(void);
#define SLAB_SET_PAGE_SLAB(pg, x) ((pg)->prev = (struct page *)(x))
#define SLAB_GET_PAGE_SLAB(pg) ((kmem_slab_t *)(pg)->prev)
-/* Size description struct for general-caches. */
+/* Size description struct for general caches. */
typedef struct cache_sizes {
size_t cs_size;
kmem_cache_t *cs_cachep;
@@ -354,7 +355,7 @@ static cache_sizes_t cache_sizes[] = {
{0, NULL}
};
-/* Names for the general-caches. Not placed into the sizes struct for
+/* Names for the general caches. Not placed into the sizes struct for
* a good reason; the string ptr is not needed while searching in kmalloc(),
* and would 'get-in-the-way' in the h/w cache.
*/
@@ -400,7 +401,7 @@ static struct semaphore cache_chain_sem;
/* Place maintainer for reaping. */
static kmem_cache_t *clock_searchp = &cache_cache;
-/* Internal slab mgmt cache, for when slab mgmt is off-slab. */
+/* Internal slab management cache, for when slab management is off-slab. */
static kmem_cache_t *cache_slabp = NULL;
/* Max number of objs-per-slab for caches which use bufctl's.
@@ -451,6 +452,12 @@ __initfunc(long kmem_cache_init(long start, long end))
cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES;
cache_cache.c_colour_next = cache_cache.c_colour;
+ /*
+ * Fragmentation resistance on low memory - only use bigger
+ * page orders on machines with more than 32MB of memory.
+ */
+ if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+ slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_HI;
return start;
}
@@ -467,9 +474,9 @@ __initfunc(void kmem_cache_sizes_init(void))
char **names = cache_sizes_name;
cache_sizes_t *sizes = cache_sizes;
do {
- /* For performance, all the general-caches are L1 aligned.
+ /* For performance, all the general caches are L1 aligned.
* This should be particularly beneficial on SMP boxes, as it
- * elimantes "false sharing".
+ * eliminates "false sharing".
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches. */
if (!(sizes->cs_cachep =
@@ -566,7 +573,7 @@ kmem_check_poison_obj(kmem_cache_t *cachep, void *addr)
}
#endif /* SLAB_DEBUG_SUPPORT */
-/* Three slab chain funcs - all called with ints disabled and the appropiate
+/* Three slab chain funcs - all called with ints disabled and the appropriate
* cache-lock held.
*/
static inline void
@@ -608,7 +615,7 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
{
if (cachep->c_dtor
#if SLAB_DEBUG_SUPPORT
- || cachep->c_flags & (SLAB_POISON || SLAB_RED_ZONE)
+ || cachep->c_flags & (SLAB_POISON | SLAB_RED_ZONE)
#endif /*SLAB_DEBUG_SUPPORT*/
) {
/* Doesn't use the bufctl ptrs to find objs. */
@@ -634,7 +641,7 @@ kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
#if SLAB_DEBUG_SUPPORT
else if (cachep->c_flags & SLAB_POISON) {
if (kmem_check_poison_obj(cachep, objp))
- printk(KERN_ERR "kmem_slab_destory: "
+ printk(KERN_ERR "kmem_slab_destroy: "
"Bad poison - %s\n", cachep->c_name);
}
if (cachep->c_flags & SLAB_RED_ZONE)
@@ -718,7 +725,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
}
if (offset < 0 || offset > size) {
- printk("%sOffset weired %d - %s\n", func_nm, (int) offset, name);
+ printk("%sOffset weird %d - %s\n", func_nm, (int) offset, name);
offset = 0;
}
@@ -785,11 +792,11 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
if (flags & SLAB_HWCACHE_ALIGN)
align = L1_CACHE_BYTES;
- /* Determine if the slab mgmt and/or bufclts are 'on' or 'off' slab. */
+ /* Determine if the slab management and/or bufclts are 'on' or 'off' slab. */
extra = sizeof(kmem_bufctl_t);
if (size < (PAGE_SIZE>>3)) {
/* Size is small(ish). Use packing where bufctl size per
- * obj is low, and slab mngmnt is on-slab.
+ * obj is low, and slab management is on-slab.
*/
#if 0
if ((flags & SLAB_HIGH_PACK)) {
@@ -806,7 +813,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
}
#endif
} else {
- /* Size is large, assume best to place the slab mngmnt obj
+ /* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= SLAB_CFLGS_OFF_SLAB;
@@ -815,7 +822,7 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
/* To avoid waste the bufctls are off-slab... */
flags |= SLAB_CFLGS_BUFCTL;
extra = 0;
- } /* else slab mngmnt is off-slab, but freelist ptrs are on. */
+ } /* else slab management is off-slab, but freelist pointers are on. */
}
size += extra;
@@ -873,7 +880,7 @@ cal_wastage:
* bad for the gfp()s.
*/
if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) {
- if (cachep->c_gfporder < SLAB_BREAK_GFP_ORDER)
+ if (cachep->c_gfporder < slab_break_gfp_order)
goto next;
}
@@ -1022,8 +1029,8 @@ kmem_cache_shrink(kmem_cache_t *cachep)
printk(KERN_ERR "kmem_shrink: Invalid cache addr %p\n", cachep);
return 2;
found:
- /* Relase the sempahore before getting the cache-lock. This could
- * mean multiple engines are shrinking the cache, but so what...
+ /* Release the semaphore before getting the cache-lock. This could
+ * mean multiple engines are shrinking the cache, but so what.
*/
up(&cache_chain_sem);
spin_lock_irq(&cachep->c_spinlock);
@@ -1045,17 +1052,17 @@ found:
return ret;
}
-/* Get the mem for a slab mgmt obj. */
+/* Get the memory for a slab management obj. */
static inline kmem_slab_t *
kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags)
{
kmem_slab_t *slabp;
if (SLAB_OFF_SLAB(cachep->c_flags)) {
- /* Slab mgmt obj is off-slab. */
+ /* Slab management obj is off-slab. */
slabp = kmem_cache_alloc(cache_slabp, local_flags);
} else {
- /* Slab mgmnt at end of slab mem, placed so that
+ /* Slab management at end of slab memory, placed so that
* the position is 'coloured'.
*/
void *end;
@@ -1203,7 +1210,7 @@ re_try:
if (!(objp = kmem_getpages(cachep, flags, &dma)))
goto failed;
- /* Get slab mgmt. */
+ /* Get slab management. */
if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags)))
goto opps1;
if (dma)
@@ -1257,7 +1264,7 @@ failed:
if (local_flags != SLAB_ATOMIC && cachep->c_gfporder) {
/* For large order (>0) slabs, we try again.
* Needed because the gfp() functions are not good at giving
- * out contigious pages unless pushed (but do not push too hard).
+ * out contiguous pages unless pushed (but do not push too hard).
*/
if (cachep->c_failures++ < 4 && cachep->c_freep == kmem_slab_end(cachep))
goto re_try;
@@ -1648,19 +1655,19 @@ kfree(const void *objp)
goto bad_ptr;
/* Assume we own the page structure - hence no locking.
- * If someone is misbehaving (eg. someone calling us with a bad
+ * If someone is misbehaving (for example, calling us with a bad
* address), then access to the page structure can race with the
- * kmem_slab_destory() code. Need to add a spin_lock to each page
+ * kmem_slab_destroy() code. Need to add a spin_lock to each page
* structure, which would be useful in threading the gfp() functions....
*/
page = &mem_map[nr];
if (PageSlab(page)) {
kmem_cache_t *cachep;
- /* Here, we (again) assume the obj address is good.
+ /* Here, we again assume the obj address is good.
* If it isn't, and happens to map onto another
- * general-cache page which has no active objs, then
- * we race....
+ * general cache page which has no active objs, then
+ * we race.
*/
cachep = SLAB_GET_PAGE_CACHE(page);
if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
@@ -1714,9 +1721,9 @@ kmem_find_general_cachep(size_t size)
{
cache_sizes_t *csizep = cache_sizes;
- /* This function could be moved to the header-file, and
+ /* This function could be moved to the header file, and
* made inline so consumers can quickly determine what
- * cache-ptr they require.
+ * cache pointer they require.
*/
for (; csizep->cs_size; csizep++) {
if (size > csizep->cs_size)
@@ -1745,7 +1752,7 @@ kmem_cache_reap(int gfp_mask)
return;
}
- /* We really need a test semphore op so we can avoid sleeping when
+ /* We really need a test semaphore op so we can avoid sleeping when
* !wait is true.
*/
down(&cache_chain_sem);
@@ -1778,8 +1785,8 @@ kmem_cache_reap(int gfp_mask)
dma_flag = 0;
full_free = 0;
- /* Count num of fully free slabs. Hopefully there are not many,
- * we are holding the cache lock....
+ /* Count the fully free slabs. There should not be not many,
+ * since we are holding the cache lock.
*/
slabp = searchp->c_lastp;
while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) {
@@ -1819,7 +1826,7 @@ next:
up(&cache_chain_sem);
if (!best_cachep) {
- /* couldn't find anthying to reap */
+ /* couldn't find anything to reap */
return;
}
diff --git a/mm/swap.c b/mm/swap.c
index c760208da..3cedb215c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -6,7 +6,7 @@
/*
* This file contains the default values for the opereation of the
- * Linux VM subsystem. Finetuning documentation can be found in
+ * Linux VM subsystem. Fine-tuning documentation can be found in
* linux/Documentation/sysctl/vm.txt.
* Started 18.12.91
* Swap aging added 23.2.95, Stephen Tweedie.
@@ -67,9 +67,9 @@ swap_control_t swap_control = {
swapstat_t swapstats = {0};
buffer_mem_t buffer_mem = {
- 3, /* minimum percent buffer */
- 10, /* borrow percent buffer */
- 30 /* maximum percent buffer */
+ 5, /* minimum percent buffer */
+ 25, /* borrow percent buffer */
+ 50 /* maximum percent buffer */
};
buffer_mem_t page_cache = {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b91583340..401c7a1fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -146,42 +146,23 @@ void remove_from_swap_cache(struct page *page)
"on page %08lx\n", page_address(page));
}
/*
- * This will be a legal case once we have a more mature swap cache.
+ * This is a legal case, but warn about it.
*/
if (atomic_read(&page->count) == 1) {
- printk ("VM: Removing page cache on unshared page %08lx\n",
+ printk (KERN_WARNING
+ "VM: Removing page cache on unshared page %08lx\n",
page_address(page));
- return;
}
-
#ifdef DEBUG_SWAP
printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n",
page_address(page), atomic_read(&page->count));
#endif
- remove_page_from_hash_queue (page);
- remove_page_from_inode_queue (page);
PageClearSwapCache (page);
- __free_page (page);
+ remove_inode_page(page);
}
-long find_in_swap_cache(struct page *page)
-{
-#ifdef SWAP_CACHE_INFO
- swap_cache_find_total++;
-#endif
- if (PageSwapCache (page)) {
- long entry = page->offset;
-#ifdef SWAP_CACHE_INFO
- swap_cache_find_success++;
-#endif
- remove_from_swap_cache (page);
- return entry;
- }
- return 0;
-}
-
int delete_from_swap_cache(struct page *page)
{
#ifdef SWAP_CACHE_INFO
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d935433bb..45f73de02 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -28,10 +28,7 @@
unsigned int nr_swapfiles = 0;
-static struct {
- int head; /* head of priority-ordered swapfile list */
- int next; /* swapfile to be used next */
-} swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
struct swap_info_struct swap_info[MAX_SWAPFILES];
@@ -180,7 +177,7 @@ bad_free:
* that the page has been used or is no longer needed.
*
* Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited). We don't know just how many ptes will
+ * after one process has exited). We don't know just how many PTEs will
* share this swap entry, so be cautious and let do_wp_page work out
* what to do if a write is requested later.
*/
@@ -535,6 +532,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
error = blkdev_open(swap_dentry->d_inode, &filp);
if (error)
goto bad_swap_2;
+ set_blocksize(p->swap_device, PAGE_SIZE);
error = -ENODEV;
if (!p->swap_device ||
(blk_size[MAJOR(p->swap_device)] &&
@@ -595,7 +593,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
p->flags = SWP_WRITEOK;
p->pages = j;
nr_swap_pages += j;
- printk("Adding Swap: %dk swap-space (priority %d)\n",
+ printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
j<<(PAGE_SHIFT-10), p->prio);
/* insert swap space into swap_list: */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b87beaa2..e7711c23c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -135,12 +135,16 @@ int vmalloc_area_pages(unsigned long address, unsigned long size, pgprot_t prot)
dir = pgd_offset_k(address);
flush_cache_all();
while (address < end) {
- pmd_t *pmd = pmd_alloc_kernel(dir, address);
+ pmd_t *pmd;
+ pgd_t olddir = *dir;
+
+ pmd = pmd_alloc_kernel(dir, address);
if (!pmd)
return -ENOMEM;
if (alloc_area_pmd(pmd, address, end - address, prot))
return -ENOMEM;
- set_pgdir(address, *dir);
+ if (pgd_val(olddir) != pgd_val(*dir))
+ set_pgdir(address, *dir);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
@@ -150,21 +154,22 @@ int vmalloc_area_pages(unsigned long address, unsigned long size, pgprot_t prot)
struct vm_struct * get_vm_area(unsigned long size)
{
- void *addr;
+ unsigned long addr;
struct vm_struct **p, *tmp, *area;
area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
if (!area)
return NULL;
- addr = (void *) VMALLOC_START;
- area->size = size + PAGE_SIZE;
- area->next = NULL;
+ addr = VMALLOC_START;
for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
- if (size + (unsigned long) addr < (unsigned long) tmp->addr)
+ if (size + addr < (unsigned long) tmp->addr)
break;
- addr = (void *) (tmp->size + (unsigned long) tmp->addr);
+ if (addr > VMALLOC_END-size)
+ return NULL;
+ addr = tmp->size + (unsigned long) tmp->addr;
}
- area->addr = addr;
+ area->addr = (void *)addr;
+ area->size = size + PAGE_SIZE;
area->next = *p;
*p = area;
return area;
@@ -217,16 +222,18 @@ void * vmalloc(unsigned long size)
long vread(char *buf, char *addr, unsigned long count)
{
- struct vm_struct **p, *tmp;
+ struct vm_struct *tmp;
char *vaddr, *buf_start = buf;
- int n;
+ unsigned long n;
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
- for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
vaddr = (char *) tmp->addr;
+ if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ continue;
while (addr < vaddr) {
if (count == 0)
goto finished;
@@ -235,17 +242,15 @@ long vread(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = tmp->size - PAGE_SIZE;
- if (addr > vaddr)
- n -= addr - vaddr;
- while (--n >= 0) {
+ n = vaddr + tmp->size - PAGE_SIZE - addr;
+ do {
if (count == 0)
goto finished;
put_user(*addr, buf);
buf++;
addr++;
count--;
- }
+ } while (--n > 0);
}
finished:
return buf - buf_start;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 919b97244..b586bce72 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,7 +42,7 @@ int swapout_interval = HZ / 4;
/*
* The wait queue for waking up the pageout daemon:
*/
-static struct wait_queue * kswapd_wait = NULL;
+struct wait_queue * kswapd_wait = NULL;
static void init_swap_timer(void);
@@ -88,7 +88,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
* pages, then delete the swap cache. We can only do this if
* the swap page's reference count is one: ie. there are no
* other references to it beyond the swap cache (as there must
- * still be pte's pointing to it if count > 1).
+ * still be PTEs pointing to it if count > 1).
*
* If the page has NOT been touched, and its age reaches zero,
* then we are swapping it out:
@@ -107,7 +107,17 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
if (PageSwapCache(page_map)) {
if (pte_write(pte)) {
+ struct page *found;
printk ("VM: Found a writable swap-cached page!\n");
+ /* Try to diagnose the problem ... */
+ found = find_page(&swapper_inode, page_map->offset);
+ if (found) {
+ printk("page=%p@%08lx, found=%p, count=%d\n",
+ page_map, page_map->offset,
+ found, atomic_read(&found->count));
+ __free_page(found);
+ } else
+ printk ("Spurious, page not in cache\n");
return 0;
}
}
@@ -144,9 +154,8 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
* we have the swap cache set up to associate the
* page with that swap entry.
*/
- if (PageSwapCache(page_map)) {
- entry = page_map->offset;
- } else {
+ entry = in_swap_cache(page_map);
+ if (!entry) {
entry = get_swap_page();
if (!entry)
return 0; /* No swap space left */
@@ -219,8 +228,8 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc
flush_cache_page(vma, address);
pte_clear(page_table);
flush_tlb_page(vma, address);
- entry = page_unuse(page);
- free_page(page);
+ entry = page_unuse(page_map);
+ __free_page(page_map);
return entry;
}
@@ -435,7 +444,7 @@ out:
* to be. This works out OK, because we now do proper aging on page
* contents.
*/
-static inline int do_try_to_free_page(int gfp_mask)
+static int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
int i=6;
@@ -448,9 +457,10 @@ static inline int do_try_to_free_page(int gfp_mask)
stop = 3;
if (gfp_mask & __GFP_WAIT)
stop = 0;
+
if (((buffermem >> PAGE_SHIFT) * 100 > buffer_mem.borrow_percent * num_physpages)
|| (page_cache_size * 100 > page_cache.borrow_percent * num_physpages))
- state = 0;
+ shrink_mmap(i, gfp_mask);
switch (state) {
do {
@@ -459,7 +469,7 @@ static inline int do_try_to_free_page(int gfp_mask)
return 1;
state = 1;
case 1:
- if ((gfp_mask & __GFP_IO) && shm_swap(i, gfp_mask))
+ if (shm_swap(i, gfp_mask))
return 1;
state = 2;
case 2:
@@ -476,23 +486,6 @@ static inline int do_try_to_free_page(int gfp_mask)
}
/*
- * This is REALLY ugly.
- *
- * We need to make the locks finer granularity, but right
- * now we need this so that we can do page allocations
- * without holding the kernel lock etc.
- */
-int try_to_free_page(int gfp_mask)
-{
- int retval;
-
- lock_kernel();
- retval = do_try_to_free_page(gfp_mask);
- unlock_kernel();
- return retval;
-}
-
-/*
* Before we start the kernel thread, print out the
* kswapd initialization message (otherwise the init message
* may be printed in the middle of another driver's init
@@ -532,7 +525,7 @@ int kswapd(void *unused)
/* Give kswapd a realtime priority. */
current->policy = SCHED_FIFO;
- current->priority = 32; /* Fixme --- we need to standardise our
+ current->rt_priority = 32; /* Fixme --- we need to standardise our
namings for POSIX.4 realtime scheduling
priorities. */
@@ -540,7 +533,6 @@ int kswapd(void *unused)
add_wait_queue(&kswapd_wait, &wait);
while (1) {
int tries;
- int tried = 0;
current->state = TASK_INTERRUPTIBLE;
flush_signals(current);
@@ -564,29 +556,56 @@ int kswapd(void *unused)
* woken up more often and the rate will be even
* higher).
*/
- tries = pager_daemon.tries_base >> free_memory_available(3);
-
- while (tries--) {
- int gfp_mask;
+ tries = pager_daemon.tries_base;
+ tries >>= 4*free_memory_available();
- if (++tried > pager_daemon.tries_min && free_memory_available(0))
- break;
- gfp_mask = __GFP_IO;
- try_to_free_page(gfp_mask);
+ do {
+ do_try_to_free_page(0);
/*
* Syncing large chunks is faster than swapping
* synchronously (less head movement). -- Rik.
*/
if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
run_task_queue(&tq_disk);
-
- }
+ if (free_memory_available() > 1)
+ break;
+ } while (--tries > 0);
}
/* As if we could ever get here - maybe we want to make this killable */
remove_wait_queue(&kswapd_wait, &wait);
+ unlock_kernel();
return 0;
}
+/*
+ * We need to make the locks finer granularity, but right
+ * now we need this so that we can do page allocations
+ * without holding the kernel lock etc.
+ *
+ * The "PF_MEMALLOC" flag protects us against recursion:
+ * if we need more memory as part of a swap-out effort we
+ * will just silently return "success" to tell the page
+ * allocator to accept the allocation.
+ */
+int try_to_free_pages(unsigned int gfp_mask, int count)
+{
+ int retval = 1;
+
+ lock_kernel();
+ if (current->flags & PF_MEMALLOC) {
+ current->flags |= PF_MEMALLOC;
+ do {
+ retval = do_try_to_free_page(gfp_mask);
+ if (!retval)
+ break;
+ count--;
+ } while (count > 0);
+ current->flags &= PF_MEMALLOC;
+ }
+ unlock_kernel();
+ return retval;
+}
+
/*
* The swap_tick function gets called on every clock tick.
*/
@@ -606,11 +625,11 @@ void swap_tick(void)
* Schedule for wakeup if there isn't lots
* of free memory.
*/
- switch (free_memory_available(3)) {
+ switch (free_memory_available()) {
case 0:
want = now;
/* Fall through */
- case 1 ... 3:
+ case 1:
want_wakeup = 1;
default:
}