summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c69
-rw-r--r--mm/filemap.c237
-rw-r--r--mm/highmem.c7
-rw-r--r--mm/memory.c36
-rw-r--r--mm/mlock.c24
-rw-r--r--mm/mmap.c28
-rw-r--r--mm/mprotect.c31
-rw-r--r--mm/mremap.c22
-rw-r--r--mm/numa.c50
-rw-r--r--mm/oom_kill.c210
-rw-r--r--mm/page_alloc.c91
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/vmalloc.c14
-rw-r--r--mm/vmscan.c70
16 files changed, 598 insertions, 302 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 56e93693b..d74cdec48 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@
O_TARGET := mm.o
O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
- page_alloc.o swap_state.o swapfile.o numa.o
+ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o
ifeq ($(CONFIG_HIGHMEM),y)
O_OBJS += highmem.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0a8d37ba2..e9e9ef7bc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -41,11 +41,15 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
/*
* Called once to set up the allocator itself.
*/
-static unsigned long __init init_bootmem_core (bootmem_data_t *bdata,
+static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
unsigned long mapstart, unsigned long start, unsigned long end)
{
+ bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize = ((end - start)+7)/8;
+ pgdat->node_next = pgdat_list;
+ pgdat_list = pgdat;
+
mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
bdata->node_boot_start = (start << PAGE_SHIFT);
@@ -172,10 +176,6 @@ restart_scan:
preferred = 0;
goto restart_scan;
}
- /*
- * Whoops, we cannot satisfy the allocation request.
- */
- BUG();
found:
if (start >= eidx)
BUG();
@@ -221,15 +221,15 @@ found:
return ret;
}
-static unsigned long __init free_all_bootmem_core(int nid, bootmem_data_t *bdata)
+static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
{
- struct page * page;
+ struct page *page = pgdat->node_mem_map;
+ bootmem_data_t *bdata = pgdat->bdata;
unsigned long i, count, total = 0;
unsigned long idx;
if (!bdata->node_bootmem_map) BUG();
- page = NODE_MEM_MAP(nid);
count = 0;
idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
for (i = 0; i < idx; i++, page++) {
@@ -260,59 +260,78 @@ static unsigned long __init free_all_bootmem_core(int nid, bootmem_data_t *bdata
return total;
}
-unsigned long __init init_bootmem_node (int nid, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
+unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
{
- return(init_bootmem_core(NODE_DATA(nid)->bdata, freepfn, startpfn, endpfn));
+ return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
}
-void __init reserve_bootmem_node (int nid, unsigned long physaddr, unsigned long size)
+void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
{
- reserve_bootmem_core(NODE_DATA(nid)->bdata, physaddr, size);
+ reserve_bootmem_core(pgdat->bdata, physaddr, size);
}
-void __init free_bootmem_node (int nid, unsigned long physaddr, unsigned long size)
+void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
{
- return(free_bootmem_core(NODE_DATA(nid)->bdata, physaddr, size));
+ return(free_bootmem_core(pgdat->bdata, physaddr, size));
}
-unsigned long __init free_all_bootmem_node (int nid)
+unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
{
- return(free_all_bootmem_core(nid, NODE_DATA(nid)->bdata));
+ return(free_all_bootmem_core(pgdat));
}
unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
{
max_low_pfn = pages;
min_low_pfn = start;
- return(init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages));
+ return(init_bootmem_core(&contig_page_data, start, 0, pages));
}
void __init reserve_bootmem (unsigned long addr, unsigned long size)
{
- reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+ reserve_bootmem_core(contig_page_data.bdata, addr, size);
}
void __init free_bootmem (unsigned long addr, unsigned long size)
{
- return(free_bootmem_core(NODE_DATA(0)->bdata, addr, size));
+ return(free_bootmem_core(contig_page_data.bdata, addr, size));
}
unsigned long __init free_all_bootmem (void)
{
- return(free_all_bootmem_core(0, NODE_DATA(0)->bdata));
+ return(free_all_bootmem_core(&contig_page_data));
}
void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
{
+ pg_data_t *pgdat = pgdat_list;
+ void *ptr;
+
+ while (pgdat) {
+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ align, goal)))
+ return(ptr);
+ pgdat = pgdat->node_next;
+ }
/*
- * In the discontigmem case, all non-node specific allocations come
- * from the first node, node 0.
+ * Whoops, we cannot satisfy the allocation request.
*/
- return(__alloc_bootmem_core(NODE_DATA(0)->bdata, size, align, goal));
+ BUG();
+ return NULL;
}
-void * __init __alloc_bootmem_node (int nid, unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
{
- return(__alloc_bootmem_core(NODE_DATA(nid)->bdata, size, align, goal));
+ void *ptr;
+
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
+ if (ptr)
+ return (ptr);
+
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ BUG();
+ return NULL;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 6aca16409..b19f4c5b3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,6 +135,12 @@ void invalidate_inode_pages(struct inode * inode)
if (TryLockPage(page))
continue;
+ /* Neither can we invalidate something in use.. */
+ if (page_count(page) != 1) {
+ UnlockPage(page);
+ continue;
+ }
+
__lru_cache_del(page);
__remove_inode_page(page);
UnlockPage(page);
@@ -156,6 +162,7 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
static inline void truncate_complete_page(struct page *page)
{
+ /* Leave it on the LRU if it gets converted into anonymous buffers */
if (!page->buffers || block_flushpage(page, 0))
lru_cache_del(page);
@@ -167,6 +174,7 @@ static inline void truncate_complete_page(struct page *page)
* all sorts of fun problems ...
*/
ClearPageDirty(page);
+ ClearPageUptodate(page);
remove_inode_page(page);
page_cache_release(page);
}
@@ -495,20 +503,46 @@ void ___wait_on_page(struct page *page)
}
/*
- * Get an exclusive lock on the page..
+ * Get a lock on the page, assuming we need to sleep
+ * to get it..
*/
-void lock_page(struct page *page)
+static void __lock_page(struct page *page)
{
- while (TryLockPage(page))
- ___wait_on_page(page);
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+
+ add_wait_queue_exclusive(&page->wait, &wait);
+ for (;;) {
+ sync_page(page);
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (PageLocked(page)) {
+ run_task_queue(&tq_disk);
+ schedule();
+ continue;
+ }
+ if (!TryLockPage(page))
+ break;
+ }
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(&page->wait, &wait);
}
+
+/*
+ * Get an exclusive lock on the page, optimistically
+ * assuming it's not locked..
+ */
+void lock_page(struct page *page)
+{
+ if (TryLockPage(page))
+ __lock_page(page);
+}
/*
* a rather lightweight function, finding and getting a reference to a
* hashed page atomically, waiting for it if it's locked.
*/
-struct page * __find_get_page (struct address_space *mapping,
+static struct page * __find_get_page(struct address_space *mapping,
unsigned long offset, struct page **hash)
{
struct page *page;
@@ -517,41 +551,11 @@ struct page * __find_get_page (struct address_space *mapping,
* We scan the hash list read-only. Addition to and removal from
* the hash-list needs a held write-lock.
*/
-repeat:
spin_lock(&pagecache_lock);
page = __find_page_nolock(mapping, offset, *hash);
if (page)
page_cache_get(page);
spin_unlock(&pagecache_lock);
-
- /* Found the page, sleep if locked. */
- if (page && PageLocked(page)) {
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
-
- sync_page(page);
-
- __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- add_wait_queue(&page->wait, &wait);
-
- if (PageLocked(page))
- schedule();
- __set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&page->wait, &wait);
-
- /*
- * The page might have been unhashed meanwhile. It's
- * not freed though because we hold a reference to it.
- * If this is the case then it will be freed _here_,
- * and we recheck the hash anyway.
- */
- page_cache_release(page);
- goto repeat;
- }
- /*
- * It's not locked so we can return the page and we hold
- * a reference to it.
- */
return page;
}
@@ -570,39 +574,23 @@ struct page * __find_lock_page (struct address_space *mapping,
repeat:
spin_lock(&pagecache_lock);
page = __find_page_nolock(mapping, offset, *hash);
- if (page)
+ if (page) {
page_cache_get(page);
- spin_unlock(&pagecache_lock);
-
- /* Found the page, sleep if locked. */
- if (page && TryLockPage(page)) {
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
-
- sync_page(page);
+ spin_unlock(&pagecache_lock);
- __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- add_wait_queue(&page->wait, &wait);
+ lock_page(page);
- if (PageLocked(page))
- schedule();
- __set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&page->wait, &wait);
+ /* Is the page still hashed? Ok, good.. */
+ if (page->mapping)
+ return page;
- /*
- * The page might have been unhashed meanwhile. It's
- * not freed though because we hold a reference to it.
- * If this is the case then it will be freed _here_,
- * and we recheck the hash anyway.
- */
+ /* Nope: we raced. Release and try again.. */
+ UnlockPage(page);
page_cache_release(page);
goto repeat;
}
- /*
- * It's not locked so we can return the page and we hold
- * a reference to it.
- */
- return page;
+ spin_unlock(&pagecache_lock);
+ return NULL;
}
#if 0
@@ -993,7 +981,7 @@ page_ok:
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (page->mapping->i_mmap_shared != NULL)
+ if (mapping->i_mmap_shared != NULL)
flush_dcache_page(page);
/*
@@ -1027,6 +1015,15 @@ page_not_up_to_date:
/* Get exclusive access to the page ... */
lock_page(page);
+
+ /* Did it get unhashed before we got the lock? */
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ continue;
+ }
+
+ /* Did somebody else fill it already? */
if (Page_Uptodate(page)) {
UnlockPage(page);
goto page_ok;
@@ -1323,16 +1320,16 @@ struct page * filemap_nopage(struct vm_area_struct * area,
struct inode *inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
struct page *page, **hash, *old_page;
- unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long size, pgoff;
- unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+ pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+retry_all:
/*
- * Semantics for shared and private memory areas are different
- * past the end of the file. A shared mapping past the last page
- * of the file is an error and results in a SIGBUS, while a
- * private mapping just maps in a zero page.
+ * An external ptracer can access pages that normally aren't
+ * accessible..
*/
+ size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if ((pgoff >= size) && (area->vm_mm == current->mm))
return NULL;
@@ -1411,6 +1408,15 @@ no_cached_page:
page_not_uptodate:
lock_page(page);
+
+ /* Did it get unhashed while we waited for it? */
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ goto retry_all;
+ }
+
+ /* Did somebody else get it up-to-date? */
if (Page_Uptodate(page)) {
UnlockPage(page);
goto success;
@@ -1429,6 +1435,15 @@ page_not_uptodate:
* and we need to check for errors.
*/
lock_page(page);
+
+ /* Somebody truncated the page on us? */
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ goto retry_all;
+ }
+
+ /* Somebody else successfully read it in? */
if (Page_Uptodate(page)) {
UnlockPage(page);
goto success;
@@ -1448,17 +1463,25 @@ page_not_uptodate:
return NULL;
}
+/*
+ * If a task terminates while we're swapping the page, the vma and
+ * and file could be released: try_to_swap_out has done a get_file.
+ * vma/file is guaranteed to exist in the unmap/sync cases because
+ * mmap_sem is held.
+ *
+ * The "mapping" test takes care of somebody having truncated the
+ * page and thus made this write-page a no-op..
+ */
static int filemap_write_page(struct file *file,
struct page * page,
int wait)
{
- /*
- * If a task terminates while we're swapping the page, the vma and
- * and file could be released: try_to_swap_out has done a get_file.
- * vma/file is guaranteed to exist in the unmap/sync cases because
- * mmap_sem is held.
- */
- return page->mapping->a_ops->writepage(file, page);
+ struct address_space * mapping = page->mapping;
+ int error = 0;
+
+ if (mapping)
+ error = mapping->a_ops->writepage(file, page);
+ return error;
}
@@ -1475,39 +1498,47 @@ int filemap_swapout(struct page * page, struct file * file)
return retval;
}
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
unsigned long pgoff;
- pte_t pte = *ptep;
+ pte_t pte;
struct page *page;
int error;
+ pte = *ptep;
+
if (!(flags & MS_INVALIDATE)) {
if (!pte_present(pte))
- return 0;
- if (!pte_dirty(pte))
- return 0;
+ goto out;
+ if (!ptep_test_and_clear_dirty(ptep))
+ goto out;
flush_page_to_ram(pte_page(pte));
flush_cache_page(vma, address);
- set_pte(ptep, pte_mkclean(pte));
flush_tlb_page(vma, address);
page = pte_page(pte);
page_cache_get(page);
} else {
if (pte_none(pte))
- return 0;
+ goto out;
flush_cache_page(vma, address);
- pte_clear(ptep);
+
+ pte = ptep_get_and_clear(ptep);
flush_tlb_page(vma, address);
+
if (!pte_present(pte)) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
swap_free(pte_to_swp_entry(pte));
- return 0;
+ spin_lock(&vma->vm_mm->page_table_lock);
+ goto out;
}
page = pte_page(pte);
if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
page_cache_free(page);
- return 0;
+ goto out;
}
}
pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
@@ -1516,11 +1547,20 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
}
+
+ spin_unlock(&vma->vm_mm->page_table_lock);
lock_page(page);
+
error = filemap_write_page(vma->vm_file, page, 1);
+
UnlockPage(page);
page_cache_free(page);
+
+ spin_lock(&vma->vm_mm->page_table_lock);
return error;
+
+out:
+ return 0;
}
static inline int filemap_sync_pte_range(pmd_t * pmd,
@@ -1590,6 +1630,11 @@ int filemap_sync(struct vm_area_struct * vma, unsigned long address,
unsigned long end = address + size;
int error = 0;
+ /* Aquire the lock early; it may be possible to avoid dropping
+ * and reaquiring it repeatedly.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+
dir = pgd_offset(vma->vm_mm, address);
flush_cache_range(vma->vm_mm, end - size, end);
if (address >= end)
@@ -1600,6 +1645,9 @@ int filemap_sync(struct vm_area_struct * vma, unsigned long address,
dir++;
} while (address && (address < end));
flush_tlb_range(vma->vm_mm, end - size, end);
+
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
return error;
}
@@ -1766,11 +1814,11 @@ static long madvise_fixup_start(struct vm_area_struct * vma,
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = end;
insert_vm_struct(current->mm, n);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -1790,10 +1838,10 @@ static long madvise_fixup_end(struct vm_area_struct * vma,
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_end = start;
insert_vm_struct(current->mm, n);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -1823,7 +1871,7 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = end;
@@ -1831,7 +1879,7 @@ static long madvise_fixup_middle(struct vm_area_struct * vma,
vma->vm_raend = 0;
insert_vm_struct(current->mm, left);
insert_vm_struct(current->mm, right);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -2270,13 +2318,20 @@ struct page *read_cache_page(struct address_space *mapping,
int (*filler)(void *,struct page*),
void *data)
{
- struct page *page = __read_cache_page(mapping, index, filler, data);
+ struct page *page;
int err;
+retry:
+ page = __read_cache_page(mapping, index, filler, data);
if (IS_ERR(page) || Page_Uptodate(page))
goto out;
lock_page(page);
+ if (!page->mapping) {
+ UnlockPage(page);
+ page_cache_release(page);
+ goto retry;
+ }
if (Page_Uptodate(page)) {
UnlockPage(page);
goto out;
diff --git a/mm/highmem.c b/mm/highmem.c
index 3be601c6f..d83d9bb87 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -130,10 +130,9 @@ static void flush_all_zero_pkmaps(void)
if (pkmap_count[i] != 1)
continue;
pkmap_count[i] = 0;
- pte = pkmap_page_table[i];
+ pte = ptep_get_and_clear(pkmap_page_table+i);
if (pte_none(pte))
BUG();
- pte_clear(pkmap_page_table+i);
page = pte_page(pte);
page->virtual = NULL;
}
@@ -310,7 +309,7 @@ struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
repeat_bh:
bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
if (!bh) {
- wakeup_bdflush(1);
+ wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */
current->policy |= SCHED_YIELD;
schedule();
goto repeat_bh;
@@ -324,7 +323,7 @@ repeat_bh:
repeat_page:
page = alloc_page(GFP_BUFFER);
if (!page) {
- wakeup_bdflush(1);
+ wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */
current->policy |= SCHED_YIELD;
schedule();
goto repeat_page;
diff --git a/mm/memory.c b/mm/memory.c
index 6b047821d..11048ddce 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -215,30 +215,30 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
/* copy_one_pte */
if (pte_none(pte))
- goto cont_copy_pte_range;
+ goto cont_copy_pte_range_noset;
if (!pte_present(pte)) {
swap_duplicate(pte_to_swp_entry(pte));
- set_pte(dst_pte, pte);
goto cont_copy_pte_range;
}
ptepage = pte_page(pte);
if ((!VALID_PAGE(ptepage)) ||
- PageReserved(ptepage)) {
- set_pte(dst_pte, pte);
+ PageReserved(ptepage))
goto cont_copy_pte_range;
- }
+
/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow) {
- pte = pte_wrprotect(pte);
- set_pte(src_pte, pte);
+ ptep_clear_wrprotect(src_pte);
+ pte = *src_pte;
}
+
/* If it's a shared mapping, mark it clean in the child */
if (vma->vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
- set_pte(dst_pte, pte_mkold(pte));
+ pte = pte_mkold(pte);
get_page(ptepage);
-
-cont_copy_pte_range: address += PAGE_SIZE;
+
+cont_copy_pte_range: set_pte(dst_pte, pte);
+cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out;
src_pte++;
@@ -306,10 +306,9 @@ static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long
pte_t page;
if (!size)
break;
- page = *pte;
+ page = ptep_get_and_clear(pte);
pte++;
size--;
- pte_clear(pte-1);
if (pte_none(page))
continue;
freed += free_pte(page);
@@ -642,7 +641,7 @@ static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
end = PMD_SIZE;
do {
pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
- pte_t oldpage = *pte;
+ pte_t oldpage = ptep_get_and_clear(pte);
set_pte(pte, zero_pte);
forget_pte(oldpage);
address += PAGE_SIZE;
@@ -712,8 +711,8 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned
end = PMD_SIZE;
do {
struct page *page;
- pte_t oldpage = *pte;
- pte_clear(pte);
+ pte_t oldpage;
+ oldpage = ptep_get_and_clear(pte);
page = virt_to_page(__va(phys_addr));
if ((!VALID_PAGE(page)) || PageReserved(page))
@@ -746,6 +745,7 @@ static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned l
return 0;
}
+/* Note: this is only safe if the mm semaphore is held when called. */
int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
int error = 0;
@@ -781,8 +781,8 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long
*/
static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
{
- flush_tlb_page(vma, address);
set_pte(page_table, entry);
+ flush_tlb_page(vma, address);
update_mmu_cache(vma, address, entry);
}
@@ -867,7 +867,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
/*
* Re-check the pte - we dropped the lock
*/
- if (pte_val(*page_table) == pte_val(pte)) {
+ if (pte_same(*page_table, pte)) {
if (PageReserved(old_page))
++mm->rss;
break_cow(vma, old_page, new_page, address, page_table);
@@ -1214,7 +1214,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* didn't change from under us..
*/
spin_lock(&mm->page_table_lock);
- if (pte_val(entry) == pte_val(*pte)) {
+ if (pte_same(entry, *pte)) {
if (write_access) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address, pte, entry);
diff --git a/mm/mlock.c b/mm/mlock.c
index a3d10ff99..f684a3c60 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,9 +14,9 @@
static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
{
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_flags = newflags;
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -36,11 +36,11 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma,
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = end;
insert_vm_struct(current->mm, n);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -61,10 +61,10 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma,
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_end = start;
insert_vm_struct(current->mm, n);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -96,7 +96,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = end;
@@ -104,7 +104,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
vma->vm_raend = 0;
insert_vm_struct(current->mm, left);
insert_vm_struct(current->mm, right);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -183,9 +183,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
break;
}
}
- vmlist_modify_lock(current->mm);
+ spin_lock(&current->mm->page_table_lock);
merge_segments(current->mm, start, end);
- vmlist_modify_unlock(current->mm);
+ spin_unlock(&current->mm->page_table_lock);
return error;
}
@@ -257,9 +257,9 @@ static int do_mlockall(int flags)
if (error)
break;
}
- vmlist_modify_lock(current->mm);
+ spin_lock(&current->mm->page_table_lock);
merge_segments(current->mm, 0, TASK_SIZE);
- vmlist_modify_unlock(current->mm);
+ spin_unlock(&current->mm->page_table_lock);
return error;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 9c0027563..c50de6ed8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -317,12 +317,12 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
*/
flags = vma->vm_flags;
addr = vma->vm_start; /* can addr have changed?? */
- vmlist_modify_lock(mm);
+ spin_lock(&mm->page_table_lock);
insert_vm_struct(mm, vma);
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
merge_segments(mm, vma->vm_start, vma->vm_end);
- vmlist_modify_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
@@ -534,11 +534,11 @@ static struct vm_area_struct * unmap_fixup(struct mm_struct *mm,
/* Work out to one of the ends. */
if (end == area->vm_end) {
area->vm_end = addr;
- vmlist_modify_lock(mm);
+ spin_lock(&mm->page_table_lock);
} else if (addr == area->vm_start) {
area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
area->vm_start = end;
- vmlist_modify_lock(mm);
+ spin_lock(&mm->page_table_lock);
} else {
/* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
/* Add end mapping -- leave beginning for below */
@@ -560,12 +560,12 @@ static struct vm_area_struct * unmap_fixup(struct mm_struct *mm,
if (mpnt->vm_ops && mpnt->vm_ops->open)
mpnt->vm_ops->open(mpnt);
area->vm_end = addr; /* Truncate area */
- vmlist_modify_lock(mm);
+ spin_lock(&mm->page_table_lock);
insert_vm_struct(mm, mpnt);
}
insert_vm_struct(mm, area);
- vmlist_modify_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
return extra;
}
@@ -670,7 +670,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
npp = (prev ? &prev->vm_next : &mm->mmap);
free = NULL;
- vmlist_modify_lock(mm);
+ spin_lock(&mm->page_table_lock);
for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
*npp = mpnt->vm_next;
mpnt->vm_next = free;
@@ -679,7 +679,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
avl_remove(mpnt, &mm->mmap_avl);
}
mm->mmap_cache = NULL; /* Kill the cache. */
- vmlist_modify_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
/* Ok - we have the memory areas we should free on the 'free' list,
* so release them, and unmap the page range..
@@ -811,10 +811,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
flags = vma->vm_flags;
addr = vma->vm_start;
- vmlist_modify_lock(mm);
+ spin_lock(&mm->page_table_lock);
insert_vm_struct(mm, vma);
merge_segments(mm, vma->vm_start, vma->vm_end);
- vmlist_modify_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
@@ -840,10 +840,10 @@ void exit_mmap(struct mm_struct * mm)
struct vm_area_struct * mpnt;
release_segments(mm);
+ spin_lock(&mm->page_table_lock);
mpnt = mm->mmap;
- vmlist_modify_lock(mm);
mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
- vmlist_modify_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
mm->rss = 0;
mm->total_vm = 0;
mm->locked_vm = 0;
@@ -985,9 +985,9 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
if (mpnt->vm_ops && mpnt->vm_ops->close) {
mpnt->vm_pgoff += (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
mpnt->vm_start = mpnt->vm_end;
- vmlist_modify_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
mpnt->vm_ops->close(mpnt);
- vmlist_modify_lock(mm);
+ spin_lock(&mm->page_table_lock);
}
mm->map_count--;
remove_shared_vm_struct(mpnt);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 53fc53acb..7b61abb3e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -30,9 +30,16 @@ static inline void change_pte_range(pmd_t * pmd, unsigned long address,
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- pte_t entry = *pte;
- if (pte_present(entry))
+ if (pte_present(*pte)) {
+ pte_t entry;
+
+ /* Avoid an SMP race with hardware updated dirty/clean
+ * bits by wiping the pte and then setting the new pte
+ * into place.
+ */
+ entry = ptep_get_and_clear(pte);
set_pte(pte, pte_modify(entry, newprot));
+ }
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
@@ -86,10 +93,10 @@ static void change_protection(unsigned long start, unsigned long end, pgprot_t n
static inline int mprotect_fixup_all(struct vm_area_struct * vma,
int newflags, pgprot_t prot)
{
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_flags = newflags;
vma->vm_page_prot = prot;
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -111,11 +118,11 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma,
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = end;
insert_vm_struct(current->mm, n);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -138,10 +145,10 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma,
get_file(n->vm_file);
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_end = start;
insert_vm_struct(current->mm, n);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -172,7 +179,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
vma->vm_ops->open(left);
vma->vm_ops->open(right);
}
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = end;
@@ -181,7 +188,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
vma->vm_page_prot = prot;
insert_vm_struct(current->mm, left);
insert_vm_struct(current->mm, right);
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -263,9 +270,9 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot
break;
}
}
- vmlist_modify_lock(current->mm);
+ spin_lock(&current->mm->page_table_lock);
merge_segments(current->mm, start, end);
- vmlist_modify_unlock(current->mm);
+ spin_unlock(&current->mm->page_table_lock);
out:
up(&current->mm->mmap_sem);
return error;
diff --git a/mm/mremap.c b/mm/mremap.c
index d1f6a7b8b..719ca1ec1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -63,14 +63,14 @@ static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst)
pte_t pte;
spin_lock(&mm->page_table_lock);
- pte = *src;
- if (!pte_none(pte)) {
- error++;
- if (dst) {
- pte_clear(src);
- set_pte(dst, pte);
- error--;
+ if (!pte_none(*src)) {
+ pte = ptep_get_and_clear(src);
+ if (!dst) {
+ /* No dest? We must put it back. */
+ dst = src;
+ error++;
}
+ set_pte(dst, pte);
}
spin_unlock(&mm->page_table_lock);
return error;
@@ -141,10 +141,10 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vmlist_modify_lock(current->mm);
+ spin_lock(&current->mm->page_table_lock);
insert_vm_struct(current->mm, new_vma);
merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
- vmlist_modify_unlock(current->mm);
+ spin_unlock(&current->mm->page_table_lock);
do_munmap(current->mm, addr, old_len);
current->mm->total_vm += new_len >> PAGE_SHIFT;
if (new_vma->vm_flags & VM_LOCKED) {
@@ -258,9 +258,9 @@ unsigned long do_mremap(unsigned long addr,
/* can we just expand the current mapping? */
if (max_addr - addr >= new_len) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
- vmlist_modify_lock(vma->vm_mm);
+ spin_lock(&vma->vm_mm->page_table_lock);
vma->vm_end = addr + new_len;
- vmlist_modify_unlock(vma->vm_mm);
+ spin_unlock(&vma->vm_mm->page_table_lock);
current->mm->total_vm += pages;
if (vma->vm_flags & VM_LOCKED) {
current->mm->locked_vm += pages;
diff --git a/mm/numa.c b/mm/numa.c
index 06ad9ec63..47cb72ec6 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -11,11 +11,11 @@
int numnodes = 1; /* Initialized for UMA platforms */
-#ifndef CONFIG_DISCONTIGMEM
-
static bootmem_data_t contig_bootmem_data;
pg_data_t contig_page_data = { bdata: &contig_bootmem_data };
+#ifndef CONFIG_DISCONTIGMEM
+
/*
* This is meant to be invoked by platforms whose physical memory starts
* at a considerably higher value than 0. Examples are Super-H, ARM, m68k.
@@ -25,7 +25,7 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
unsigned long *zones_size, unsigned long zone_start_paddr,
unsigned long *zholes_size)
{
- free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size,
+ free_area_init_core(0, &contig_page_data, &mem_map, zones_size,
zone_start_paddr, zholes_size, pmap);
}
@@ -33,7 +33,11 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
{
+#ifdef CONFIG_NUMA
return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order);
+#else
+ return alloc_pages(gfp_mask, order);
+#endif
}
#ifdef CONFIG_DISCONTIGMEM
@@ -42,13 +46,12 @@ struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order)
static spinlock_t node_lock = SPIN_LOCK_UNLOCKED;
-void show_free_areas_node(int nid)
+void show_free_areas_node(pg_data_t *pgdat)
{
unsigned long flags;
spin_lock_irqsave(&node_lock, flags);
- printk("Memory information for node %d:\n", nid);
- show_free_areas_core(nid);
+ show_free_areas_core(pgdat);
spin_unlock_irqrestore(&node_lock, flags);
}
@@ -75,10 +78,16 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
for (i = 0; i < MAX_NR_ZONES; i++)
size += zones_size[i];
size = LONG_ALIGN((size + 7) >> 3);
- pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(nid, size);
+ pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size);
memset(pgdat->valid_addr_bitmap, 0, size);
}
+static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
+ unsigned long order)
+{
+ return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);
+}
+
/*
* This can be refined. Currently, tries to do round robin, instead
* should do concentratic circle search, starting from current node.
@@ -86,33 +95,34 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
struct page * alloc_pages(int gfp_mask, unsigned long order)
{
struct page *ret = 0;
- int startnode, tnode;
+ pg_data_t *start, *temp;
#ifndef CONFIG_NUMA
unsigned long flags;
- static int nextnid = 0;
+ static pg_data_t *next = 0;
#endif
if (order >= MAX_ORDER)
return NULL;
#ifdef CONFIG_NUMA
- tnode = numa_node_id();
+ temp = NODE_DATA(numa_node_id());
#else
spin_lock_irqsave(&node_lock, flags);
- tnode = nextnid;
- nextnid++;
- if (nextnid == numnodes)
- nextnid = 0;
+ if (!next) next = pgdat_list;
+ temp = next;
+ next = next->node_next;
spin_unlock_irqrestore(&node_lock, flags);
#endif
- startnode = tnode;
- while (tnode < numnodes) {
- if ((ret = alloc_pages_node(tnode++, gfp_mask, order)))
+ start = temp;
+ while (temp) {
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
+ temp = temp->node_next;
}
- tnode = 0;
- while (tnode != startnode) {
- if ((ret = alloc_pages_node(tnode++, gfp_mask, order)))
+ temp = pgdat_list;
+ while (temp != start) {
+ if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
+ temp = temp->node_next;
}
return(0);
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
new file mode 100644
index 000000000..9882fe7cd
--- /dev/null
+++ b/mm/oom_kill.c
@@ -0,0 +1,210 @@
+/*
+ * linux/mm/oom_kill.c
+ *
+ * Copyright (C) 1998,2000 Rik van Riel
+ * Thanks go out to Claus Fischer for some serious inspiration and
+ * for goading me into coding this file...
+ *
+ * The routines in this file are used to kill a process when
+ * we're seriously out of memory. This gets called from kswapd()
+ * in linux/mm/vmscan.c when we really run out of memory.
+ *
+ * Since we won't call these routines often (on a well-configured
+ * machine) this file will double as a 'coding guide' and a signpost
+ * for newbie kernel hackers. It features several pointers to major
+ * kernel subsystems and hints as to where to find out what things do.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/timex.h>
+
+/* #define DEBUG */
+
+/**
+ * int_sqrt - oom_kill.c internal function, rough approximation to sqrt
+ * @x: integer of which to calculate the sqrt
+ *
+ * A very rough approximation to the sqrt() function.
+ */
+static unsigned int int_sqrt(unsigned int x)
+{
+ unsigned int out = x;
+ while (x & ~(unsigned int)1) x >>=2, out >>=1;
+ if (x) out -= out >> 2;
+ return (out ? out : 1);
+}
+
+/**
+ * oom_badness - calculate a numeric value for how bad this task has been
+ * @p: task struct of which task we should calculate
+ *
+ * The formula used is relatively simple and documented inline in the
+ * function. The main rationale is that we want to select a good task
+ * to kill when we run out of memory.
+ *
+ * Good in this context means that:
+ * 1) we lose the minimum amount of work done
+ * 2) we recover a large amount of memory
+ * 3) we don't kill anything innocent of eating tons of memory
+ * 4) we want to kill the minimum amount of processes (one)
+ * 5) we try to kill the process the user expects us to kill, this
+ * algorithm has been meticulously tuned to meet the priniciple
+ * of least surprise ... (be careful when you change it)
+ */
+
+static int badness(struct task_struct *p)
+{
+ int points, cpu_time, run_time;
+
+ if (!p->mm)
+ return 0;
+ /*
+ * The memory size of the process is the basis for the badness.
+ */
+ points = p->mm->total_vm;
+
+ /*
+ * CPU time is in seconds and run time is in minutes. There is no
+ * particular reason for this other than that it turned out to work
+ * very well in practice. This is not safe against jiffie wraps
+ * but we don't care _that_ much...
+ */
+ cpu_time = (p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3);
+ run_time = (jiffies - p->start_time) >> (SHIFT_HZ + 10);
+
+ points /= int_sqrt(cpu_time);
+ points /= int_sqrt(int_sqrt(run_time));
+
+ /*
+ * Niced processes are most likely less important, so double
+ * their badness points.
+ */
+ if (p->nice > 0)
+ points *= 2;
+
+ /*
+ * Superuser processes are usually more important, so we make it
+ * less likely that we kill those.
+ */
+ if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
+ p->uid == 0 || p->euid == 0)
+ points /= 4;
+
+ /*
+ * We don't want to kill a process with direct hardware access.
+ * Not only could that mess up the hardware, but usually users
+ * tend to only have this flag set on applications they think
+ * of as important.
+ */
+ if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+ points /= 4;
+#ifdef DEBUG
+ printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
+ p->pid, p->comm, points);
+#endif
+ return points;
+}
+
+/*
+ * Simple selection loop. We chose the process with the highest
+ * number of 'points'. We need the locks to make sure that the
+ * list of task structs doesn't change while we look the other way.
+ *
+ * (not docbooked, we don't want this one cluttering up the manual)
+ */
+static struct task_struct * select_bad_process(void)
+{
+ int points = 0, maxpoints = 0;
+ struct task_struct *p = NULL;
+ struct task_struct *chosen = NULL;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ {
+ if (p->pid)
+ points = badness(p);
+ if (points > maxpoints) {
+ chosen = p;
+ maxpoints = points;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return chosen;
+}
+
+/**
+ * oom_kill - kill the "best" process when we run out of memory
+ *
+ * If we run out of memory, we have the choice between either
+ * killing a random task (bad), letting the system crash (worse)
+ * OR try to be smart about which process to kill. Note that we
+ * don't have to be perfect here, we just have to be good.
+ *
+ * We must be careful though to never send SIGKILL a process with
+ * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
+ * we select a process with CAP_SYS_RAW_IO set).
+ */
+void oom_kill(void)
+{
+
+ struct task_struct *p = select_bad_process();
+
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (p == NULL)
+ panic("Out of memory and no killable processes...\n");
+
+ printk(KERN_ERR "Out of Memory: Killed process %d (%s).", p->pid, p->comm);
+
+ /*
+ * We give our sacrificial lamb high priority and access to
+ * all the memory it needs. That way it should be able to
+ * exit() and clear out its resources quickly...
+ */
+ p->counter = 5 * HZ;
+ p->flags |= PF_MEMALLOC;
+
+ /* This process has hardware access, be more careful. */
+ if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
+ force_sig(SIGTERM, p);
+ } else {
+ force_sig(SIGKILL, p);
+ }
+
+ /*
+ * Make kswapd go out of the way, so "p" has a good chance of
+ * killing itself before someone else gets the chance to ask
+ * for more memory.
+ */
+ current->policy |= SCHED_YIELD;
+ schedule();
+ return;
+}
+
+/**
+ * out_of_memory - is the system out of memory?
+ *
+ * Returns 0 if there is still enough memory left,
+ * 1 when we are out of memory (otherwise).
+ */
+int out_of_memory(void)
+{
+ struct sysinfo swp_info;
+
+ /* Enough free memory? Not OOM. */
+ if (nr_free_pages() > freepages.min)
+ return 0;
+
+ if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low)
+ return 0;
+
+ /* Enough swap space left? Not OOM. */
+ si_swapinfo(&swp_info);
+ if (swp_info.freeswap > 0)
+ return 0;
+
+ /* Else... */
+ return 1;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b5990a11..90c077439 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -17,13 +17,6 @@
#include <linux/pagemap.h>
#include <linux/bootmem.h>
-/* Use NUMNODES instead of numnodes for better code inside kernel APIs */
-#ifndef CONFIG_DISCONTIGMEM
-#define NUMNODES 1
-#else
-#define NUMNODES numnodes
-#endif
-
int nr_swap_pages;
int nr_active_pages;
int nr_inactive_dirty_pages;
@@ -294,7 +287,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
zone_t **zone;
int direct_reclaim = 0;
unsigned int gfp_mask = zonelist->gfp_mask;
- struct page * page = NULL;
+ struct page * page;
/*
* Allocations put pressure on the VM subsystem.
@@ -329,7 +322,7 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
* wake up bdflush.
*/
else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
- && nr_inactive_dirty_pages > freepages.high)
+ && nr_inactive_dirty_pages >= freepages.high)
wakeup_bdflush(0);
try_again:
@@ -347,7 +340,7 @@ try_again:
if (!z->size)
BUG();
- if (z->free_pages > z->pages_low) {
+ if (z->free_pages >= z->pages_low) {
page = rmqueue(z, order);
if (page)
return page;
@@ -517,17 +510,17 @@ try_again:
* happen when the OOM killer selects this task for
* instant execution...
*/
- if (direct_reclaim)
+ if (direct_reclaim) {
page = reclaim_page(z);
- if (page)
- return page;
+ if (page)
+ return page;
+ }
/* XXX: is pages_min/4 a good amount to reserve for this? */
if (z->free_pages < z->pages_min / 4 &&
!(current->flags & PF_MEMALLOC))
continue;
- if (!page)
- page = rmqueue(z, order);
+ page = rmqueue(z, order);
if (page)
return page;
}
@@ -588,12 +581,14 @@ unsigned int nr_free_pages (void)
{
unsigned int sum;
zone_t *zone;
- int i;
+ pg_data_t *pgdat = pgdat_list;
sum = 0;
- for (i = 0; i < NUMNODES; i++)
- for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+ while (pgdat) {
+ for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
sum += zone->free_pages;
+ pgdat = pgdat->node_next;
+ }
return sum;
}
@@ -604,12 +599,14 @@ unsigned int nr_inactive_clean_pages (void)
{
unsigned int sum;
zone_t *zone;
- int i;
+ pg_data_t *pgdat = pgdat_list;
sum = 0;
- for (i = 0; i < NUMNODES; i++)
- for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+ while (pgdat) {
+ for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
sum += zone->inactive_clean_pages;
+ pgdat = pgdat->node_next;
+ }
return sum;
}
@@ -644,11 +641,13 @@ unsigned int nr_free_buffer_pages (void)
#if CONFIG_HIGHMEM
unsigned int nr_free_highpages (void)
{
- int i;
+ pg_data_t *pgdat = pgdat_list;
unsigned int pages = 0;
- for (i = 0; i < NUMNODES; i++)
- pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages;
+ while (pgdat) {
+ pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ pgdat = pgdat->node_next;
+ }
return pages;
}
#endif
@@ -658,7 +657,7 @@ unsigned int nr_free_highpages (void)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
*/
-void show_free_areas_core(int nid)
+void show_free_areas_core(pg_data_t *pgdat)
{
unsigned long order;
unsigned type;
@@ -678,7 +677,7 @@ void show_free_areas_core(int nid)
for (type = 0; type < MAX_NR_ZONES; type++) {
struct list_head *head, *curr;
- zone_t *zone = NODE_DATA(nid)->node_zones + type;
+ zone_t *zone = pgdat->node_zones + type;
unsigned long nr, total, flags;
total = 0;
@@ -710,7 +709,7 @@ void show_free_areas_core(int nid)
void show_free_areas(void)
{
- show_free_areas_core(0);
+ show_free_areas_core(pgdat_list);
}
/*
@@ -780,9 +779,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
unsigned long totalpages, offset, realtotalpages;
unsigned int cumulative = 0;
- pgdat->node_next = pgdat_list;
- pgdat_list = pgdat;
-
totalpages = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
unsigned long size = zones_size[i];
@@ -795,21 +791,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
printk("On node %d totalpages: %lu\n", nid, realtotalpages);
- /*
- * Select nr of pages we try to keep free for important stuff
- * with a minimum of 10 pages and a maximum of 256 pages, so
- * that we don't waste too much memory on large systems.
- * This is fairly arbitrary, but based on some behaviour
- * analysis.
- */
- i = realtotalpages >> 7;
- if (i < 10)
- i = 10;
- if (i > 256)
- i = 256;
- freepages.min += i;
- freepages.low += i * 2;
- freepages.high += i * 3;
memlist_init(&active_list);
memlist_init(&inactive_dirty_list);
@@ -822,7 +803,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
*/
map_size = (totalpages + 1)*sizeof(struct page);
if (lmem_map == (struct page *)0) {
- lmem_map = (struct page *) alloc_bootmem_node(nid, map_size);
+ lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
lmem_map = (struct page *)(PAGE_OFFSET +
MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
}
@@ -875,6 +856,20 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
zone->pages_min = mask;
zone->pages_low = mask*2;
zone->pages_high = mask*3;
+ /*
+ * Add these free targets to the global free target;
+ * we have to be SURE that freepages.high is higher
+ * than SUM [zone->pages_min] for all zones, otherwise
+ * we may have bad bad problems.
+ *
+ * This means we cannot make the freepages array writable
+ * in /proc, but have to add a separate extra_free_target
+ * for people who require it to catch load spikes in eg.
+ * gigabit ethernet routing...
+ */
+ freepages.min += mask;
+ freepages.low += mask*2;
+ freepages.high += mask*3;
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;
@@ -900,7 +895,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
bitmap_size = (bitmap_size + 7) >> 3;
bitmap_size = LONG_ALIGN(bitmap_size);
zone->free_area[i].map =
- (unsigned int *) alloc_bootmem_node(nid, bitmap_size);
+ (unsigned int *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
build_zonelists(pgdat);
@@ -908,7 +903,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
void __init free_area_init(unsigned long *zones_size)
{
- free_area_init_core(0, NODE_DATA(0), &mem_map, zones_size, 0, 0, 0);
+ free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
}
static int __init setup_mem_frac(char *str)
diff --git a/mm/swap.c b/mm/swap.c
index 8cb160b81..b4b9f76be 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -174,6 +174,7 @@ void deactivate_page_nolock(struct page * page)
*/
int maxcount = (page->buffers ? 3 : 2);
page->age = 0;
+ ClearPageReferenced(page);
/*
* Don't touch it if it's not on the active list.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fa4cb133e..688e2fcdd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -223,10 +223,10 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
if (pte_page(pte) != page)
return;
/* We will be removing the swap cache in a moment, so... */
- set_pte(dir, pte_mkdirty(pte));
+ ptep_mkdirty(dir);
return;
}
- if (pte_val(pte) != entry.val)
+ if (pte_to_swp_entry(pte).val != entry.val)
return;
set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
@@ -315,12 +315,12 @@ static void unuse_process(struct mm_struct * mm,
*/
if (!mm)
return;
- vmlist_access_lock(mm);
+ spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
unuse_vma(vma, pgd, entry, page);
}
- vmlist_access_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
return;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e8c557e04..15261612e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,8 +34,8 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- pte_t page = *pte;
- pte_clear(pte);
+ pte_t page;
+ page = ptep_get_and_clear(pte);
address += PAGE_SIZE;
pte++;
if (pte_none(page))
@@ -142,15 +142,14 @@ inline int vmalloc_area_pages (unsigned long address, unsigned long size,
flush_cache_all();
do {
pmd_t *pmd;
- pgd_t olddir = *dir;
pmd = pmd_alloc_kernel(dir, address);
if (!pmd)
return -ENOMEM;
+
if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot))
return -ENOMEM;
- if (pgd_val(olddir) != pgd_val(*dir))
- set_pgdir(address, *dir);
+
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
@@ -222,14 +221,11 @@ void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot)
return NULL;
}
area = get_vm_area(size, VM_ALLOC);
- if (!area) {
- BUG();
+ if (!area)
return NULL;
- }
addr = area->addr;
if (vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask, prot)) {
vfree(addr);
- BUG();
return NULL;
}
return addr;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aacd9a5b0..d7fd0aca8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,22 +55,8 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
onlist = PageActive(page);
/* Don't look at this pte if it's been accessed recently. */
- if (pte_young(pte)) {
- set_pte(page_table, pte_mkold(pte));
- if (onlist) {
- /*
- * Transfer the "accessed" bit from the page
- * tables to the global page map. Page aging
- * will be done by refill_inactive_scan().
- */
- SetPageReferenced(page);
- } else {
- /*
- * The page is not on the active list, so
- * we have to do the page aging ourselves.
- */
- age_page_up(page);
- }
+ if (ptep_test_and_clear_young(page_table)) {
+ age_page_up(page);
goto out_failed;
}
if (!onlist)
@@ -88,6 +74,13 @@ static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, un
if (TryLockPage(page))
goto out_failed;
+ /* From this point on, the odds are that we're going to
+ * nuke this pte, so read and clear the pte. This hook
+ * is needed on CPUs which update the accessed and dirty
+ * bits in hardware.
+ */
+ pte = ptep_get_and_clear(page_table);
+
/*
* Is the page already in the swap cache? If so, then
* we can just drop our reference to it without doing
@@ -124,7 +117,6 @@ drop_pte:
*/
if (!pte_dirty(pte)) {
flush_cache_page(vma, address);
- pte_clear(page_table);
goto drop_pte;
}
@@ -134,7 +126,7 @@ drop_pte:
* locks etc.
*/
if (!(gfp_mask & __GFP_IO))
- goto out_unlock;
+ goto out_unlock_restore;
/*
* Don't do any of the expensive stuff if
@@ -143,7 +135,7 @@ drop_pte:
if (page->zone->free_pages + page->zone->inactive_clean_pages
+ page->zone->inactive_dirty_pages
> page->zone->pages_high + inactive_target)
- goto out_unlock;
+ goto out_unlock_restore;
/*
* Ok, it's really dirty. That means that
@@ -169,10 +161,10 @@ drop_pte:
int error;
struct file *file = vma->vm_file;
if (file) get_file(file);
- pte_clear(page_table);
+
mm->rss--;
flush_tlb_page(vma, address);
- vmlist_access_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
error = swapout(page, file);
UnlockPage(page);
if (file) fput(file);
@@ -191,7 +183,7 @@ drop_pte:
*/
entry = get_swap_page();
if (!entry.val)
- goto out_unlock; /* No swap space left */
+ goto out_unlock_restore; /* No swap space left */
if (!(page = prepare_highmem_swapout(page)))
goto out_swap_free;
@@ -205,7 +197,7 @@ drop_pte:
mm->rss--;
set_pte(page_table, swp_entry_to_pte(entry));
flush_tlb_page(vma, address);
- vmlist_access_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
/* OK, do a physical asynchronous write to swap. */
rw_swap_page(WRITE, page, 0);
@@ -215,10 +207,12 @@ out_free_success:
page_cache_release(page);
return 1;
out_swap_free:
+ set_pte(page_table, pte);
swap_free(entry);
out_failed:
return 0;
-out_unlock:
+out_unlock_restore:
+ set_pte(page_table, pte);
UnlockPage(page);
return 0;
}
@@ -307,7 +301,7 @@ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsi
unsigned long end;
/* Don't swap out areas which are locked down */
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
return 0;
pgdir = pgd_offset(mm, address);
@@ -341,7 +335,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
* Find the proper vm-area after freezing the vma chain
* and ptes.
*/
- vmlist_access_lock(mm);
+ spin_lock(&mm->page_table_lock);
vma = find_vma(mm, address);
if (vma) {
if (address < vma->vm_start)
@@ -364,7 +358,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
mm->swap_cnt = 0;
out_unlock:
- vmlist_access_unlock(mm);
+ spin_unlock(&mm->page_table_lock);
/* We didn't find anything for the process */
return 0;
@@ -790,7 +784,8 @@ int refill_inactive_scan(unsigned int priority, int oneshot)
*
* SUBTLE: we can have buffer pages with count 1.
*/
- if (page_count(page) <= (page->buffers ? 2 : 1)) {
+ if (page->age == 0 && page_count(page) <=
+ (page->buffers ? 2 : 1)) {
deactivate_page_nolock(page);
page_active = 0;
} else {
@@ -837,8 +832,9 @@ int free_shortage(void)
for(i = 0; i < MAX_NR_ZONES; i++) {
zone_t *zone = pgdat->node_zones+ i;
if (zone->size && (zone->inactive_clean_pages +
- zone->free_pages < zone->pages_min)) {
- sum += zone->pages_min;
+ zone->free_pages < zone->pages_min+1)) {
+ /* + 1 to have overlap with alloc_pages() !! */
+ sum += zone->pages_min + 1;
sum -= zone->free_pages;
sum -= zone->inactive_clean_pages;
}
@@ -1095,12 +1091,20 @@ int kswapd(void *unused)
* We go to sleep for one second, but if it's needed
* we'll be woken up earlier...
*/
- if (!free_shortage() || !inactive_shortage())
+ if (!free_shortage() || !inactive_shortage()) {
interruptible_sleep_on_timeout(&kswapd_wait, HZ);
/*
- * TODO: insert out of memory check & oom killer
- * invocation in an else branch here.
+ * If we couldn't free enough memory, we see if it was
+ * due to the system just not having enough memory.
+ * If that is the case, the only solution is to kill
+ * a process (the alternative is enternal deadlock).
+ *
+ * If there still is enough memory around, we just loop
+ * and try free some more memory...
*/
+ } else if (out_of_memory()) {
+ oom_kill();
+ }
}
}