summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c269
1 files changed, 198 insertions, 71 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 242981f72..c6287eae8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -42,11 +42,11 @@
#include <linux/smp_lock.h>
#include <linux/swapctl.h>
#include <linux/iobuf.h>
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
unsigned long max_mapnr;
unsigned long num_physpages;
@@ -160,6 +160,7 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
+ spin_lock(&dst->page_table_lock);
for (;;) {
pmd_t * src_pmd, * dst_pmd;
@@ -177,13 +178,11 @@ skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
goto out;
continue;
}
- if (pgd_none(*dst_pgd)) {
- if (!pmd_alloc(dst_pgd, 0))
- goto nomem;
- }
-
+
src_pmd = pmd_offset(src_pgd, address);
- dst_pmd = pmd_offset(dst_pgd, address);
+ dst_pmd = pmd_alloc(dst, dst_pgd, address);
+ if (!dst_pmd)
+ goto nomem;
do {
pte_t * src_pte, * dst_pte;
@@ -200,13 +199,11 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
goto out;
goto cont_copy_pmd_range;
}
- if (pmd_none(*dst_pmd)) {
- if (!pte_alloc(dst_pmd, 0))
- goto nomem;
- }
-
+
src_pte = pte_offset(src_pmd, address);
- dst_pte = pte_offset(dst_pmd, address);
+ dst_pte = pte_alloc(dst, dst_pmd, address);
+ if (!dst_pte)
+ goto nomem;
spin_lock(&src->page_table_lock);
do {
@@ -251,14 +248,14 @@ cont_copy_pmd_range: src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
}
-out:
- return 0;
-
out_unlock:
spin_unlock(&src->page_table_lock);
+out:
+ spin_unlock(&dst->page_table_lock);
return 0;
nomem:
+ spin_unlock(&dst->page_table_lock);
return -ENOMEM;
}
@@ -377,7 +374,6 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
- spin_unlock(&mm->page_table_lock);
/*
* Update rss for the mm_struct (not necessarily current->mm)
* Notice that rss is an unsigned long.
@@ -386,6 +382,7 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s
mm->rss -= freed;
else
mm->rss = 0;
+ spin_unlock(&mm->page_table_lock);
}
@@ -450,7 +447,7 @@ int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
if (err)
return err;
- down(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
err = -EFAULT;
iobuf->locked = 0;
@@ -501,12 +498,12 @@ int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
ptr += PAGE_SIZE;
}
- up(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
dprintk ("map_user_kiobuf: end OK\n");
return 0;
out_unlock:
- up(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
unmap_kiobuf(iobuf);
dprintk ("map_user_kiobuf: end %d\n", err);
return err;
@@ -658,7 +655,7 @@ static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
} while (address && (address < end));
}
-static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address,
+static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
unsigned long size, pgprot_t prot)
{
unsigned long end;
@@ -668,7 +665,7 @@ static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address,
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
- pte_t * pte = pte_alloc(pmd, address);
+ pte_t * pte = pte_alloc(mm, pmd, address);
if (!pte)
return -ENOMEM;
zeromap_pte_range(pte, address, end - address, prot);
@@ -684,23 +681,27 @@ int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
pgd_t * dir;
unsigned long beg = address;
unsigned long end = address + size;
+ struct mm_struct *mm = current->mm;
- dir = pgd_offset(current->mm, address);
- flush_cache_range(current->mm, beg, end);
+ dir = pgd_offset(mm, address);
+ flush_cache_range(mm, beg, end);
if (address >= end)
BUG();
+
+ spin_lock(&mm->page_table_lock);
do {
- pmd_t *pmd = pmd_alloc(dir, address);
+ pmd_t *pmd = pmd_alloc(mm, dir, address);
error = -ENOMEM;
if (!pmd)
break;
- error = zeromap_pmd_range(pmd, address, end - address, prot);
+ error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
if (error)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
- flush_tlb_range(current->mm, beg, end);
+ spin_unlock(&mm->page_table_lock);
+ flush_tlb_range(mm, beg, end);
return error;
}
@@ -733,7 +734,7 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned
} while (address && (address < end));
}
-static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
+static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
unsigned long phys_addr, pgprot_t prot)
{
unsigned long end;
@@ -744,7 +745,7 @@ static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned l
end = PGDIR_SIZE;
phys_addr -= address;
do {
- pte_t * pte = pte_alloc(pmd, address);
+ pte_t * pte = pte_alloc(mm, pmd, address);
if (!pte)
return -ENOMEM;
remap_pte_range(pte, address, end - address, address + phys_addr, prot);
@@ -761,24 +762,28 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long
pgd_t * dir;
unsigned long beg = from;
unsigned long end = from + size;
+ struct mm_struct *mm = current->mm;
phys_addr -= from;
- dir = pgd_offset(current->mm, from);
- flush_cache_range(current->mm, beg, end);
+ dir = pgd_offset(mm, from);
+ flush_cache_range(mm, beg, end);
if (from >= end)
BUG();
+
+ spin_lock(&mm->page_table_lock);
do {
- pmd_t *pmd = pmd_alloc(dir, from);
+ pmd_t *pmd = pmd_alloc(mm, dir, from);
error = -ENOMEM;
if (!pmd)
break;
- error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
+ error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
if (error)
break;
from = (from + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (from && (from < end));
- flush_tlb_range(current->mm, beg, end);
+ spin_unlock(&mm->page_table_lock);
+ flush_tlb_range(mm, beg, end);
return error;
}
@@ -787,6 +792,8 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long
* - flush the old one
* - update the page tables
* - inform the TLB about the new one
+ *
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
*/
static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
{
@@ -795,6 +802,9 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr
update_mmu_cache(vma, address, entry);
}
+/*
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
+ */
static inline void break_cow(struct vm_area_struct * vma, struct page * old_page, struct page * new_page, unsigned long address,
pte_t *page_table)
{
@@ -862,7 +872,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
break;
flush_cache_page(vma, address);
establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
- spin_unlock(&mm->page_table_lock);
return 1; /* Minor fault */
}
@@ -870,10 +879,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
* Ok, we need to copy. Oh, well..
*/
spin_unlock(&mm->page_table_lock);
- new_page = page_cache_alloc();
+ new_page = alloc_page(GFP_HIGHUSER);
+ spin_lock(&mm->page_table_lock);
if (!new_page)
return -1;
- spin_lock(&mm->page_table_lock);
/*
* Re-check the pte - we dropped the lock
@@ -886,12 +895,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
/* Free the old page.. */
new_page = old_page;
}
- spin_unlock(&mm->page_table_lock);
page_cache_release(new_page);
return 1; /* Minor fault */
bad_wp_page:
- spin_unlock(&mm->page_table_lock);
printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
return -1;
}
@@ -1021,63 +1028,100 @@ void swapin_readahead(swp_entry_t entry)
return;
}
+/*
+ * We hold the mm semaphore and the page_table_lock on entry and exit.
+ */
static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
pte_t * page_table, swp_entry_t entry, int write_access)
{
- struct page *page = lookup_swap_cache(entry);
+ struct page *page;
pte_t pte;
+ spin_unlock(&mm->page_table_lock);
+ page = lookup_swap_cache(entry);
if (!page) {
lock_kernel();
swapin_readahead(entry);
page = read_swap_cache(entry);
unlock_kernel();
- if (!page)
+ if (!page) {
+ spin_lock(&mm->page_table_lock);
return -1;
+ }
flush_page_to_ram(page);
flush_icache_page(vma, page);
}
- mm->rss++;
-
- pte = mk_pte(page, vma->vm_page_prot);
-
/*
* Freeze the "shared"ness of the page, ie page_count + swap_count.
* Must lock page before transferring our swap count to already
* obtained page count.
*/
lock_page(page);
+
+ /*
+ * Back out if somebody else faulted in this pte while we
+ * released the page table lock.
+ */
+ spin_lock(&mm->page_table_lock);
+ if (pte_present(*page_table)) {
+ UnlockPage(page);
+ page_cache_release(page);
+ return 1;
+ }
+
+ /* The page isn't present yet, go ahead with the fault. */
+ mm->rss++;
+ pte = mk_pte(page, vma->vm_page_prot);
+
swap_free(entry);
if (write_access && !is_page_shared(page))
pte = pte_mkwrite(pte_mkdirty(pte));
UnlockPage(page);
set_pte(page_table, pte);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
return 1; /* Minor fault */
}
/*
- * This only needs the MM semaphore
+ * We are called with the MM semaphore and page_table_lock
+ * spinlock held to protect against concurrent faults in
+ * multithreaded programs.
*/
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
- struct page *page = NULL;
- pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+ pte_t entry;
+
+ /* Read-only mapping of ZERO_PAGE. */
+ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+
+ /* ..except if it's a write access */
if (write_access) {
+ struct page *page;
+
+ /* Allocate our own private page. */
+ spin_unlock(&mm->page_table_lock);
page = alloc_page(GFP_HIGHUSER);
+ spin_lock(&mm->page_table_lock);
if (!page)
return -1;
- clear_user_highpage(page, addr);
- entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ if (!pte_none(*page_table)) {
+ page_cache_release(page);
+ return 1;
+ }
mm->rss++;
+ clear_user_highpage(page, addr);
flush_page_to_ram(page);
+ entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
}
+
set_pte(page_table, entry);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
return 1; /* Minor fault */
@@ -1092,7 +1136,8 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma,
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held.
+ * This is called with the MM semaphore held and the page table
+ * spinlock held.
*/
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long address, int write_access, pte_t *page_table)
@@ -1102,6 +1147,7 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table, write_access, address);
+ spin_unlock(&mm->page_table_lock);
/*
* The third argument is "no_share", which tells the low-level code
@@ -1109,11 +1155,12 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
* essentially an early COW detection.
*/
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
+
+ spin_lock(&mm->page_table_lock);
if (new_page == NULL) /* no page was available -- SIGBUS */
return 0;
if (new_page == NOPAGE_OOM)
return -1;
- ++mm->rss;
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
@@ -1124,15 +1171,24 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
- flush_page_to_ram(new_page);
- flush_icache_page(vma, new_page);
- entry = mk_pte(new_page, vma->vm_page_prot);
- if (write_access) {
- entry = pte_mkwrite(pte_mkdirty(entry));
- } else if (page_count(new_page) > 1 &&
- !(vma->vm_flags & VM_SHARED))
- entry = pte_wrprotect(entry);
- set_pte(page_table, entry);
+ /* Only go through if we didn't race with anybody else... */
+ if (pte_none(*page_table)) {
+ ++mm->rss;
+ flush_page_to_ram(new_page);
+ flush_icache_page(vma, new_page);
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ if (write_access) {
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ } else if (page_count(new_page) > 1 &&
+ !(vma->vm_flags & VM_SHARED))
+ entry = pte_wrprotect(entry);
+ set_pte(page_table, entry);
+ } else {
+ /* One of our sibling threads was faster, back out. */
+ page_cache_release(new_page);
+ return 1;
+ }
+
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
return 2; /* Major fault */
@@ -1162,11 +1218,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
{
pte_t entry;
- /*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
- */
- spin_lock(&mm->page_table_lock);
entry = *pte;
if (!pte_present(entry)) {
/*
@@ -1174,7 +1225,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* and the PTE updates will not touch it later. So
* drop the lock.
*/
- spin_unlock(&mm->page_table_lock);
if (pte_none(entry))
return do_no_page(mm, vma, address, write_access, pte);
return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
@@ -1188,7 +1238,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
}
entry = pte_mkyoung(entry);
establish_pte(vma, address, pte, entry);
- spin_unlock(&mm->page_table_lock);
return 1;
}
@@ -1204,17 +1253,95 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
current->state = TASK_RUNNING;
pgd = pgd_offset(mm, address);
- pmd = pmd_alloc(pgd, address);
+
+ /*
+ * We need the page table lock to synchronize with kswapd
+ * and the SMP-safe atomic PTE updates.
+ */
+ spin_lock(&mm->page_table_lock);
+ pmd = pmd_alloc(mm, pgd, address);
if (pmd) {
- pte_t * pte = pte_alloc(pmd, address);
+ pte_t * pte = pte_alloc(mm, pmd, address);
if (pte)
ret = handle_pte_fault(mm, vma, address, write_access, pte);
}
+ spin_unlock(&mm->page_table_lock);
return ret;
}
/*
+ * Allocate page middle directory.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
+ *
+ * On a two-level page table, this ends up actually being entirely
+ * optimized away.
+ */
+pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+ pmd_t *new;
+
+ /* "fast" allocation can happen without dropping the lock.. */
+ new = pmd_alloc_one_fast(mm, address);
+ if (!new) {
+ spin_unlock(&mm->page_table_lock);
+ new = pmd_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new)
+ return NULL;
+
+ /*
+ * Because we dropped the lock, we should re-check the
+ * entry, as somebody else could have populated it..
+ */
+ if (pgd_present(*pgd)) {
+ pmd_free(new);
+ goto out;
+ }
+ }
+ pgd_populate(mm, pgd, new);
+out:
+ return pmd_offset(pgd, address);
+}
+
+/*
+ * Allocate the page table directory.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
+ */
+pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+ if (!pmd_present(*pmd)) {
+ pte_t *new;
+
+ /* "fast" allocation can happen without dropping the lock.. */
+ new = pte_alloc_one_fast(mm, address);
+ if (!new) {
+ spin_unlock(&mm->page_table_lock);
+ new = pte_alloc_one(mm, address);
+ spin_lock(&mm->page_table_lock);
+ if (!new)
+ return NULL;
+
+ /*
+ * Because we dropped the lock, we should re-check the
+ * entry, as somebody else could have populated it..
+ */
+ if (pmd_present(*pmd)) {
+ pte_free(new);
+ goto out;
+ }
+ }
+ pmd_populate(mm, pmd, new);
+ }
+out:
+ return pte_offset(pmd, address);
+}
+
+/*
* Simplistic page force-in..
*/
int make_pages_present(unsigned long addr, unsigned long end)