summaryrefslogtreecommitdiffstats
path: root/mm/highmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/highmem.c')
-rw-r--r--mm/highmem.c296
1 files changed, 287 insertions, 9 deletions
diff --git a/mm/highmem.c b/mm/highmem.c
index 7665393cf..248688c23 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -4,19 +4,25 @@
* (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
* Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
*
+ *
* Redesigned the x86 32-bit VM architecture to deal with
* 64-bit physical space. With current x86 CPUs this
* means up to 64 Gigabytes physical RAM.
*
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
*/
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
unsigned long highmem_mapnr;
-unsigned long nr_free_highpages = 0;
struct page * prepare_highmem_swapout(struct page * page)
{
@@ -34,9 +40,9 @@ struct page * prepare_highmem_swapout(struct page * page)
if (!regular_page)
return NULL;
- vaddr = kmap(page, KM_READ);
+ vaddr = kmap(page);
copy_page((void *)regular_page, (void *)vaddr);
- kunmap(vaddr, KM_READ);
+ kunmap(page);
/*
* ok, we can just forget about our highmem page since
@@ -52,10 +58,10 @@ struct page * replace_with_highmem(struct page * page)
struct page *highpage;
unsigned long vaddr;
- if (PageHighMem(page) || !nr_free_highpages)
+ if (PageHighMem(page) || !nr_free_highpages())
return page;
- highpage = get_free_highpage(GFP_ATOMIC|__GFP_HIGHMEM);
+ highpage = alloc_page(GFP_ATOMIC|__GFP_HIGHMEM);
if (!highpage)
return page;
if (!PageHighMem(highpage)) {
@@ -63,13 +69,13 @@ struct page * replace_with_highmem(struct page * page)
return page;
}
- vaddr = kmap(highpage, KM_WRITE);
+ vaddr = kmap(page);
copy_page((void *)vaddr, (void *)page_address(page));
- kunmap(vaddr, KM_WRITE);
+ kunmap(page);
/* Preserve the caching of the swap_entry. */
- highpage->offset = page->offset;
- highpage->inode = page->inode;
+ highpage->index = page->index;
+ highpage->mapping = page->mapping;
/*
* We can just forget the old page since
@@ -79,3 +85,275 @@ struct page * replace_with_highmem(struct page * page)
return highpage;
}
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 2048
+#else
+#define LAST_PKMAP 4096
+#endif
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+/*
+ * Virtual_count is not a pure "count".
+ * 0 means that it is not mapped, and has not been mapped
+ * since a TLB flush - it is usable.
+ * 1 means that there are no users, but it has been mapped
+ * since the last TLB flush - so we can't use it.
+ * n means that there are (n-1) current users of it.
+ */
+static int pkmap_count[LAST_PKMAP];
+static unsigned int last_pkmap_nr = 0;
+static spinlock_t kmap_lock;
+
+pte_t * pkmap_page_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+static void flush_all_zero_pkmaps(void)
+{
+ int i;
+
+ for (i = 0; i < LAST_PKMAP; i++) {
+ struct page *page;
+ pte_t pte;
+ /*
+ * zero means we don't have anything to do,
+ * >1 means that it is still in use. Only
+ * a count of 1 means that it is free but
+ * needs to be unmapped
+ */
+ if (pkmap_count[i] != 1)
+ continue;
+ pkmap_count[i] = 0;
+ pte = pkmap_page_table[i];
+ if (pte_none(pte))
+ continue;
+ pte_clear(pkmap_page_table+i);
+ page = pte_page(pte);
+ page->virtual = 0;
+ }
+ flush_tlb_all();
+}
+
+static unsigned long map_new_virtual(struct page *page)
+{
+ unsigned long vaddr;
+ int count = LAST_PKMAP;
+
+ /* Find an empty entry */
+ for (;;) {
+ last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+ if (!last_pkmap_nr)
+ flush_all_zero_pkmaps();
+ if (!pkmap_count[last_pkmap_nr])
+ break; /* Found a usable entry */
+ if (--count)
+ continue;
+
+ /*
+ * Sleep for somebody else to unmap their entries
+ */
+ {
+ DECLARE_WAITQUEUE(wait, current);
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ add_wait_queue(&pkmap_map_wait, &wait);
+ spin_unlock(&kmap_lock);
+ // it's not quite possible to saturate the
+ // pkmap pool right now.
+ BUG();
+ schedule();
+ remove_wait_queue(&pkmap_map_wait, &wait);
+ spin_lock(&kmap_lock);
+ }
+
+ /* Somebody else might have mapped it while we slept */
+ if (page->virtual)
+ return page->virtual;
+
+ /* Re-start */
+ count = LAST_PKMAP;
+ }
+ vaddr = PKMAP_ADDR(last_pkmap_nr);
+ pkmap_page_table[last_pkmap_nr] = mk_pte(page, kmap_prot);
+
+ /*
+ * Subtle! For some reason if we dont do this TLB flush then
+ * we get data corruption and weird behavior in dbench runs.
+ * But invlpg this should not be necessery ... Any ideas?
+ */
+ __flush_tlb_one(vaddr);
+ pkmap_count[last_pkmap_nr] = 1;
+ page->virtual = vaddr;
+
+ return vaddr;
+}
+
+unsigned long kmap_high(struct page *page)
+{
+ unsigned long vaddr;
+
+ if (!PageHighMem(page))
+ BUG();
+ /*
+ * For highmem pages, we can't trust "virtual" until
+ * after we have the lock.
+ *
+ * We cannot call this from interrupts, as it may block
+ */
+ spin_lock(&kmap_lock);
+ vaddr = page->virtual;
+ if (!vaddr)
+ vaddr = map_new_virtual(page);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ if (pkmap_count[PKMAP_NR(vaddr)] < 2)
+ BUG();
+ spin_unlock(&kmap_lock);
+ return vaddr;
+}
+
+void kunmap_high(struct page *page)
+{
+ unsigned long vaddr;
+ unsigned long nr;
+
+ spin_lock(&kmap_lock);
+ vaddr = page->virtual;
+ if (!vaddr)
+ BUG();
+ nr = PKMAP_NR(vaddr);
+
+ /*
+ * A count must never go down to zero
+ * without a TLB flush!
+ */
+ switch (--pkmap_count[nr]) {
+ case 0:
+ BUG();
+ case 1:
+ wake_up(&pkmap_map_wait);
+ }
+ spin_unlock(&kmap_lock);
+}
+
+/*
+ * Simple bounce buffer support for highmem pages.
+ * This will be moved to the block layer in 2.5.
+ */
+
+extern kmem_cache_t *bh_cachep;
+
+static inline void copy_from_high_bh (struct buffer_head *to,
+ struct buffer_head *from)
+{
+ struct page *p_from;
+ unsigned long vfrom;
+
+ p_from = from->b_page;
+ vfrom = kmap_atomic(p_from, KM_BOUNCE_WRITE);
+ memcpy(to->b_data, (char *)vfrom + bh_offset(from), to->b_size);
+ kunmap_atomic(vfrom, KM_BOUNCE_WRITE);
+}
+
+static inline void copy_to_high_bh_irq (struct buffer_head *to,
+ struct buffer_head *from)
+{
+ struct page *p_to;
+ unsigned long vto;
+
+ p_to = to->b_page;
+ vto = kmap_atomic(p_to, KM_BOUNCE_WRITE);
+ memcpy((char *)vto + bh_offset(to), from->b_data, to->b_size);
+ kunmap_atomic(vto, KM_BOUNCE_WRITE);
+}
+
+static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
+{
+ struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_dev_id);
+
+ bh_orig->b_end_io(bh_orig, uptodate);
+ __free_page(bh->b_page);
+ kmem_cache_free(bh_cachep, bh);
+}
+
+static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
+{
+ bounce_end_io(bh, uptodate);
+}
+
+static void bounce_end_io_read (struct buffer_head *bh, int uptodate)
+{
+ struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_dev_id);
+
+ if (uptodate)
+ copy_to_high_bh_irq(bh_orig, bh);
+ bounce_end_io(bh, uptodate);
+}
+
+struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
+{
+ struct page *page;
+ struct buffer_head *bh;
+
+ if (!PageHighMem(bh_orig->b_page))
+ return bh_orig;
+
+repeat_bh:
+ bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+ if (!bh) {
+ wakeup_bdflush(1);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ goto repeat_bh;
+ }
+ /*
+ * This is wasteful for 1k buffers, but this is a stopgap measure
+ * and we are being ineffective anyway. This approach simplifies
+ * things immensly. On boxes with more than 4GB RAM this should
+ * not be an issue anyway.
+ */
+repeat_page:
+ page = alloc_page(GFP_BUFFER);
+ if (!page) {
+ wakeup_bdflush(1);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ goto repeat_page;
+ }
+ set_bh_page(bh, page, 0);
+
+ bh->b_next = NULL;
+ bh->b_blocknr = bh_orig->b_blocknr;
+ bh->b_size = bh_orig->b_size;
+ bh->b_list = -1;
+ bh->b_dev = bh_orig->b_dev;
+ bh->b_count = bh_orig->b_count;
+ bh->b_rdev = bh_orig->b_rdev;
+ bh->b_state = bh_orig->b_state;
+ bh->b_flushtime = 0;
+ bh->b_next_free = NULL;
+ bh->b_prev_free = NULL;
+ /* bh->b_this_page */
+ bh->b_reqnext = NULL;
+ bh->b_pprev = NULL;
+ /* bh->b_page */
+ if (rw == WRITE) {
+ bh->b_end_io = bounce_end_io_write;
+ copy_from_high_bh(bh, bh_orig);
+ } else
+ bh->b_end_io = bounce_end_io_read;
+ bh->b_dev_id = (void *)bh_orig;
+ bh->b_rsector = -1;
+ memset(&bh->b_wait, -1, sizeof(bh->b_wait));
+ bh->b_kiobuf = NULL;
+
+ return bh;
+}
+