summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c29
-rw-r--r--mm/filemap.c308
-rw-r--r--mm/highmem.c296
-rw-r--r--mm/memory.c45
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c20
-rw-r--r--mm/mprotect.c10
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/page_alloc.c390
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swap_state.c44
-rw-r--r--mm/swapfile.c65
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c13
15 files changed, 833 insertions, 419 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e790acc4f..edc69e6b3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -36,9 +36,7 @@ unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
{
unsigned long mapsize = (pages+7)/8;
- if (bootmem_map)
- BUG();
- bootmem_map = __va(start << PAGE_SHIFT);
+ bootmem_map = phys_to_virt(start << PAGE_SHIFT);
max_low_pfn = pages;
/*
@@ -64,7 +62,6 @@ void __init reserve_bootmem (unsigned long addr, unsigned long size)
*/
unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
- if (!bootmem_map) BUG();
if (!size) BUG();
if (end > max_low_pfn)
@@ -77,18 +74,23 @@ void __init reserve_bootmem (unsigned long addr, unsigned long size)
void __init free_bootmem (unsigned long addr, unsigned long size)
{
unsigned long i;
+ unsigned long start;
/*
* round down end of usable mem, partially free pages are
* considered reserved.
*/
unsigned long end = (addr + size)/PAGE_SIZE;
- if (!bootmem_map) BUG();
if (!size) BUG();
-
if (end > max_low_pfn)
BUG();
- for (i = addr/PAGE_SIZE; i < end; i++) {
+
+ /*
+ * Round up the beginning of the address.
+ */
+ start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
+
+ for (i = start; i < end; i++) {
if (!test_and_clear_bit(i, bootmem_map))
BUG();
}
@@ -117,7 +119,6 @@ void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned
unsigned long offset, remaining_size;
unsigned long areasize, preferred;
- if (!bootmem_map) BUG();
if (!size) BUG();
/*
@@ -152,6 +153,9 @@ restart_scan:
preferred = 0;
goto restart_scan;
}
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
BUG();
found:
if (start >= max_low_pfn)
@@ -173,11 +177,11 @@ found:
areasize = 0;
// last_pos unchanged
last_offset = offset+size;
- ret = __va(last_pos*PAGE_SIZE + offset);
+ ret = phys_to_virt(last_pos*PAGE_SIZE + offset);
} else {
size -= remaining_size;
areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
- ret = __va(last_pos*PAGE_SIZE + offset);
+ ret = phys_to_virt(last_pos*PAGE_SIZE + offset);
last_pos = start+areasize-1;
last_offset = size;
}
@@ -185,7 +189,7 @@ found:
} else {
last_pos = start + areasize - 1;
last_offset = size & ~PAGE_MASK;
- ret = __va(start * PAGE_SIZE);
+ ret = phys_to_virt(start * PAGE_SIZE);
}
/*
* Reserve the area now:
@@ -211,12 +215,13 @@ unsigned long __init free_all_bootmem (void)
count++;
ClearPageReserved(page);
set_page_count(page, 1);
- if (i >= (__pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT))
+ if (i >= (virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT))
clear_bit(PG_DMA, &page->flags);
__free_page(page);
}
}
total += count;
+
/*
* Now free the allocator bitmap itself, it's not
* needed anymore:
diff --git a/mm/filemap.c b/mm/filemap.c
index 887d7b6f8..3bb4d89de 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -21,11 +21,13 @@
#include <linux/swapctl.h>
#include <linux/slab.h>
#include <linux/init.h>
-#include <linux/highmem.h>
+#include <linux/mm.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
+#include <linux/highmem.h>
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -50,9 +52,7 @@ spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
#define CLUSTER_PAGES (1 << page_cluster)
-#define CLUSTER_SHIFT (PAGE_CACHE_SHIFT + page_cluster)
-#define CLUSTER_BYTES (1 << CLUSTER_SHIFT)
-#define CLUSTER_OFFSET(x) (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT)
+#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
void __add_page_to_hash_queue(struct page * page, struct page **p)
{
@@ -127,20 +127,22 @@ void invalidate_inode_pages(struct inode * inode)
void truncate_inode_pages(struct inode * inode, unsigned long start)
{
struct list_head *head, *curr;
- unsigned long offset;
struct page * page;
- int partial = 0;
+ unsigned partial = start & (PAGE_CACHE_SIZE - 1);
+
+ start = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
repeat:
head = &inode->i_data.pages;
spin_lock(&pagecache_lock);
curr = head->next;
while (curr != head) {
+ unsigned long offset;
page = list_entry(curr, struct page, list);
curr = curr->next;
- offset = page->offset;
+ offset = page->index;
/* page wholly truncated - free it */
if (offset >= start) {
@@ -179,30 +181,32 @@ repeat:
/*
* there is only one partial page possible.
*/
- if (partial)
+ if (!partial)
+ continue;
+
+ /* and it's the one preceeding the first wholly truncated page */
+ if ((offset + 1) != start)
continue;
- offset = start - offset;
/* partial truncate, clear end of page */
- if (offset < PAGE_CACHE_SIZE) {
- get_page(page);
- spin_unlock(&pagecache_lock);
+ get_page(page);
+ spin_unlock(&pagecache_lock);
- lock_page(page);
- partial = 1;
+ lock_page(page);
- memclear_highpage_flush(page, offset,
- PAGE_CACHE_SIZE-offset);
- if (inode->i_op->flushpage)
- inode->i_op->flushpage(inode, page, offset);
- /*
- * we have dropped the spinlock so we have to
- * restart.
- */
- UnlockPage(page);
- page_cache_release(page);
- goto repeat;
- }
+ memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+ if (inode->i_op->flushpage)
+ inode->i_op->flushpage(inode, page, partial);
+
+ partial = 0;
+
+ /*
+ * we have dropped the spinlock so we have to
+ * restart.
+ */
+ UnlockPage(page);
+ page_cache_release(page);
+ goto repeat;
}
spin_unlock(&pagecache_lock);
}
@@ -367,7 +371,7 @@ inside:
goto not_found;
if (page->mapping != mapping)
continue;
- if (page->offset == offset)
+ if (page->index == offset)
break;
}
set_bit(PG_referenced, &page->flags);
@@ -417,7 +421,6 @@ static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigne
int retval = 0;
head = &inode->i_data.pages;
- start &= PAGE_MASK;
spin_lock(&pagecache_lock);
curr = head->next;
@@ -426,9 +429,9 @@ static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigne
curr = curr->next;
if (!page->buffers)
continue;
- if (page->offset >= end)
+ if (page->index >= end)
continue;
- if (page->offset < start)
+ if (page->index < start)
continue;
get_page(page);
@@ -455,10 +458,12 @@ static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigne
*/
int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
{
+ unsigned long start_idx = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_idx = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
int retval;
- retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
- retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
+ retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
+ retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
return retval;
}
@@ -476,7 +481,7 @@ static inline void __add_to_page_cache(struct page * page,
flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
page->flags = flags | (1 << PG_locked);
get_page(page);
- page->offset = offset;
+ page->index = offset;
add_page_to_inode_queue(mapping, page);
__add_page_to_hash_queue(page, hash);
lru_cache_add(page);
@@ -516,7 +521,7 @@ int add_to_page_cache_unique(struct page * page,
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static inline void page_cache_read(struct file * file, unsigned long offset)
+static inline int page_cache_read(struct file * file, unsigned long offset)
{
struct inode *inode = file->f_dentry->d_inode;
struct page **hash = page_hash(&inode->i_data, offset);
@@ -526,42 +531,45 @@ static inline void page_cache_read(struct file * file, unsigned long offset)
page = __find_page_nolock(&inode->i_data, offset, *hash);
spin_unlock(&pagecache_lock);
if (page)
- return;
+ return 0;
page = page_cache_alloc();
if (!page)
- return;
+ return -ENOMEM;
if (!add_to_page_cache_unique(page, &inode->i_data, offset, hash)) {
- inode->i_op->readpage(file, page);
+ int error = inode->i_op->readpage(file, page);
page_cache_release(page);
- return;
+ return error;
}
/*
* We arrive here in the unlikely event that someone
* raced with us and added our page to the cache first.
*/
page_cache_free(page);
- return;
+ return 0;
}
/*
* Read in an entire cluster at once. A cluster is usually a 64k-
* aligned block that includes the address requested in "offset."
*/
-static void read_cluster_nonblocking(struct file * file,
- unsigned long offset)
+static int read_cluster_nonblocking(struct file * file, unsigned long offset)
{
- off_t filesize = file->f_dentry->d_inode->i_size;
+ int error = 0;
+ unsigned long filesize = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
unsigned long pages = CLUSTER_PAGES;
offset = CLUSTER_OFFSET(offset);
while ((pages-- > 0) && (offset < filesize)) {
- page_cache_read(file, offset);
- offset += PAGE_CACHE_SIZE;
+ error = page_cache_read(file, offset);
+ if (error >= 0)
+ offset ++;
+ else
+ break;
}
- return;
+ return error;
}
/*
@@ -751,7 +759,7 @@ static void profile_readahead(int async, struct file *filp)
total_rawin/total_reada,
(total_async*100)/total_reada);
#ifdef DEBUG_READAHEAD
- printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
+ printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
#endif
@@ -831,13 +839,15 @@ static inline int get_max_readahead(struct inode * inode)
static void generic_file_readahead(int reada_ok,
struct file * filp, struct inode * inode,
- unsigned long ppos, struct page * page)
+ struct page * page)
{
+ unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ unsigned long index = page->index;
unsigned long max_ahead, ahead;
unsigned long raend;
int max_readahead = get_max_readahead(inode);
- raend = filp->f_raend & PAGE_CACHE_MASK;
+ raend = filp->f_raend;
max_ahead = 0;
/*
@@ -849,14 +859,14 @@ static void generic_file_readahead(int reada_ok,
* page only.
*/
if (PageLocked(page)) {
- if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
- raend = ppos;
- if (raend < inode->i_size)
+ if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
+ raend = index;
+ if (raend < end_index)
max_ahead = filp->f_ramax;
filp->f_rawin = 0;
- filp->f_ralen = PAGE_CACHE_SIZE;
+ filp->f_ralen = 1;
if (!max_ahead) {
- filp->f_raend = ppos + filp->f_ralen;
+ filp->f_raend = index + filp->f_ralen;
filp->f_rawin += filp->f_ralen;
}
}
@@ -869,17 +879,17 @@ static void generic_file_readahead(int reada_ok,
* it is the moment to try to read ahead asynchronously.
* We will later force unplug device in order to force asynchronous read IO.
*/
- else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
- ppos <= raend && ppos + filp->f_ralen >= raend) {
+ else if (reada_ok && filp->f_ramax && raend >= 1 &&
+ index <= raend && index + filp->f_ralen >= raend) {
/*
* Add ONE page to max_ahead in order to try to have about the same IO max size
* as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
* Compute the position of the last page we have tried to read in order to
* begin to read ahead just at the next page.
*/
- raend -= PAGE_CACHE_SIZE;
- if (raend < inode->i_size)
- max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
+ raend -= 1;
+ if (raend < end_index)
+ max_ahead = filp->f_ramax + 1;
if (max_ahead) {
filp->f_rawin = filp->f_ralen;
@@ -894,10 +904,11 @@ static void generic_file_readahead(int reada_ok,
*/
ahead = 0;
while (ahead < max_ahead) {
- ahead += PAGE_CACHE_SIZE;
- if ((raend + ahead) >= inode->i_size)
+ ahead ++;
+ if ((raend + ahead) >= end_index)
+ break;
+ if (page_cache_read(filp, raend + ahead) < 0)
break;
- page_cache_read(filp, raend + ahead);
}
/*
* If we tried to read ahead some pages,
@@ -917,7 +928,7 @@ static void generic_file_readahead(int reada_ok,
filp->f_ralen += ahead;
filp->f_rawin += filp->f_ralen;
- filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
+ filp->f_raend = raend + ahead + 1;
filp->f_ramax += filp->f_ramax;
@@ -945,15 +956,16 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
{
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
- size_t pos, pgpos;
+ unsigned long index, offset;
struct page *cached_page;
int reada_ok;
int error;
int max_readahead = get_max_readahead(inode);
cached_page = NULL;
- pos = *ppos;
- pgpos = pos & PAGE_CACHE_MASK;
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_CACHE_MASK;
+
/*
* If the current position is outside the previous read-ahead window,
* we reset the current read-ahead context and set read ahead max to zero
@@ -961,7 +973,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
* otherwise, we assume that the file accesses are sequential enough to
* continue read-ahead.
*/
- if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
+ if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
reada_ok = 0;
filp->f_raend = 0;
filp->f_ralen = 0;
@@ -977,12 +989,12 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
* Then, at least MIN_READAHEAD if read ahead is ok,
* and at most MAX_READAHEAD in all cases.
*/
- if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
+ if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
filp->f_ramax = 0;
} else {
unsigned long needed;
- needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
+ needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
if (filp->f_ramax < needed)
filp->f_ramax = needed;
@@ -995,17 +1007,27 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
for (;;) {
struct page *page, **hash;
+ unsigned long end_index, nr;
- if (pos >= inode->i_size)
+ end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ if (index > end_index)
break;
+ nr = PAGE_CACHE_SIZE;
+ if (index == end_index) {
+ nr = inode->i_size & ~PAGE_CACHE_MASK;
+ if (nr <= offset)
+ break;
+ }
+
+ nr = nr - offset;
/*
* Try to find the data in the page cache..
*/
- hash = page_hash(&inode->i_data, pos & PAGE_CACHE_MASK);
+ hash = page_hash(&inode->i_data, index);
spin_lock(&pagecache_lock);
- page = __find_page_nolock(&inode->i_data, pos & PAGE_CACHE_MASK, *hash);
+ page = __find_page_nolock(&inode->i_data, index, *hash);
if (!page)
goto no_cached_page;
found_page:
@@ -1015,19 +1037,10 @@ found_page:
if (!Page_Uptodate(page))
goto page_not_up_to_date;
page_ok:
- /*
- * Ok, we have the page, and it's up-to-date, so
- * now we can copy it to user space...
- */
- {
- unsigned long offset, nr;
-
- offset = pos & ~PAGE_CACHE_MASK;
- nr = PAGE_CACHE_SIZE - offset;
- if (nr > inode->i_size - pos)
- nr = inode->i_size - pos;
-
/*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ *
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
@@ -1035,19 +1048,20 @@ page_ok:
* pointers and the remaining count).
*/
nr = actor(desc, page, offset, nr);
- pos += nr;
+ offset += nr;
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+
page_cache_release(page);
if (nr && desc->count)
continue;
break;
- }
/*
* Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
*/
page_not_up_to_date:
- generic_file_readahead(reada_ok, filp, inode,
- pos & PAGE_CACHE_MASK, page);
+ generic_file_readahead(reada_ok, filp, inode, page);
if (Page_Uptodate(page))
goto page_ok;
@@ -1068,8 +1082,7 @@ readpage:
goto page_ok;
/* Again, try some read-ahead while waiting for the page to finish.. */
- generic_file_readahead(reada_ok, filp, inode,
- pos & PAGE_CACHE_MASK, page);
+ generic_file_readahead(reada_ok, filp, inode, page);
wait_on_page(page);
if (Page_Uptodate(page))
goto page_ok;
@@ -1101,7 +1114,7 @@ no_cached_page:
* dropped the page cache lock. Check for that.
*/
spin_lock(&pagecache_lock);
- page = __find_page_nolock(&inode->i_data, pos & PAGE_CACHE_MASK, *hash);
+ page = __find_page_nolock(&inode->i_data, index, *hash);
if (page)
goto found_page;
}
@@ -1110,14 +1123,14 @@ no_cached_page:
* Ok, add the new page to the hash-queues...
*/
page = cached_page;
- __add_to_page_cache(page, &inode->i_data, pos & PAGE_CACHE_MASK, hash);
+ __add_to_page_cache(page, &inode->i_data, index, hash);
spin_unlock(&pagecache_lock);
cached_page = NULL;
goto readpage;
}
- *ppos = pos;
+ *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
filp->f_reada = 1;
if (cached_page)
page_cache_free(cached_page);
@@ -1131,12 +1144,10 @@ static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned
if (size > count)
size = count;
- /*
- * FIXME: We cannot yet sleep with kmaps held.
- */
- kaddr = kmap(page, KM_READ);
- left = __copy_to_user(desc->buf, (void *)(kaddr+offset), size);
- kunmap(kaddr, KM_READ);
+
+ kaddr = kmap(page);
+ left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
+ kunmap(page);
if (left) {
size -= left;
@@ -1159,6 +1170,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *
retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) {
retval = 0;
+
if (count) {
read_descriptor_t desc;
@@ -1188,9 +1200,11 @@ static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned
size = count;
old_fs = get_fs();
set_fs(KERNEL_DS);
- kaddr = kmap(page, KM_READ);
- written = file->f_op->write(file, (char *)kaddr + offset, size, &file->f_pos);
- kunmap(kaddr, KM_READ);
+
+ kaddr = kmap(page);
+ written = file->f_op->write(file, (char *)kaddr + offset,
+ size, &file->f_pos);
+ kunmap(page);
set_fs(old_fs);
if (written < 0) {
desc->error = written;
@@ -1286,19 +1300,18 @@ out:
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
- *
- * XXX - at some point, this should return unique values to indicate to
- * the caller whether this is EIO, OOM, or SIGBUS.
*/
static struct page * filemap_nopage(struct vm_area_struct * area,
unsigned long address, int no_share)
{
+ int error;
struct file *file = area->vm_file;
struct dentry *dentry = file->f_dentry;
struct inode *inode = dentry->d_inode;
struct page *page, **hash, *old_page;
+ unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- unsigned long offset = address - area->vm_start + area->vm_offset;
+ unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
/*
* Semantics for shared and private memory areas are different
@@ -1306,16 +1319,16 @@ static struct page * filemap_nopage(struct vm_area_struct * area,
* of the file is an error and results in a SIGBUS, while a
* private mapping just maps in a zero page.
*/
- if ((offset >= inode->i_size) &&
+ if ((pgoff >= size) &&
(area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
return NULL;
/*
* Do we have something in the page cache already?
*/
- hash = page_hash(&inode->i_data, offset);
+ hash = page_hash(&inode->i_data, pgoff);
retry_find:
- page = __find_get_page(&inode->i_data, offset, hash);
+ page = __find_get_page(&inode->i_data, pgoff, hash);
if (!page)
goto no_cached_page;
@@ -1336,11 +1349,10 @@ success:
struct page *new_page = page_cache_alloc();
if (new_page) {
- if (PageHighMem(new_page) || PageHighMem(old_page))
- BUG();
copy_highpage(new_page, old_page);
flush_page_to_ram(new_page);
- }
+ } else
+ new_page = NOPAGE_OOM;
page_cache_release(page);
return new_page;
}
@@ -1356,17 +1368,27 @@ no_cached_page:
* Otherwise, we're off the end of a privately mapped file,
* so we need to map a zero page.
*/
- if (offset < inode->i_size)
- read_cluster_nonblocking(file, offset);
+ if (pgoff < size)
+ error = read_cluster_nonblocking(file, pgoff);
else
- page_cache_read(file, offset);
+ error = page_cache_read(file, pgoff);
/*
* The page we want has now been added to the page cache.
* In the unlikely event that someone removed it in the
* meantime, we'll just come back here and read it again.
*/
- goto retry_find;
+ if (error >= 0)
+ goto retry_find;
+
+ /*
+ * An error return from page_cache_read can result if the
+ * system is low on memory, or a problem occurs while trying
+ * to schedule I/O.
+ */
+ if (error == -ENOMEM)
+ return NOPAGE_OOM;
+ return NULL;
page_not_uptodate:
lock_page(page);
@@ -1418,7 +1440,7 @@ static inline int do_write_page(struct inode * inode, struct file * file,
unsigned long size;
int (*writepage) (struct file *, struct page *);
- size = offset + PAGE_SIZE;
+ size = (offset << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE;
/* refuse to extend file size.. */
if (S_ISREG(inode->i_mode)) {
if (size > inode->i_size)
@@ -1427,7 +1449,6 @@ static inline int do_write_page(struct inode * inode, struct file * file,
if (size < offset)
return -EIO;
}
- size -= offset;
retval = -EIO;
writepage = inode->i_op->writepage;
lock_page(page);
@@ -1469,7 +1490,7 @@ static int filemap_write_page(struct file *file,
extern void wakeup_bdflush(int);
int filemap_swapout(struct page * page, struct file * file)
{
- int retval = filemap_write_page(file, page->offset, page, 0);
+ int retval = filemap_write_page(file, page->index, page, 0);
wakeup_bdflush(0);
return retval;
}
@@ -1477,6 +1498,7 @@ int filemap_swapout(struct page * page, struct file * file)
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
+ unsigned long pgoff;
pte_t pte = *ptep;
struct page *page;
int error;
@@ -1499,7 +1521,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
pte_clear(ptep);
flush_tlb_page(vma, address);
if (!pte_present(pte)) {
- swap_free(pte);
+ swap_free(pte_to_swp_entry(pte));
return 0;
}
page = pte_page(pte);
@@ -1508,9 +1530,13 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
return 0;
}
}
- if (PageHighMem(page))
- BUG();
- error = filemap_write_page(vma->vm_file, address - vma->vm_start + vma->vm_offset, page, 1);
+ pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (page->index != pgoff) {
+ printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
+ pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
+ }
+ error = filemap_write_page(vma->vm_file, pgoff, page, 1);
page_cache_free(page);
return error;
}
@@ -1764,13 +1790,16 @@ generic_file_write(struct file *file, const char *buf,
{
struct dentry *dentry = file->f_dentry;
struct inode *inode = dentry->d_inode;
- unsigned long pos = *ppos;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+ loff_t pos = *ppos;
struct page *page, **hash, *cached_page;
unsigned long written;
long status;
int err;
+ if (pos < 0)
+ return -EINVAL;
+
cached_page = NULL;
down(&inode->i_sem);
@@ -1789,36 +1818,35 @@ generic_file_write(struct file *file, const char *buf,
* Check whether we've reached the file size limit.
*/
err = -EFBIG;
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- goto out;
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ goto out;
+ }
+ if (count > limit - pos) {
+ send_sig(SIGXFSZ, current, 0);
+ count = limit - pos;
+ }
}
status = 0;
- /*
- * Check whether to truncate the write,
- * and send the signal if we do.
- */
- if (count > limit - pos) {
- send_sig(SIGXFSZ, current, 0);
- count = limit - pos;
- }
while (count) {
- unsigned long bytes, pgpos, offset;
+ unsigned long bytes, index, offset;
+
/*
* Try to find the page in the cache. If it isn't there,
* allocate a free page.
*/
- offset = (pos & ~PAGE_CACHE_MASK);
- pgpos = pos & PAGE_CACHE_MASK;
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ index = pos >> PAGE_CACHE_SHIFT;
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count)
bytes = count;
- hash = page_hash(&inode->i_data, pgpos);
+ hash = page_hash(&inode->i_data, index);
repeat_find:
- page = __find_lock_page(&inode->i_data, pgpos, hash);
+ page = __find_lock_page(&inode->i_data, index, hash);
if (!page) {
if (!cached_page) {
cached_page = page_cache_alloc();
@@ -1828,7 +1856,7 @@ repeat_find:
break;
}
page = cached_page;
- if (add_to_page_cache_unique(page,&inode->i_data,pgpos,hash))
+ if (add_to_page_cache_unique(page, &inode->i_data, index, hash))
goto repeat_find;
cached_page = NULL;
diff --git a/mm/highmem.c b/mm/highmem.c
index 7665393cf..248688c23 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -4,19 +4,25 @@
* (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
* Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
*
+ *
* Redesigned the x86 32-bit VM architecture to deal with
* 64-bit physical space. With current x86 CPUs this
* means up to 64 Gigabytes physical RAM.
*
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
*/
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
unsigned long highmem_mapnr;
-unsigned long nr_free_highpages = 0;
struct page * prepare_highmem_swapout(struct page * page)
{
@@ -34,9 +40,9 @@ struct page * prepare_highmem_swapout(struct page * page)
if (!regular_page)
return NULL;
- vaddr = kmap(page, KM_READ);
+ vaddr = kmap(page);
copy_page((void *)regular_page, (void *)vaddr);
- kunmap(vaddr, KM_READ);
+ kunmap(page);
/*
* ok, we can just forget about our highmem page since
@@ -52,10 +58,10 @@ struct page * replace_with_highmem(struct page * page)
struct page *highpage;
unsigned long vaddr;
- if (PageHighMem(page) || !nr_free_highpages)
+ if (PageHighMem(page) || !nr_free_highpages())
return page;
- highpage = get_free_highpage(GFP_ATOMIC|__GFP_HIGHMEM);
+ highpage = alloc_page(GFP_ATOMIC|__GFP_HIGHMEM);
if (!highpage)
return page;
if (!PageHighMem(highpage)) {
@@ -63,13 +69,13 @@ struct page * replace_with_highmem(struct page * page)
return page;
}
- vaddr = kmap(highpage, KM_WRITE);
+ vaddr = kmap(page);
copy_page((void *)vaddr, (void *)page_address(page));
- kunmap(vaddr, KM_WRITE);
+ kunmap(page);
/* Preserve the caching of the swap_entry. */
- highpage->offset = page->offset;
- highpage->inode = page->inode;
+ highpage->index = page->index;
+ highpage->mapping = page->mapping;
/*
* We can just forget the old page since
@@ -79,3 +85,275 @@ struct page * replace_with_highmem(struct page * page)
return highpage;
}
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 2048
+#else
+#define LAST_PKMAP 4096
+#endif
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+/*
+ * Virtual_count is not a pure "count".
+ * 0 means that it is not mapped, and has not been mapped
+ * since a TLB flush - it is usable.
+ * 1 means that there are no users, but it has been mapped
+ * since the last TLB flush - so we can't use it.
+ * n means that there are (n-1) current users of it.
+ */
+static int pkmap_count[LAST_PKMAP];
+static unsigned int last_pkmap_nr = 0;
+static spinlock_t kmap_lock;
+
+pte_t * pkmap_page_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+static void flush_all_zero_pkmaps(void)
+{
+ int i;
+
+ for (i = 0; i < LAST_PKMAP; i++) {
+ struct page *page;
+ pte_t pte;
+ /*
+ * zero means we don't have anything to do,
+ * >1 means that it is still in use. Only
+ * a count of 1 means that it is free but
+ * needs to be unmapped
+ */
+ if (pkmap_count[i] != 1)
+ continue;
+ pkmap_count[i] = 0;
+ pte = pkmap_page_table[i];
+ if (pte_none(pte))
+ continue;
+ pte_clear(pkmap_page_table+i);
+ page = pte_page(pte);
+ page->virtual = 0;
+ }
+ flush_tlb_all();
+}
+
+static unsigned long map_new_virtual(struct page *page)
+{
+ unsigned long vaddr;
+ int count = LAST_PKMAP;
+
+ /* Find an empty entry */
+ for (;;) {
+ last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+ if (!last_pkmap_nr)
+ flush_all_zero_pkmaps();
+ if (!pkmap_count[last_pkmap_nr])
+ break; /* Found a usable entry */
+ if (--count)
+ continue;
+
+ /*
+ * Sleep for somebody else to unmap their entries
+ */
+ {
+ DECLARE_WAITQUEUE(wait, current);
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ add_wait_queue(&pkmap_map_wait, &wait);
+ spin_unlock(&kmap_lock);
+ // it's not quite possible to saturate the
+ // pkmap pool right now.
+ BUG();
+ schedule();
+ remove_wait_queue(&pkmap_map_wait, &wait);
+ spin_lock(&kmap_lock);
+ }
+
+ /* Somebody else might have mapped it while we slept */
+ if (page->virtual)
+ return page->virtual;
+
+ /* Re-start */
+ count = LAST_PKMAP;
+ }
+ vaddr = PKMAP_ADDR(last_pkmap_nr);
+ pkmap_page_table[last_pkmap_nr] = mk_pte(page, kmap_prot);
+
+ /*
+ * Subtle! For some reason if we dont do this TLB flush then
+ * we get data corruption and weird behavior in dbench runs.
+ * But invlpg this should not be necessery ... Any ideas?
+ */
+ __flush_tlb_one(vaddr);
+ pkmap_count[last_pkmap_nr] = 1;
+ page->virtual = vaddr;
+
+ return vaddr;
+}
+
+unsigned long kmap_high(struct page *page)
+{
+ unsigned long vaddr;
+
+ if (!PageHighMem(page))
+ BUG();
+ /*
+ * For highmem pages, we can't trust "virtual" until
+ * after we have the lock.
+ *
+ * We cannot call this from interrupts, as it may block
+ */
+ spin_lock(&kmap_lock);
+ vaddr = page->virtual;
+ if (!vaddr)
+ vaddr = map_new_virtual(page);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ if (pkmap_count[PKMAP_NR(vaddr)] < 2)
+ BUG();
+ spin_unlock(&kmap_lock);
+ return vaddr;
+}
+
+void kunmap_high(struct page *page)
+{
+ unsigned long vaddr;
+ unsigned long nr;
+
+ spin_lock(&kmap_lock);
+ vaddr = page->virtual;
+ if (!vaddr)
+ BUG();
+ nr = PKMAP_NR(vaddr);
+
+ /*
+ * A count must never go down to zero
+ * without a TLB flush!
+ */
+ switch (--pkmap_count[nr]) {
+ case 0:
+ BUG();
+ case 1:
+ wake_up(&pkmap_map_wait);
+ }
+ spin_unlock(&kmap_lock);
+}
+
+/*
+ * Simple bounce buffer support for highmem pages.
+ * This will be moved to the block layer in 2.5.
+ */
+
+extern kmem_cache_t *bh_cachep;
+
+static inline void copy_from_high_bh (struct buffer_head *to,
+ struct buffer_head *from)
+{
+ struct page *p_from;
+ unsigned long vfrom;
+
+ p_from = from->b_page;
+ vfrom = kmap_atomic(p_from, KM_BOUNCE_WRITE);
+ memcpy(to->b_data, (char *)vfrom + bh_offset(from), to->b_size);
+ kunmap_atomic(vfrom, KM_BOUNCE_WRITE);
+}
+
+static inline void copy_to_high_bh_irq (struct buffer_head *to,
+ struct buffer_head *from)
+{
+ struct page *p_to;
+ unsigned long vto;
+
+ p_to = to->b_page;
+ vto = kmap_atomic(p_to, KM_BOUNCE_WRITE);
+ memcpy((char *)vto + bh_offset(to), from->b_data, to->b_size);
+ kunmap_atomic(vto, KM_BOUNCE_WRITE);
+}
+
+static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
+{
+ struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_dev_id);
+
+ bh_orig->b_end_io(bh_orig, uptodate);
+ __free_page(bh->b_page);
+ kmem_cache_free(bh_cachep, bh);
+}
+
+static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
+{
+ bounce_end_io(bh, uptodate);
+}
+
+static void bounce_end_io_read (struct buffer_head *bh, int uptodate)
+{
+ struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_dev_id);
+
+ if (uptodate)
+ copy_to_high_bh_irq(bh_orig, bh);
+ bounce_end_io(bh, uptodate);
+}
+
+struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
+{
+ struct page *page;
+ struct buffer_head *bh;
+
+ if (!PageHighMem(bh_orig->b_page))
+ return bh_orig;
+
+repeat_bh:
+ bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+ if (!bh) {
+ wakeup_bdflush(1);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ goto repeat_bh;
+ }
+ /*
+ * This is wasteful for 1k buffers, but this is a stopgap measure
+ * and we are being ineffective anyway. This approach simplifies
+ * things immensly. On boxes with more than 4GB RAM this should
+ * not be an issue anyway.
+ */
+repeat_page:
+ page = alloc_page(GFP_BUFFER);
+ if (!page) {
+ wakeup_bdflush(1);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ goto repeat_page;
+ }
+ set_bh_page(bh, page, 0);
+
+ bh->b_next = NULL;
+ bh->b_blocknr = bh_orig->b_blocknr;
+ bh->b_size = bh_orig->b_size;
+ bh->b_list = -1;
+ bh->b_dev = bh_orig->b_dev;
+ bh->b_count = bh_orig->b_count;
+ bh->b_rdev = bh_orig->b_rdev;
+ bh->b_state = bh_orig->b_state;
+ bh->b_flushtime = 0;
+ bh->b_next_free = NULL;
+ bh->b_prev_free = NULL;
+ /* bh->b_this_page */
+ bh->b_reqnext = NULL;
+ bh->b_pprev = NULL;
+ /* bh->b_page */
+ if (rw == WRITE) {
+ bh->b_end_io = bounce_end_io_write;
+ copy_from_high_bh(bh, bh_orig);
+ } else
+ bh->b_end_io = bounce_end_io_read;
+ bh->b_dev_id = (void *)bh_orig;
+ bh->b_rsector = -1;
+ memset(&bh->b_wait, -1, sizeof(bh->b_wait));
+ bh->b_kiobuf = NULL;
+
+ return bh;
+}
+
diff --git a/mm/memory.c b/mm/memory.c
index 87611db8c..a4eb69717 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -227,7 +227,7 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
if (pte_none(pte))
goto cont_copy_pte_range;
if (!pte_present(pte)) {
- swap_duplicate(pte);
+ swap_duplicate(pte_to_swp_entry(pte));
set_pte(dst_pte, pte);
goto cont_copy_pte_range;
}
@@ -282,7 +282,7 @@ static inline int free_pte(pte_t page)
free_page_and_swap_cache(mem_map+nr);
return 1;
}
- swap_free(page);
+ swap_free(pte_to_swp_entry(page));
return 0;
}
@@ -743,7 +743,7 @@ struct page * put_dirty_page(struct task_struct * tsk, struct page *page,
return 0;
}
flush_page_to_ram(page);
- set_pte(pte, pte_mkwrite(page_pte_prot(page, PAGE_COPY)));
+ set_pte(pte, pte_mkwrite(mk_pte(page, PAGE_COPY)));
/* no need for flush_tlb */
return page;
}
@@ -808,7 +808,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
* Ok, we need to copy. Oh, well..
*/
spin_unlock(&tsk->mm->page_table_lock);
- new_page = get_free_highpage(GFP_HIGHUSER);
+ new_page = alloc_page(GFP_HIGHUSER);
if (!new_page)
return -1;
spin_lock(&tsk->mm->page_table_lock);
@@ -887,12 +887,19 @@ static void partial_clear(struct vm_area_struct *vma, unsigned long address)
*/
void vmtruncate(struct inode * inode, unsigned long offset)
{
+ unsigned long partial, pgoff;
struct vm_area_struct * mpnt;
truncate_inode_pages(inode, offset);
spin_lock(&inode->i_shared_lock);
if (!inode->i_mmap)
goto out_unlock;
+
+ partial = offset & (PAGE_CACHE_SIZE - 1);
+ pgoff = offset >> PAGE_CACHE_SHIFT;
+ if (partial)
+ pgoff ++;
+
mpnt = inode->i_mmap;
do {
struct mm_struct *mm = mpnt->vm_mm;
@@ -902,19 +909,22 @@ void vmtruncate(struct inode * inode, unsigned long offset)
unsigned long diff;
/* mapping wholly truncated? */
- if (mpnt->vm_offset >= offset) {
+ if (mpnt->vm_pgoff >= pgoff) {
flush_cache_range(mm, start, end);
zap_page_range(mm, start, len);
flush_tlb_range(mm, start, end);
continue;
}
+
/* mapping wholly unaffected? */
- diff = offset - mpnt->vm_offset;
+ len = len >> PAGE_SHIFT;
+ diff = pgoff - mpnt->vm_pgoff;
if (diff >= len)
continue;
+
/* Ok, partially affected.. */
- start += diff;
- len = (len - diff) & PAGE_MASK;
+ start += diff << PAGE_SHIFT;
+ len = (len - diff) << PAGE_SHIFT;
if (start & ~PAGE_MASK) {
partial_clear(mpnt, start);
start = (start + ~PAGE_MASK) & PAGE_MASK;
@@ -935,7 +945,7 @@ out_unlock:
* because it doesn't cost us any seek time. We also make sure to queue
* the 'original' request together with the readahead ones...
*/
-void swapin_readahead(pte_t entry)
+void swapin_readahead(swp_entry_t entry)
{
int i;
struct page *new_page;
@@ -969,7 +979,7 @@ void swapin_readahead(pte_t entry)
static int do_swap_page(struct task_struct * tsk,
struct vm_area_struct * vma, unsigned long address,
- pte_t * page_table, pte_t entry, int write_access)
+ pte_t * page_table, swp_entry_t entry, int write_access)
{
struct page *page = lookup_swap_cache(entry);
pte_t pte;
@@ -1015,7 +1025,7 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v
struct page *page = NULL;
pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
if (write_access) {
- page = get_free_highpage(GFP_HIGHUSER);
+ page = alloc_page(GFP_HIGHUSER);
if (!page)
return -1;
if (PageHighMem(page))
@@ -1041,8 +1051,7 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore and the kernel lock held.
- * We need to release the kernel lock as soon as possible..
+ * This is called with the MM semaphore held.
*/
static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
unsigned long address, int write_access, pte_t *page_table)
@@ -1059,10 +1068,10 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
* essentially an early COW detection.
*/
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
- if (!new_page)
- return 0; /* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */
- if (new_page == (struct page *)-1)
- return -1; /* OOM */
+ if (new_page == NULL) /* no page was available -- SIGBUS */
+ return 0;
+ if (new_page == NOPAGE_OOM)
+ return -1;
++tsk->maj_flt;
++vma->vm_mm->rss;
/*
@@ -1116,7 +1125,7 @@ static inline int handle_pte_fault(struct task_struct *tsk,
if (!pte_present(entry)) {
if (pte_none(entry))
return do_no_page(tsk, vma, address, write_access, pte);
- return do_swap_page(tsk, vma, address, pte, entry, write_access);
+ return do_swap_page(tsk, vma, address, pte, pte_to_swp_entry(entry), write_access);
}
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 9709d1a04..59d11b922 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -7,6 +7,7 @@
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -35,7 +36,7 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma,
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
vmlist_modify_lock(vma->vm_mm);
- vma->vm_offset += end - vma->vm_start;
+ vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = end;
insert_vm_struct(current->mm, n);
vmlist_modify_unlock(vma->vm_mm);
@@ -52,7 +53,7 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma,
return -EAGAIN;
*n = *vma;
n->vm_start = start;
- n->vm_offset += n->vm_start - vma->vm_start;
+ n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
n->vm_flags = newflags;
if (n->vm_file)
get_file(n->vm_file);
@@ -82,7 +83,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
*right = *vma;
left->vm_end = start;
right->vm_start = end;
- right->vm_offset += right->vm_start - left->vm_start;
+ right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
vma->vm_flags = newflags;
if (vma->vm_file)
atomic_add(2, &vma->vm_file->f_count);
@@ -92,7 +93,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma,
vma->vm_ops->open(right);
}
vmlist_modify_lock(vma->vm_mm);
- vma->vm_offset += start - vma->vm_start;
+ vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = end;
vma->vm_flags = newflags;
diff --git a/mm/mmap.c b/mm/mmap.c
index db47ad266..99e86653d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -64,7 +64,7 @@ int vm_enough_memory(long pages)
free = atomic_read(&buffermem_pages);
free += atomic_read(&page_cache_size);
- free += nr_free_pages;
+ free += nr_free_pages();
free += nr_swap_pages;
return free > pages;
}
@@ -183,6 +183,8 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
if (off + len < off)
return -EINVAL;
+ off = off >> PAGE_SHIFT;
+
/* Too many mappings? */
if (mm->map_count > MAX_MAP_COUNT)
return -ENOMEM;
@@ -272,7 +274,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
- vma->vm_offset = off;
+ vma->vm_pgoff = off;
vma->vm_file = NULL;
vma->vm_private_data = NULL;
@@ -533,7 +535,7 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
area->vm_end = addr;
vmlist_modify_lock(current->mm);
} else if (addr == area->vm_start) {
- area->vm_offset += (end - area->vm_start);
+ area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
area->vm_start = end;
vmlist_modify_lock(current->mm);
} else {
@@ -548,7 +550,8 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
mpnt->vm_page_prot = area->vm_page_prot;
mpnt->vm_flags = area->vm_flags;
mpnt->vm_ops = area->vm_ops;
- mpnt->vm_offset = area->vm_offset + (end - area->vm_start);
+ mpnt->vm_pgoff = area->vm_pgoff;
+ area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
mpnt->vm_file = area->vm_file;
mpnt->vm_private_data = area->vm_private_data;
if (mpnt->vm_file)
@@ -783,7 +786,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
- vma->vm_offset = 0;
+ vma->vm_pgoff = 0;
vma->vm_file = NULL;
vma->vm_private_data = NULL;
@@ -943,8 +946,9 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
* the offsets must be contiguous..
*/
if ((mpnt->vm_file != NULL) || (mpnt->vm_flags & VM_SHM)) {
- unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start;
- if (off != mpnt->vm_offset)
+ unsigned long off = prev->vm_pgoff;
+ off += (prev->vm_end - prev->vm_start) >> PAGE_SHIFT;
+ if (off != mpnt->vm_pgoff)
continue;
}
@@ -957,7 +961,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l
prev->vm_end = mpnt->vm_end;
prev->vm_next = mpnt->vm_next;
if (mpnt->vm_ops && mpnt->vm_ops->close) {
- mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start;
+ mpnt->vm_pgoff += (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
mpnt->vm_start = mpnt->vm_end;
vmlist_modify_unlock(mm);
mpnt->vm_ops->close(mpnt);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 56454fc07..4752806de 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -72,11 +72,13 @@ static void change_protection(unsigned long start, unsigned long end, pgprot_t n
flush_cache_range(current->mm, beg, end);
if (start >= end)
BUG();
+ spin_lock(&current->mm->page_table_lock);
do {
change_pmd_range(dir, start, end - start, newprot);
start = (start + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (start && (start < end));
+ spin_unlock(&current->mm->page_table_lock);
flush_tlb_range(current->mm, beg, end);
return;
}
@@ -109,7 +111,7 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma,
if (n->vm_ops && n->vm_ops->open)
n->vm_ops->open(n);
vmlist_modify_lock(vma->vm_mm);
- vma->vm_offset += end - vma->vm_start;
+ vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = end;
insert_vm_struct(current->mm, n);
vmlist_modify_unlock(vma->vm_mm);
@@ -127,7 +129,7 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma,
return -ENOMEM;
*n = *vma;
n->vm_start = start;
- n->vm_offset += n->vm_start - vma->vm_start;
+ n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
n->vm_flags = newflags;
n->vm_page_prot = prot;
if (n->vm_file)
@@ -159,7 +161,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
*right = *vma;
left->vm_end = start;
right->vm_start = end;
- right->vm_offset += right->vm_start - left->vm_start;
+ right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
if (vma->vm_file)
atomic_add(2,&vma->vm_file->f_count);
if (vma->vm_ops && vma->vm_ops->open) {
@@ -167,7 +169,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma,
vma->vm_ops->open(right);
}
vmlist_modify_lock(vma->vm_mm);
- vma->vm_offset += start - vma->vm_start;
+ vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = end;
vma->vm_flags = newflags;
diff --git a/mm/mremap.c b/mm/mremap.c
index b73996dc2..012ab7912 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -93,7 +93,6 @@ static int move_page_tables(struct mm_struct * mm,
unsigned long offset = len;
flush_cache_range(mm, old_addr, old_addr + len);
- flush_tlb_range(mm, old_addr, old_addr + len);
/*
* This is not the clever way to do this, but we're taking the
@@ -105,6 +104,7 @@ static int move_page_tables(struct mm_struct * mm,
if (move_one_page(mm, old_addr + offset, new_addr + offset))
goto oops_we_failed;
}
+ flush_tlb_range(mm, old_addr, old_addr + len);
return 0;
/*
@@ -136,7 +136,8 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
*new_vma = *vma;
new_vma->vm_start = new_addr;
new_vma->vm_end = new_addr+new_len;
- new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start);
+ new_vma->vm_pgoff = vma->vm_pgoff;
+ new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27aa58468..95a2bc436 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4,6 +4,7 @@
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
*/
#include <linux/config.h>
@@ -22,7 +23,6 @@
#include <asm/pgtable.h>
int nr_swap_pages = 0;
-int nr_free_pages = 0;
int nr_lru_pages;
LIST_HEAD(lru_cache);
@@ -36,30 +36,46 @@ LIST_HEAD(lru_cache);
#if CONFIG_AP1000
/* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
for the ring buffers */
-#define NR_MEM_LISTS 12
+#define MAX_ORDER 12
#else
-#define NR_MEM_LISTS 10
+#define MAX_ORDER 10
#endif
-struct free_area_struct {
+typedef struct free_area_struct {
struct list_head free_list;
unsigned int * map;
- unsigned long count;
-};
+} free_area_t;
-#define MEM_TYPE_DMA 0
-#define MEM_TYPE_NORMAL 1
-#define MEM_TYPE_HIGH 2
-
-static const char *mem_type_strs[] = {"DMA", "Normal", "High"};
+#define ZONE_DMA 0
+#define ZONE_NORMAL 1
#ifdef CONFIG_HIGHMEM
-#define NR_MEM_TYPES 3
+# define ZONE_HIGHMEM 2
+# define NR_ZONES 3
#else
-#define NR_MEM_TYPES 2
+# define NR_ZONES 2
#endif
-static struct free_area_struct free_area[NR_MEM_TYPES][NR_MEM_LISTS];
+typedef struct zone_struct {
+ spinlock_t lock;
+ unsigned long offset;
+ unsigned long size;
+ free_area_t free_area[MAX_ORDER];
+
+ unsigned long free_pages;
+ unsigned long pages_low, pages_high;
+ int low_on_memory;
+ char * name;
+} zone_t;
+
+static zone_t zones[NR_ZONES] =
+ {
+ { name: "DMA" },
+ { name: "Normal" },
+#ifdef CONFIG_HIGHMEM
+ { name: "HighMem" }
+#endif
+ };
/*
* Free_page() adds the page to the free lists. This is optimized for
@@ -73,13 +89,6 @@ static struct free_area_struct free_area[NR_MEM_TYPES][NR_MEM_LISTS];
* for the normal case, giving better asm-code.
*/
-/*
- * Buddy system. Hairy. You really aren't expected to understand this
- *
- * Hint: -mask = 1+~mask
- */
-spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
-
#define memlist_init(x) INIT_LIST_HEAD(x)
#define memlist_add_head list_add
#define memlist_add_tail list_add_tail
@@ -88,35 +97,54 @@ spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
#define memlist_next(x) ((x)->next)
#define memlist_prev(x) ((x)->prev)
-static inline void free_pages_ok(struct page *page, unsigned long map_nr, unsigned long order)
+/*
+ * Temporary debugging check.
+ */
+#define BAD_RANGE(zone,x) ((((x)-mem_map) < zone->offset) || (((x)-mem_map) >= zone->offset+zone->size))
+
+/*
+ * Buddy system. Hairy. You really aren't expected to understand this
+ *
+ * Hint: -mask = 1+~mask
+ */
+
+static inline void free_pages_ok (struct page *page, unsigned long map_nr, unsigned long order)
{
struct free_area_struct *area;
- unsigned long index = map_nr >> (1 + order);
- unsigned long mask = (~0UL) << order;
+ unsigned long index, page_idx, mask, offset;
unsigned long flags;
struct page *buddy;
+ zone_t *zone;
+ int i;
- spin_lock_irqsave(&page_alloc_lock, flags);
-
-#define list(x) (mem_map+(x))
-
-#ifdef CONFIG_HIGHMEM
- if (map_nr >= highmem_mapnr) {
- area = free_area[MEM_TYPE_HIGH];
- nr_free_highpages -= mask;
- } else
-#endif
- if (PageDMA(page))
- area = free_area[MEM_TYPE_DMA];
- else
- area = free_area[MEM_TYPE_NORMAL];
+ /*
+ * Which zone is this page belonging to.
+ *
+ * (NR_ZONES is low, and we do not want (yet) to introduce
+ * put page->zone, it increases the size of mem_map[]
+ * unnecesserily. This small loop is basically equivalent
+ * to the previous #ifdef jungle, speed-wise.)
+ */
+ i = NR_ZONES-1;
+ zone = zones + i;
+ for ( ; i >= 0; i--, zone--)
+ if (map_nr >= zone->offset)
+ break;
+ mask = (~0UL) << order;
+ offset = zone->offset;
+ area = zone->free_area;
area += order;
+ page_idx = map_nr - zone->offset;
+ page_idx &= mask;
+ index = page_idx >> (1 + order);
+ mask = (~0UL) << order;
- map_nr &= mask;
- nr_free_pages -= mask;
+ spin_lock_irqsave(&zone->lock, flags);
- while (mask + (1 << (NR_MEM_LISTS-1))) {
+ zone->free_pages -= mask;
+
+ while (mask + (1 << (MAX_ORDER-1))) {
if (!test_and_change_bit(index, area->map))
/*
* the buddy page is still allocated.
@@ -125,21 +153,22 @@ static inline void free_pages_ok(struct page *page, unsigned long map_nr, unsign
/*
* Move the buddy up one level.
*/
- buddy = list(map_nr ^ -mask);
- page = list(map_nr);
+ buddy = mem_map + offset + (page_idx ^ -mask);
+ page = mem_map + offset + page_idx;
+ if (BAD_RANGE(zone,buddy))
+ BUG();
+ if (BAD_RANGE(zone,page))
+ BUG();
- area->count--;
memlist_del(&buddy->list);
mask <<= 1;
area++;
index >>= 1;
- map_nr &= mask;
+ page_idx &= mask;
}
- area->count++;
- memlist_add_head(&(list(map_nr))->list, &area->free_list);
-#undef list
+ memlist_add_head(&mem_map[offset + page_idx].list, &area->free_list);
- spin_unlock_irqrestore(&page_alloc_lock, flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
/*
@@ -147,10 +176,9 @@ static inline void free_pages_ok(struct page *page, unsigned long map_nr, unsign
*/
#define MARK_USED(index, order, area) \
change_bit((index) >> (1+(order)), (area)->map)
-#define CAN_DMA(x) (PageDMA(x))
#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
-int __free_page(struct page *page)
+int __free_page (struct page *page)
{
if (!PageReserved(page) && put_page_testzero(page)) {
if (PageSwapCache(page))
@@ -164,7 +192,7 @@ int __free_page(struct page *page)
return 0;
}
-int free_pages(unsigned long addr, unsigned long order)
+int free_pages (unsigned long addr, unsigned long order)
{
unsigned long map_nr = MAP_NR(addr);
@@ -182,16 +210,17 @@ int free_pages(unsigned long addr, unsigned long order)
return 0;
}
-static inline unsigned long EXPAND (struct page *map, unsigned long index,
+static inline unsigned long EXPAND (zone_t *zone, struct page *map, unsigned long index,
int low, int high, struct free_area_struct * area)
{
unsigned long size = 1 << high;
while (high > low) {
+ if (BAD_RANGE(zone,map))
+ BUG();
area--;
high--;
size >>= 1;
- area->count++;
memlist_add_head(&(map)->list, &(area)->free_list);
MARK_USED(index, high, area);
index += size;
@@ -201,79 +230,62 @@ static inline unsigned long EXPAND (struct page *map, unsigned long index,
return index;
}
-static inline struct page * rmqueue (int order, unsigned type)
+static inline struct page * rmqueue (zone_t *zone, unsigned long order)
{
- struct free_area_struct * area = free_area[type]+order;
+ struct free_area_struct * area = zone->free_area + order;
unsigned long curr_order = order, map_nr;
- struct page *page;
struct list_head *head, *curr;
+ unsigned long flags;
+ struct page *page;
+ spin_lock_irqsave(&zone->lock, flags);
do {
head = &area->free_list;
curr = memlist_next(head);
if (curr != head) {
+ unsigned int index;
+
page = memlist_entry(curr, struct page, list);
memlist_del(curr);
- area->count--;
- map_nr = page - mem_map;
- MARK_USED(map_nr, curr_order, area);
- nr_free_pages -= 1 << order;
- map_nr = EXPAND(page, map_nr, order, curr_order, area);
+ map_nr = page - mem_map;
+ index = map_nr - zone->offset;
+ MARK_USED(index, curr_order, area);
+ zone->free_pages -= 1 << order;
+ map_nr = zone->offset + EXPAND(zone, page, index, order, curr_order, area);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
page = mem_map + map_nr;
+ if (BAD_RANGE(zone,page))
+ BUG();
return page;
}
curr_order++;
area++;
- } while (curr_order < NR_MEM_LISTS);
+ } while (curr_order < MAX_ORDER);
+ spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}
-static inline int balance_lowmemory (int gfp_mask)
+static inline int balance_memory (zone_t *zone, int gfp_mask)
{
int freed;
- static int low_on_memory = 0;
-#ifndef CONFIG_HIGHMEM
- if (nr_free_pages > freepages.min) {
- if (!low_on_memory)
+ if (zone->free_pages > zone->pages_low) {
+ if (!zone->low_on_memory)
return 1;
- if (nr_free_pages >= freepages.high) {
- low_on_memory = 0;
+ /*
+ * Simple hysteresis: exit 'low memory mode' if
+ * the upper limit has been reached:
+ */
+ if (zone->free_pages >= zone->pages_high) {
+ zone->low_on_memory = 0;
return 1;
}
}
+ zone->low_on_memory = 1;
- low_on_memory = 1;
-#else
- static int low_on_highmemory = 0;
-
- if (gfp_mask & __GFP_HIGHMEM)
- {
- if (nr_free_pages > freepages.min) {
- if (!low_on_highmemory) {
- return 1;
- }
- if (nr_free_pages >= freepages.high) {
- low_on_highmemory = 0;
- return 1;
- }
- }
- low_on_highmemory = 1;
- } else {
- if (nr_free_pages+nr_free_highpages > freepages.min) {
- if (!low_on_memory) {
- return 1;
- }
- if (nr_free_pages+nr_free_highpages >= freepages.high) {
- low_on_memory = 0;
- return 1;
- }
- }
- low_on_memory = 1;
- }
-#endif
current->flags |= PF_MEMALLOC;
freed = try_to_free_pages(gfp_mask);
current->flags &= ~PF_MEMALLOC;
@@ -283,13 +295,12 @@ static inline int balance_lowmemory (int gfp_mask)
return 1;
}
-struct page * __get_pages(int gfp_mask, unsigned long order)
+static inline struct page * __get_pages (zone_t *zone, unsigned int gfp_mask,
+ unsigned long order)
{
- unsigned long flags;
struct page *page;
- unsigned type;
- if (order >= NR_MEM_LISTS)
+ if (order >= MAX_ORDER)
goto nopage;
/*
@@ -303,28 +314,20 @@ struct page * __get_pages(int gfp_mask, unsigned long order)
* further thought.
*/
if (!(current->flags & PF_MEMALLOC))
- goto lowmemory;
-
-ok_to_allocate:
-#ifdef CONFIG_HIGHMEM
- if (gfp_mask & __GFP_HIGHMEM)
- type = MEM_TYPE_HIGH;
- else
-#endif
- if (gfp_mask & __GFP_DMA)
- type = MEM_TYPE_DMA;
- else
- type = MEM_TYPE_NORMAL;
-
- spin_lock_irqsave(&page_alloc_lock, flags);
+ if (!balance_memory(zone, gfp_mask))
+ goto nopage;
+ /*
+ * We are falling back to lower-level zones if allocation
+ * in a higher zone fails. This assumes a hierarchical
+ * dependency between zones, which is true currently. If
+ * you need something else then move this loop outside
+ * this function, into the zone-specific allocator.
+ */
do {
- page = rmqueue(order, type);
- if (page) {
- spin_unlock_irqrestore(&page_alloc_lock, flags);
+ page = rmqueue(zone, order);
+ if (page)
return page;
- }
- } while (type-- > 0) ;
- spin_unlock_irqrestore(&page_alloc_lock, flags);
+ } while (zone-- != zones) ;
/*
* If we can schedule, do so, and make sure to yield.
@@ -338,60 +341,114 @@ ok_to_allocate:
nopage:
return NULL;
+}
-lowmemory:
- if (balance_lowmemory(gfp_mask))
- goto ok_to_allocate;
- goto nopage;
+static inline zone_t * gfp_mask_to_zone (int gfp_mask)
+{
+ zone_t *zone;
+
+#if CONFIG_HIGHMEM
+ if (gfp_mask & __GFP_HIGHMEM)
+ zone = zones + ZONE_HIGHMEM;
+ else
+#endif
+ if (gfp_mask & __GFP_DMA)
+ zone = zones + ZONE_DMA;
+ else
+ zone = zones + ZONE_NORMAL;
+ return zone;
}
-unsigned long __get_free_pages(int gfp_mask, unsigned long order)
+unsigned long __get_free_pages (int gfp_mask, unsigned long order)
{
struct page *page;
- page = __get_pages(gfp_mask, order);
+
+ page = __get_pages(gfp_mask_to_zone(gfp_mask), gfp_mask, order);
if (!page)
return 0;
return page_address(page);
}
-struct page * get_free_highpage(int gfp_mask)
+struct page * alloc_pages (int gfp_mask, unsigned long order)
{
- return __get_pages(gfp_mask, 0);
+ return __get_pages(gfp_mask_to_zone(gfp_mask), gfp_mask, order);
}
/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages (void)
+{
+ unsigned int sum;
+ zone_t *zone;
+
+ sum = 0;
+ for (zone = zones; zone < zones+NR_ZONES; zone++)
+ sum += zone->free_pages;
+ return sum;
+}
+
+/*
+ * Amount of free RAM allocatable as buffer memory:
+ */
+unsigned int nr_free_buffer_pages (void)
+{
+ unsigned int sum;
+ zone_t *zone;
+
+ sum = nr_lru_pages;
+ for (zone = zones; zone <= zones+ZONE_NORMAL; zone++)
+ sum += zone->free_pages;
+ return sum;
+}
+
+#if CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+ return zones[ZONE_HIGHMEM].free_pages;
+}
+#endif
+
+/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
*/
void show_free_areas(void)
{
- unsigned long order, flags;
+ unsigned long order;
unsigned type;
- spin_lock_irqsave(&page_alloc_lock, flags);
- printk("Free pages: %6dkB (%6ldkB HighMem)\n",
- nr_free_pages<<(PAGE_SHIFT-10),
- nr_free_highpages<<(PAGE_SHIFT-10));
+ printk("Free pages: %6dkB (%6dkB HighMem)\n",
+ nr_free_pages()<<(PAGE_SHIFT-10),
+ nr_free_highpages()<<(PAGE_SHIFT-10));
printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
- nr_free_pages,
+ nr_free_pages(),
nr_lru_pages,
freepages.min,
freepages.low,
freepages.high);
- for (type = 0; type < NR_MEM_TYPES; type++) {
+ for (type = 0; type < NR_ZONES; type++) {
+ zone_t *zone = zones + type;
unsigned long total = 0;
- printk(" %s: ", mem_type_strs[type]);
- for (order = 0; order < NR_MEM_LISTS; order++) {
- unsigned long nr = free_area[type][order].count;
+ printk(" %s: ", zone->name);
+ for (order = 0; order < MAX_ORDER; order++) {
+ unsigned long i, nr;
+
+ nr = 0;
+ for (i = 0; i < zone->size; i += 1<<order) {
+ struct page * page;
+ page = mem_map + zone->offset + i;
+ if (!page_count(page))
+ nr++;
+ }
total += nr * ((PAGE_SIZE>>10) << order);
printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
}
printk("= %lukB)\n", total);
}
- spin_unlock_irqrestore(&page_alloc_lock, flags);
#ifdef SWAP_CACHE_INFO
show_swap_cache_info();
@@ -401,18 +458,24 @@ void show_free_areas(void)
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
/*
- * set up the free-area data structures:
+ * Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*/
-volatile int data;
-void __init free_area_init(unsigned long end_mem_pages)
+void __init free_area_init(unsigned int *zones_size)
{
mem_map_t * p;
unsigned long i, j;
unsigned long map_size;
+ unsigned int totalpages, offset;
+
+ totalpages = 0;
+ for (i = 0; i < NR_ZONES; i++)
+ totalpages += zones_size[i];
+ printk("totalpages: %08x\n", totalpages);
+ i = totalpages >> 7;
/*
* Select nr of pages we try to keep free for important stuff
* with a minimum of 10 pages and a maximum of 256 pages, so
@@ -420,7 +483,7 @@ void __init free_area_init(unsigned long end_mem_pages)
* This is fairly arbitrary, but based on some behaviour
* analysis.
*/
- i = end_mem_pages >> 7;
+ i = totalpages >> 7;
if (i < 10)
i = 10;
if (i > 256)
@@ -430,11 +493,10 @@ void __init free_area_init(unsigned long end_mem_pages)
freepages.high = i * 3;
/*
- * Most architectures just pick 'start_mem'. Some architectures
- * (with lots of mem and discontinous memory maps) have to search
- * for a good area.
+ * Some architectures (with lots of mem and discontinous memory
+ * maps) have to search for a good mem_map area:
*/
- map_size = end_mem_pages*sizeof(struct page);
+ map_size = totalpages*sizeof(struct page);
mem_map = (struct page *) alloc_bootmem(map_size);
memset(mem_map, 0, map_size);
@@ -443,27 +505,39 @@ void __init free_area_init(unsigned long end_mem_pages)
* up by free_all_bootmem() once the early boot process is
* done.
*/
- for (p = mem_map; p < mem_map + end_mem_pages; p++) {
+ for (p = mem_map; p < mem_map + totalpages; p++) {
set_page_count(p, 0);
p->flags = (1 << PG_DMA);
SetPageReserved(p);
init_waitqueue_head(&p->wait);
memlist_init(&p->list);
}
-
- for (j = 0 ; j < NR_MEM_TYPES ; j++) {
+
+ offset = 0;
+ for (j = 0; j < NR_ZONES; j++) {
+ zone_t *zone = zones + j;
unsigned long mask = -1;
- for (i = 0 ; i < NR_MEM_LISTS ; i++) {
+ unsigned long size;
+
+ size = zones_size[j];
+ zone->size = size;
+ zone->offset = offset;
+ zone->pages_low = freepages.low;
+ zone->pages_high = freepages.high;
+ zone->low_on_memory = 0;
+
+ offset += size;
+ for (i = 0; i < MAX_ORDER; i++) {
unsigned long bitmap_size;
unsigned int * map;
- memlist_init(&free_area[j][i].free_list);
+ memlist_init(&zone->free_area[i].free_list);
mask += mask;
- end_mem_pages = (end_mem_pages + ~mask) & mask;
- bitmap_size = end_mem_pages >> i;
+ size = (size + ~mask) & mask;
+ bitmap_size = size >> i;
bitmap_size = (bitmap_size + 7) >> 3;
bitmap_size = LONG_ALIGN(bitmap_size);
map = (unsigned int *) alloc_bootmem(bitmap_size);
- free_area[j][i].map = map;
+ zone->free_area[i].map = map;
memset((void *) map, 0, bitmap_size);
}
}
diff --git a/mm/page_io.c b/mm/page_io.c
index c5ed3ed74..0012fe234 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -33,7 +33,7 @@
* that shared pages stay shared while being swapped.
*/
-static int rw_swap_page_base(int rw, pte_t entry, struct page *page, int wait)
+static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page, int wait)
{
unsigned long type, offset;
struct swap_info_struct * p;
@@ -59,7 +59,7 @@ static int rw_swap_page_base(int rw, pte_t entry, struct page *page, int wait)
return 0;
}
if (p->swap_map && !p->swap_map[offset]) {
- pte_ERROR(entry);
+ printk("VM: Bad swap entry %08lx\n", entry.val);
return 0;
}
if (!(p->flags & SWP_USED)) {
@@ -130,7 +130,9 @@ static int rw_swap_page_base(int rw, pte_t entry, struct page *page, int wait)
*/
void rw_swap_page(int rw, struct page *page, int wait)
{
- pte_t entry = get_pagecache_pte(page);
+ swp_entry_t entry;
+
+ entry.val = page->index;
if (!PageLocked(page))
PAGE_BUG(page);
@@ -147,7 +149,7 @@ void rw_swap_page(int rw, struct page *page, int wait)
* Therefore we can't use it. Later when we can remove the need for the
* lock map and we can reduce the number of functions exported.
*/
-void rw_swap_page_nolock(int rw, pte_t entry, char *buf, int wait)
+void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf, int wait)
{
struct page *page = mem_map + MAP_NR(buf);
diff --git a/mm/slab.c b/mm/slab.c
index eabddab23..405cf8f88 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,7 +114,7 @@
/* If there is a different PAGE_SIZE around, and it works with this allocator,
* then change the following.
*/
-#if (PAGE_SIZE != 8192 && PAGE_SIZE != 4096)
+#if (PAGE_SIZE != 8192 && PAGE_SIZE != 4096 && PAGE_SIZE != 32768)
#error Your page size is probably not correctly supported - please check
#endif
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d79b7bffb..f63eca66a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -40,7 +40,7 @@ void show_swap_cache_info(void)
}
#endif
-void add_to_swap_cache(struct page *page, pte_t entry)
+void add_to_swap_cache(struct page *page, swp_entry_t entry)
{
#ifdef SWAP_CACHE_INFO
swap_cache_add_total++;
@@ -49,7 +49,7 @@ void add_to_swap_cache(struct page *page, pte_t entry)
BUG();
if (page->mapping)
BUG();
- add_to_page_cache(page, &swapper_space, pte_val(entry));
+ add_to_page_cache(page, &swapper_space, entry.val);
}
/*
@@ -58,17 +58,16 @@ void add_to_swap_cache(struct page *page, pte_t entry)
* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
* "permanent", but will be reclaimed by the next swapoff.
*/
-int swap_duplicate(pte_t entry)
+int swap_duplicate(swp_entry_t entry)
{
struct swap_info_struct * p;
unsigned long offset, type;
int result = 0;
- if (!pte_val(entry))
+ /* Swap entry 0 is illegal */
+ if (!entry.val)
goto out;
type = SWP_TYPE(entry);
- if (type & SHM_SWP_TYPE)
- goto out;
if (type >= nr_swapfiles)
goto bad_file;
p = type + swap_info;
@@ -85,7 +84,7 @@ int swap_duplicate(pte_t entry)
else {
static int overflow = 0;
if (overflow++ < 5)
- pte_ERROR(entry);
+ printk("VM: swap entry overflow\n");
p->swap_map[offset] = SWAP_MAP_MAX;
}
result = 1;
@@ -93,13 +92,13 @@ out:
return result;
bad_file:
- pte_ERROR(entry);
+ printk("Bad swap file entry %08lx\n", entry.val);
goto out;
bad_offset:
- pte_ERROR(entry);
+ printk("Bad swap offset entry %08lx\n", entry.val);
goto out;
bad_unused:
- pte_ERROR(entry);
+ printk("Unused swap offset entry %08lx\n", entry.val);
goto out;
}
@@ -107,14 +106,13 @@ int swap_count(struct page *page)
{
struct swap_info_struct * p;
unsigned long offset, type;
- pte_t entry = get_pagecache_pte(page);
+ swp_entry_t entry;
int retval = 0;
- if (!pte_val(entry))
+ entry.val = page->index;
+ if (!entry.val)
goto bad_entry;
type = SWP_TYPE(entry);
- if (type & SHM_SWP_TYPE)
- goto out;
if (type >= nr_swapfiles)
goto bad_file;
p = type + swap_info;
@@ -131,13 +129,13 @@ bad_entry:
printk(KERN_ERR "swap_count: null entry!\n");
goto out;
bad_file:
- pte_ERROR(entry);
+ printk("Bad swap file entry %08lx\n", entry.val);
goto out;
bad_offset:
- pte_ERROR(entry);
+ printk("Bad swap offset entry %08lx\n", entry.val);
goto out;
bad_unused:
- pte_ERROR(entry);
+ printk("Unused swap offset entry %08lx\n", entry.val);
goto out;
}
@@ -160,7 +158,9 @@ static inline void remove_from_swap_cache(struct page *page)
*/
void __delete_from_swap_cache(struct page *page)
{
- pte_t entry = get_pagecache_pte(page);
+ swp_entry_t entry;
+
+ entry.val = page->index;
#ifdef SWAP_CACHE_INFO
swap_cache_del_total++;
@@ -223,7 +223,7 @@ void free_page_and_swap_cache(struct page *page)
* lock before returning.
*/
-struct page * lookup_swap_cache(pte_t entry)
+struct page * lookup_swap_cache(swp_entry_t entry)
{
struct page *found;
@@ -232,9 +232,9 @@ struct page * lookup_swap_cache(pte_t entry)
#endif
while (1) {
/*
- * Right now the pagecache is 32-bit only.
+ * Right now the pagecache is 32-bit only. But it's a 32 bit index. =)
*/
- found = find_lock_page(&swapper_space, pte_val(entry));
+ found = find_lock_page(&swapper_space, entry.val);
if (!found)
return 0;
if (found->mapping != &swapper_space || !PageSwapCache(found))
@@ -262,7 +262,7 @@ out_bad:
* the swap entry is no longer in use.
*/
-struct page * read_swap_cache_async(pte_t entry, int wait)
+struct page * read_swap_cache_async(swp_entry_t entry, int wait)
{
struct page *found_page = 0, *new_page;
unsigned long new_page_addr;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bcd7b4587..c34a5316a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,7 +25,7 @@ struct swap_info_struct swap_info[MAX_SWAPFILES];
#define SWAPFILE_CLUSTER 256
-static inline int scan_swap_map(struct swap_info_struct *si)
+static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
{
unsigned long offset;
/*
@@ -73,7 +73,7 @@ static inline int scan_swap_map(struct swap_info_struct *si)
si->lowest_bit++;
if (offset == si->highest_bit)
si->highest_bit--;
- si->swap_map[offset] = 1;
+ si->swap_map[offset] = count;
nr_swap_pages--;
si->cluster_next = offset+1;
return offset;
@@ -81,23 +81,26 @@ static inline int scan_swap_map(struct swap_info_struct *si)
return 0;
}
-pte_t get_swap_page(void)
+swp_entry_t __get_swap_page(unsigned short count)
{
struct swap_info_struct * p;
unsigned long offset;
- pte_t entry = __pte(0);
+ swp_entry_t entry;
int type, wrapped = 0;
+ entry.val = 0; /* Out of memory */
type = swap_list.next;
if (type < 0)
goto out;
if (nr_swap_pages == 0)
goto out;
+ if (count >= SWAP_MAP_MAX)
+ goto bad_count;
while (1) {
p = &swap_info[type];
if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
- offset = scan_swap_map(p);
+ offset = scan_swap_map(p, count);
if (offset) {
entry = SWP_ENTRY(type,offset);
type = swap_info[type].next;
@@ -122,20 +125,23 @@ pte_t get_swap_page(void)
}
out:
return entry;
+
+bad_count:
+ printk(KERN_ERR "get_swap_page: bad count %hd from %p\n",
+ count, __builtin_return_address(0));
+ goto out;
}
-void swap_free(pte_t entry)
+void __swap_free(swp_entry_t entry, unsigned short count)
{
struct swap_info_struct * p;
unsigned long offset, type;
- if (!pte_val(entry))
+ if (!entry.val)
goto out;
type = SWP_TYPE(entry);
- if (type & SHM_SWP_TYPE)
- goto out;
if (type >= nr_swapfiles)
goto bad_nofile;
p = & swap_info[type];
@@ -149,7 +155,9 @@ void swap_free(pte_t entry)
if (!p->swap_map[offset])
goto bad_free;
if (p->swap_map[offset] < SWAP_MAP_MAX) {
- if (!--p->swap_map[offset]) {
+ if (p->swap_map[offset] < count)
+ goto bad_count;
+ if (!(p->swap_map[offset] -= count)) {
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -170,27 +178,28 @@ bad_offset:
printk("swap_free: offset exceeds max\n");
goto out;
bad_free:
- pte_ERROR(entry);
+ printk("VM: Bad swap entry %08lx\n", entry.val);
+ goto out;
+bad_count:
+ printk(KERN_ERR "VM: Bad count %hd current count %hd\n", count, p->swap_map[offset]);
goto out;
}
/* needs the big kernel lock */
-pte_t acquire_swap_entry(struct page *page)
+swp_entry_t acquire_swap_entry(struct page *page)
{
struct swap_info_struct * p;
unsigned long offset, type;
- pte_t entry;
+ swp_entry_t entry;
if (!test_bit(PG_swap_entry, &page->flags))
goto new_swap_entry;
/* We have the old entry in the page offset still */
- if (!page->offset)
+ if (!page->index)
goto new_swap_entry;
- entry = get_pagecache_pte(page);
+ entry.val = page->index;
type = SWP_TYPE(entry);
- if (type & SHM_SWP_TYPE)
- goto new_swap_entry;
if (type >= nr_swapfiles)
goto new_swap_entry;
p = type + swap_info;
@@ -222,7 +231,7 @@ new_swap_entry:
* what to do if a write is requested later.
*/
static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
- pte_t *dir, pte_t entry, struct page* page)
+ pte_t *dir, swp_entry_t entry, struct page* page)
{
pte_t pte = *dir;
@@ -238,17 +247,17 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
set_pte(dir, pte_mkdirty(pte));
return;
}
- if (pte_val(pte) != pte_val(entry))
+ if (pte_val(pte) != entry.val)
return;
set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
- get_page(mem_map + MAP_NR(page));
+ get_page(page);
++vma->vm_mm->rss;
}
static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset,
- pte_t entry, struct page* page)
+ swp_entry_t entry, struct page* page)
{
pte_t * pte;
unsigned long end;
@@ -275,7 +284,7 @@ static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size,
- pte_t entry, struct page* page)
+ swp_entry_t entry, struct page* page)
{
pmd_t * pmd;
unsigned long offset, end;
@@ -304,7 +313,7 @@ static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
}
static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
- pte_t entry, struct page* page)
+ swp_entry_t entry, struct page* page)
{
unsigned long start = vma->vm_start, end = vma->vm_end;
@@ -318,7 +327,7 @@ static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
}
static void unuse_process(struct mm_struct * mm,
- pte_t entry, struct page* page)
+ swp_entry_t entry, struct page* page)
{
struct vm_area_struct* vma;
@@ -344,7 +353,7 @@ static int try_to_unuse(unsigned int type)
struct swap_info_struct * si = &swap_info[type];
struct task_struct *p;
struct page *page;
- pte_t entry;
+ swp_entry_t entry;
int i;
while (1) {
@@ -388,7 +397,7 @@ static int try_to_unuse(unsigned int type)
*/
if (si->swap_map[i] != 0) {
if (si->swap_map[i] != SWAP_MAP_MAX)
- pte_ERROR(entry);
+ printk("VM: Undead swap entry %08lx\n", entry.val);
si->swap_map[i] = 0;
nr_swap_pages++;
}
@@ -616,7 +625,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
swapfilesize = 0;
if (blk_size[MAJOR(dev)])
swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
- / (PAGE_SIZE / 1024);
+ >> (PAGE_SHIFT - 10);
} else if (S_ISREG(swap_dentry->d_inode->i_mode)) {
error = -EBUSY;
for (i = 0 ; i < nr_swapfiles ; i++) {
@@ -625,7 +634,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
if (swap_dentry->d_inode == swap_info[i].swap_file->d_inode)
goto bad_swap;
}
- swapfilesize = swap_dentry->d_inode->i_size / PAGE_SIZE;
+ swapfilesize = swap_dentry->d_inode->i_size >> PAGE_SHIFT;
} else
goto bad_swap;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0978f544c..d7908df16 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -10,7 +10,7 @@
#include <asm/uaccess.h>
-static struct vm_struct * vmlist = NULL;
+struct vm_struct * vmlist = NULL;
static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
{
@@ -97,7 +97,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo
struct page * page;
if (!pte_none(*pte))
printk(KERN_ERR "alloc_area_pte: page already exists\n");
- page = get_free_highpage(GFP_KERNEL|__GFP_HIGHMEM);
+ page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
if (!page)
return -ENOMEM;
set_pte(pte, mk_pte(page, prot));
@@ -204,7 +204,7 @@ void * vmalloc_prot(unsigned long size, pgprot_t prot)
struct vm_struct *area;
size = PAGE_ALIGN(size);
- if (!size || size > (max_mapnr << PAGE_SHIFT)) {
+ if (!size || (size >> PAGE_SHIFT) > max_mapnr) {
BUG();
return NULL;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9db9ce6f9..14f5dc444 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,7 +35,8 @@
*/
static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
{
- pte_t pte, entry;
+ pte_t pte;
+ swp_entry_t entry;
struct page * page;
int (*swapout)(struct page *, struct file *);
@@ -72,9 +73,9 @@ static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pt
* memory, and we should just continue our scan.
*/
if (PageSwapCache(page)) {
- entry = get_pagecache_pte(page);
+ entry.val = page->index;
swap_duplicate(entry);
- set_pte(page_table, entry);
+ set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
vma->vm_mm->rss--;
flush_tlb_page(vma, address);
@@ -151,14 +152,14 @@ drop_pte:
* page with that swap entry.
*/
entry = acquire_swap_entry(page);
- if (!pte_val(entry))
+ if (!entry.val)
goto out_failed; /* No swap space left */
if (!(page = prepare_highmem_swapout(page)))
goto out_swap_free;
vma->vm_mm->rss--;
- set_pte(page_table, entry);
+ set_pte(page_table, swp_entry_to_pte(entry));
vmlist_access_unlock(vma->vm_mm);
flush_tlb_page(vma, address);
@@ -502,7 +503,7 @@ int kswapd(void *unused)
do {
/* kswapd is critical to provide GFP_ATOMIC
allocations (not GFP_HIGHMEM ones). */
- if (nr_free_pages - nr_free_highpages >= freepages.high)
+ if (nr_free_buffer_pages() >= freepages.high)
break;
if (!do_try_to_free_pages(GFP_KSWAPD))