diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bootmem.c | 29 | ||||
-rw-r--r-- | mm/filemap.c | 308 | ||||
-rw-r--r-- | mm/highmem.c | 296 | ||||
-rw-r--r-- | mm/memory.c | 45 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 20 | ||||
-rw-r--r-- | mm/mprotect.c | 10 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 390 | ||||
-rw-r--r-- | mm/page_io.c | 10 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/swap_state.c | 44 | ||||
-rw-r--r-- | mm/swapfile.c | 65 | ||||
-rw-r--r-- | mm/vmalloc.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 13 |
15 files changed, 833 insertions, 419 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index e790acc4f..edc69e6b3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -36,9 +36,7 @@ unsigned long __init init_bootmem (unsigned long start, unsigned long pages) { unsigned long mapsize = (pages+7)/8; - if (bootmem_map) - BUG(); - bootmem_map = __va(start << PAGE_SHIFT); + bootmem_map = phys_to_virt(start << PAGE_SHIFT); max_low_pfn = pages; /* @@ -64,7 +62,6 @@ void __init reserve_bootmem (unsigned long addr, unsigned long size) */ unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; - if (!bootmem_map) BUG(); if (!size) BUG(); if (end > max_low_pfn) @@ -77,18 +74,23 @@ void __init reserve_bootmem (unsigned long addr, unsigned long size) void __init free_bootmem (unsigned long addr, unsigned long size) { unsigned long i; + unsigned long start; /* * round down end of usable mem, partially free pages are * considered reserved. */ unsigned long end = (addr + size)/PAGE_SIZE; - if (!bootmem_map) BUG(); if (!size) BUG(); - if (end > max_low_pfn) BUG(); - for (i = addr/PAGE_SIZE; i < end; i++) { + + /* + * Round up the beginning of the address. + */ + start = (addr + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = start; i < end; i++) { if (!test_and_clear_bit(i, bootmem_map)) BUG(); } @@ -117,7 +119,6 @@ void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned unsigned long offset, remaining_size; unsigned long areasize, preferred; - if (!bootmem_map) BUG(); if (!size) BUG(); /* @@ -152,6 +153,9 @@ restart_scan: preferred = 0; goto restart_scan; } + /* + * Whoops, we cannot satisfy the allocation request. + */ BUG(); found: if (start >= max_low_pfn) @@ -173,11 +177,11 @@ found: areasize = 0; // last_pos unchanged last_offset = offset+size; - ret = __va(last_pos*PAGE_SIZE + offset); + ret = phys_to_virt(last_pos*PAGE_SIZE + offset); } else { size -= remaining_size; areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; - ret = __va(last_pos*PAGE_SIZE + offset); + ret = phys_to_virt(last_pos*PAGE_SIZE + offset); last_pos = start+areasize-1; last_offset = size; } @@ -185,7 +189,7 @@ found: } else { last_pos = start + areasize - 1; last_offset = size & ~PAGE_MASK; - ret = __va(start * PAGE_SIZE); + ret = phys_to_virt(start * PAGE_SIZE); } /* * Reserve the area now: @@ -211,12 +215,13 @@ unsigned long __init free_all_bootmem (void) count++; ClearPageReserved(page); set_page_count(page, 1); - if (i >= (__pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT)) + if (i >= (virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT)) clear_bit(PG_DMA, &page->flags); __free_page(page); } } total += count; + /* * Now free the allocator bitmap itself, it's not * needed anymore: diff --git a/mm/filemap.c b/mm/filemap.c index 887d7b6f8..3bb4d89de 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,11 +21,13 @@ #include <linux/swapctl.h> #include <linux/slab.h> #include <linux/init.h> -#include <linux/highmem.h> +#include <linux/mm.h> #include <asm/pgtable.h> #include <asm/uaccess.h> +#include <linux/highmem.h> + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -50,9 +52,7 @@ spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED; #define CLUSTER_PAGES (1 << page_cluster) -#define CLUSTER_SHIFT (PAGE_CACHE_SHIFT + page_cluster) -#define CLUSTER_BYTES (1 << CLUSTER_SHIFT) -#define CLUSTER_OFFSET(x) (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT) +#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) void __add_page_to_hash_queue(struct page * page, struct page **p) { @@ -127,20 +127,22 @@ void invalidate_inode_pages(struct inode * inode) void truncate_inode_pages(struct inode * inode, unsigned long start) { struct list_head *head, *curr; - unsigned long offset; struct page * page; - int partial = 0; + unsigned partial = start & (PAGE_CACHE_SIZE - 1); + + start = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; repeat: head = &inode->i_data.pages; spin_lock(&pagecache_lock); curr = head->next; while (curr != head) { + unsigned long offset; page = list_entry(curr, struct page, list); curr = curr->next; - offset = page->offset; + offset = page->index; /* page wholly truncated - free it */ if (offset >= start) { @@ -179,30 +181,32 @@ repeat: /* * there is only one partial page possible. */ - if (partial) + if (!partial) + continue; + + /* and it's the one preceeding the first wholly truncated page */ + if ((offset + 1) != start) continue; - offset = start - offset; /* partial truncate, clear end of page */ - if (offset < PAGE_CACHE_SIZE) { - get_page(page); - spin_unlock(&pagecache_lock); + get_page(page); + spin_unlock(&pagecache_lock); - lock_page(page); - partial = 1; + lock_page(page); - memclear_highpage_flush(page, offset, - PAGE_CACHE_SIZE-offset); - if (inode->i_op->flushpage) - inode->i_op->flushpage(inode, page, offset); - /* - * we have dropped the spinlock so we have to - * restart. - */ - UnlockPage(page); - page_cache_release(page); - goto repeat; - } + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (inode->i_op->flushpage) + inode->i_op->flushpage(inode, page, partial); + + partial = 0; + + /* + * we have dropped the spinlock so we have to + * restart. + */ + UnlockPage(page); + page_cache_release(page); + goto repeat; } spin_unlock(&pagecache_lock); } @@ -367,7 +371,7 @@ inside: goto not_found; if (page->mapping != mapping) continue; - if (page->offset == offset) + if (page->index == offset) break; } set_bit(PG_referenced, &page->flags); @@ -417,7 +421,6 @@ static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigne int retval = 0; head = &inode->i_data.pages; - start &= PAGE_MASK; spin_lock(&pagecache_lock); curr = head->next; @@ -426,9 +429,9 @@ static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigne curr = curr->next; if (!page->buffers) continue; - if (page->offset >= end) + if (page->index >= end) continue; - if (page->offset < start) + if (page->index < start) continue; get_page(page); @@ -455,10 +458,12 @@ static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigne */ int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end) { + unsigned long start_idx = start >> PAGE_CACHE_SHIFT; + unsigned long end_idx = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; int retval; - retval = do_buffer_fdatasync(inode, start, end, writeout_one_page); - retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page); + retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page); return retval; } @@ -476,7 +481,7 @@ static inline void __add_to_page_cache(struct page * page, flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced)); page->flags = flags | (1 << PG_locked); get_page(page); - page->offset = offset; + page->index = offset; add_page_to_inode_queue(mapping, page); __add_page_to_hash_queue(page, hash); lru_cache_add(page); @@ -516,7 +521,7 @@ int add_to_page_cache_unique(struct page * page, * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static inline void page_cache_read(struct file * file, unsigned long offset) +static inline int page_cache_read(struct file * file, unsigned long offset) { struct inode *inode = file->f_dentry->d_inode; struct page **hash = page_hash(&inode->i_data, offset); @@ -526,42 +531,45 @@ static inline void page_cache_read(struct file * file, unsigned long offset) page = __find_page_nolock(&inode->i_data, offset, *hash); spin_unlock(&pagecache_lock); if (page) - return; + return 0; page = page_cache_alloc(); if (!page) - return; + return -ENOMEM; if (!add_to_page_cache_unique(page, &inode->i_data, offset, hash)) { - inode->i_op->readpage(file, page); + int error = inode->i_op->readpage(file, page); page_cache_release(page); - return; + return error; } /* * We arrive here in the unlikely event that someone * raced with us and added our page to the cache first. */ page_cache_free(page); - return; + return 0; } /* * Read in an entire cluster at once. A cluster is usually a 64k- * aligned block that includes the address requested in "offset." */ -static void read_cluster_nonblocking(struct file * file, - unsigned long offset) +static int read_cluster_nonblocking(struct file * file, unsigned long offset) { - off_t filesize = file->f_dentry->d_inode->i_size; + int error = 0; + unsigned long filesize = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; unsigned long pages = CLUSTER_PAGES; offset = CLUSTER_OFFSET(offset); while ((pages-- > 0) && (offset < filesize)) { - page_cache_read(file, offset); - offset += PAGE_CACHE_SIZE; + error = page_cache_read(file, offset); + if (error >= 0) + offset ++; + else + break; } - return; + return error; } /* @@ -751,7 +759,7 @@ static void profile_readahead(int async, struct file *filp) total_rawin/total_reada, (total_async*100)/total_reada); #ifdef DEBUG_READAHEAD - printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n", + printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n", filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend); #endif @@ -831,13 +839,15 @@ static inline int get_max_readahead(struct inode * inode) static void generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode, - unsigned long ppos, struct page * page) + struct page * page) { + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + unsigned long index = page->index; unsigned long max_ahead, ahead; unsigned long raend; int max_readahead = get_max_readahead(inode); - raend = filp->f_raend & PAGE_CACHE_MASK; + raend = filp->f_raend; max_ahead = 0; /* @@ -849,14 +859,14 @@ static void generic_file_readahead(int reada_ok, * page only. */ if (PageLocked(page)) { - if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) { - raend = ppos; - if (raend < inode->i_size) + if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) { + raend = index; + if (raend < end_index) max_ahead = filp->f_ramax; filp->f_rawin = 0; - filp->f_ralen = PAGE_CACHE_SIZE; + filp->f_ralen = 1; if (!max_ahead) { - filp->f_raend = ppos + filp->f_ralen; + filp->f_raend = index + filp->f_ralen; filp->f_rawin += filp->f_ralen; } } @@ -869,17 +879,17 @@ static void generic_file_readahead(int reada_ok, * it is the moment to try to read ahead asynchronously. * We will later force unplug device in order to force asynchronous read IO. */ - else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE && - ppos <= raend && ppos + filp->f_ralen >= raend) { + else if (reada_ok && filp->f_ramax && raend >= 1 && + index <= raend && index + filp->f_ralen >= raend) { /* * Add ONE page to max_ahead in order to try to have about the same IO max size * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. * Compute the position of the last page we have tried to read in order to * begin to read ahead just at the next page. */ - raend -= PAGE_CACHE_SIZE; - if (raend < inode->i_size) - max_ahead = filp->f_ramax + PAGE_CACHE_SIZE; + raend -= 1; + if (raend < end_index) + max_ahead = filp->f_ramax + 1; if (max_ahead) { filp->f_rawin = filp->f_ralen; @@ -894,10 +904,11 @@ static void generic_file_readahead(int reada_ok, */ ahead = 0; while (ahead < max_ahead) { - ahead += PAGE_CACHE_SIZE; - if ((raend + ahead) >= inode->i_size) + ahead ++; + if ((raend + ahead) >= end_index) + break; + if (page_cache_read(filp, raend + ahead) < 0) break; - page_cache_read(filp, raend + ahead); } /* * If we tried to read ahead some pages, @@ -917,7 +928,7 @@ static void generic_file_readahead(int reada_ok, filp->f_ralen += ahead; filp->f_rawin += filp->f_ralen; - filp->f_raend = raend + ahead + PAGE_CACHE_SIZE; + filp->f_raend = raend + ahead + 1; filp->f_ramax += filp->f_ramax; @@ -945,15 +956,16 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * { struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; - size_t pos, pgpos; + unsigned long index, offset; struct page *cached_page; int reada_ok; int error; int max_readahead = get_max_readahead(inode); cached_page = NULL; - pos = *ppos; - pgpos = pos & PAGE_CACHE_MASK; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + /* * If the current position is outside the previous read-ahead window, * we reset the current read-ahead context and set read ahead max to zero @@ -961,7 +973,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * * otherwise, we assume that the file accesses are sequential enough to * continue read-ahead. */ - if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) { + if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) { reada_ok = 0; filp->f_raend = 0; filp->f_ralen = 0; @@ -977,12 +989,12 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * * Then, at least MIN_READAHEAD if read ahead is ok, * and at most MAX_READAHEAD in all cases. */ - if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) { + if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) { filp->f_ramax = 0; } else { unsigned long needed; - needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos; + needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1; if (filp->f_ramax < needed) filp->f_ramax = needed; @@ -995,17 +1007,27 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * for (;;) { struct page *page, **hash; + unsigned long end_index, nr; - if (pos >= inode->i_size) + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + if (index > end_index) break; + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = inode->i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + nr = nr - offset; /* * Try to find the data in the page cache.. */ - hash = page_hash(&inode->i_data, pos & PAGE_CACHE_MASK); + hash = page_hash(&inode->i_data, index); spin_lock(&pagecache_lock); - page = __find_page_nolock(&inode->i_data, pos & PAGE_CACHE_MASK, *hash); + page = __find_page_nolock(&inode->i_data, index, *hash); if (!page) goto no_cached_page; found_page: @@ -1015,19 +1037,10 @@ found_page: if (!Page_Uptodate(page)) goto page_not_up_to_date; page_ok: - /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... - */ - { - unsigned long offset, nr; - - offset = pos & ~PAGE_CACHE_MASK; - nr = PAGE_CACHE_SIZE - offset; - if (nr > inode->i_size - pos) - nr = inode->i_size - pos; - /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update @@ -1035,19 +1048,20 @@ page_ok: * pointers and the remaining count). */ nr = actor(desc, page, offset, nr); - pos += nr; + offset += nr; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + page_cache_release(page); if (nr && desc->count) continue; break; - } /* * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. */ page_not_up_to_date: - generic_file_readahead(reada_ok, filp, inode, - pos & PAGE_CACHE_MASK, page); + generic_file_readahead(reada_ok, filp, inode, page); if (Page_Uptodate(page)) goto page_ok; @@ -1068,8 +1082,7 @@ readpage: goto page_ok; /* Again, try some read-ahead while waiting for the page to finish.. */ - generic_file_readahead(reada_ok, filp, inode, - pos & PAGE_CACHE_MASK, page); + generic_file_readahead(reada_ok, filp, inode, page); wait_on_page(page); if (Page_Uptodate(page)) goto page_ok; @@ -1101,7 +1114,7 @@ no_cached_page: * dropped the page cache lock. Check for that. */ spin_lock(&pagecache_lock); - page = __find_page_nolock(&inode->i_data, pos & PAGE_CACHE_MASK, *hash); + page = __find_page_nolock(&inode->i_data, index, *hash); if (page) goto found_page; } @@ -1110,14 +1123,14 @@ no_cached_page: * Ok, add the new page to the hash-queues... */ page = cached_page; - __add_to_page_cache(page, &inode->i_data, pos & PAGE_CACHE_MASK, hash); + __add_to_page_cache(page, &inode->i_data, index, hash); spin_unlock(&pagecache_lock); cached_page = NULL; goto readpage; } - *ppos = pos; + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; filp->f_reada = 1; if (cached_page) page_cache_free(cached_page); @@ -1131,12 +1144,10 @@ static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned if (size > count) size = count; - /* - * FIXME: We cannot yet sleep with kmaps held. - */ - kaddr = kmap(page, KM_READ); - left = __copy_to_user(desc->buf, (void *)(kaddr+offset), size); - kunmap(kaddr, KM_READ); + + kaddr = kmap(page); + left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size); + kunmap(page); if (left) { size -= left; @@ -1159,6 +1170,7 @@ ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t * retval = -EFAULT; if (access_ok(VERIFY_WRITE, buf, count)) { retval = 0; + if (count) { read_descriptor_t desc; @@ -1188,9 +1200,11 @@ static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned size = count; old_fs = get_fs(); set_fs(KERNEL_DS); - kaddr = kmap(page, KM_READ); - written = file->f_op->write(file, (char *)kaddr + offset, size, &file->f_pos); - kunmap(kaddr, KM_READ); + + kaddr = kmap(page); + written = file->f_op->write(file, (char *)kaddr + offset, + size, &file->f_pos); + kunmap(page); set_fs(old_fs); if (written < 0) { desc->error = written; @@ -1286,19 +1300,18 @@ out: * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. - * - * XXX - at some point, this should return unique values to indicate to - * the caller whether this is EIO, OOM, or SIGBUS. */ static struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share) { + int error; struct file *file = area->vm_file; struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; struct page *page, **hash, *old_page; + unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - unsigned long offset = address - area->vm_start + area->vm_offset; + unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; /* * Semantics for shared and private memory areas are different @@ -1306,16 +1319,16 @@ static struct page * filemap_nopage(struct vm_area_struct * area, * of the file is an error and results in a SIGBUS, while a * private mapping just maps in a zero page. */ - if ((offset >= inode->i_size) && + if ((pgoff >= size) && (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm)) return NULL; /* * Do we have something in the page cache already? */ - hash = page_hash(&inode->i_data, offset); + hash = page_hash(&inode->i_data, pgoff); retry_find: - page = __find_get_page(&inode->i_data, offset, hash); + page = __find_get_page(&inode->i_data, pgoff, hash); if (!page) goto no_cached_page; @@ -1336,11 +1349,10 @@ success: struct page *new_page = page_cache_alloc(); if (new_page) { - if (PageHighMem(new_page) || PageHighMem(old_page)) - BUG(); copy_highpage(new_page, old_page); flush_page_to_ram(new_page); - } + } else + new_page = NOPAGE_OOM; page_cache_release(page); return new_page; } @@ -1356,17 +1368,27 @@ no_cached_page: * Otherwise, we're off the end of a privately mapped file, * so we need to map a zero page. */ - if (offset < inode->i_size) - read_cluster_nonblocking(file, offset); + if (pgoff < size) + error = read_cluster_nonblocking(file, pgoff); else - page_cache_read(file, offset); + error = page_cache_read(file, pgoff); /* * The page we want has now been added to the page cache. * In the unlikely event that someone removed it in the * meantime, we'll just come back here and read it again. */ - goto retry_find; + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return NOPAGE_OOM; + return NULL; page_not_uptodate: lock_page(page); @@ -1418,7 +1440,7 @@ static inline int do_write_page(struct inode * inode, struct file * file, unsigned long size; int (*writepage) (struct file *, struct page *); - size = offset + PAGE_SIZE; + size = (offset << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE; /* refuse to extend file size.. */ if (S_ISREG(inode->i_mode)) { if (size > inode->i_size) @@ -1427,7 +1449,6 @@ static inline int do_write_page(struct inode * inode, struct file * file, if (size < offset) return -EIO; } - size -= offset; retval = -EIO; writepage = inode->i_op->writepage; lock_page(page); @@ -1469,7 +1490,7 @@ static int filemap_write_page(struct file *file, extern void wakeup_bdflush(int); int filemap_swapout(struct page * page, struct file * file) { - int retval = filemap_write_page(file, page->offset, page, 0); + int retval = filemap_write_page(file, page->index, page, 0); wakeup_bdflush(0); return retval; } @@ -1477,6 +1498,7 @@ int filemap_swapout(struct page * page, struct file * file) static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { + unsigned long pgoff; pte_t pte = *ptep; struct page *page; int error; @@ -1499,7 +1521,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, pte_clear(ptep); flush_tlb_page(vma, address); if (!pte_present(pte)) { - swap_free(pte); + swap_free(pte_to_swp_entry(pte)); return 0; } page = pte_page(pte); @@ -1508,9 +1530,13 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, return 0; } } - if (PageHighMem(page)) - BUG(); - error = filemap_write_page(vma->vm_file, address - vma->vm_start + vma->vm_offset, page, 1); + pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT; + pgoff += vma->vm_pgoff; + if (page->index != pgoff) { + printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n", + pgoff, page->index, address, vma->vm_start, vma->vm_pgoff); + } + error = filemap_write_page(vma->vm_file, pgoff, page, 1); page_cache_free(page); return error; } @@ -1764,13 +1790,16 @@ generic_file_write(struct file *file, const char *buf, { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; - unsigned long pos = *ppos; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + loff_t pos = *ppos; struct page *page, **hash, *cached_page; unsigned long written; long status; int err; + if (pos < 0) + return -EINVAL; + cached_page = NULL; down(&inode->i_sem); @@ -1789,36 +1818,35 @@ generic_file_write(struct file *file, const char *buf, * Check whether we've reached the file size limit. */ err = -EFBIG; - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - goto out; + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (count > limit - pos) { + send_sig(SIGXFSZ, current, 0); + count = limit - pos; + } } status = 0; - /* - * Check whether to truncate the write, - * and send the signal if we do. - */ - if (count > limit - pos) { - send_sig(SIGXFSZ, current, 0); - count = limit - pos; - } while (count) { - unsigned long bytes, pgpos, offset; + unsigned long bytes, index, offset; + /* * Try to find the page in the cache. If it isn't there, * allocate a free page. */ - offset = (pos & ~PAGE_CACHE_MASK); - pgpos = pos & PAGE_CACHE_MASK; + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) bytes = count; - hash = page_hash(&inode->i_data, pgpos); + hash = page_hash(&inode->i_data, index); repeat_find: - page = __find_lock_page(&inode->i_data, pgpos, hash); + page = __find_lock_page(&inode->i_data, index, hash); if (!page) { if (!cached_page) { cached_page = page_cache_alloc(); @@ -1828,7 +1856,7 @@ repeat_find: break; } page = cached_page; - if (add_to_page_cache_unique(page,&inode->i_data,pgpos,hash)) + if (add_to_page_cache_unique(page, &inode->i_data, index, hash)) goto repeat_find; cached_page = NULL; diff --git a/mm/highmem.c b/mm/highmem.c index 7665393cf..248688c23 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -4,19 +4,25 @@ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de * + * * Redesigned the x86 32-bit VM architecture to deal with * 64-bit physical space. With current x86 CPUs this * means up to 64 Gigabytes physical RAM. * + * Rewrote high memory support to move the page cache into + * high memory. Implemented permanent (schedulable) kmaps + * based on Linus' idea. + * * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> */ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/swap.h> +#include <linux/slab.h> unsigned long highmem_mapnr; -unsigned long nr_free_highpages = 0; struct page * prepare_highmem_swapout(struct page * page) { @@ -34,9 +40,9 @@ struct page * prepare_highmem_swapout(struct page * page) if (!regular_page) return NULL; - vaddr = kmap(page, KM_READ); + vaddr = kmap(page); copy_page((void *)regular_page, (void *)vaddr); - kunmap(vaddr, KM_READ); + kunmap(page); /* * ok, we can just forget about our highmem page since @@ -52,10 +58,10 @@ struct page * replace_with_highmem(struct page * page) struct page *highpage; unsigned long vaddr; - if (PageHighMem(page) || !nr_free_highpages) + if (PageHighMem(page) || !nr_free_highpages()) return page; - highpage = get_free_highpage(GFP_ATOMIC|__GFP_HIGHMEM); + highpage = alloc_page(GFP_ATOMIC|__GFP_HIGHMEM); if (!highpage) return page; if (!PageHighMem(highpage)) { @@ -63,13 +69,13 @@ struct page * replace_with_highmem(struct page * page) return page; } - vaddr = kmap(highpage, KM_WRITE); + vaddr = kmap(page); copy_page((void *)vaddr, (void *)page_address(page)); - kunmap(vaddr, KM_WRITE); + kunmap(page); /* Preserve the caching of the swap_entry. */ - highpage->offset = page->offset; - highpage->inode = page->inode; + highpage->index = page->index; + highpage->mapping = page->mapping; /* * We can just forget the old page since @@ -79,3 +85,275 @@ struct page * replace_with_highmem(struct page * page) return highpage; } + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 2048 +#else +#define LAST_PKMAP 4096 +#endif +#define LAST_PKMAP_MASK (LAST_PKMAP-1) +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +/* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped + * since a TLB flush - it is usable. + * 1 means that there are no users, but it has been mapped + * since the last TLB flush - so we can't use it. + * n means that there are (n-1) current users of it. + */ +static int pkmap_count[LAST_PKMAP]; +static unsigned int last_pkmap_nr = 0; +static spinlock_t kmap_lock; + +pte_t * pkmap_page_table; + +static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + +static void flush_all_zero_pkmaps(void) +{ + int i; + + for (i = 0; i < LAST_PKMAP; i++) { + struct page *page; + pte_t pte; + /* + * zero means we don't have anything to do, + * >1 means that it is still in use. Only + * a count of 1 means that it is free but + * needs to be unmapped + */ + if (pkmap_count[i] != 1) + continue; + pkmap_count[i] = 0; + pte = pkmap_page_table[i]; + if (pte_none(pte)) + continue; + pte_clear(pkmap_page_table+i); + page = pte_page(pte); + page->virtual = 0; + } + flush_tlb_all(); +} + +static unsigned long map_new_virtual(struct page *page) +{ + unsigned long vaddr; + int count = LAST_PKMAP; + + /* Find an empty entry */ + for (;;) { + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + if (!last_pkmap_nr) + flush_all_zero_pkmaps(); + if (!pkmap_count[last_pkmap_nr]) + break; /* Found a usable entry */ + if (--count) + continue; + + /* + * Sleep for somebody else to unmap their entries + */ + { + DECLARE_WAITQUEUE(wait, current); + + current->state = TASK_UNINTERRUPTIBLE; + add_wait_queue(&pkmap_map_wait, &wait); + spin_unlock(&kmap_lock); + // it's not quite possible to saturate the + // pkmap pool right now. + BUG(); + schedule(); + remove_wait_queue(&pkmap_map_wait, &wait); + spin_lock(&kmap_lock); + } + + /* Somebody else might have mapped it while we slept */ + if (page->virtual) + return page->virtual; + + /* Re-start */ + count = LAST_PKMAP; + } + vaddr = PKMAP_ADDR(last_pkmap_nr); + pkmap_page_table[last_pkmap_nr] = mk_pte(page, kmap_prot); + + /* + * Subtle! For some reason if we dont do this TLB flush then + * we get data corruption and weird behavior in dbench runs. + * But invlpg this should not be necessery ... Any ideas? + */ + __flush_tlb_one(vaddr); + pkmap_count[last_pkmap_nr] = 1; + page->virtual = vaddr; + + return vaddr; +} + +unsigned long kmap_high(struct page *page) +{ + unsigned long vaddr; + + if (!PageHighMem(page)) + BUG(); + /* + * For highmem pages, we can't trust "virtual" until + * after we have the lock. + * + * We cannot call this from interrupts, as it may block + */ + spin_lock(&kmap_lock); + vaddr = page->virtual; + if (!vaddr) + vaddr = map_new_virtual(page); + pkmap_count[PKMAP_NR(vaddr)]++; + if (pkmap_count[PKMAP_NR(vaddr)] < 2) + BUG(); + spin_unlock(&kmap_lock); + return vaddr; +} + +void kunmap_high(struct page *page) +{ + unsigned long vaddr; + unsigned long nr; + + spin_lock(&kmap_lock); + vaddr = page->virtual; + if (!vaddr) + BUG(); + nr = PKMAP_NR(vaddr); + + /* + * A count must never go down to zero + * without a TLB flush! + */ + switch (--pkmap_count[nr]) { + case 0: + BUG(); + case 1: + wake_up(&pkmap_map_wait); + } + spin_unlock(&kmap_lock); +} + +/* + * Simple bounce buffer support for highmem pages. + * This will be moved to the block layer in 2.5. + */ + +extern kmem_cache_t *bh_cachep; + +static inline void copy_from_high_bh (struct buffer_head *to, + struct buffer_head *from) +{ + struct page *p_from; + unsigned long vfrom; + + p_from = from->b_page; + vfrom = kmap_atomic(p_from, KM_BOUNCE_WRITE); + memcpy(to->b_data, (char *)vfrom + bh_offset(from), to->b_size); + kunmap_atomic(vfrom, KM_BOUNCE_WRITE); +} + +static inline void copy_to_high_bh_irq (struct buffer_head *to, + struct buffer_head *from) +{ + struct page *p_to; + unsigned long vto; + + p_to = to->b_page; + vto = kmap_atomic(p_to, KM_BOUNCE_WRITE); + memcpy((char *)vto + bh_offset(to), from->b_data, to->b_size); + kunmap_atomic(vto, KM_BOUNCE_WRITE); +} + +static inline void bounce_end_io (struct buffer_head *bh, int uptodate) +{ + struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_dev_id); + + bh_orig->b_end_io(bh_orig, uptodate); + __free_page(bh->b_page); + kmem_cache_free(bh_cachep, bh); +} + +static void bounce_end_io_write (struct buffer_head *bh, int uptodate) +{ + bounce_end_io(bh, uptodate); +} + +static void bounce_end_io_read (struct buffer_head *bh, int uptodate) +{ + struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_dev_id); + + if (uptodate) + copy_to_high_bh_irq(bh_orig, bh); + bounce_end_io(bh, uptodate); +} + +struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig) +{ + struct page *page; + struct buffer_head *bh; + + if (!PageHighMem(bh_orig->b_page)) + return bh_orig; + +repeat_bh: + bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + if (!bh) { + wakeup_bdflush(1); + current->policy |= SCHED_YIELD; + schedule(); + goto repeat_bh; + } + /* + * This is wasteful for 1k buffers, but this is a stopgap measure + * and we are being ineffective anyway. This approach simplifies + * things immensly. On boxes with more than 4GB RAM this should + * not be an issue anyway. + */ +repeat_page: + page = alloc_page(GFP_BUFFER); + if (!page) { + wakeup_bdflush(1); + current->policy |= SCHED_YIELD; + schedule(); + goto repeat_page; + } + set_bh_page(bh, page, 0); + + bh->b_next = NULL; + bh->b_blocknr = bh_orig->b_blocknr; + bh->b_size = bh_orig->b_size; + bh->b_list = -1; + bh->b_dev = bh_orig->b_dev; + bh->b_count = bh_orig->b_count; + bh->b_rdev = bh_orig->b_rdev; + bh->b_state = bh_orig->b_state; + bh->b_flushtime = 0; + bh->b_next_free = NULL; + bh->b_prev_free = NULL; + /* bh->b_this_page */ + bh->b_reqnext = NULL; + bh->b_pprev = NULL; + /* bh->b_page */ + if (rw == WRITE) { + bh->b_end_io = bounce_end_io_write; + copy_from_high_bh(bh, bh_orig); + } else + bh->b_end_io = bounce_end_io_read; + bh->b_dev_id = (void *)bh_orig; + bh->b_rsector = -1; + memset(&bh->b_wait, -1, sizeof(bh->b_wait)); + bh->b_kiobuf = NULL; + + return bh; +} + diff --git a/mm/memory.c b/mm/memory.c index 87611db8c..a4eb69717 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -227,7 +227,7 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (pte_none(pte)) goto cont_copy_pte_range; if (!pte_present(pte)) { - swap_duplicate(pte); + swap_duplicate(pte_to_swp_entry(pte)); set_pte(dst_pte, pte); goto cont_copy_pte_range; } @@ -282,7 +282,7 @@ static inline int free_pte(pte_t page) free_page_and_swap_cache(mem_map+nr); return 1; } - swap_free(page); + swap_free(pte_to_swp_entry(page)); return 0; } @@ -743,7 +743,7 @@ struct page * put_dirty_page(struct task_struct * tsk, struct page *page, return 0; } flush_page_to_ram(page); - set_pte(pte, pte_mkwrite(page_pte_prot(page, PAGE_COPY))); + set_pte(pte, pte_mkwrite(mk_pte(page, PAGE_COPY))); /* no need for flush_tlb */ return page; } @@ -808,7 +808,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, * Ok, we need to copy. Oh, well.. */ spin_unlock(&tsk->mm->page_table_lock); - new_page = get_free_highpage(GFP_HIGHUSER); + new_page = alloc_page(GFP_HIGHUSER); if (!new_page) return -1; spin_lock(&tsk->mm->page_table_lock); @@ -887,12 +887,19 @@ static void partial_clear(struct vm_area_struct *vma, unsigned long address) */ void vmtruncate(struct inode * inode, unsigned long offset) { + unsigned long partial, pgoff; struct vm_area_struct * mpnt; truncate_inode_pages(inode, offset); spin_lock(&inode->i_shared_lock); if (!inode->i_mmap) goto out_unlock; + + partial = offset & (PAGE_CACHE_SIZE - 1); + pgoff = offset >> PAGE_CACHE_SHIFT; + if (partial) + pgoff ++; + mpnt = inode->i_mmap; do { struct mm_struct *mm = mpnt->vm_mm; @@ -902,19 +909,22 @@ void vmtruncate(struct inode * inode, unsigned long offset) unsigned long diff; /* mapping wholly truncated? */ - if (mpnt->vm_offset >= offset) { + if (mpnt->vm_pgoff >= pgoff) { flush_cache_range(mm, start, end); zap_page_range(mm, start, len); flush_tlb_range(mm, start, end); continue; } + /* mapping wholly unaffected? */ - diff = offset - mpnt->vm_offset; + len = len >> PAGE_SHIFT; + diff = pgoff - mpnt->vm_pgoff; if (diff >= len) continue; + /* Ok, partially affected.. */ - start += diff; - len = (len - diff) & PAGE_MASK; + start += diff << PAGE_SHIFT; + len = (len - diff) << PAGE_SHIFT; if (start & ~PAGE_MASK) { partial_clear(mpnt, start); start = (start + ~PAGE_MASK) & PAGE_MASK; @@ -935,7 +945,7 @@ out_unlock: * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... */ -void swapin_readahead(pte_t entry) +void swapin_readahead(swp_entry_t entry) { int i; struct page *new_page; @@ -969,7 +979,7 @@ void swapin_readahead(pte_t entry) static int do_swap_page(struct task_struct * tsk, struct vm_area_struct * vma, unsigned long address, - pte_t * page_table, pte_t entry, int write_access) + pte_t * page_table, swp_entry_t entry, int write_access) { struct page *page = lookup_swap_cache(entry); pte_t pte; @@ -1015,7 +1025,7 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v struct page *page = NULL; pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); if (write_access) { - page = get_free_highpage(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER); if (!page) return -1; if (PageHighMem(page)) @@ -1041,8 +1051,7 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * - * This is called with the MM semaphore and the kernel lock held. - * We need to release the kernel lock as soon as possible.. + * This is called with the MM semaphore held. */ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t *page_table) @@ -1059,10 +1068,10 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, * essentially an early COW detection. */ new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access); - if (!new_page) - return 0; /* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */ - if (new_page == (struct page *)-1) - return -1; /* OOM */ + if (new_page == NULL) /* no page was available -- SIGBUS */ + return 0; + if (new_page == NOPAGE_OOM) + return -1; ++tsk->maj_flt; ++vma->vm_mm->rss; /* @@ -1116,7 +1125,7 @@ static inline int handle_pte_fault(struct task_struct *tsk, if (!pte_present(entry)) { if (pte_none(entry)) return do_no_page(tsk, vma, address, write_access, pte); - return do_swap_page(tsk, vma, address, pte, entry, write_access); + return do_swap_page(tsk, vma, address, pte, pte_to_swp_entry(entry), write_access); } /* diff --git a/mm/mlock.c b/mm/mlock.c index 9709d1a04..59d11b922 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -7,6 +7,7 @@ #include <linux/shm.h> #include <linux/mman.h> #include <linux/smp_lock.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -35,7 +36,7 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma, if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); vmlist_modify_lock(vma->vm_mm); - vma->vm_offset += end - vma->vm_start; + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = end; insert_vm_struct(current->mm, n); vmlist_modify_unlock(vma->vm_mm); @@ -52,7 +53,7 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma, return -EAGAIN; *n = *vma; n->vm_start = start; - n->vm_offset += n->vm_start - vma->vm_start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; n->vm_flags = newflags; if (n->vm_file) get_file(n->vm_file); @@ -82,7 +83,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma, *right = *vma; left->vm_end = start; right->vm_start = end; - right->vm_offset += right->vm_start - left->vm_start; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; vma->vm_flags = newflags; if (vma->vm_file) atomic_add(2, &vma->vm_file->f_count); @@ -92,7 +93,7 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma, vma->vm_ops->open(right); } vmlist_modify_lock(vma->vm_mm); - vma->vm_offset += start - vma->vm_start; + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = start; vma->vm_end = end; vma->vm_flags = newflags; @@ -64,7 +64,7 @@ int vm_enough_memory(long pages) free = atomic_read(&buffermem_pages); free += atomic_read(&page_cache_size); - free += nr_free_pages; + free += nr_free_pages(); free += nr_swap_pages; return free > pages; } @@ -183,6 +183,8 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, if (off + len < off) return -EINVAL; + off = off >> PAGE_SHIFT; + /* Too many mappings? */ if (mm->map_count > MAX_MAP_COUNT) return -ENOMEM; @@ -272,7 +274,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; vma->vm_ops = NULL; - vma->vm_offset = off; + vma->vm_pgoff = off; vma->vm_file = NULL; vma->vm_private_data = NULL; @@ -533,7 +535,7 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area, area->vm_end = addr; vmlist_modify_lock(current->mm); } else if (addr == area->vm_start) { - area->vm_offset += (end - area->vm_start); + area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; area->vm_start = end; vmlist_modify_lock(current->mm); } else { @@ -548,7 +550,8 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area, mpnt->vm_page_prot = area->vm_page_prot; mpnt->vm_flags = area->vm_flags; mpnt->vm_ops = area->vm_ops; - mpnt->vm_offset = area->vm_offset + (end - area->vm_start); + mpnt->vm_pgoff = area->vm_pgoff; + area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; mpnt->vm_file = area->vm_file; mpnt->vm_private_data = area->vm_private_data; if (mpnt->vm_file) @@ -783,7 +786,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; vma->vm_ops = NULL; - vma->vm_offset = 0; + vma->vm_pgoff = 0; vma->vm_file = NULL; vma->vm_private_data = NULL; @@ -943,8 +946,9 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l * the offsets must be contiguous.. */ if ((mpnt->vm_file != NULL) || (mpnt->vm_flags & VM_SHM)) { - unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start; - if (off != mpnt->vm_offset) + unsigned long off = prev->vm_pgoff; + off += (prev->vm_end - prev->vm_start) >> PAGE_SHIFT; + if (off != mpnt->vm_pgoff) continue; } @@ -957,7 +961,7 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l prev->vm_end = mpnt->vm_end; prev->vm_next = mpnt->vm_next; if (mpnt->vm_ops && mpnt->vm_ops->close) { - mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start; + mpnt->vm_pgoff += (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; mpnt->vm_start = mpnt->vm_end; vmlist_modify_unlock(mm); mpnt->vm_ops->close(mpnt); diff --git a/mm/mprotect.c b/mm/mprotect.c index 56454fc07..4752806de 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -72,11 +72,13 @@ static void change_protection(unsigned long start, unsigned long end, pgprot_t n flush_cache_range(current->mm, beg, end); if (start >= end) BUG(); + spin_lock(¤t->mm->page_table_lock); do { change_pmd_range(dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); + spin_unlock(¤t->mm->page_table_lock); flush_tlb_range(current->mm, beg, end); return; } @@ -109,7 +111,7 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma, if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); vmlist_modify_lock(vma->vm_mm); - vma->vm_offset += end - vma->vm_start; + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = end; insert_vm_struct(current->mm, n); vmlist_modify_unlock(vma->vm_mm); @@ -127,7 +129,7 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma, return -ENOMEM; *n = *vma; n->vm_start = start; - n->vm_offset += n->vm_start - vma->vm_start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; n->vm_flags = newflags; n->vm_page_prot = prot; if (n->vm_file) @@ -159,7 +161,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma, *right = *vma; left->vm_end = start; right->vm_start = end; - right->vm_offset += right->vm_start - left->vm_start; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; if (vma->vm_file) atomic_add(2,&vma->vm_file->f_count); if (vma->vm_ops && vma->vm_ops->open) { @@ -167,7 +169,7 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma, vma->vm_ops->open(right); } vmlist_modify_lock(vma->vm_mm); - vma->vm_offset += start - vma->vm_start; + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; vma->vm_start = start; vma->vm_end = end; vma->vm_flags = newflags; diff --git a/mm/mremap.c b/mm/mremap.c index b73996dc2..012ab7912 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -93,7 +93,6 @@ static int move_page_tables(struct mm_struct * mm, unsigned long offset = len; flush_cache_range(mm, old_addr, old_addr + len); - flush_tlb_range(mm, old_addr, old_addr + len); /* * This is not the clever way to do this, but we're taking the @@ -105,6 +104,7 @@ static int move_page_tables(struct mm_struct * mm, if (move_one_page(mm, old_addr + offset, new_addr + offset)) goto oops_we_failed; } + flush_tlb_range(mm, old_addr, old_addr + len); return 0; /* @@ -136,7 +136,8 @@ static inline unsigned long move_vma(struct vm_area_struct * vma, *new_vma = *vma; new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; - new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start); + new_vma->vm_pgoff = vma->vm_pgoff; + new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27aa58468..95a2bc436 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4,6 +4,7 @@ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 */ #include <linux/config.h> @@ -22,7 +23,6 @@ #include <asm/pgtable.h> int nr_swap_pages = 0; -int nr_free_pages = 0; int nr_lru_pages; LIST_HEAD(lru_cache); @@ -36,30 +36,46 @@ LIST_HEAD(lru_cache); #if CONFIG_AP1000 /* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram for the ring buffers */ -#define NR_MEM_LISTS 12 +#define MAX_ORDER 12 #else -#define NR_MEM_LISTS 10 +#define MAX_ORDER 10 #endif -struct free_area_struct { +typedef struct free_area_struct { struct list_head free_list; unsigned int * map; - unsigned long count; -}; +} free_area_t; -#define MEM_TYPE_DMA 0 -#define MEM_TYPE_NORMAL 1 -#define MEM_TYPE_HIGH 2 - -static const char *mem_type_strs[] = {"DMA", "Normal", "High"}; +#define ZONE_DMA 0 +#define ZONE_NORMAL 1 #ifdef CONFIG_HIGHMEM -#define NR_MEM_TYPES 3 +# define ZONE_HIGHMEM 2 +# define NR_ZONES 3 #else -#define NR_MEM_TYPES 2 +# define NR_ZONES 2 #endif -static struct free_area_struct free_area[NR_MEM_TYPES][NR_MEM_LISTS]; +typedef struct zone_struct { + spinlock_t lock; + unsigned long offset; + unsigned long size; + free_area_t free_area[MAX_ORDER]; + + unsigned long free_pages; + unsigned long pages_low, pages_high; + int low_on_memory; + char * name; +} zone_t; + +static zone_t zones[NR_ZONES] = + { + { name: "DMA" }, + { name: "Normal" }, +#ifdef CONFIG_HIGHMEM + { name: "HighMem" } +#endif + }; /* * Free_page() adds the page to the free lists. This is optimized for @@ -73,13 +89,6 @@ static struct free_area_struct free_area[NR_MEM_TYPES][NR_MEM_LISTS]; * for the normal case, giving better asm-code. */ -/* - * Buddy system. Hairy. You really aren't expected to understand this - * - * Hint: -mask = 1+~mask - */ -spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; - #define memlist_init(x) INIT_LIST_HEAD(x) #define memlist_add_head list_add #define memlist_add_tail list_add_tail @@ -88,35 +97,54 @@ spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; #define memlist_next(x) ((x)->next) #define memlist_prev(x) ((x)->prev) -static inline void free_pages_ok(struct page *page, unsigned long map_nr, unsigned long order) +/* + * Temporary debugging check. + */ +#define BAD_RANGE(zone,x) ((((x)-mem_map) < zone->offset) || (((x)-mem_map) >= zone->offset+zone->size)) + +/* + * Buddy system. Hairy. You really aren't expected to understand this + * + * Hint: -mask = 1+~mask + */ + +static inline void free_pages_ok (struct page *page, unsigned long map_nr, unsigned long order) { struct free_area_struct *area; - unsigned long index = map_nr >> (1 + order); - unsigned long mask = (~0UL) << order; + unsigned long index, page_idx, mask, offset; unsigned long flags; struct page *buddy; + zone_t *zone; + int i; - spin_lock_irqsave(&page_alloc_lock, flags); - -#define list(x) (mem_map+(x)) - -#ifdef CONFIG_HIGHMEM - if (map_nr >= highmem_mapnr) { - area = free_area[MEM_TYPE_HIGH]; - nr_free_highpages -= mask; - } else -#endif - if (PageDMA(page)) - area = free_area[MEM_TYPE_DMA]; - else - area = free_area[MEM_TYPE_NORMAL]; + /* + * Which zone is this page belonging to. + * + * (NR_ZONES is low, and we do not want (yet) to introduce + * put page->zone, it increases the size of mem_map[] + * unnecesserily. This small loop is basically equivalent + * to the previous #ifdef jungle, speed-wise.) + */ + i = NR_ZONES-1; + zone = zones + i; + for ( ; i >= 0; i--, zone--) + if (map_nr >= zone->offset) + break; + mask = (~0UL) << order; + offset = zone->offset; + area = zone->free_area; area += order; + page_idx = map_nr - zone->offset; + page_idx &= mask; + index = page_idx >> (1 + order); + mask = (~0UL) << order; - map_nr &= mask; - nr_free_pages -= mask; + spin_lock_irqsave(&zone->lock, flags); - while (mask + (1 << (NR_MEM_LISTS-1))) { + zone->free_pages -= mask; + + while (mask + (1 << (MAX_ORDER-1))) { if (!test_and_change_bit(index, area->map)) /* * the buddy page is still allocated. @@ -125,21 +153,22 @@ static inline void free_pages_ok(struct page *page, unsigned long map_nr, unsign /* * Move the buddy up one level. */ - buddy = list(map_nr ^ -mask); - page = list(map_nr); + buddy = mem_map + offset + (page_idx ^ -mask); + page = mem_map + offset + page_idx; + if (BAD_RANGE(zone,buddy)) + BUG(); + if (BAD_RANGE(zone,page)) + BUG(); - area->count--; memlist_del(&buddy->list); mask <<= 1; area++; index >>= 1; - map_nr &= mask; + page_idx &= mask; } - area->count++; - memlist_add_head(&(list(map_nr))->list, &area->free_list); -#undef list + memlist_add_head(&mem_map[offset + page_idx].list, &area->free_list); - spin_unlock_irqrestore(&page_alloc_lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } /* @@ -147,10 +176,9 @@ static inline void free_pages_ok(struct page *page, unsigned long map_nr, unsign */ #define MARK_USED(index, order, area) \ change_bit((index) >> (1+(order)), (area)->map) -#define CAN_DMA(x) (PageDMA(x)) #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT)) -int __free_page(struct page *page) +int __free_page (struct page *page) { if (!PageReserved(page) && put_page_testzero(page)) { if (PageSwapCache(page)) @@ -164,7 +192,7 @@ int __free_page(struct page *page) return 0; } -int free_pages(unsigned long addr, unsigned long order) +int free_pages (unsigned long addr, unsigned long order) { unsigned long map_nr = MAP_NR(addr); @@ -182,16 +210,17 @@ int free_pages(unsigned long addr, unsigned long order) return 0; } -static inline unsigned long EXPAND (struct page *map, unsigned long index, +static inline unsigned long EXPAND (zone_t *zone, struct page *map, unsigned long index, int low, int high, struct free_area_struct * area) { unsigned long size = 1 << high; while (high > low) { + if (BAD_RANGE(zone,map)) + BUG(); area--; high--; size >>= 1; - area->count++; memlist_add_head(&(map)->list, &(area)->free_list); MARK_USED(index, high, area); index += size; @@ -201,79 +230,62 @@ static inline unsigned long EXPAND (struct page *map, unsigned long index, return index; } -static inline struct page * rmqueue (int order, unsigned type) +static inline struct page * rmqueue (zone_t *zone, unsigned long order) { - struct free_area_struct * area = free_area[type]+order; + struct free_area_struct * area = zone->free_area + order; unsigned long curr_order = order, map_nr; - struct page *page; struct list_head *head, *curr; + unsigned long flags; + struct page *page; + spin_lock_irqsave(&zone->lock, flags); do { head = &area->free_list; curr = memlist_next(head); if (curr != head) { + unsigned int index; + page = memlist_entry(curr, struct page, list); memlist_del(curr); - area->count--; - map_nr = page - mem_map; - MARK_USED(map_nr, curr_order, area); - nr_free_pages -= 1 << order; - map_nr = EXPAND(page, map_nr, order, curr_order, area); + map_nr = page - mem_map; + index = map_nr - zone->offset; + MARK_USED(index, curr_order, area); + zone->free_pages -= 1 << order; + map_nr = zone->offset + EXPAND(zone, page, index, order, curr_order, area); + spin_unlock_irqrestore(&zone->lock, flags); + page = mem_map + map_nr; + if (BAD_RANGE(zone,page)) + BUG(); return page; } curr_order++; area++; - } while (curr_order < NR_MEM_LISTS); + } while (curr_order < MAX_ORDER); + spin_unlock_irqrestore(&zone->lock, flags); return NULL; } -static inline int balance_lowmemory (int gfp_mask) +static inline int balance_memory (zone_t *zone, int gfp_mask) { int freed; - static int low_on_memory = 0; -#ifndef CONFIG_HIGHMEM - if (nr_free_pages > freepages.min) { - if (!low_on_memory) + if (zone->free_pages > zone->pages_low) { + if (!zone->low_on_memory) return 1; - if (nr_free_pages >= freepages.high) { - low_on_memory = 0; + /* + * Simple hysteresis: exit 'low memory mode' if + * the upper limit has been reached: + */ + if (zone->free_pages >= zone->pages_high) { + zone->low_on_memory = 0; return 1; } } + zone->low_on_memory = 1; - low_on_memory = 1; -#else - static int low_on_highmemory = 0; - - if (gfp_mask & __GFP_HIGHMEM) - { - if (nr_free_pages > freepages.min) { - if (!low_on_highmemory) { - return 1; - } - if (nr_free_pages >= freepages.high) { - low_on_highmemory = 0; - return 1; - } - } - low_on_highmemory = 1; - } else { - if (nr_free_pages+nr_free_highpages > freepages.min) { - if (!low_on_memory) { - return 1; - } - if (nr_free_pages+nr_free_highpages >= freepages.high) { - low_on_memory = 0; - return 1; - } - } - low_on_memory = 1; - } -#endif current->flags |= PF_MEMALLOC; freed = try_to_free_pages(gfp_mask); current->flags &= ~PF_MEMALLOC; @@ -283,13 +295,12 @@ static inline int balance_lowmemory (int gfp_mask) return 1; } -struct page * __get_pages(int gfp_mask, unsigned long order) +static inline struct page * __get_pages (zone_t *zone, unsigned int gfp_mask, + unsigned long order) { - unsigned long flags; struct page *page; - unsigned type; - if (order >= NR_MEM_LISTS) + if (order >= MAX_ORDER) goto nopage; /* @@ -303,28 +314,20 @@ struct page * __get_pages(int gfp_mask, unsigned long order) * further thought. */ if (!(current->flags & PF_MEMALLOC)) - goto lowmemory; - -ok_to_allocate: -#ifdef CONFIG_HIGHMEM - if (gfp_mask & __GFP_HIGHMEM) - type = MEM_TYPE_HIGH; - else -#endif - if (gfp_mask & __GFP_DMA) - type = MEM_TYPE_DMA; - else - type = MEM_TYPE_NORMAL; - - spin_lock_irqsave(&page_alloc_lock, flags); + if (!balance_memory(zone, gfp_mask)) + goto nopage; + /* + * We are falling back to lower-level zones if allocation + * in a higher zone fails. This assumes a hierarchical + * dependency between zones, which is true currently. If + * you need something else then move this loop outside + * this function, into the zone-specific allocator. + */ do { - page = rmqueue(order, type); - if (page) { - spin_unlock_irqrestore(&page_alloc_lock, flags); + page = rmqueue(zone, order); + if (page) return page; - } - } while (type-- > 0) ; - spin_unlock_irqrestore(&page_alloc_lock, flags); + } while (zone-- != zones) ; /* * If we can schedule, do so, and make sure to yield. @@ -338,60 +341,114 @@ ok_to_allocate: nopage: return NULL; +} -lowmemory: - if (balance_lowmemory(gfp_mask)) - goto ok_to_allocate; - goto nopage; +static inline zone_t * gfp_mask_to_zone (int gfp_mask) +{ + zone_t *zone; + +#if CONFIG_HIGHMEM + if (gfp_mask & __GFP_HIGHMEM) + zone = zones + ZONE_HIGHMEM; + else +#endif + if (gfp_mask & __GFP_DMA) + zone = zones + ZONE_DMA; + else + zone = zones + ZONE_NORMAL; + return zone; } -unsigned long __get_free_pages(int gfp_mask, unsigned long order) +unsigned long __get_free_pages (int gfp_mask, unsigned long order) { struct page *page; - page = __get_pages(gfp_mask, order); + + page = __get_pages(gfp_mask_to_zone(gfp_mask), gfp_mask, order); if (!page) return 0; return page_address(page); } -struct page * get_free_highpage(int gfp_mask) +struct page * alloc_pages (int gfp_mask, unsigned long order) { - return __get_pages(gfp_mask, 0); + return __get_pages(gfp_mask_to_zone(gfp_mask), gfp_mask, order); } /* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages (void) +{ + unsigned int sum; + zone_t *zone; + + sum = 0; + for (zone = zones; zone < zones+NR_ZONES; zone++) + sum += zone->free_pages; + return sum; +} + +/* + * Amount of free RAM allocatable as buffer memory: + */ +unsigned int nr_free_buffer_pages (void) +{ + unsigned int sum; + zone_t *zone; + + sum = nr_lru_pages; + for (zone = zones; zone <= zones+ZONE_NORMAL; zone++) + sum += zone->free_pages; + return sum; +} + +#if CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + return zones[ZONE_HIGHMEM].free_pages; +} +#endif + +/* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. */ void show_free_areas(void) { - unsigned long order, flags; + unsigned long order; unsigned type; - spin_lock_irqsave(&page_alloc_lock, flags); - printk("Free pages: %6dkB (%6ldkB HighMem)\n", - nr_free_pages<<(PAGE_SHIFT-10), - nr_free_highpages<<(PAGE_SHIFT-10)); + printk("Free pages: %6dkB (%6dkB HighMem)\n", + nr_free_pages()<<(PAGE_SHIFT-10), + nr_free_highpages()<<(PAGE_SHIFT-10)); printk("( Free: %d, lru_cache: %d (%d %d %d) )\n", - nr_free_pages, + nr_free_pages(), nr_lru_pages, freepages.min, freepages.low, freepages.high); - for (type = 0; type < NR_MEM_TYPES; type++) { + for (type = 0; type < NR_ZONES; type++) { + zone_t *zone = zones + type; unsigned long total = 0; - printk(" %s: ", mem_type_strs[type]); - for (order = 0; order < NR_MEM_LISTS; order++) { - unsigned long nr = free_area[type][order].count; + printk(" %s: ", zone->name); + for (order = 0; order < MAX_ORDER; order++) { + unsigned long i, nr; + + nr = 0; + for (i = 0; i < zone->size; i += 1<<order) { + struct page * page; + page = mem_map + zone->offset + i; + if (!page_count(page)) + nr++; + } total += nr * ((PAGE_SIZE>>10) << order); printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order)); } printk("= %lukB)\n", total); } - spin_unlock_irqrestore(&page_alloc_lock, flags); #ifdef SWAP_CACHE_INFO show_swap_cache_info(); @@ -401,18 +458,24 @@ void show_free_areas(void) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) /* - * set up the free-area data structures: + * Set up the zone data structures: * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps */ -volatile int data; -void __init free_area_init(unsigned long end_mem_pages) +void __init free_area_init(unsigned int *zones_size) { mem_map_t * p; unsigned long i, j; unsigned long map_size; + unsigned int totalpages, offset; + + totalpages = 0; + for (i = 0; i < NR_ZONES; i++) + totalpages += zones_size[i]; + printk("totalpages: %08x\n", totalpages); + i = totalpages >> 7; /* * Select nr of pages we try to keep free for important stuff * with a minimum of 10 pages and a maximum of 256 pages, so @@ -420,7 +483,7 @@ void __init free_area_init(unsigned long end_mem_pages) * This is fairly arbitrary, but based on some behaviour * analysis. */ - i = end_mem_pages >> 7; + i = totalpages >> 7; if (i < 10) i = 10; if (i > 256) @@ -430,11 +493,10 @@ void __init free_area_init(unsigned long end_mem_pages) freepages.high = i * 3; /* - * Most architectures just pick 'start_mem'. Some architectures - * (with lots of mem and discontinous memory maps) have to search - * for a good area. + * Some architectures (with lots of mem and discontinous memory + * maps) have to search for a good mem_map area: */ - map_size = end_mem_pages*sizeof(struct page); + map_size = totalpages*sizeof(struct page); mem_map = (struct page *) alloc_bootmem(map_size); memset(mem_map, 0, map_size); @@ -443,27 +505,39 @@ void __init free_area_init(unsigned long end_mem_pages) * up by free_all_bootmem() once the early boot process is * done. */ - for (p = mem_map; p < mem_map + end_mem_pages; p++) { + for (p = mem_map; p < mem_map + totalpages; p++) { set_page_count(p, 0); p->flags = (1 << PG_DMA); SetPageReserved(p); init_waitqueue_head(&p->wait); memlist_init(&p->list); } - - for (j = 0 ; j < NR_MEM_TYPES ; j++) { + + offset = 0; + for (j = 0; j < NR_ZONES; j++) { + zone_t *zone = zones + j; unsigned long mask = -1; - for (i = 0 ; i < NR_MEM_LISTS ; i++) { + unsigned long size; + + size = zones_size[j]; + zone->size = size; + zone->offset = offset; + zone->pages_low = freepages.low; + zone->pages_high = freepages.high; + zone->low_on_memory = 0; + + offset += size; + for (i = 0; i < MAX_ORDER; i++) { unsigned long bitmap_size; unsigned int * map; - memlist_init(&free_area[j][i].free_list); + memlist_init(&zone->free_area[i].free_list); mask += mask; - end_mem_pages = (end_mem_pages + ~mask) & mask; - bitmap_size = end_mem_pages >> i; + size = (size + ~mask) & mask; + bitmap_size = size >> i; bitmap_size = (bitmap_size + 7) >> 3; bitmap_size = LONG_ALIGN(bitmap_size); map = (unsigned int *) alloc_bootmem(bitmap_size); - free_area[j][i].map = map; + zone->free_area[i].map = map; memset((void *) map, 0, bitmap_size); } } diff --git a/mm/page_io.c b/mm/page_io.c index c5ed3ed74..0012fe234 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -33,7 +33,7 @@ * that shared pages stay shared while being swapped. */ -static int rw_swap_page_base(int rw, pte_t entry, struct page *page, int wait) +static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page, int wait) { unsigned long type, offset; struct swap_info_struct * p; @@ -59,7 +59,7 @@ static int rw_swap_page_base(int rw, pte_t entry, struct page *page, int wait) return 0; } if (p->swap_map && !p->swap_map[offset]) { - pte_ERROR(entry); + printk("VM: Bad swap entry %08lx\n", entry.val); return 0; } if (!(p->flags & SWP_USED)) { @@ -130,7 +130,9 @@ static int rw_swap_page_base(int rw, pte_t entry, struct page *page, int wait) */ void rw_swap_page(int rw, struct page *page, int wait) { - pte_t entry = get_pagecache_pte(page); + swp_entry_t entry; + + entry.val = page->index; if (!PageLocked(page)) PAGE_BUG(page); @@ -147,7 +149,7 @@ void rw_swap_page(int rw, struct page *page, int wait) * Therefore we can't use it. Later when we can remove the need for the * lock map and we can reduce the number of functions exported. */ -void rw_swap_page_nolock(int rw, pte_t entry, char *buf, int wait) +void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf, int wait) { struct page *page = mem_map + MAP_NR(buf); @@ -114,7 +114,7 @@ /* If there is a different PAGE_SIZE around, and it works with this allocator, * then change the following. */ -#if (PAGE_SIZE != 8192 && PAGE_SIZE != 4096) +#if (PAGE_SIZE != 8192 && PAGE_SIZE != 4096 && PAGE_SIZE != 32768) #error Your page size is probably not correctly supported - please check #endif diff --git a/mm/swap_state.c b/mm/swap_state.c index d79b7bffb..f63eca66a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -40,7 +40,7 @@ void show_swap_cache_info(void) } #endif -void add_to_swap_cache(struct page *page, pte_t entry) +void add_to_swap_cache(struct page *page, swp_entry_t entry) { #ifdef SWAP_CACHE_INFO swap_cache_add_total++; @@ -49,7 +49,7 @@ void add_to_swap_cache(struct page *page, pte_t entry) BUG(); if (page->mapping) BUG(); - add_to_page_cache(page, &swapper_space, pte_val(entry)); + add_to_page_cache(page, &swapper_space, entry.val); } /* @@ -58,17 +58,16 @@ void add_to_swap_cache(struct page *page, pte_t entry) * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as * "permanent", but will be reclaimed by the next swapoff. */ -int swap_duplicate(pte_t entry) +int swap_duplicate(swp_entry_t entry) { struct swap_info_struct * p; unsigned long offset, type; int result = 0; - if (!pte_val(entry)) + /* Swap entry 0 is illegal */ + if (!entry.val) goto out; type = SWP_TYPE(entry); - if (type & SHM_SWP_TYPE) - goto out; if (type >= nr_swapfiles) goto bad_file; p = type + swap_info; @@ -85,7 +84,7 @@ int swap_duplicate(pte_t entry) else { static int overflow = 0; if (overflow++ < 5) - pte_ERROR(entry); + printk("VM: swap entry overflow\n"); p->swap_map[offset] = SWAP_MAP_MAX; } result = 1; @@ -93,13 +92,13 @@ out: return result; bad_file: - pte_ERROR(entry); + printk("Bad swap file entry %08lx\n", entry.val); goto out; bad_offset: - pte_ERROR(entry); + printk("Bad swap offset entry %08lx\n", entry.val); goto out; bad_unused: - pte_ERROR(entry); + printk("Unused swap offset entry %08lx\n", entry.val); goto out; } @@ -107,14 +106,13 @@ int swap_count(struct page *page) { struct swap_info_struct * p; unsigned long offset, type; - pte_t entry = get_pagecache_pte(page); + swp_entry_t entry; int retval = 0; - if (!pte_val(entry)) + entry.val = page->index; + if (!entry.val) goto bad_entry; type = SWP_TYPE(entry); - if (type & SHM_SWP_TYPE) - goto out; if (type >= nr_swapfiles) goto bad_file; p = type + swap_info; @@ -131,13 +129,13 @@ bad_entry: printk(KERN_ERR "swap_count: null entry!\n"); goto out; bad_file: - pte_ERROR(entry); + printk("Bad swap file entry %08lx\n", entry.val); goto out; bad_offset: - pte_ERROR(entry); + printk("Bad swap offset entry %08lx\n", entry.val); goto out; bad_unused: - pte_ERROR(entry); + printk("Unused swap offset entry %08lx\n", entry.val); goto out; } @@ -160,7 +158,9 @@ static inline void remove_from_swap_cache(struct page *page) */ void __delete_from_swap_cache(struct page *page) { - pte_t entry = get_pagecache_pte(page); + swp_entry_t entry; + + entry.val = page->index; #ifdef SWAP_CACHE_INFO swap_cache_del_total++; @@ -223,7 +223,7 @@ void free_page_and_swap_cache(struct page *page) * lock before returning. */ -struct page * lookup_swap_cache(pte_t entry) +struct page * lookup_swap_cache(swp_entry_t entry) { struct page *found; @@ -232,9 +232,9 @@ struct page * lookup_swap_cache(pte_t entry) #endif while (1) { /* - * Right now the pagecache is 32-bit only. + * Right now the pagecache is 32-bit only. But it's a 32 bit index. =) */ - found = find_lock_page(&swapper_space, pte_val(entry)); + found = find_lock_page(&swapper_space, entry.val); if (!found) return 0; if (found->mapping != &swapper_space || !PageSwapCache(found)) @@ -262,7 +262,7 @@ out_bad: * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(pte_t entry, int wait) +struct page * read_swap_cache_async(swp_entry_t entry, int wait) { struct page *found_page = 0, *new_page; unsigned long new_page_addr; diff --git a/mm/swapfile.c b/mm/swapfile.c index bcd7b4587..c34a5316a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -25,7 +25,7 @@ struct swap_info_struct swap_info[MAX_SWAPFILES]; #define SWAPFILE_CLUSTER 256 -static inline int scan_swap_map(struct swap_info_struct *si) +static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count) { unsigned long offset; /* @@ -73,7 +73,7 @@ static inline int scan_swap_map(struct swap_info_struct *si) si->lowest_bit++; if (offset == si->highest_bit) si->highest_bit--; - si->swap_map[offset] = 1; + si->swap_map[offset] = count; nr_swap_pages--; si->cluster_next = offset+1; return offset; @@ -81,23 +81,26 @@ static inline int scan_swap_map(struct swap_info_struct *si) return 0; } -pte_t get_swap_page(void) +swp_entry_t __get_swap_page(unsigned short count) { struct swap_info_struct * p; unsigned long offset; - pte_t entry = __pte(0); + swp_entry_t entry; int type, wrapped = 0; + entry.val = 0; /* Out of memory */ type = swap_list.next; if (type < 0) goto out; if (nr_swap_pages == 0) goto out; + if (count >= SWAP_MAP_MAX) + goto bad_count; while (1) { p = &swap_info[type]; if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - offset = scan_swap_map(p); + offset = scan_swap_map(p, count); if (offset) { entry = SWP_ENTRY(type,offset); type = swap_info[type].next; @@ -122,20 +125,23 @@ pte_t get_swap_page(void) } out: return entry; + +bad_count: + printk(KERN_ERR "get_swap_page: bad count %hd from %p\n", + count, __builtin_return_address(0)); + goto out; } -void swap_free(pte_t entry) +void __swap_free(swp_entry_t entry, unsigned short count) { struct swap_info_struct * p; unsigned long offset, type; - if (!pte_val(entry)) + if (!entry.val) goto out; type = SWP_TYPE(entry); - if (type & SHM_SWP_TYPE) - goto out; if (type >= nr_swapfiles) goto bad_nofile; p = & swap_info[type]; @@ -149,7 +155,9 @@ void swap_free(pte_t entry) if (!p->swap_map[offset]) goto bad_free; if (p->swap_map[offset] < SWAP_MAP_MAX) { - if (!--p->swap_map[offset]) { + if (p->swap_map[offset] < count) + goto bad_count; + if (!(p->swap_map[offset] -= count)) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -170,27 +178,28 @@ bad_offset: printk("swap_free: offset exceeds max\n"); goto out; bad_free: - pte_ERROR(entry); + printk("VM: Bad swap entry %08lx\n", entry.val); + goto out; +bad_count: + printk(KERN_ERR "VM: Bad count %hd current count %hd\n", count, p->swap_map[offset]); goto out; } /* needs the big kernel lock */ -pte_t acquire_swap_entry(struct page *page) +swp_entry_t acquire_swap_entry(struct page *page) { struct swap_info_struct * p; unsigned long offset, type; - pte_t entry; + swp_entry_t entry; if (!test_bit(PG_swap_entry, &page->flags)) goto new_swap_entry; /* We have the old entry in the page offset still */ - if (!page->offset) + if (!page->index) goto new_swap_entry; - entry = get_pagecache_pte(page); + entry.val = page->index; type = SWP_TYPE(entry); - if (type & SHM_SWP_TYPE) - goto new_swap_entry; if (type >= nr_swapfiles) goto new_swap_entry; p = type + swap_info; @@ -222,7 +231,7 @@ new_swap_entry: * what to do if a write is requested later. */ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, - pte_t *dir, pte_t entry, struct page* page) + pte_t *dir, swp_entry_t entry, struct page* page) { pte_t pte = *dir; @@ -238,17 +247,17 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, set_pte(dir, pte_mkdirty(pte)); return; } - if (pte_val(pte) != pte_val(entry)) + if (pte_val(pte) != entry.val) return; set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); swap_free(entry); - get_page(mem_map + MAP_NR(page)); + get_page(page); ++vma->vm_mm->rss; } static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - pte_t entry, struct page* page) + swp_entry_t entry, struct page* page) { pte_t * pte; unsigned long end; @@ -275,7 +284,7 @@ static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - pte_t entry, struct page* page) + swp_entry_t entry, struct page* page) { pmd_t * pmd; unsigned long offset, end; @@ -304,7 +313,7 @@ static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, } static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - pte_t entry, struct page* page) + swp_entry_t entry, struct page* page) { unsigned long start = vma->vm_start, end = vma->vm_end; @@ -318,7 +327,7 @@ static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, } static void unuse_process(struct mm_struct * mm, - pte_t entry, struct page* page) + swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; @@ -344,7 +353,7 @@ static int try_to_unuse(unsigned int type) struct swap_info_struct * si = &swap_info[type]; struct task_struct *p; struct page *page; - pte_t entry; + swp_entry_t entry; int i; while (1) { @@ -388,7 +397,7 @@ static int try_to_unuse(unsigned int type) */ if (si->swap_map[i] != 0) { if (si->swap_map[i] != SWAP_MAP_MAX) - pte_ERROR(entry); + printk("VM: Undead swap entry %08lx\n", entry.val); si->swap_map[i] = 0; nr_swap_pages++; } @@ -616,7 +625,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) swapfilesize = 0; if (blk_size[MAJOR(dev)]) swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] - / (PAGE_SIZE / 1024); + >> (PAGE_SHIFT - 10); } else if (S_ISREG(swap_dentry->d_inode->i_mode)) { error = -EBUSY; for (i = 0 ; i < nr_swapfiles ; i++) { @@ -625,7 +634,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) if (swap_dentry->d_inode == swap_info[i].swap_file->d_inode) goto bad_swap; } - swapfilesize = swap_dentry->d_inode->i_size / PAGE_SIZE; + swapfilesize = swap_dentry->d_inode->i_size >> PAGE_SHIFT; } else goto bad_swap; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0978f544c..d7908df16 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -10,7 +10,7 @@ #include <asm/uaccess.h> -static struct vm_struct * vmlist = NULL; +struct vm_struct * vmlist = NULL; static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) { @@ -97,7 +97,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo struct page * page; if (!pte_none(*pte)) printk(KERN_ERR "alloc_area_pte: page already exists\n"); - page = get_free_highpage(GFP_KERNEL|__GFP_HIGHMEM); + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); if (!page) return -ENOMEM; set_pte(pte, mk_pte(page, prot)); @@ -204,7 +204,7 @@ void * vmalloc_prot(unsigned long size, pgprot_t prot) struct vm_struct *area; size = PAGE_ALIGN(size); - if (!size || size > (max_mapnr << PAGE_SHIFT)) { + if (!size || (size >> PAGE_SHIFT) > max_mapnr) { BUG(); return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9db9ce6f9..14f5dc444 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,8 @@ */ static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { - pte_t pte, entry; + pte_t pte; + swp_entry_t entry; struct page * page; int (*swapout)(struct page *, struct file *); @@ -72,9 +73,9 @@ static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pt * memory, and we should just continue our scan. */ if (PageSwapCache(page)) { - entry = get_pagecache_pte(page); + entry.val = page->index; swap_duplicate(entry); - set_pte(page_table, entry); + set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: vma->vm_mm->rss--; flush_tlb_page(vma, address); @@ -151,14 +152,14 @@ drop_pte: * page with that swap entry. */ entry = acquire_swap_entry(page); - if (!pte_val(entry)) + if (!entry.val) goto out_failed; /* No swap space left */ if (!(page = prepare_highmem_swapout(page))) goto out_swap_free; vma->vm_mm->rss--; - set_pte(page_table, entry); + set_pte(page_table, swp_entry_to_pte(entry)); vmlist_access_unlock(vma->vm_mm); flush_tlb_page(vma, address); @@ -502,7 +503,7 @@ int kswapd(void *unused) do { /* kswapd is critical to provide GFP_ATOMIC allocations (not GFP_HIGHMEM ones). */ - if (nr_free_pages - nr_free_highpages >= freepages.high) + if (nr_free_buffer_pages() >= freepages.high) break; if (!do_try_to_free_pages(GFP_KSWAPD)) |