diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1997-01-07 02:33:00 +0000 |
---|---|---|
committer | <ralf@linux-mips.org> | 1997-01-07 02:33:00 +0000 |
commit | beb116954b9b7f3bb56412b2494b562f02b864b1 (patch) | |
tree | 120e997879884e1b9d93b265221b939d2ef1ade1 /mm | |
parent | 908d4681a1dc3792ecafbe64265783a86c4cccb6 (diff) |
Import of Linux/MIPS 2.1.14
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 27 | ||||
-rw-r--r-- | mm/filemap.c | 1218 | ||||
-rw-r--r-- | mm/kmalloc.c | 663 | ||||
-rw-r--r-- | mm/memory.c | 920 | ||||
-rw-r--r-- | mm/mlock.c | 272 | ||||
-rw-r--r-- | mm/mmap.c | 360 | ||||
-rw-r--r-- | mm/mprotect.c | 22 | ||||
-rw-r--r-- | mm/mremap.c | 224 | ||||
-rw-r--r-- | mm/page_alloc.c | 339 | ||||
-rw-r--r-- | mm/page_io.c | 193 | ||||
-rw-r--r-- | mm/swap.c | 1263 | ||||
-rw-r--r-- | mm/swap_state.c | 111 | ||||
-rw-r--r-- | mm/swapfile.c | 577 | ||||
-rw-r--r-- | mm/vmalloc.c | 91 | ||||
-rw-r--r-- | mm/vmscan.c | 453 |
15 files changed, 4351 insertions, 2382 deletions
diff --git a/mm/Makefile b/mm/Makefile index 35f51d45f..19552c98f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,26 +7,9 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := mm.o +O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ + kmalloc.o vmalloc.o \ + swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o -OBJS = memory.o swap.o mmap.o filemap.o mprotect.o kmalloc.o vmalloc.o - -mm.o: $(OBJS) - $(LD) -r -o mm.o $(OBJS) - -modules: - -dep: - $(CPP) -M *.c > .depend - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif +include $(TOPDIR)/Rules.make diff --git a/mm/filemap.c b/mm/filemap.c index 5a1e99142..c0ce486df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1,7 +1,7 @@ /* - * linux/mm/filemmap.c + * linux/mm/filemap.c * - * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 1994, 1995 Linus Torvalds */ /* @@ -18,57 +18,921 @@ #include <linux/mman.h> #include <linux/string.h> #include <linux/malloc.h> +#include <linux/fs.h> +#include <linux/locks.h> +#include <linux/pagemap.h> +#include <linux/swap.h> -#include <asm/segment.h> #include <asm/system.h> #include <asm/pgtable.h> +#include <asm/uaccess.h> /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + */ + +unsigned long page_cache_size = 0; +struct page * page_hash_table[PAGE_HASH_SIZE]; + +/* + * Simple routines for both non-shared and shared mappings. */ -static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, - unsigned long page, int no_share) +/* + * This is a special fast page-free routine that _only_ works + * on page-cache pages that we are currently using. We can + * just decrement the page count, because we know that the page + * has a count > 1 (the page cache itself counts as one, and + * we're currently using it counts as one). So we don't need + * the full free_page() stuff.. + */ +static inline void release_page(struct page * page) { - struct inode * inode = area->vm_inode; - unsigned int block; - int nr[8]; - int i, *p; - - address &= PAGE_MASK; - block = address - area->vm_start + area->vm_offset; - block >>= inode->i_sb->s_blocksize_bits; - i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; - p = nr; + atomic_dec(&page->count); +} + +/* + * Invalidate the pages of an inode, removing all pages that aren't + * locked down (those are sure to be up-to-date anyway, so we shouldn't + * invalidate them). + */ +void invalidate_inode_pages(struct inode * inode) +{ + struct page ** p; + struct page * page; + + p = &inode->i_pages; + while ((page = *p) != NULL) { + if (PageLocked(page)) { + p = &page->next; + continue; + } + inode->i_nrpages--; + if ((*p = page->next) != NULL) + (*p)->prev = page->prev; + page->dirty = 0; + page->next = NULL; + page->prev = NULL; + remove_page_from_hash_queue(page); + page->inode = NULL; + __free_page(page); + continue; + } +} + +/* + * Truncate the page cache at a set offset, removing the pages + * that are beyond that offset (and zeroing out partial pages). + */ +void truncate_inode_pages(struct inode * inode, unsigned long start) +{ + struct page ** p; + struct page * page; + +repeat: + p = &inode->i_pages; + while ((page = *p) != NULL) { + unsigned long offset = page->offset; + + /* page wholly truncated - free it */ + if (offset >= start) { + if (PageLocked(page)) { + wait_on_page(page); + goto repeat; + } + inode->i_nrpages--; + if ((*p = page->next) != NULL) + (*p)->prev = page->prev; + page->dirty = 0; + page->next = NULL; + page->prev = NULL; + remove_page_from_hash_queue(page); + page->inode = NULL; + __free_page(page); + continue; + } + p = &page->next; + offset = start - offset; + /* partial truncate, clear end of page */ + if (offset < PAGE_SIZE) { + unsigned long address = page_address(page); + memset((void *) (offset + address), 0, PAGE_SIZE - offset); + flush_page_to_ram(address); + } + } +} + +int shrink_mmap(int priority, int dma) +{ + static unsigned long clock = 0; + struct page * page; + unsigned long limit = max_mapnr; + struct buffer_head *tmp, *bh; + int count_max, count_min; + + count_max = (limit<<1) >> (priority>>1); + count_min = (limit<<1) >> (priority); + + page = mem_map + clock; + do { + count_max--; + if (page->inode || page->buffers) + count_min--; + + if (PageLocked(page)) + goto next; + if (dma && !PageDMA(page)) + goto next; + /* First of all, regenerate the page's referenced bit + from any buffers in the page */ + bh = page->buffers; + if (bh) { + tmp = bh; + do { + if (buffer_touched(tmp)) { + clear_bit(BH_Touched, &tmp->b_state); + set_bit(PG_referenced, &page->flags); + } + tmp = tmp->b_this_page; + } while (tmp != bh); + } + + /* We can't throw away shared pages, but we do mark + them as referenced. This relies on the fact that + no page is currently in both the page cache and the + buffer cache; we'd have to modify the following + test to allow for that case. */ + + switch (page->count) { + case 1: + /* If it has been referenced recently, don't free it */ + if (clear_bit(PG_referenced, &page->flags)) + break; + + /* is it a page cache page? */ + if (page->inode) { + remove_page_from_hash_queue(page); + remove_page_from_inode_queue(page); + __free_page(page); + return 1; + } + + /* is it a buffer cache page? */ + if (bh && try_to_free_buffer(bh, &bh, 6)) + return 1; + break; + + default: + /* more than one users: we can't throw it away */ + set_bit(PG_referenced, &page->flags); + /* fall through */ + case 0: + /* nothing */ + } +next: + page++; + clock++; + if (clock >= limit) { + clock = 0; + page = mem_map; + } + } while (count_max > 0 && count_min > 0); + return 0; +} + +/* + * This is called from try_to_swap_out() when we try to get rid of some + * pages.. If we're unmapping the last occurrence of this page, we also + * free it from the page hash-queues etc, as we don't want to keep it + * in-core unnecessarily. + */ +unsigned long page_unuse(unsigned long page) +{ + struct page * p = mem_map + MAP_NR(page); + int count = p->count; + + if (count != 2) + return count; + if (!p->inode) + return count; + remove_page_from_hash_queue(p); + remove_page_from_inode_queue(p); + free_page(page); + return 1; +} + +/* + * Update a page cache copy, when we're doing a "write()" system call + * See also "update_vm_cache()". + */ +void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count) +{ + unsigned long offset, len; + + offset = (pos & ~PAGE_MASK); + pos = pos & PAGE_MASK; + len = PAGE_SIZE - offset; do { - *p = bmap(inode,block); - i--; - block++; - p++; - } while (i > 0); - return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share); + struct page * page; + + if (len > count) + len = count; + page = find_page(inode, pos); + if (page) { + wait_on_page(page); + memcpy((void *) (offset + page_address(page)), buf, len); + release_page(page); + } + count -= len; + buf += len; + len = PAGE_SIZE; + offset = 0; + pos += PAGE_SIZE; + } while (count); +} + +static inline void add_to_page_cache(struct page * page, + struct inode * inode, unsigned long offset, + struct page **hash) +{ + page->count++; + page->flags &= ~((1 << PG_uptodate) | (1 << PG_error)); + page->offset = offset; + add_page_to_inode_queue(inode, page); + __add_page_to_hash_queue(page, hash); } /* - * NOTE! mmap sync doesn't really work yet. This is mainly a stub for it, - * which only works if the buffers and the page were already sharing the - * same physical page (that's actually pretty common, especially if the - * file has been mmap'ed before being read the normal way). + * Try to read ahead in the file. "page_cache" is a potentially free page + * that we could use for the cache (if it is 0 we can try to create one, + * this is all overlapped with the IO on the previous page finishing anyway) + */ +static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache) +{ + struct page * page; + struct page ** hash; + + offset &= PAGE_MASK; + switch (page_cache) { + case 0: + page_cache = __get_free_page(GFP_KERNEL); + if (!page_cache) + break; + default: + if (offset >= inode->i_size) + break; + hash = page_hash(inode, offset); + page = __find_page(inode, offset, *hash); + if (!page) { + /* + * Ok, add the new page to the hash-queues... + */ + page = mem_map + MAP_NR(page_cache); + add_to_page_cache(page, inode, offset, hash); + inode->i_op->readpage(inode, page); + page_cache = 0; + } + release_page(page); + } + return page_cache; +} + +/* + * Wait for IO to complete on a locked page. * - * Todo: - * - non-shared pages also need to be synced with the buffers. - * - the "swapout()" function needs to swap out the page to - * the shared file instead of using the swap device. + * This must be called with the caller "holding" the page, + * ie with increased "page->count" so that the page won't + * go away during the wait.. */ -static void filemap_sync_page(struct vm_area_struct * vma, +void __wait_on_page(struct page *page) +{ + struct wait_queue wait = { current, NULL }; + + add_wait_queue(&page->wait, &wait); +repeat: + run_task_queue(&tq_disk); + current->state = TASK_UNINTERRUPTIBLE; + if (PageLocked(page)) { + schedule(); + goto repeat; + } + remove_wait_queue(&page->wait, &wait); + current->state = TASK_RUNNING; +} + +#if 0 +#define PROFILE_READAHEAD +#define DEBUG_READAHEAD +#endif + +/* + * Read-ahead profiling information + * -------------------------------- + * Every PROFILE_MAXREADCOUNT, the following information is written + * to the syslog: + * Percentage of asynchronous read-ahead. + * Average of read-ahead fields context value. + * If DEBUG_READAHEAD is defined, a snapshot of these fields is written + * to the syslog. + */ + +#ifdef PROFILE_READAHEAD + +#define PROFILE_MAXREADCOUNT 1000 + +static unsigned long total_reada; +static unsigned long total_async; +static unsigned long total_ramax; +static unsigned long total_ralen; +static unsigned long total_rawin; + +static void profile_readahead(int async, struct file *filp) +{ + unsigned long flags; + + ++total_reada; + if (async) + ++total_async; + + total_ramax += filp->f_ramax; + total_ralen += filp->f_ralen; + total_rawin += filp->f_rawin; + + if (total_reada > PROFILE_MAXREADCOUNT) { + save_flags(flags); + cli(); + if (!(total_reada > PROFILE_MAXREADCOUNT)) { + restore_flags(flags); + return; + } + + printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n", + total_ramax/total_reada, + total_ralen/total_reada, + total_rawin/total_reada, + (total_async*100)/total_reada); +#ifdef DEBUG_READAHEAD + printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n", + filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend); +#endif + + total_reada = 0; + total_async = 0; + total_ramax = 0; + total_ralen = 0; + total_rawin = 0; + + restore_flags(flags); + } +} +#endif /* defined PROFILE_READAHEAD */ + +/* + * Read-ahead context: + * ------------------- + * The read ahead context fields of the "struct file" are the following: + * - f_raend : position of the first byte after the last page we tried to + * read ahead. + * - f_ramax : current read-ahead maximum size. + * - f_ralen : length of the current IO read block we tried to read-ahead. + * - f_rawin : length of the current read-ahead window. + * if last read-ahead was synchronous then + * f_rawin = f_ralen + * otherwise (was asynchronous) + * f_rawin = previous value of f_ralen + f_ralen + * + * Read-ahead limits: + * ------------------ + * MIN_READAHEAD : minimum read-ahead size when read-ahead. + * MAX_READAHEAD : maximum read-ahead size when read-ahead. + * + * Synchronous read-ahead benefits: + * -------------------------------- + * Using reasonable IO xfer length from peripheral devices increase system + * performances. + * Reasonable means, in this context, not too large but not too small. + * The actual maximum value is: + * MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined + * and 32K if defined (4K page size assumed). + * + * Asynchronous read-ahead benefits: + * --------------------------------- + * Overlapping next read request and user process execution increase system + * performance. + * + * Read-ahead risks: + * ----------------- + * We have to guess which further data are needed by the user process. + * If these data are often not really needed, it's bad for system + * performances. + * However, we know that files are often accessed sequentially by + * application programs and it seems that it is possible to have some good + * strategy in that guessing. + * We only try to read-ahead files that seems to be read sequentially. + * + * Asynchronous read-ahead risks: + * ------------------------------ + * In order to maximize overlapping, we must start some asynchronous read + * request from the device, as soon as possible. + * We must be very careful about: + * - The number of effective pending IO read requests. + * ONE seems to be the only reasonable value. + * - The total memory pool usage for the file access stream. + * This maximum memory usage is implicitly 2 IO read chunks: + * 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined, + * 64k if defined (4K page size assumed). + */ + +#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK) + +#if 0 /* small readahead */ +#define MAX_READAHEAD PageAlignSize(4096*7) +#define MIN_READAHEAD PageAlignSize(4096*2) +#else /* large readahead */ +#define MAX_READAHEAD PageAlignSize(4096*18) +#define MIN_READAHEAD PageAlignSize(4096*3) +#endif + +static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode, + unsigned long ppos, struct page * page, + unsigned long page_cache) +{ + unsigned long max_ahead, ahead; + unsigned long raend; + + raend = filp->f_raend & PAGE_MASK; + max_ahead = 0; + +/* + * The current page is locked. + * If the current position is inside the previous read IO request, do not + * try to reread previously read ahead pages. + * Otherwise decide or not to read ahead some pages synchronously. + * If we are not going to read ahead, set the read ahead context for this + * page only. + */ + if (PageLocked(page)) { + if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) { + raend = ppos; + if (raend < inode->i_size) + max_ahead = filp->f_ramax; + filp->f_rawin = 0; + filp->f_ralen = PAGE_SIZE; + if (!max_ahead) { + filp->f_raend = ppos + filp->f_ralen; + filp->f_rawin += filp->f_ralen; + } + } + } +/* + * The current page is not locked. + * If we were reading ahead and, + * if the current max read ahead size is not zero and, + * if the current position is inside the last read-ahead IO request, + * it is the moment to try to read ahead asynchronously. + * We will later force unplug device in order to force asynchronous read IO. + */ + else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE && + ppos <= raend && ppos + filp->f_ralen >= raend) { +/* + * Add ONE page to max_ahead in order to try to have about the same IO max size + * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE. + * Compute the position of the last page we have tried to read in order to + * begin to read ahead just at the next page. + */ + raend -= PAGE_SIZE; + if (raend < inode->i_size) + max_ahead = filp->f_ramax + PAGE_SIZE; + + if (max_ahead) { + filp->f_rawin = filp->f_ralen; + filp->f_ralen = 0; + reada_ok = 2; + } + } +/* + * Try to read ahead pages. + * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the + * scheduler, will work enough for us to avoid too bad actuals IO requests. + */ + ahead = 0; + while (ahead < max_ahead) { + ahead += PAGE_SIZE; + page_cache = try_to_read_ahead(inode, raend + ahead, page_cache); + } +/* + * If we tried to read ahead some pages, + * If we tried to read ahead asynchronously, + * Try to force unplug of the device in order to start an asynchronous + * read IO request. + * Update the read-ahead context. + * Store the length of the current read-ahead window. + * Double the current max read ahead size. + * That heuristic avoid to do some large IO for files that are not really + * accessed sequentially. + */ + if (ahead) { + if (reada_ok == 2) { + run_task_queue(&tq_disk); + } + + filp->f_ralen += ahead; + filp->f_rawin += filp->f_ralen; + filp->f_raend = raend + ahead + PAGE_SIZE; + + filp->f_ramax += filp->f_ramax; + + if (filp->f_ramax > MAX_READAHEAD) + filp->f_ramax = MAX_READAHEAD; + +#ifdef PROFILE_READAHEAD + profile_readahead((reada_ok == 2), filp); +#endif + } + + return page_cache; +} + + +/* + * This is a generic file read routine, and uses the + * inode->i_op->readpage() function for the actual low-level + * stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + */ + +long generic_file_read(struct inode * inode, struct file * filp, + char * buf, unsigned long count) +{ + int error, read; + unsigned long pos, ppos, page_cache; + int reada_ok; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + if (!count) + return 0; + error = 0; + read = 0; + page_cache = 0; + + pos = filp->f_pos; + ppos = pos & PAGE_MASK; +/* + * If the current position is outside the previous read-ahead window, + * we reset the current read-ahead context and set read ahead max to zero + * (will be set to just needed value later), + * otherwise, we assume that the file accesses are sequential enough to + * continue read-ahead. + */ + if (ppos > filp->f_raend || ppos + filp->f_rawin < filp->f_raend) { + reada_ok = 0; + filp->f_raend = 0; + filp->f_ralen = 0; + filp->f_ramax = 0; + filp->f_rawin = 0; + } else { + reada_ok = 1; + } +/* + * Adjust the current value of read-ahead max. + * If the read operation stay in the first half page, force no readahead. + * Otherwise try to increase read ahead max just enough to do the read request. + * Then, at least MIN_READAHEAD if read ahead is ok, + * and at most MAX_READAHEAD in all cases. + */ + if (pos + count <= (PAGE_SIZE >> 1)) { + filp->f_ramax = 0; + } else { + unsigned long needed; + + needed = ((pos + count) & PAGE_MASK) - ppos; + + if (filp->f_ramax < needed) + filp->f_ramax = needed; + + if (reada_ok && filp->f_ramax < MIN_READAHEAD) + filp->f_ramax = MIN_READAHEAD; + if (filp->f_ramax > MAX_READAHEAD) + filp->f_ramax = MAX_READAHEAD; + } + + for (;;) { + struct page *page, **hash; + + if (pos >= inode->i_size) + break; + + /* + * Try to find the data in the page cache.. + */ + hash = page_hash(inode, pos & PAGE_MASK); + page = __find_page(inode, pos & PAGE_MASK, *hash); + if (!page) + goto no_cached_page; + +found_page: +/* + * Try to read ahead only if the current page is filled or being filled. + * Otherwise, if we were reading ahead, decrease max read ahead size to + * the minimum value. + * In this context, that seems to may happen only on some read error or if + * the page has been rewritten. + */ + if (PageUptodate(page) || PageLocked(page)) + page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache); + else if (reada_ok && filp->f_ramax > MIN_READAHEAD) + filp->f_ramax = MIN_READAHEAD; + + wait_on_page(page); + + if (!PageUptodate(page)) + goto page_read_error; + +success: + /* + * Ok, we have the page, it's up-to-date and ok, + * so now we can finally copy it to user space... + */ + { + unsigned long offset, nr; + + offset = pos & ~PAGE_MASK; + nr = PAGE_SIZE - offset; + if (nr > count) + nr = count; + if (nr > inode->i_size - pos) + nr = inode->i_size - pos; + nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr); + release_page(page); + error = -EFAULT; + if (!nr) + break; + buf += nr; + pos += nr; + read += nr; + count -= nr; + if (count) + continue; + break; + } + +no_cached_page: + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + if (!page_cache) { + page_cache = __get_free_page(GFP_KERNEL); + /* + * That could have slept, so go around to the + * very beginning.. + */ + if (page_cache) + continue; + error = -ENOMEM; + break; + } + + /* + * Ok, add the new page to the hash-queues... + */ + page = mem_map + MAP_NR(page_cache); + page_cache = 0; + add_to_page_cache(page, inode, pos & PAGE_MASK, hash); + + /* + * Error handling is tricky. If we get a read error, + * the cached page stays in the cache (but uptodate=0), + * and the next process that accesses it will try to + * re-read it. This is needed for NFS etc, where the + * identity of the reader can decide if we can read the + * page or not.. + */ +/* + * We have to read the page. + * If we were reading ahead, we had previously tried to read this page, + * That means that the page has probably been removed from the cache before + * the application process needs it, or has been rewritten. + * Decrease max readahead size to the minimum value in that situation. + */ + if (reada_ok && filp->f_ramax > MIN_READAHEAD) + filp->f_ramax = MIN_READAHEAD; + + error = inode->i_op->readpage(inode, page); + if (!error) + goto found_page; + release_page(page); + break; + +page_read_error: + /* + * We found the page, but it wasn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because this happens only if there were errors. + */ + error = inode->i_op->readpage(inode, page); + if (!error) { + wait_on_page(page); + if (PageUptodate(page) && !PageError(page)) + goto success; + error = -EIO; /* Some unspecified error occurred.. */ + } + release_page(page); + break; + } + + filp->f_pos = pos; + filp->f_reada = 1; + if (page_cache) + free_page(page_cache); + if (!IS_RDONLY(inode)) { + inode->i_atime = CURRENT_TIME; + inode->i_dirt = 1; + } + if (!read) + read = error; + return read; +} + +/* + * Semantics for shared and private memory areas are different past the end + * of the file. A shared mapping past the last page of the file is an error + * and results in a SIGBUS, while a private mapping just maps in a zero page. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share) +{ +/* XXX: Check the flushes in this code. At least sometimes we do + duplicate flushes. ... */ + unsigned long offset; + struct page * page, **hash; + struct inode * inode = area->vm_inode; + unsigned long old_page, new_page; + + new_page = 0; + offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset; + if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm) + goto no_page; + + /* + * Do we have something in the page cache already? + */ + hash = page_hash(inode, offset); + page = __find_page(inode, offset, *hash); + if (!page) + goto no_cached_page; + +found_page: + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date + */ + if (PageLocked(page)) + goto page_locked_wait; + if (!PageUptodate(page)) + goto page_read_error; + +success: + /* + * Found the page, need to check sharing and possibly + * copy it over to another page.. + */ + old_page = page_address(page); + if (!no_share) { + /* + * Ok, we can share the cached page directly.. Get rid + * of any potential extra pages. + */ + if (new_page) + free_page(new_page); + + flush_page_to_ram(old_page); + return old_page; + } + + /* + * Check that we have another page to copy it over to.. + */ + if (!new_page) { + new_page = __get_free_page(GFP_KERNEL); + if (!new_page) + goto failure; + } + copy_page(new_page, old_page); + flush_page_to_ram(new_page); + release_page(page); + return new_page; + +no_cached_page: + new_page = __get_free_page(GFP_KERNEL); + if (!new_page) + goto no_page; + + /* + * During getting the above page we might have slept, + * so we need to re-check the situation with the page + * cache.. The page we just got may be useful if we + * can't share, so don't get rid of it here. + */ + page = find_page(inode, offset); + if (page) + goto found_page; + + /* + * Now, create a new page-cache page from the page we got + */ + page = mem_map + MAP_NR(new_page); + new_page = 0; + add_to_page_cache(page, inode, offset, hash); + + if (inode->i_op->readpage(inode, page) != 0) + goto failure; + + /* + * Do a very limited read-ahead if appropriate + */ + if (PageLocked(page)) + new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0); + goto found_page; + +page_locked_wait: + __wait_on_page(page); + if (PageUptodate(page)) + goto success; + +page_read_error: + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + if (inode->i_op->readpage(inode, page) != 0) + goto failure; + wait_on_page(page); + if (PageError(page)) + goto failure; + if (PageUptodate(page)) + goto success; + + /* + * Uhhuh.. Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ +failure: + release_page(page); +no_page: + return 0; +} + +/* + * Tries to write a shared mapped page to its backing store. May return -EIO + * if the disk is full. + */ +static inline int do_write_page(struct inode * inode, struct file * file, + const char * page, unsigned long offset) +{ + int old_fs, retval; + unsigned long size; + + size = offset + PAGE_SIZE; + /* refuse to extend file size.. */ + if (S_ISREG(inode->i_mode)) { + if (size > inode->i_size) + size = inode->i_size; + /* Ho humm.. We should have tested for this earlier */ + if (size < offset) + return -EIO; + } + size -= offset; + old_fs = get_fs(); + set_fs(KERNEL_DS); + retval = -EIO; + if (size == file->f_op->write(inode, file, (const char *) page, size)) + retval = 0; + set_fs(old_fs); + return retval; +} + +static int filemap_write_page(struct vm_area_struct * vma, unsigned long offset, unsigned long page) { + int result; + struct file file; + struct inode * inode; struct buffer_head * bh; - printk("msync: %ld: [%08lx]\n", offset, page); - bh = buffer_pages[MAP_NR(page)]; + bh = mem_map[MAP_NR(page)].buffers; if (bh) { /* whee.. just mark the buffer heads dirty */ struct buffer_head * tmp = bh; @@ -76,45 +940,125 @@ static void filemap_sync_page(struct vm_area_struct * vma, mark_buffer_dirty(tmp, 0); tmp = tmp->b_this_page; } while (tmp != bh); - return; + return 0; } - /* we'll need to go fetch the buffer heads etc.. RSN */ - printk("Can't handle non-shared page yet\n"); - return; + + inode = vma->vm_inode; + file.f_op = inode->i_op->default_file_ops; + if (!file.f_op->write) + return -EIO; + file.f_mode = 3; + file.f_flags = 0; + file.f_count = 1; + file.f_inode = inode; + file.f_pos = offset; + file.f_reada = 0; + + down(&inode->i_sem); + result = do_write_page(inode, &file, (const char *) page, offset); + up(&inode->i_sem); + return result; +} + + +/* + * Swapping to a shared file: while we're busy writing out the page + * (and the page still exists in memory), we save the page information + * in the page table, so that "filemap_swapin()" can re-use the page + * immediately if it is called while we're busy swapping it out.. + * + * Once we've written it all out, we mark the page entry "empty", which + * will result in a normal page-in (instead of a swap-in) from the now + * up-to-date disk file. + */ +int filemap_swapout(struct vm_area_struct * vma, + unsigned long offset, + pte_t *page_table) +{ + int error; + unsigned long page = pte_page(*page_table); + unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page)); + + flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset)); + set_pte(page_table, __pte(entry)); + flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset)); + error = filemap_write_page(vma, offset, page); + if (pte_val(*page_table) == entry) + pte_clear(page_table); + return error; } -static inline void filemap_sync_pte(pte_t * pte, struct vm_area_struct *vma, +/* + * filemap_swapin() is called only if we have something in the page + * tables that is non-zero (but not present), which we know to be the + * page index of a page that is busy being swapped out (see above). + * So we just use it directly.. + */ +static pte_t filemap_swapin(struct vm_area_struct * vma, + unsigned long offset, + unsigned long entry) +{ + unsigned long page = SWP_OFFSET(entry); + + mem_map[page].count++; + page = (page << PAGE_SHIFT) + PAGE_OFFSET; + return mk_pte(page,vma->vm_page_prot); +} + + +static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t page = *pte; - - if (!pte_present(page)) - return; - if (!pte_dirty(page)) - return; - if (flags & MS_INVALIDATE) { - pte_clear(pte); + pte_t pte = *ptep; + unsigned long page; + int error; + + if (!(flags & MS_INVALIDATE)) { + if (!pte_present(pte)) + return 0; + if (!pte_dirty(pte)) + return 0; + flush_page_to_ram(pte_page(pte)); + flush_cache_page(vma, address); + set_pte(ptep, pte_mkclean(pte)); + flush_tlb_page(vma, address); + page = pte_page(pte); + mem_map[MAP_NR(page)].count++; } else { - mem_map[MAP_NR(pte_page(page))]++; - *pte = pte_mkclean(page); + if (pte_none(pte)) + return 0; + flush_cache_page(vma, address); + pte_clear(ptep); + flush_tlb_page(vma, address); + if (!pte_present(pte)) { + swap_free(pte_val(pte)); + return 0; + } + page = pte_page(pte); + if (!pte_dirty(pte) || flags == MS_INVALIDATE) { + free_page(page); + return 0; + } } - filemap_sync_page(vma, address - vma->vm_start, pte_page(page)); - free_page(pte_page(page)); + error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page); + free_page(page); + return error; } -static inline void filemap_sync_pte_range(pmd_t * pmd, +static inline int filemap_sync_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, struct vm_area_struct *vma, unsigned long offset, unsigned int flags) { pte_t * pte; unsigned long end; + int error; if (pmd_none(*pmd)) - return; + return 0; if (pmd_bad(*pmd)) { printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd)); pmd_clear(pmd); - return; + return 0; } pte = pte_offset(pmd, address); offset += address & PMD_MASK; @@ -122,58 +1066,65 @@ static inline void filemap_sync_pte_range(pmd_t * pmd, end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; + error = 0; do { - filemap_sync_pte(pte, vma, address + offset, flags); + error |= filemap_sync_pte(pte, vma, address + offset, flags); address += PAGE_SIZE; pte++; } while (address < end); + return error; } -static inline void filemap_sync_pmd_range(pgd_t * pgd, +static inline int filemap_sync_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size, struct vm_area_struct *vma, unsigned int flags) { pmd_t * pmd; unsigned long offset, end; + int error; if (pgd_none(*pgd)) - return; + return 0; if (pgd_bad(*pgd)) { printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd)); pgd_clear(pgd); - return; + return 0; } pmd = pmd_offset(pgd, address); - offset = address & PMD_MASK; - address &= ~PMD_MASK; + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; + error = 0; do { - filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); + error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); + return error; } -static void filemap_sync(struct vm_area_struct * vma, unsigned long address, +static int filemap_sync(struct vm_area_struct * vma, unsigned long address, size_t size, unsigned int flags) { pgd_t * dir; unsigned long end = address + size; + int error = 0; - dir = pgd_offset(current, address); + dir = pgd_offset(vma->vm_mm, address); + flush_cache_range(vma->vm_mm, end - size, end); while (address < end) { - filemap_sync_pmd_range(dir, address, end - address, vma, flags); + error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } - invalidate(); - return; + flush_tlb_range(vma->vm_mm, end - size, end); + return error; } /* - * This handles area unmaps.. + * This handles (potentially partial) area unmaps.. */ static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len) { @@ -181,50 +1132,27 @@ static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_ } /* - * This handles complete area closes.. - */ -static void filemap_close(struct vm_area_struct * vma) -{ - filemap_sync(vma, vma->vm_start, vma->vm_end - vma->vm_start, MS_ASYNC); -} - -/* - * This isn't implemented yet: you'll get a warning and incorrect behaviour. - * - * Note that the page is free'd by the higher-level after return, - * so we have to either write it out or just forget it. We currently - * forget it.. - */ -void filemap_swapout(struct vm_area_struct * vma, - unsigned long offset, - pte_t *page_table) -{ - printk("swapout not implemented on shared files..\n"); - pte_clear(page_table); -} - -/* * Shared mappings need to be able to do the right thing at * close/unmap/sync. They will also use the private file as * backing-store for swapping.. */ static struct vm_operations_struct file_shared_mmap = { - NULL, /* open */ - filemap_close, /* close */ - filemap_unmap, /* unmap */ - NULL, /* protect */ + NULL, /* no special open */ + NULL, /* no special close */ + filemap_unmap, /* unmap - we need to sync the pages */ + NULL, /* no special protect */ filemap_sync, /* sync */ NULL, /* advise */ filemap_nopage, /* nopage */ NULL, /* wppage */ filemap_swapout, /* swapout */ - NULL, /* swapin */ + filemap_swapin, /* swapin */ }; /* - * Private mappings just need to be able to load in the map + * Private mappings just need to be able to load in the map. * - * (this is actually used for shared mappings as well, if we + * (This is actually used for shared mappings as well, if we * know they can't ever get write permissions..) */ static struct vm_operations_struct file_private_mmap = { @@ -241,28 +1169,25 @@ static struct vm_operations_struct file_private_mmap = { }; /* This is used for a general mmap of a disk file */ -int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma) { struct vm_operations_struct * ops; - if (vma->vm_offset & (inode->i_sb->s_blocksize - 1)) - return -EINVAL; + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + ops = &file_shared_mmap; + /* share_page() can only guarantee proper page sharing if + * the offsets are all page aligned. */ + if (vma->vm_offset & (PAGE_SIZE - 1)) + return -EINVAL; + } else { + ops = &file_private_mmap; + if (vma->vm_offset & (inode->i_sb->s_blocksize - 1)) + return -EINVAL; + } if (!inode->i_sb || !S_ISREG(inode->i_mode)) return -EACCES; - if (!inode->i_op || !inode->i_op->bmap) + if (!inode->i_op || !inode->i_op->readpage) return -ENOEXEC; - ops = &file_private_mmap; - if (vma->vm_flags & VM_SHARED) { - if (vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) { - static int nr = 0; - ops = &file_shared_mmap; -#ifndef SHARED_MMAP_REALLY_WORKS /* it doesn't, yet */ - if (nr++ < 5) - printk("%s tried to do a shared writeable mapping\n", current->comm); - return -EINVAL; -#endif - } - } if (!IS_RDONLY(inode)) { inode->i_atime = CURRENT_TIME; inode->i_dirt = 1; @@ -272,3 +1197,74 @@ int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct vma->vm_ops = ops; return 0; } + + +/* + * The msync() system call. + */ + +static int msync_interval(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int flags) +{ + if (!vma->vm_inode) + return 0; + if (vma->vm_ops->sync) { + int error; + error = vma->vm_ops->sync(vma, start, end-start, flags); + if (error) + return error; + if (flags & MS_SYNC) + return file_fsync(vma->vm_inode, NULL); + return 0; + } + return 0; +} + +asmlinkage int sys_msync(unsigned long start, size_t len, int flags) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error, error; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + return -EINVAL; + if (end == start) + return 0; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -EFAULT at the end. + */ + vma = find_vma(current->mm, start); + unmapped_error = 0; + for (;;) { + /* Still start < end. */ + if (!vma) + return -EFAULT; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -EFAULT; + start = vma->vm_start; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = msync_interval(vma, start, end, flags); + if (error) + return error; + } + return unmapped_error; + } + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags); + if (error) + return error; + start = vma->vm_end; + vma = vma->vm_next; + } +} diff --git a/mm/kmalloc.c b/mm/kmalloc.c index e288ecf2f..d0193f02b 100644 --- a/mm/kmalloc.c +++ b/mm/kmalloc.c @@ -10,44 +10,42 @@ /* * Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple * pages. So for 'page' throughout, read 'area'. + * + * Largely rewritten.. Linus */ #include <linux/mm.h> -#include <asm/system.h> #include <linux/delay.h> +#include <linux/interrupt.h> -#define GFP_LEVEL_MASK 0xf - -/* I want this low enough for a while to catch errors. - I want this number to be increased in the near future: - loadable device drivers should use this function to get memory */ - -#define MAX_KMALLOC_K ((PAGE_SIZE<<(NUM_AREA_ORDERS-1))>>10) - - -/* This defines how many times we should try to allocate a free page before - giving up. Normally this shouldn't happen at all. */ -#define MAX_GET_FREE_PAGE_TRIES 4 +#include <asm/system.h> +#include <asm/dma.h> +#ifdef __mips__ +#include <asm/sgidefs.h> +#endif +/* Define this if you want slow routines that try to trip errors */ +#undef SADISTIC_KMALLOC /* Private flags. */ #define MF_USED 0xffaa0055 +#define MF_DMA 0xff00aa55 #define MF_FREE 0x0055ffaa -/* +/* * Much care has gone into making these routines in this file reentrant. * * The fancy bookkeeping of nbytesmalloced and the like are only used to - * report them to the user (oooohhhhh, aaaaahhhhh....) are not + * report them to the user (oooohhhhh, aaaaahhhhh....) are not * protected by cli(). (If that goes wrong. So what?) * * These routines restore the interrupt status to allow calling with ints - * off. + * off. */ -/* +/* * A block header. This is in front of every malloc-block, whether free or not. */ struct block_header { @@ -64,8 +62,8 @@ struct block_header { #define BH(p) ((struct block_header *)(p)) -/* - * The page descriptor is at the front of every page that malloc has in use. +/* + * The page descriptor is at the front of every page that malloc has in use. */ struct page_descriptor { struct page_descriptor *next; @@ -84,324 +82,415 @@ struct page_descriptor { */ struct size_descriptor { struct page_descriptor *firstfree; - struct page_descriptor *dmafree; /* DMA-able memory */ - int size; + struct page_descriptor *dmafree; /* DMA-able memory */ int nblocks; int nmallocs; int nfrees; int nbytesmalloced; int npages; - unsigned long gfporder; /* number of pages in the area required */ + unsigned long gfporder; /* number of pages in the area required */ }; /* - * For now it is unsafe to allocate bucket sizes between n & n=16 where n is - * 4096 * any power of two + * For now it is unsafe to allocate bucket sizes between n and + * n-sizeof(page_descriptor) where n is PAGE_SIZE * any power of two */ -#if PAGE_SIZE == 4096 -struct size_descriptor sizes[] = { - { NULL, NULL, 32,127, 0,0,0,0, 0}, - { NULL, NULL, 64, 63, 0,0,0,0, 0 }, - { NULL, NULL, 128, 31, 0,0,0,0, 0 }, - { NULL, NULL, 252, 16, 0,0,0,0, 0 }, - { NULL, NULL, 508, 8, 0,0,0,0, 0 }, - { NULL, NULL,1020, 4, 0,0,0,0, 0 }, - { NULL, NULL,2040, 2, 0,0,0,0, 0 }, - { NULL, NULL,4096-16, 1, 0,0,0,0, 0 }, - { NULL, NULL,8192-16, 1, 0,0,0,0, 1 }, - { NULL, NULL,16384-16, 1, 0,0,0,0, 2 }, - { NULL, NULL,32768-16, 1, 0,0,0,0, 3 }, - { NULL, NULL,65536-16, 1, 0,0,0,0, 4 }, - { NULL, NULL,131072-16, 1, 0,0,0,0, 5 }, - { NULL, NULL, 0, 0, 0,0,0,0, 0 } +#if PAGE_SIZE == 4096 && defined (__mips__) && \ + ((_MIPS_ISA == _MIPS_ISA_MIPS2) || \ + (_MIPS_ISA == _MIPS_ISA_MIPS3) || \ + (_MIPS_ISA == _MIPS_ISA_MIPS4)) +static const unsigned int blocksize[] = { + /* + * For MIPS II we need this hacked descriptor table to get + * doubleword alignment. Otherwise the scheduler and other code + * that use doublewords will bomb. + */ + 32, + 64, + 128, + 248, + 504, + 1016, + 2040, + 4096 - 16, + 8192 - 16, + 16384 - 16, + 32768 - 16, + 65536 - 16, + 131072 - 16, + 0 +}; + +static struct size_descriptor sizes[] = +{ + {NULL, NULL, 127, 0, 0, 0, 0, 0}, + {NULL, NULL, 63, 0, 0, 0, 0, 0}, + {NULL, NULL, 31, 0, 0, 0, 0, 0}, + {NULL, NULL, 16, 0, 0, 0, 0, 0}, + {NULL, NULL, 8, 0, 0, 0, 0, 0}, + {NULL, NULL, 4, 0, 0, 0, 0, 0}, + {NULL, NULL, 2, 0, 0, 0, 0, 0}, + {NULL, NULL, 1, 0, 0, 0, 0, 0}, + {NULL, NULL, 1, 0, 0, 0, 0, 1}, + {NULL, NULL, 1, 0, 0, 0, 0, 2}, + {NULL, NULL, 1, 0, 0, 0, 0, 3}, + {NULL, NULL, 1, 0, 0, 0, 0, 4}, + {NULL, NULL, 1, 0, 0, 0, 0, 5}, + {NULL, NULL, 0, 0, 0, 0, 0, 0} +}; +#elif PAGE_SIZE == 4096 +static const unsigned int blocksize[] = { + 32, + 64, + 128, + 252, + 508, + 1020, + 2040, + 4096 - 16, + 8192 - 16, + 16384 - 16, + 32768 - 16, + 65536 - 16, + 131072 - 16, + 0 +}; + +static struct size_descriptor sizes[] = +{ + {NULL, NULL, 127, 0, 0, 0, 0, 0}, + {NULL, NULL, 63, 0, 0, 0, 0, 0}, + {NULL, NULL, 31, 0, 0, 0, 0, 0}, + {NULL, NULL, 16, 0, 0, 0, 0, 0}, + {NULL, NULL, 8, 0, 0, 0, 0, 0}, + {NULL, NULL, 4, 0, 0, 0, 0, 0}, + {NULL, NULL, 2, 0, 0, 0, 0, 0}, + {NULL, NULL, 1, 0, 0, 0, 0, 0}, + {NULL, NULL, 1, 0, 0, 0, 0, 1}, + {NULL, NULL, 1, 0, 0, 0, 0, 2}, + {NULL, NULL, 1, 0, 0, 0, 0, 3}, + {NULL, NULL, 1, 0, 0, 0, 0, 4}, + {NULL, NULL, 1, 0, 0, 0, 0, 5}, + {NULL, NULL, 0, 0, 0, 0, 0, 0} }; #elif PAGE_SIZE == 8192 -struct size_descriptor sizes[] = { - { NULL, NULL, 64,127, 0,0,0,0, 0}, - { NULL, NULL, 128, 63, 0,0,0,0, 0 }, - { NULL, NULL, 248, 31, 0,0,0,0, 0 }, - { NULL, NULL, 504, 16, 0,0,0,0, 0 }, - { NULL, NULL,1016, 8, 0,0,0,0, 0 }, - { NULL, NULL,2040, 4, 0,0,0,0, 0 }, - { NULL, NULL,4080, 2, 0,0,0,0, 0 }, - { NULL, NULL,8192-32, 1, 0,0,0,0, 0 }, - { NULL, NULL,16384-32, 1, 0,0,0,0, 1 }, - { NULL, NULL,32768-32, 1, 0,0,0,0, 2 }, - { NULL, NULL,65536-32, 1, 0,0,0,0, 3 }, - { NULL, NULL,131072-32, 1, 0,0,0,0, 4 }, - { NULL, NULL,262144-32, 1, 0,0,0,0, 5 }, - { NULL, NULL, 0, 0, 0,0,0,0, 0 } +static const unsigned int blocksize[] = { + 64, + 128, + 248, + 504, + 1016, + 2040, + 4080, + 8192 - 32, + 16384 - 32, + 32768 - 32, + 65536 - 32, + 131072 - 32, + 262144 - 32, + 0 +}; + +struct size_descriptor sizes[] = +{ + {NULL, NULL, 127, 0, 0, 0, 0, 0}, + {NULL, NULL, 63, 0, 0, 0, 0, 0}, + {NULL, NULL, 31, 0, 0, 0, 0, 0}, + {NULL, NULL, 16, 0, 0, 0, 0, 0}, + {NULL, NULL, 8, 0, 0, 0, 0, 0}, + {NULL, NULL, 4, 0, 0, 0, 0, 0}, + {NULL, NULL, 2, 0, 0, 0, 0, 0}, + {NULL, NULL, 1, 0, 0, 0, 0, 0}, + {NULL, NULL, 1, 0, 0, 0, 0, 1}, + {NULL, NULL, 1, 0, 0, 0, 0, 2}, + {NULL, NULL, 1, 0, 0, 0, 0, 3}, + {NULL, NULL, 1, 0, 0, 0, 0, 4}, + {NULL, NULL, 1, 0, 0, 0, 0, 5}, + {NULL, NULL, 0, 0, 0, 0, 0, 0} }; #else #error you need to make a version for your pagesize #endif #define NBLOCKS(order) (sizes[order].nblocks) -#define BLOCKSIZE(order) (sizes[order].size) +#define BLOCKSIZE(order) (blocksize[order]) #define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder)) + - -long kmalloc_init (long start_mem,long end_mem) +long kmalloc_init(long start_mem, long end_mem) { int order; -/* +/* * Check the static info array. Things will blow up terribly if it's * incorrect. This is a late "compile time" check..... */ -for (order = 0;BLOCKSIZE(order);order++) - { - if ((NBLOCKS (order)*BLOCKSIZE(order) + sizeof (struct page_descriptor)) > - AREASIZE(order)) - { - printk ("Cannot use %d bytes out of %d in order = %d block mallocs\n", - (int) (NBLOCKS (order) * BLOCKSIZE(order) + - sizeof (struct page_descriptor)), - (int) AREASIZE(order), - BLOCKSIZE (order)); - panic ("This only happens if someone messes with kmalloc"); - } - } -return start_mem; + for (order = 0; BLOCKSIZE(order); order++) { + if ((NBLOCKS(order) * BLOCKSIZE(order) + sizeof(struct page_descriptor)) > + AREASIZE(order)) { + printk("Cannot use %d bytes out of %d in order = %d block mallocs\n", + (int) (NBLOCKS(order) * BLOCKSIZE(order) + + sizeof(struct page_descriptor)), + (int) AREASIZE(order), + BLOCKSIZE(order)); + panic("This only happens if someone messes with kmalloc"); + } + } + return start_mem; } +/* + * Create a small cache of page allocations: this helps a bit with + * those pesky 8kB+ allocations for NFS when we're temporarily + * out of memory.. + * + * This is a _truly_ small cache, we just cache one single page + * order (for orders 0, 1 and 2, that is 4, 8 and 16kB on x86). + */ +#define MAX_CACHE_ORDER 3 +struct page_descriptor * kmalloc_cache[MAX_CACHE_ORDER]; - -int get_order (int size) +static inline struct page_descriptor * get_kmalloc_pages(unsigned long priority, + unsigned long order, int dma) { - int order; + return (struct page_descriptor *) __get_free_pages(priority, order, dma); +} - /* Add the size of the header */ - size += sizeof (struct block_header); - for (order = 0;BLOCKSIZE(order);order++) - if (size <= BLOCKSIZE (order)) - return order; - return -1; +static inline void free_kmalloc_pages(struct page_descriptor * page, + unsigned long order, int dma) +{ + if (!dma && order < MAX_CACHE_ORDER) { + page = xchg(kmalloc_cache+order, page); + if (!page) + return; + } + free_pages((unsigned long) page, order); } -void * kmalloc (size_t size, int priority) +/* + * Ugh, this is ugly, but we want the default case to run + * straight through, which is why we have the ugly goto's + */ +void *kmalloc(size_t size, int priority) { unsigned long flags; - int order,tries,i,sz; - int dma_flag; + unsigned long type; + int order, dma; struct block_header *p; - struct page_descriptor *page; + struct page_descriptor *page, **pg; + struct size_descriptor *bucket = sizes; + + /* Get order */ + order = 0; + { + unsigned int realsize = size + sizeof(struct block_header); + for (;;) { + int ordersize = BLOCKSIZE(order); + if (realsize <= ordersize) + break; + order++; + bucket++; + if (ordersize) + continue; + printk("kmalloc of too large a block (%d bytes).\n", (int) size); + return NULL; + } + } + + dma = 0; + type = MF_USED; + pg = &bucket->firstfree; + if (priority & GFP_DMA) { + dma = 1; + type = MF_DMA; + pg = &bucket->dmafree; + } - dma_flag = (priority & GFP_DMA); priority &= GFP_LEVEL_MASK; - + /* Sanity check... */ if (intr_count && priority != GFP_ATOMIC) { static int count = 0; if (++count < 5) { printk("kmalloc called nonatomically from interrupt %p\n", - __builtin_return_address(0)); + return_address()); priority = GFP_ATOMIC; } } -order = get_order (size); -if (order < 0) - { - printk ("kmalloc of too large a block (%d bytes).\n",(int) size); - return (NULL); - } - -save_flags(flags); - -/* It seems VERY unlikely to me that it would be possible that this - loop will get executed more than once. */ -tries = MAX_GET_FREE_PAGE_TRIES; -while (tries --) - { - /* Try to allocate a "recently" freed memory block */ - cli (); - if ((page = (dma_flag ? sizes[order].dmafree : sizes[order].firstfree)) && - (p = page->firstfree)) - { - if (p->bh_flags == MF_FREE) - { - page->firstfree = p->bh_next; - page->nfree--; - if (!page->nfree) - { - if(dma_flag) - sizes[order].dmafree = page->next; - else - sizes[order].firstfree = page->next; - page->next = NULL; - } - restore_flags(flags); - - sizes [order].nmallocs++; - sizes [order].nbytesmalloced += size; - p->bh_flags = MF_USED; /* As of now this block is officially in use */ - p->bh_length = size; - return p+1; /* Pointer arithmetic: increments past header */ - } - printk ("Problem: block on freelist at %08lx isn't free.\n",(long)p); - return (NULL); - } - restore_flags(flags); - - - /* Now we're in trouble: We need to get a new free page..... */ - - sz = BLOCKSIZE(order); /* sz is the size of the blocks we're dealing with */ - - /* This can be done with ints on: This is private to this invocation */ - if (dma_flag) - page = (struct page_descriptor *) __get_dma_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder); - else - page = (struct page_descriptor *) __get_free_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder); - - if (!page) { - static unsigned long last = 0; - if (last + 10*HZ < jiffies) { - last = jiffies; - printk ("Couldn't get a free page.....\n"); - } - return NULL; - } -#if 0 - printk ("Got page %08x to use for %d byte mallocs....",(long)page,sz); -#endif - sizes[order].npages++; - - /* Loop for all but last block: */ - for (i=NBLOCKS(order),p=BH (page+1);i > 1;i--,p=p->bh_next) - { - p->bh_flags = MF_FREE; - p->bh_next = BH ( ((long)p)+sz); - } - /* Last block: */ - p->bh_flags = MF_FREE; - p->bh_next = NULL; - - page->order = order; - page->nfree = NBLOCKS(order); - page->firstfree = BH(page+1); -#if 0 - printk ("%d blocks per page\n",page->nfree); + save_flags(flags); + cli(); + page = *pg; + if (!page) + goto no_bucket_page; + + p = page->firstfree; + if (p->bh_flags != MF_FREE) + goto not_free_on_freelist; + +found_it: + page->firstfree = p->bh_next; + page->nfree--; + if (!page->nfree) + *pg = page->next; + restore_flags(flags); + bucket->nmallocs++; + bucket->nbytesmalloced += size; + p->bh_flags = type; /* As of now this block is officially in use */ + p->bh_length = size; +#ifdef SADISTIC_KMALLOC + memset(p+1, 0xf0, size); #endif - /* Now we're going to muck with the "global" freelist for this size: - this should be uninterruptible */ - cli (); - /* - * sizes[order].firstfree used to be NULL, otherwise we wouldn't be - * here, but you never know.... - */ - if (dma_flag) { - page->next = sizes[order].dmafree; - sizes[order].dmafree = page; - } else { - page->next = sizes[order].firstfree; - sizes[order].firstfree = page; - } - restore_flags(flags); - } - -/* Pray that printk won't cause this to happen again :-) */ - -printk ("Hey. This is very funny. I tried %d times to allocate a whole\n" - "new page for an object only %d bytes long, but some other process\n" - "beat me to actually allocating it. Also note that this 'error'\n" - "message is soooo very long to catch your attention. I'd appreciate\n" - "it if you'd be so kind as to report what conditions caused this to\n" - "the author of this kmalloc: wolff@dutecai.et.tudelft.nl.\n" - "(Executive summary: This can't happen)\n", - MAX_GET_FREE_PAGE_TRIES, - (int) size); -return NULL; + return p + 1; /* Pointer arithmetic: increments past header */ + + +no_bucket_page: + /* + * If we didn't find a page already allocated for this + * bucket size, we need to get one.. + * + * This can be done with ints on: it is private to this invocation + */ + restore_flags(flags); + + { + int i, sz; + + /* sz is the size of the blocks we're dealing with */ + sz = BLOCKSIZE(order); + + page = get_kmalloc_pages(priority, bucket->gfporder, dma); + if (!page) + goto no_free_page; +found_cached_page: + + bucket->npages++; + + page->order = order; + /* Loop for all but last block: */ + i = (page->nfree = bucket->nblocks) - 1; + p = BH(page + 1); + while (i > 0) { + i--; + p->bh_flags = MF_FREE; + p->bh_next = BH(((long) p) + sz); + p = p->bh_next; + } + /* Last block: */ + p->bh_flags = MF_FREE; + p->bh_next = NULL; + + p = BH(page+1); + } + + /* + * Now we're going to muck with the "global" freelist + * for this size: this should be uninterruptible + */ + cli(); + page->next = *pg; + *pg = page; + goto found_it; + + +no_free_page: + /* + * No free pages, check the kmalloc cache of + * pages to see if maybe we have something available + */ + if (!dma && order < MAX_CACHE_ORDER) { + page = xchg(kmalloc_cache+order, page); + if (page) + goto found_cached_page; + } + { + static unsigned long last = 0; + if (priority != GFP_BUFFER && (last + 10 * HZ < jiffies)) { + last = jiffies; + printk("Couldn't get a free page.....\n"); + } + return NULL; + } + +not_free_on_freelist: + restore_flags(flags); + printk("Problem: block on freelist at %08lx isn't free.\n", (long) p); + return NULL; } -void kfree_s (void *ptr,int size) +void kfree(void *__ptr) { -unsigned long flags; -int order; -register struct block_header *p=((struct block_header *)ptr) -1; -struct page_descriptor *page,*pg2; - -page = PAGE_DESC (p); -order = page->order; -if ((order < 0) || - (order > sizeof (sizes)/sizeof (sizes[0])) || - (((long)(page->next)) & ~PAGE_MASK) || - (p->bh_flags != MF_USED)) - { - printk ("kfree of non-kmalloced memory: %p, next= %p, order=%d\n", - p, page->next, page->order); - return; - } -if (size && - size != p->bh_length) - { - printk ("Trying to free pointer at %p with wrong size: %d instead of %lu.\n", - p,size,p->bh_length); - return; - } -size = p->bh_length; -p->bh_flags = MF_FREE; /* As of now this block is officially free */ -save_flags(flags); -cli (); -p->bh_next = page->firstfree; -page->firstfree = p; -page->nfree ++; - -if (page->nfree == 1) - { /* Page went from full to one free block: put it on the freelist. Do not bother - trying to put it on the DMA list. */ - if (page->next) - { - printk ("Page %p already on freelist dazed and confused....\n", page); - } - else - { - page->next = sizes[order].firstfree; - sizes[order].firstfree = page; - } - } - -/* If page is completely free, free it */ -if (page->nfree == NBLOCKS (page->order)) - { -#if 0 - printk ("Freeing page %08x.\n", (long)page); + int dma; + unsigned long flags; + unsigned int order; + struct page_descriptor *page, **pg; + struct size_descriptor *bucket; + + if (!__ptr) + goto null_kfree; +#define ptr ((struct block_header *) __ptr) + page = PAGE_DESC(ptr); + __ptr = ptr - 1; + if (~PAGE_MASK & (unsigned long)page->next) + goto bad_order; + order = page->order; + if (order >= sizeof(sizes) / sizeof(sizes[0])) + goto bad_order; + bucket = sizes + order; + dma = 0; + pg = &bucket->firstfree; + if (ptr->bh_flags == MF_DMA) { + dma = 1; + ptr->bh_flags = MF_USED; + pg = &bucket->dmafree; + } + if (ptr->bh_flags != MF_USED) + goto bad_order; + ptr->bh_flags = MF_FREE; /* As of now this block is officially free */ +#ifdef SADISTIC_KMALLOC + memset(ptr+1, 0x0e, ptr->bh_length); #endif - if (sizes[order].firstfree == page) - { - sizes[order].firstfree = page->next; - } - else if (sizes[order].dmafree == page) - { - sizes[order].dmafree = page->next; - } - else - { - for (pg2=sizes[order].firstfree; - (pg2 != NULL) && (pg2->next != page); - pg2=pg2->next) - /* Nothing */; - if (!pg2) - for (pg2=sizes[order].dmafree; - (pg2 != NULL) && (pg2->next != page); - pg2=pg2->next) - /* Nothing */; - if (pg2 != NULL) - pg2->next = page->next; - else - printk ("Ooops. page %p doesn't show on freelist.\n", page); - } -/* FIXME: I'm sure we should do something with npages here (like npages--) */ - free_pages ((long)page, sizes[order].gfporder); - } -restore_flags(flags); - -/* FIXME: ?? Are these increment & decrement operations guaranteed to be - * atomic? Could an IRQ not occur between the read & the write? - * Maybe yes on a x86 with GCC...?? - */ -sizes[order].nfrees++; /* Noncritical (monitoring) admin stuff */ -sizes[order].nbytesmalloced -= size; + save_flags(flags); + cli(); + + bucket->nfrees++; + bucket->nbytesmalloced -= ptr->bh_length; + + ptr->bh_next = page->firstfree; + page->firstfree = ptr; + if (!page->nfree++) { +/* Page went from full to one free block: put it on the freelist. */ + if (bucket->nblocks == 1) + goto free_page; + page->next = *pg; + *pg = page; + } +/* If page is completely free, free it */ + if (page->nfree == bucket->nblocks) { + for (;;) { + struct page_descriptor *tmp = *pg; + if (!tmp) + goto not_on_freelist; + if (tmp == page) + break; + pg = &tmp->next; + } + *pg = page->next; +free_page: + bucket->npages--; + free_kmalloc_pages(page, bucket->gfporder, dma); + } + restore_flags(flags); +null_kfree: + return; + +bad_order: + printk("kfree of non-kmalloced memory: %p, next= %p, order=%d\n", + ptr+1, page->next, page->order); + return; + +not_on_freelist: + printk("Ooops. page %p doesn't show on freelist.\n", page); + restore_flags(flags); } diff --git a/mm/memory.c b/mm/memory.c index 4fba3a4c4..9fd243a67 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -33,7 +33,6 @@ * Idea by Alex Bligh (alex@cconcepts.co.uk) */ -#include <linux/config.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/head.h> @@ -44,28 +43,29 @@ #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> +#include <linux/swap.h> #include <asm/system.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm/string.h> -unsigned long high_memory = 0; +unsigned long max_mapnr = 0; +void * high_memory = NULL; /* - * The free_area_list arrays point to the queue heads of the free areas - * of different sizes + * We special-case the C-O-W ZERO_PAGE, because it's such + * a common occurrence (no need to read the page to know + * that it's zero - better for the cache and memory subsystem). */ -int nr_swap_pages = 0; -int nr_free_pages = 0; -struct mem_list free_area_list[NR_MEM_LISTS]; -unsigned char * free_area_map[NR_MEM_LISTS]; - -#if 0 -/* - * This now resides in include/asm/page.h - */ -#define copy_page(from,to) memcpy((void *) to, (void *) from, PAGE_SIZE) -#endif +static inline void copy_cow_page(unsigned long from, unsigned long to) +{ + if (from == ZERO_PAGE) { + clear_page(to); + return; + } + copy_page(to, from); +} #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) @@ -77,30 +77,18 @@ mem_map_t * mem_map = NULL; */ void oom(struct task_struct * task) { - printk("\nOut of memory for %s.\n", current->comm); - task->sigaction[SIGKILL-1].sa_handler = NULL; + printk("\nOut of memory for %s.\n", task->comm); + task->sig->action[SIGKILL-1].sa_handler = NULL; task->blocked &= ~(1<<(SIGKILL-1)); send_sig(SIGKILL,task,1); } -static inline void free_one_pte(pte_t * page_table) -{ - pte_t page = *page_table; - - if (pte_none(page)) - return; - pte_clear(page_table); - if (!pte_present(page)) { - swap_free(pte_val(page)); - return; - } - free_page(pte_page(page)); - return; -} - +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ static inline void free_one_pmd(pmd_t * dir) { - int j; pte_t * pte; if (pmd_none(*dir)) @@ -112,12 +100,6 @@ static inline void free_one_pmd(pmd_t * dir) } pte = pte_offset(dir, 0); pmd_clear(dir); - if (pte_inuse(pte)) { - pte_free(pte); - return; - } - for (j = 0; j < PTRS_PER_PTE ; j++) - free_one_pte(pte+j); pte_free(pte); } @@ -135,224 +117,195 @@ static inline void free_one_pgd(pgd_t * dir) } pmd = pmd_offset(dir, 0); pgd_clear(dir); - if (pmd_inuse(pmd)) { - pmd_free(pmd); - return; - } for (j = 0; j < PTRS_PER_PMD ; j++) free_one_pmd(pmd+j); pmd_free(pmd); } - /* * This function clears all user-level page tables of a process - this - * is needed by execve(), so that old pages aren't in the way. Note that - * unlike 'free_page_tables()', this function still leaves a valid - * page-table-tree in memory: it just removes the user pages. The two - * functions are similar, but there is a fundamental difference. + * is needed by execve(), so that old pages aren't in the way. */ void clear_page_tables(struct task_struct * tsk) { int i; pgd_t * page_dir; - if (!tsk) - return; - if (tsk == task[0]) - panic("task[0] (swapper) doesn't support exec()\n"); - page_dir = pgd_offset(tsk, 0); + page_dir = tsk->mm->pgd; if (!page_dir || page_dir == swapper_pg_dir) { printk("%s trying to clear kernel page-directory: not good\n", tsk->comm); return; } - if (pgd_inuse(page_dir)) { - pgd_t * new_pg; - - if (!(new_pg = pgd_alloc())) { - oom(tsk); - return; - } - for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++) - new_pg[i] = page_dir[i]; - SET_PAGE_DIR(tsk, new_pg); - pgd_free(page_dir); - return; - } + flush_cache_mm(tsk->mm); for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) free_one_pgd(page_dir + i); - invalidate(); - return; + flush_tlb_mm(tsk->mm); } /* - * This function frees up all page tables of a process when it exits. + * This function frees up all page tables of a process when it exits. It + * is the same as "clear_page_tables()", except it also changes the process' + * page table directory to the kernel page tables and then frees the old + * page table directory. */ -void free_page_tables(struct task_struct * tsk) +void free_page_tables(struct mm_struct * mm) { int i; pgd_t * page_dir; - if (!tsk) - return; - if (tsk == task[0]) { - printk("task[0] (swapper) killed: unable to recover\n"); - panic("Trying to free up swapper memory space"); - } - page_dir = pgd_offset(tsk, 0); + page_dir = mm->pgd; if (!page_dir || page_dir == swapper_pg_dir) { - printk("%s trying to free kernel page-directory: not good\n", tsk->comm); - return; - } - SET_PAGE_DIR(tsk, swapper_pg_dir); - if (pgd_inuse(page_dir)) { - pgd_free(page_dir); + printk("Trying to free kernel page-directory: not good\n"); return; } - for (i = 0 ; i < PTRS_PER_PGD ; i++) + for (i = 0 ; i < USER_PTRS_PER_PGD ; i++) free_one_pgd(page_dir + i); pgd_free(page_dir); - invalidate(); } -/* - * clone_page_tables() clones the page table for a process - both - * processes will have the exact same pages in memory. There are - * probably races in the memory management with cloning, but we'll - * see.. - */ -int clone_page_tables(struct task_struct * tsk) +int new_page_tables(struct task_struct * tsk) { - pgd_t * pg_dir; + pgd_t * page_dir, * new_pg; - pg_dir = pgd_offset(current, 0); - pgd_reuse(pg_dir); - SET_PAGE_DIR(tsk, pg_dir); + if (!(new_pg = pgd_alloc())) + return -ENOMEM; + page_dir = pgd_offset(&init_mm, 0); + flush_cache_mm(tsk->mm); + memcpy(new_pg + USER_PTRS_PER_PGD, page_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof (pgd_t)); + flush_tlb_mm(tsk->mm); + SET_PAGE_DIR(tsk, new_pg); + tsk->mm->pgd = new_pg; return 0; } -static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte) +static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow) { pte_t pte = *old_pte; + unsigned long page_nr; if (pte_none(pte)) return; if (!pte_present(pte)) { swap_duplicate(pte_val(pte)); - *new_pte = pte; + set_pte(new_pte, pte); return; } - if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) { - *new_pte = pte; + page_nr = MAP_NR(pte_page(pte)); + if (page_nr >= max_mapnr || PageReserved(mem_map+page_nr)) { + set_pte(new_pte, pte); return; } - if (pte_cow(pte)) + if (cow) pte = pte_wrprotect(pte); - if (delete_from_swap_cache(pte_page(pte))) + if (delete_from_swap_cache(page_nr)) pte = pte_mkdirty(pte); - *new_pte = pte_mkold(pte); - *old_pte = pte; - mem_map[MAP_NR(pte_page(pte))]++; + set_pte(new_pte, pte_mkold(pte)); + set_pte(old_pte, pte); + mem_map[page_nr].count++; } -static inline int copy_one_pmd(pmd_t * old_pmd, pmd_t * new_pmd) +static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow) { - int j; - pte_t *old_pte, *new_pte; + pte_t * src_pte, * dst_pte; + unsigned long end; - if (pmd_none(*old_pmd)) + if (pmd_none(*src_pmd)) return 0; - if (pmd_bad(*old_pmd)) { - printk("copy_one_pmd: bad page table: probable memory corruption\n"); - pmd_clear(old_pmd); - return 0; - } - old_pte = pte_offset(old_pmd, 0); - if (pte_inuse(old_pte)) { - pte_reuse(old_pte); - *new_pmd = *old_pmd; + if (pmd_bad(*src_pmd)) { + printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd)); + pmd_clear(src_pmd); return 0; } - new_pte = pte_alloc(new_pmd, 0); - if (!new_pte) - return -ENOMEM; - for (j = 0 ; j < PTRS_PER_PTE ; j++) { - copy_one_pte(old_pte, new_pte); - old_pte++; - new_pte++; + src_pte = pte_offset(src_pmd, address); + if (pmd_none(*dst_pmd)) { + if (!pte_alloc(dst_pmd, 0)) + return -ENOMEM; } + dst_pte = pte_offset(dst_pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end >= PMD_SIZE) + end = PMD_SIZE; + do { + /* I would like to switch arguments here, to make it + * consistent with copy_xxx_range and memcpy syntax. + */ + copy_one_pte(src_pte++, dst_pte++, cow); + address += PAGE_SIZE; + } while (address < end); return 0; } -static inline int copy_one_pgd(pgd_t * old_pgd, pgd_t * new_pgd) +static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow) { - int j; - pmd_t *old_pmd, *new_pmd; + pmd_t * src_pmd, * dst_pmd; + unsigned long end; + int error = 0; - if (pgd_none(*old_pgd)) + if (pgd_none(*src_pgd)) return 0; - if (pgd_bad(*old_pgd)) { - printk("copy_one_pgd: bad page table (%p: %08lx): probable memory corruption\n", old_pgd, pgd_val(*old_pgd)); - pgd_clear(old_pgd); + if (pgd_bad(*src_pgd)) { + printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd)); + pgd_clear(src_pgd); return 0; } - old_pmd = pmd_offset(old_pgd, 0); - if (pmd_inuse(old_pmd)) { - pmd_reuse(old_pmd); - *new_pgd = *old_pgd; - return 0; + src_pmd = pmd_offset(src_pgd, address); + if (pgd_none(*dst_pgd)) { + if (!pmd_alloc(dst_pgd, 0)) + return -ENOMEM; } - new_pmd = pmd_alloc(new_pgd, 0); - if (!new_pmd) - return -ENOMEM; - for (j = 0 ; j < PTRS_PER_PMD ; j++) { - int error = copy_one_pmd(old_pmd, new_pmd); + dst_pmd = pmd_offset(dst_pgd, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow); if (error) - return error; - old_pmd++; - new_pmd++; - } - return 0; + break; + address = (address + PMD_SIZE) & PMD_MASK; + } while (address < end); + return error; } /* - * copy_page_tables() just copies the whole process memory range: - * note the special handling of RESERVED (ie kernel) pages, which - * means that they are always shared by all processes. + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. */ -int copy_page_tables(struct task_struct * tsk) -{ - int i; - pgd_t *old_pgd; - pgd_t *new_pgd; - - new_pgd = pgd_alloc(); - if (!new_pgd) - return -ENOMEM; - SET_PAGE_DIR(tsk, new_pgd); - old_pgd = pgd_offset(current, 0); - for (i = 0 ; i < PTRS_PER_PGD ; i++) { - int errno = copy_one_pgd(old_pgd, new_pgd); - if (errno) { - free_page_tables(tsk); - invalidate(); - return errno; - } - old_pgd++; - new_pgd++; +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t * src_pgd, * dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + int error = 0, cow; + + cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; + src_pgd = pgd_offset(src, address); + dst_pgd = pgd_offset(dst, address); + flush_cache_range(src, vma->vm_start, vma->vm_end); + flush_cache_range(dst, vma->vm_start, vma->vm_end); + while (address < end) { + error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow); + if (error) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; } - invalidate(); - return 0; + /* Note that the src ptes get c-o-w treatment, so they change too. */ + flush_tlb_range(src, vma->vm_start, vma->vm_end); + flush_tlb_range(dst, vma->vm_start, vma->vm_end); + return error; } -static inline void forget_pte(pte_t page) +static inline void free_pte(pte_t page) { - if (pte_none(page)) - return; if (pte_present(page)) { - free_page(pte_page(page)); - if (mem_map[MAP_NR(pte_page(page))] & MAP_PAGE_RESERVED) + unsigned long addr = pte_page(page); + if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr))) return; + free_page(addr); if (current->mm->rss <= 0) return; current->mm->rss--; @@ -361,33 +314,45 @@ static inline void forget_pte(pte_t page) swap_free(pte_val(page)); } -static inline void unmap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size) +static inline void forget_pte(pte_t page) +{ + if (!pte_none(page)) { + printk("forget_pte: old mapping existed!\n"); + free_pte(page); + } +} + +static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size) { pte_t * pte; - unsigned long end; if (pmd_none(*pmd)) return; if (pmd_bad(*pmd)) { - printk("unmap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd)); + printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd)); pmd_clear(pmd); return; } pte = pte_offset(pmd, address); address &= ~PMD_MASK; - end = address + size; - if (end >= PMD_SIZE) - end = PMD_SIZE; - do { - pte_t page = *pte; - pte_clear(pte); - forget_pte(page); - address += PAGE_SIZE; + if (address + size > PMD_SIZE) + size = PMD_SIZE - address; + size >>= PAGE_SHIFT; + for (;;) { + pte_t page; + if (!size) + break; + page = *pte; pte++; - } while (address < end); + size--; + if (pte_none(page)) + continue; + pte_clear(pte-1); + free_pte(page); + } } -static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size) +static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size) { pmd_t * pmd; unsigned long end; @@ -395,7 +360,7 @@ static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned if (pgd_none(*dir)) return; if (pgd_bad(*dir)) { - printk("unmap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir)); + printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir)); pgd_clear(dir); return; } @@ -405,28 +370,28 @@ static inline void unmap_pmd_range(pgd_t * dir, unsigned long address, unsigned if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - unmap_pte_range(pmd, address, end - address); + zap_pte_range(pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); } /* - * a more complete version of free_page_tables which performs with page - * granularity. + * remove user pages in a given range. */ -int unmap_page_range(unsigned long address, unsigned long size) +int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) { pgd_t * dir; unsigned long end = address + size; - dir = pgd_offset(current, address); + dir = pgd_offset(mm, address); + flush_cache_range(mm, end - size, end); while (address < end) { - unmap_pmd_range(dir, address, end - address); + zap_pmd_range(dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } - invalidate(); + flush_tlb_range(mm, end - size, end); return 0; } @@ -440,7 +405,7 @@ static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigne end = PMD_SIZE; do { pte_t oldpage = *pte; - *pte = zero_pte; + set_pte(pte, zero_pte); forget_pte(oldpage); address += PAGE_SIZE; pte++; @@ -470,11 +435,13 @@ int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) { int error = 0; pgd_t * dir; + unsigned long beg = address; unsigned long end = address + size; pte_t zero_pte; zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot)); - dir = pgd_offset(current, address); + dir = pgd_offset(current->mm, address); + flush_cache_range(current->mm, beg, end); while (address < end) { pmd_t *pmd = pmd_alloc(dir, address); error = -ENOMEM; @@ -486,7 +453,7 @@ int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } - invalidate(); + flush_tlb_range(current->mm, beg, end); return error; } @@ -496,7 +463,7 @@ int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) * in null mappings (currently treated as "copy-on-access") */ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long offset, pgprot_t prot) + unsigned long phys_addr, pgprot_t prot) { unsigned long end; @@ -505,23 +472,22 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned if (end > PMD_SIZE) end = PMD_SIZE; do { + unsigned long mapnr; pte_t oldpage = *pte; pte_clear(pte); - if (offset >= high_memory || (mem_map[MAP_NR(offset)] & MAP_PAGE_RESERVED)) - *pte = mk_pte(offset, prot); - else if (mem_map[MAP_NR(offset)]) { - mem_map[MAP_NR(offset)]++; - *pte = mk_pte(offset, prot); - } + + mapnr = MAP_NR(__va(phys_addr)); + if (mapnr >= max_mapnr || PageReserved(mem_map+mapnr)) + set_pte(pte, mk_pte_phys(phys_addr, prot)); forget_pte(oldpage); address += PAGE_SIZE; - offset += PAGE_SIZE; + phys_addr += PAGE_SIZE; pte++; } while (address < end); } static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long offset, pgprot_t prot) + unsigned long phys_addr, pgprot_t prot) { unsigned long end; @@ -529,38 +495,40 @@ static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned l end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; - offset -= address; + phys_addr -= address; do { pte_t * pte = pte_alloc(pmd, address); if (!pte) return -ENOMEM; - remap_pte_range(pte, address, end - address, address + offset, prot); + remap_pte_range(pte, address, end - address, address + phys_addr, prot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); return 0; } -int remap_page_range(unsigned long from, unsigned long offset, unsigned long size, pgprot_t prot) +int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) { int error = 0; pgd_t * dir; + unsigned long beg = from; unsigned long end = from + size; - offset -= from; - dir = pgd_offset(current, from); + phys_addr -= from; + dir = pgd_offset(current->mm, from); + flush_cache_range(current->mm, beg, end); while (from < end) { pmd_t *pmd = pmd_alloc(dir, from); error = -ENOMEM; if (!pmd) break; - error = remap_pmd_range(pmd, from, end - from, offset + from, prot); + error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } - invalidate(); + flush_tlb_range(current->mm, beg, end); return error; } @@ -570,12 +538,11 @@ int remap_page_range(unsigned long from, unsigned long offset, unsigned long siz static void put_page(pte_t * page_table, pte_t pte) { if (!pte_none(*page_table)) { - printk("put_page: page already exists %08lx\n", pte_val(*page_table)); free_page(pte_page(pte)); return; } -/* no need for invalidate */ - *page_table = pte; +/* no need for flush_tlb */ + set_pte(page_table, pte); } /* @@ -588,11 +555,11 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig pmd_t * pmd; pte_t * pte; - if (page >= high_memory) + if (MAP_NR(page) >= max_mapnr) printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); - if (mem_map[MAP_NR(page)] != 1) + if (mem_map[MAP_NR(page)].count != 1) printk("mem_map disagrees with %08lx at %08lx\n",page,address); - pgd = pgd_offset(tsk,address); + pgd = pgd_offset(tsk->mm,address); pmd = pmd_alloc(pgd, address); if (!pmd) { free_page(page); @@ -607,10 +574,11 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig } if (!pte_none(*pte)) { printk("put_dirty_page: page already exists\n"); - pte_clear(pte); - invalidate(); + free_page(page); + return 0; } - *pte = pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY))); + flush_page_to_ram(page); + set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, PAGE_COPY)))); /* no need for invalidate */ return page; } @@ -632,8 +600,8 @@ unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsig * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. */ -void do_wp_page(struct vm_area_struct * vma, unsigned long address, - int write_access) +void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, + unsigned long address, int write_access) { pgd_t *page_dir; pmd_t *page_middle; @@ -641,7 +609,7 @@ void do_wp_page(struct vm_area_struct * vma, unsigned long address, unsigned long old_page, new_page; new_page = __get_free_page(GFP_KERNEL); - page_dir = pgd_offset(vma->vm_task,address); + page_dir = pgd_offset(vma->vm_mm, address); if (pgd_none(*page_dir)) goto end_wp_page; if (pgd_bad(*page_dir)) @@ -658,44 +626,49 @@ void do_wp_page(struct vm_area_struct * vma, unsigned long address, if (pte_write(pte)) goto end_wp_page; old_page = pte_page(pte); - if (old_page >= high_memory) + if (MAP_NR(old_page) >= max_mapnr) goto bad_wp_page; - vma->vm_task->mm->min_flt++; + tsk->min_flt++; /* * Do we need to copy? */ - if (mem_map[MAP_NR(old_page)] != 1) { + if (mem_map[MAP_NR(old_page)].count != 1) { if (new_page) { - if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED) - ++vma->vm_task->mm->rss; - copy_page(old_page,new_page); - *page_table = pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))); + if (PageReserved(mem_map + MAP_NR(old_page))) + ++vma->vm_mm->rss; + copy_cow_page(old_page,new_page); + flush_page_to_ram(old_page); + flush_page_to_ram(new_page); + flush_cache_page(vma, address); + set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); free_page(old_page); - invalidate(); + flush_tlb_page(vma, address); return; } - *page_table = BAD_PAGE; + flush_cache_page(vma, address); + set_pte(page_table, BAD_PAGE); + flush_tlb_page(vma, address); free_page(old_page); - oom(vma->vm_task); - invalidate(); + oom(tsk); return; } - *page_table = pte_mkdirty(pte_mkwrite(pte)); - invalidate(); + flush_cache_page(vma, address); + set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); + flush_tlb_page(vma, address); if (new_page) free_page(new_page); return; bad_wp_page: printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); - send_sig(SIGKILL, vma->vm_task, 1); + send_sig(SIGKILL, tsk, 1); goto end_wp_page; bad_wp_pagemiddle: printk("do_wp_page: bogus page-middle at address %08lx (%08lx)\n", address, pmd_val(*page_middle)); - send_sig(SIGKILL, vma->vm_task, 1); + send_sig(SIGKILL, tsk, 1); goto end_wp_page; bad_wp_pagedir: printk("do_wp_page: bogus page-dir entry at address %08lx (%08lx)\n", address, pgd_val(*page_dir)); - send_sig(SIGKILL, vma->vm_task, 1); + send_sig(SIGKILL, tsk, 1); end_wp_page: if (new_page) free_page(new_page); @@ -703,301 +676,94 @@ end_wp_page: } /* - * Ugly, ugly, but the goto's result in better assembly.. + * This function zeroes out partial mmap'ed pages at truncation time.. */ -int verify_area(int type, const void * addr, unsigned long size) +static void partial_clear(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct * vma; - unsigned long start = (unsigned long) addr; - - /* If the current user space is mapped to kernel space (for the - * case where we use a fake user buffer with get_fs/set_fs()) we - * don't expect to find the address in the user vm map. - */ - if (get_fs() == get_ds()) - return 0; - - vma = find_vma(current, start); - if (!vma) - goto bad_area; - if (vma->vm_start <= start) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur) - goto bad_area; - -good_area: - if (type == VERIFY_WRITE) - goto check_write; - for (;;) { - struct vm_area_struct * next; - if (!(vma->vm_flags & VM_READ)) - goto bad_area; - if (vma->vm_end - start >= size) - return 0; - next = vma->vm_next; - if (!next || vma->vm_end != next->vm_start) - goto bad_area; - vma = next; - } - -check_write: - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - if (!wp_works_ok) - goto check_wp_fault_by_hand; - for (;;) { - if (vma->vm_end - start >= size) - break; - if (!vma->vm_next || vma->vm_end != vma->vm_next->vm_start) - goto bad_area; - vma = vma->vm_next; - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - } - return 0; - -check_wp_fault_by_hand: - size--; - size += start & ~PAGE_MASK; - size >>= PAGE_SHIFT; - start &= PAGE_MASK; - - for (;;) { - do_wp_page(vma, start, 1); - if (!size) - break; - size--; - start += PAGE_SIZE; - if (start < vma->vm_end) - continue; - vma = vma->vm_next; - if (!vma || vma->vm_start != start) - goto bad_area; - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area;; - } - return 0; - -bad_area: - return -EFAULT; -} - -static inline void get_empty_page(struct vm_area_struct * vma, pte_t * page_table) -{ - unsigned long tmp; + pgd_t *page_dir; + pmd_t *page_middle; + pte_t *page_table, pte; - if (!(tmp = get_free_page(GFP_KERNEL))) { - oom(vma->vm_task); - put_page(page_table, BAD_PAGE); + page_dir = pgd_offset(vma->vm_mm, address); + if (pgd_none(*page_dir)) + return; + if (pgd_bad(*page_dir)) { + printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir)); + pgd_clear(page_dir); return; } - put_page(page_table, pte_mkwrite(mk_pte(tmp, vma->vm_page_prot))); -} - -/* - * try_to_share() checks the page at address "address" in the task "p", - * to see if it exists, and if it is clean. If so, share it with the current - * task. - * - * NOTE! This assumes we have checked that p != current, and that they - * share the same inode and can generally otherwise be shared. - */ -static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area, - unsigned long from_address, struct vm_area_struct * from_area, - unsigned long newpage) -{ - pgd_t * from_dir, * to_dir; - pmd_t * from_middle, * to_middle; - pte_t * from_table, * to_table; - pte_t from, to; - - from_dir = pgd_offset(from_area->vm_task,from_address); -/* is there a page-directory at from? */ - if (pgd_none(*from_dir)) - return 0; - if (pgd_bad(*from_dir)) { - printk("try_to_share: bad page directory %08lx\n", pgd_val(*from_dir)); - pgd_clear(from_dir); - return 0; - } - from_middle = pmd_offset(from_dir, from_address); -/* is there a mid-directory at from? */ - if (pmd_none(*from_middle)) - return 0; - if (pmd_bad(*from_middle)) { - printk("try_to_share: bad mid directory %08lx\n", pmd_val(*from_middle)); - pmd_clear(from_middle); - return 0; - } - from_table = pte_offset(from_middle, from_address); - from = *from_table; -/* is the page present? */ - if (!pte_present(from)) - return 0; -/* if it is dirty it must be from a shared mapping to be shared */ - if (pte_dirty(from)) { - if (!(from_area->vm_flags & VM_SHARED)) - return 0; - if (pte_write(from)) { - printk("nonwritable, but dirty, shared page\n"); - return 0; - } - } -/* is the page reasonable at all? */ - if (pte_page(from) >= high_memory) - return 0; - if (mem_map[MAP_NR(pte_page(from))] & MAP_PAGE_RESERVED) - return 0; -/* is the destination ok? */ - to_dir = pgd_offset(to_area->vm_task,to_address); -/* is there a page-directory at to? */ - if (pgd_none(*to_dir)) - return 0; - if (pgd_bad(*to_dir)) { - printk("try_to_share: bad page directory %08lx\n", pgd_val(*to_dir)); - return 0; - } - to_middle = pmd_offset(to_dir, to_address); -/* is there a mid-directory at to? */ - if (pmd_none(*to_middle)) - return 0; - if (pmd_bad(*to_middle)) { - printk("try_to_share: bad mid directory %08lx\n", pmd_val(*to_middle)); - return 0; - } - to_table = pte_offset(to_middle, to_address); - to = *to_table; - if (!pte_none(to)) - return 0; -/* do we copy? */ - if (newpage) { - /* if it's in the swap cache, it's dirty by implication */ - /* so we can't use it if it's not from a shared mapping */ - if (in_swap_cache(pte_page(from))) { - if (!(from_area->vm_flags & VM_SHARED)) - return 0; - if (!pte_write(from)) { - printk("nonwritable, but dirty, shared page\n"); - return 0; - } - } - copy_page(pte_page(from), newpage); - *to_table = mk_pte(newpage, to_area->vm_page_prot); - return 1; - } -/* - * do a final swap-cache test before sharing them: if it's in the swap - * cache, we have to remove it now, as we get two pointers to the same - * physical page and the cache can't handle it. Mark the original dirty. - * - * NOTE! Even if "from" is dirty, "to" will be clean: if we get here - * with a dirty "from", the from-mapping is a shared map, so we can trust - * the page contents to be up-to-date - */ - if (in_swap_cache(pte_page(from))) { - if (!(from_area->vm_flags & VM_SHARED)) - return 0; - *from_table = pte_mkdirty(from); - delete_from_swap_cache(pte_page(from)); + page_middle = pmd_offset(page_dir, address); + if (pmd_none(*page_middle)) + return; + if (pmd_bad(*page_middle)) { + printk("bad page table directory entry %p:[%lx]\n", page_dir, pgd_val(*page_dir)); + pmd_clear(page_middle); + return; } - mem_map[MAP_NR(pte_page(from))]++; - *to_table = mk_pte(pte_page(from), to_area->vm_page_prot); -/* Check if we need to do anything at all to the 'from' field */ - if (!pte_write(from)) - return 1; - if (from_area->vm_flags & VM_SHARED) - return 1; -/* ok, need to mark it read-only, so invalidate any possible old TB entry */ - *from_table = pte_wrprotect(from); - invalidate(); - return 1; + page_table = pte_offset(page_middle, address); + pte = *page_table; + if (!pte_present(pte)) + return; + flush_cache_page(vma, address); + address &= ~PAGE_MASK; + address += pte_page(pte); + if (MAP_NR(address) >= max_mapnr) + return; + memset((void *) address, 0, PAGE_SIZE - (address & ~PAGE_MASK)); + flush_page_to_ram(pte_page(pte)); } /* - * share_page() tries to find a process that could share a page with - * the current one. + * Handle all mappings that got truncated by a "truncate()" + * system call. * - * We first check if it is at all feasible by checking inode->i_count. - * It should be >1 if there are other tasks sharing this inode. + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. */ -static int share_page(struct vm_area_struct * area, unsigned long address, - int write_access, unsigned long newpage) +void vmtruncate(struct inode * inode, unsigned long offset) { - struct inode * inode; - unsigned long offset; - unsigned long from_address; - unsigned long give_page; struct vm_area_struct * mpnt; - if (!area || !(inode = area->vm_inode) || inode->i_count < 2) - return 0; - /* do we need to copy or can we just share? */ - give_page = 0; - if (write_access && !(area->vm_flags & VM_SHARED)) { - if (!newpage) - return 0; - give_page = newpage; - } - offset = address - area->vm_start + area->vm_offset; - /* See if there is something in the VM we can share pages with. */ - /* Traverse the entire circular i_mmap list, except `area' itself. */ - for (mpnt = area->vm_next_share; mpnt != area; mpnt = mpnt->vm_next_share) { - /* must be same inode */ - if (mpnt->vm_inode != inode) { - printk("Aiee! Corrupt vm_area_struct i_mmap ring\n"); - break; - } - /* offsets must be mutually page-aligned */ - if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK) - continue; - /* the other area must actually cover the wanted page.. */ - from_address = offset + mpnt->vm_start - mpnt->vm_offset; - if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end) + truncate_inode_pages(inode, offset); + if (!inode->i_mmap) + return; + mpnt = inode->i_mmap; + do { + unsigned long start = mpnt->vm_start; + unsigned long len = mpnt->vm_end - start; + unsigned long diff; + + /* mapping wholly truncated? */ + if (mpnt->vm_offset >= offset) { + zap_page_range(mpnt->vm_mm, start, len); continue; - /* .. NOW we can actually try to use the same physical page */ - if (!try_to_share(address, area, from_address, mpnt, give_page)) + } + /* mapping wholly unaffected? */ + diff = offset - mpnt->vm_offset; + if (diff >= len) continue; - /* free newpage if we never used it.. */ - if (give_page || !newpage) - return 1; - free_page(newpage); - return 1; - } - return 0; + /* Ok, partially affected.. */ + start += diff; + len = (len - diff) & PAGE_MASK; + if (start & ~PAGE_MASK) { + partial_clear(mpnt, start); + start = (start + ~PAGE_MASK) & PAGE_MASK; + } + zap_page_range(mpnt->vm_mm, start, len); + } while ((mpnt = mpnt->vm_next_share) != inode->i_mmap); } -/* - * fill in an empty page-table if none exists. - */ -static inline pte_t * get_empty_pgtable(struct task_struct * tsk,unsigned long address) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - - pgd = pgd_offset(tsk, address); - pmd = pmd_alloc(pgd, address); - if (!pmd) { - oom(tsk); - return NULL; - } - pte = pte_alloc(pmd, address); - if (!pte) { - oom(tsk); - return NULL; - } - return pte; -} -static inline void do_swap_page(struct vm_area_struct * vma, unsigned long address, +static inline void do_swap_page(struct task_struct * tsk, + struct vm_area_struct * vma, unsigned long address, pte_t * page_table, pte_t entry, int write_access) { pte_t page; if (!vma->vm_ops || !vma->vm_ops->swapin) { - swap_in(vma, page_table, pte_val(entry), write_access); + swap_in(tsk, vma, page_table, pte_val(entry), write_access); + flush_page_to_ram(pte_page(*page_table)); return; } page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry)); @@ -1005,11 +771,12 @@ static inline void do_swap_page(struct vm_area_struct * vma, unsigned long addre free_page(pte_page(page)); return; } - if (mem_map[MAP_NR(pte_page(page))] > 1 && !(vma->vm_flags & VM_SHARED)) + if (mem_map[MAP_NR(pte_page(page))].count > 1 && !(vma->vm_flags & VM_SHARED)) page = pte_wrprotect(page); - ++vma->vm_task->mm->rss; - ++vma->vm_task->mm->maj_flt; - *page_table = page; + ++vma->vm_mm->rss; + ++tsk->maj_flt; + flush_page_to_ram(pte_page(page)); + set_pte(page_table, page); return; } @@ -1018,71 +785,94 @@ static inline void do_swap_page(struct vm_area_struct * vma, unsigned long addre * tries to share with existing pages, but makes a separate copy if * the "write_access" parameter is true in order to avoid the next * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. */ -void do_no_page(struct vm_area_struct * vma, unsigned long address, - int write_access) +void do_no_page(struct task_struct * tsk, struct vm_area_struct * vma, + unsigned long address, int write_access) { + pgd_t * pgd; + pmd_t * pmd; pte_t * page_table; pte_t entry; unsigned long page; - page_table = get_empty_pgtable(vma->vm_task,address); + pgd = pgd_offset(tsk->mm, address); + pmd = pmd_alloc(pgd, address); + if (!pmd) + goto no_memory; + page_table = pte_alloc(pmd, address); if (!page_table) - return; + goto no_memory; entry = *page_table; if (pte_present(entry)) - return; - if (!pte_none(entry)) { - do_swap_page(vma, address, page_table, entry, write_access); - return; - } + goto is_present; + if (!pte_none(entry)) + goto swap_page; address &= PAGE_MASK; - if (!vma->vm_ops || !vma->vm_ops->nopage) { - ++vma->vm_task->mm->rss; - ++vma->vm_task->mm->min_flt; - get_empty_page(vma, page_table); - return; - } - page = __get_free_page(GFP_KERNEL); - if (share_page(vma, address, write_access, page)) { - ++vma->vm_task->mm->min_flt; - ++vma->vm_task->mm->rss; - return; - } - if (!page) { - oom(current); - put_page(page_table, BAD_PAGE); - return; - } - ++vma->vm_task->mm->maj_flt; - ++vma->vm_task->mm->rss; + if (!vma->vm_ops || !vma->vm_ops->nopage) + goto anonymous_page; /* - * The fourth argument is "no_share", which tells the low-level code + * The third argument is "no_share", which tells the low-level code * to copy, not share the page even if sharing is possible. It's * essentially an early COW detection */ - page = vma->vm_ops->nopage(vma, address, page, - write_access && !(vma->vm_flags & VM_SHARED)); - if (share_page(vma, address, write_access, 0)) { - free_page(page); - return; - } + page = vma->vm_ops->nopage(vma, address, + (vma->vm_flags & VM_SHARED)?0:write_access); + if (!page) + goto sigbus; + ++tsk->maj_flt; + ++vma->vm_mm->rss; /* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it's valid * for other architectures too. * * Note that if write_access is true, we either now have - * a exclusive copy of the page, or this is a shared mapping, + * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. */ +/* do_no_page might already have flushed the page ... */ + flush_page_to_ram(page); entry = mk_pte(page, vma->vm_page_prot); if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); - } else if (mem_map[MAP_NR(page)] > 1 && !(vma->vm_flags & VM_SHARED)) + } else if (mem_map[MAP_NR(page)].count > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); put_page(page_table, entry); + /* no need to invalidate: a not-present page shouldn't be cached */ + return; + +anonymous_page: + entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot)); + if (write_access) { + unsigned long page = get_free_page(GFP_KERNEL); + if (!page) + goto sigbus; + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + vma->vm_mm->rss++; + tsk->min_flt++; + flush_page_to_ram(page); + } + put_page(page_table, entry); + return; + +sigbus: + force_sig(SIGBUS, current); + put_page(page_table, BAD_PAGE); + /* no need to invalidate, wasn't present */ + return; + +swap_page: + do_swap_page(tsk, vma, address, page_table, entry, write_access); + return; + +no_memory: + oom(tsk); +is_present: + return; } /* @@ -1102,17 +892,19 @@ static inline void handle_pte_fault(struct vm_area_struct * vma, unsigned long a int write_access, pte_t * pte) { if (!pte_present(*pte)) { - do_no_page(vma, address, write_access); + do_no_page(current, vma, address, write_access); return; } - *pte = pte_mkyoung(*pte); + set_pte(pte, pte_mkyoung(*pte)); + flush_tlb_page(vma, address); if (!write_access) return; if (pte_write(*pte)) { - *pte = pte_mkdirty(*pte); + set_pte(pte, pte_mkdirty(*pte)); + flush_tlb_page(vma, address); return; } - do_wp_page(vma, address, write_access); + do_wp_page(current, vma, address, write_access); } void handle_mm_fault(struct vm_area_struct * vma, unsigned long address, @@ -1122,7 +914,7 @@ void handle_mm_fault(struct vm_area_struct * vma, unsigned long address, pmd_t *pmd; pte_t *pte; - pgd = pgd_offset(vma->vm_task, address); + pgd = pgd_offset(vma->vm_mm, address); pmd = pmd_alloc(pgd, address); if (!pmd) goto no_memory; @@ -1133,5 +925,5 @@ void handle_mm_fault(struct vm_area_struct * vma, unsigned long address, update_mmu_cache(vma, address, *pte); return; no_memory: - oom(vma->vm_task); + oom(current); } diff --git a/mm/mlock.c b/mm/mlock.c new file mode 100644 index 000000000..65b9e5407 --- /dev/null +++ b/mm/mlock.c @@ -0,0 +1,272 @@ +/* + * linux/mm/mlock.c + * + * (C) Copyright 1995 Linus Torvalds + */ +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/pgtable.h> + +static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags) +{ + vma->vm_flags = newflags; + return 0; +} + +static inline int mlock_fixup_start(struct vm_area_struct * vma, + unsigned long end, int newflags) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + vma->vm_start = end; + n->vm_end = end; + vma->vm_offset += vma->vm_start - n->vm_start; + n->vm_flags = newflags; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current->mm, n); + return 0; +} + +static inline int mlock_fixup_end(struct vm_area_struct * vma, + unsigned long start, int newflags) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + vma->vm_end = start; + n->vm_start = start; + n->vm_offset += n->vm_start - vma->vm_start; + n->vm_flags = newflags; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current->mm, n); + return 0; +} + +static inline int mlock_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int newflags) +{ + struct vm_area_struct * left, * right; + + left = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!left) + return -EAGAIN; + right = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!right) { + kfree(left); + return -EAGAIN; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + vma->vm_start = start; + vma->vm_end = end; + right->vm_start = end; + vma->vm_offset += vma->vm_start - left->vm_start; + right->vm_offset += right->vm_start - left->vm_start; + vma->vm_flags = newflags; + if (vma->vm_inode) + vma->vm_inode->i_count += 2; + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + insert_vm_struct(current->mm, left); + insert_vm_struct(current->mm, right); + return 0; +} + +static int mlock_fixup(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned int newflags) +{ + int pages, retval; + + if (newflags == vma->vm_flags) + return 0; + + if (start == vma->vm_start) { + if (end == vma->vm_end) + retval = mlock_fixup_all(vma, newflags); + else + retval = mlock_fixup_start(vma, end, newflags); + } else { + if (end == vma->vm_end) + retval = mlock_fixup_end(vma, start, newflags); + else + retval = mlock_fixup_middle(vma, start, end, newflags); + } + if (!retval) { + /* keep track of amount of locked VM */ + pages = (end - start) >> PAGE_SHIFT; + if (!(newflags & VM_LOCKED)) + pages = -pages; + vma->vm_mm->locked_vm += pages; + + if (newflags & VM_LOCKED) + while (start < end) { + char c; + get_user(c,(char *) start); + __asm__ __volatile__("": :"r" (c)); + start += PAGE_SIZE; + } + } + return retval; +} + +static int do_mlock(unsigned long start, size_t len, int on) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * next; + int error; + + if (!suser()) + return -EPERM; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + vma = find_vma(current->mm, start); + if (!vma || vma->vm_start > start) + return -ENOMEM; + + for (nstart = start ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = vma->vm_flags | VM_LOCKED; + if (!on) + newflags &= ~VM_LOCKED; + + if (vma->vm_end >= end) { + error = mlock_fixup(vma, nstart, end, newflags); + break; + } + + tmp = vma->vm_end; + next = vma->vm_next; + error = mlock_fixup(vma, nstart, tmp, newflags); + if (error) + break; + nstart = tmp; + vma = next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + break; + } + } + merge_segments(current->mm, start, end); + return error; +} + +asmlinkage int sys_mlock(unsigned long start, size_t len) +{ + unsigned long locked; + unsigned long lock_limit; + + len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK; + start &= PAGE_MASK; + + locked = len >> PAGE_SHIFT; + locked += current->mm->locked_vm; + + lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + /* check against resource limits */ + if (locked > lock_limit) + return -ENOMEM; + + /* we may lock at most half of physical memory... */ + /* (this check is pretty bogus, but doesn't hurt) */ + if (locked > max_mapnr/2) + return -ENOMEM; + + return do_mlock(start, len, 1); +} + +asmlinkage int sys_munlock(unsigned long start, size_t len) +{ + len = (len + (start & ~PAGE_MASK) + ~PAGE_MASK) & PAGE_MASK; + start &= PAGE_MASK; + return do_mlock(start, len, 0); +} + +static int do_mlockall(int flags) +{ + int error; + unsigned int def_flags; + struct vm_area_struct * vma; + + if (!suser()) + return -EPERM; + + def_flags = 0; + if (flags & MCL_FUTURE) + def_flags = VM_LOCKED; + current->mm->def_flags = def_flags; + + error = 0; + for (vma = current->mm->mmap; vma ; vma = vma->vm_next) { + unsigned int newflags; + + newflags = vma->vm_flags | VM_LOCKED; + if (!(flags & MCL_CURRENT)) + newflags &= ~VM_LOCKED; + error = mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags); + if (error) + break; + } + merge_segments(current->mm, 0, TASK_SIZE); + return error; +} + +asmlinkage int sys_mlockall(int flags) +{ + unsigned long lock_limit; + + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) + return -EINVAL; + + lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + if (current->mm->total_vm > lock_limit) + return -ENOMEM; + + /* we may lock at most half of physical memory... */ + /* (this check is pretty bogus, but doesn't hurt) */ + if (current->mm->total_vm > max_mapnr/2) + return -ENOMEM; + + return do_mlockall(flags); +} + +asmlinkage int sys_munlockall(void) +{ + return do_mlockall(0); +} @@ -12,13 +12,13 @@ #include <linux/mman.h> #include <linux/string.h> #include <linux/malloc.h> +#include <linux/pagemap.h> +#include <linux/swap.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <asm/pgtable.h> -static int anon_map(struct inode *, struct file *, struct vm_area_struct *); - /* * description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected @@ -41,22 +41,126 @@ pgprot_t protection_map[16] = { __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; +/* + * Check that a process has enough memory to allocate a + * new virtual mapping. + */ +static inline int vm_enough_memory(long pages) +{ + /* + * stupid algorithm to decide if we have enough memory: while + * simple, it hopefully works in most obvious cases.. Easy to + * fool it, but this should catch most mistakes. + */ + long freepages; + freepages = buffermem >> PAGE_SHIFT; + freepages += page_cache_size; + freepages >>= 1; + freepages += nr_free_pages; + freepages += nr_swap_pages; + freepages -= max_mapnr >> 4; + return freepages > pages; +} + +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + unsigned long rlim; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + if (brk < mm->end_code) + return mm->brk; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + return mm->brk = brk; + + /* + * Always allow shrinking brk + */ + if (brk <= mm->brk) { + mm->brk = brk; + do_munmap(newbrk, oldbrk-newbrk); + return brk; + } + /* + * Check against rlimit and stack.. + */ + rlim = current->rlim[RLIMIT_DATA].rlim_cur; + if (rlim >= RLIM_INFINITY) + rlim = ~0; + if (brk - mm->end_code > rlim) + return mm->brk; + + /* + * Check against existing mmap mappings. + */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + return mm->brk; + + /* + * Check if we have enough memory.. + */ + if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) + return mm->brk; + + /* + * Ok, looks good - let it rip. + */ + if(do_mmap(NULL, oldbrk, newbrk-oldbrk, + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE, 0) != oldbrk) + return mm->brk; + return mm->brk = brk; +} + +/* + * Combine the mmap "prot" and "flags" argument into one "vm_flags" used + * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits + * into "VM_xxx". + */ +static inline unsigned long vm_flags(unsigned long prot, unsigned long flags) +{ +#define _trans(x,bit1,bit2) \ +((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0) + + unsigned long prot_bits, flag_bits; + prot_bits = + _trans(prot, PROT_READ, VM_READ) | + _trans(prot, PROT_WRITE, VM_WRITE) | + _trans(prot, PROT_EXEC, VM_EXEC); + flag_bits = + _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) | + _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) | + _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE); + return prot_bits | flag_bits; +#undef _trans +} + unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long off) { - int error; + struct mm_struct * mm = current->mm; struct vm_area_struct * vma; if ((len = PAGE_ALIGN(len)) == 0) return addr; - if (addr > TASK_SIZE || len > TASK_SIZE || addr > TASK_SIZE-len) + if (len > TASK_SIZE || addr > TASK_SIZE-len) return -EINVAL; /* offset overflow? */ if (off + len < off) return -EINVAL; + /* mlock MCL_FUTURE? */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + /* * do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open @@ -68,6 +172,11 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, case MAP_SHARED: if ((prot & PROT_WRITE) && !(file->f_mode & 2)) return -EACCES; + /* + * make sure there are no mandatory locks on the file. + */ + if (locks_verify_locked(file->f_inode)) + return -EAGAIN; /* fall through */ case MAP_PRIVATE: if (!(file->f_mode & 1)) @@ -77,8 +186,10 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, default: return -EINVAL; } - if ((flags & MAP_DENYWRITE) && (file->f_inode->i_wcount > 0)) - return -ETXTBSY; + if (flags & MAP_DENYWRITE) { + if (file->f_inode->i_writecount > 0) + return -ETXTBSY; + } } else if ((flags & MAP_TYPE) != MAP_PRIVATE) return -EINVAL; @@ -90,8 +201,6 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, if (flags & MAP_FIXED) { if (addr & ~PAGE_MASK) return -EINVAL; - if (len > TASK_SIZE || addr > TASK_SIZE - len) - return -EINVAL; } else { addr = get_unmapped_area(addr, len); if (!addr) @@ -111,11 +220,10 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, if (!vma) return -ENOMEM; - vma->vm_task = current; + vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = prot & (VM_READ | VM_WRITE | VM_EXEC); - vma->vm_flags |= flags & (VM_GROWSDOWN | VM_DENYWRITE | VM_EXECUTABLE); + vma->vm_flags = vm_flags(prot,flags) | mm->def_flags; if (file) { if (file->f_mode & 1) @@ -145,17 +253,48 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, do_munmap(addr, len); /* Clear old maps */ - if (file) - error = file->f_op->mmap(file->f_inode, file, vma); - else - error = anon_map(NULL, NULL, vma); - - if (error) { + /* Check against address space limit. */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) { kfree(vma); - return error; + return -ENOMEM; + } + + /* Private writable mapping? Check memory availability.. */ + if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE) { + if (!(flags & MAP_NORESERVE) && + !vm_enough_memory(len >> PAGE_SHIFT)) { + kfree(vma); + return -ENOMEM; + } + } + + if (file) { + int error = file->f_op->mmap(file->f_inode, file, vma); + + if (error) { + kfree(vma); + return error; + } + } + + flags = vma->vm_flags; + insert_vm_struct(mm, vma); + merge_segments(mm, vma->vm_start, vma->vm_end); + + /* merge_segments might have merged our vma, so we can't use it any more */ + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + unsigned long start = addr; + mm->locked_vm += len >> PAGE_SHIFT; + do { + char c; + get_user(c,(char *) start); + len -= PAGE_SIZE; + start += PAGE_SIZE; + __asm__ __volatile__("": :"r" (c)); + } while (len > 0); } - insert_vm_struct(current, vma); - merge_segments(current, vma->vm_start, vma->vm_end); return addr; } @@ -174,41 +313,16 @@ unsigned long get_unmapped_area(unsigned long addr, unsigned long len) addr = TASK_SIZE / 3; addr = PAGE_ALIGN(addr); - for (vmm = current->mm->mmap; ; vmm = vmm->vm_next) { + for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { + /* At this point: (!vmm || addr < vmm->vm_end). */ if (TASK_SIZE - len < addr) return 0; - if (!vmm) + if (!vmm || addr + len <= vmm->vm_start) return addr; - if (addr > vmm->vm_end) - continue; - if (addr + len > vmm->vm_start) { - addr = vmm->vm_end; - continue; - } - return addr; + addr = vmm->vm_end; } } -asmlinkage int sys_mmap(unsigned long *buffer) -{ - int error; - unsigned long flags; - struct file * file = NULL; - - error = verify_area(VERIFY_READ, buffer, 6*sizeof(long)); - if (error) - return error; - flags = get_fs_long(buffer+3); - if (!(flags & MAP_ANONYMOUS)) { - unsigned long fd = get_fs_long(buffer+4); - if (fd >= NR_OPEN || !(file = current->files->fd[fd])) - return -EBADF; - } - return do_mmap(file, get_fs_long(buffer), get_fs_long(buffer+1), - get_fs_long(buffer+2), flags, get_fs_long(buffer+5)); -} - - /* * Searching a VMA in the linear list task->mm->mmap is horribly slow. * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search @@ -230,7 +344,6 @@ asmlinkage int sys_mmap(unsigned long *buffer) * vm_avl_height 1+max(heightof(left),heightof(right)) * The empty tree is represented as NULL. */ -#define avl_empty (struct vm_area_struct *) NULL /* Since the trees are balanced, their height will never be large. */ #define avl_maxheight 41 /* why this? a small exercise */ @@ -243,60 +356,8 @@ asmlinkage int sys_mmap(unsigned long *buffer) * foreach node in tree->vm_avl_right: node->vm_avl_key >= tree->vm_avl_key. */ -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct * find_vma (struct task_struct * task, unsigned long addr) -{ -#if 0 /* equivalent, but slow */ - struct vm_area_struct * vma; - - for (vma = task->mm->mmap ; ; vma = vma->vm_next) { - if (!vma) - return NULL; - if (vma->vm_end > addr) - return vma; - } -#else - struct vm_area_struct * result = NULL; - struct vm_area_struct * tree; - - for (tree = task->mm->mmap_avl ; ; ) { - if (tree == avl_empty) - return result; - if (tree->vm_end > addr) { - if (tree->vm_start <= addr) - return tree; - result = tree; - tree = tree->vm_avl_left; - } else - tree = tree->vm_avl_right; - } -#endif -} - -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -struct vm_area_struct * find_vma_intersection (struct task_struct * task, unsigned long start_addr, unsigned long end_addr) -{ - struct vm_area_struct * vma; - -#if 0 /* equivalent, but slow */ - for (vma = task->mm->mmap; vma; vma = vma->vm_next) { - if (end_addr <= vma->vm_start) - break; - if (start_addr < vma->vm_end) - return vma; - } - return NULL; -#else - vma = find_vma(task,start_addr); - if (!vma || end_addr <= vma->vm_start) - return NULL; - return vma; -#endif -} - /* Look up the nodes at the left and at the right of a given node. */ -static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) +static inline void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) { vm_avl_key_t key = node->vm_avl_key; @@ -342,7 +403,7 @@ static void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * nodes[0]..nodes[k-1] such that * nodes[0] is the root and nodes[i+1] = nodes[i]->{vm_avl_left|vm_avl_right}. */ -static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count) +static inline void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count) { for ( ; count > 0 ; count--) { struct vm_area_struct ** nodeplace = *--nodeplaces_ptr; @@ -419,7 +480,7 @@ static void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count) } /* Insert a node into a tree. */ -static void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree) +static inline void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct ** ptree) { vm_avl_key_t key = new_node->vm_avl_key; struct vm_area_struct ** nodeplace = ptree; @@ -446,7 +507,7 @@ static void avl_insert (struct vm_area_struct * new_node, struct vm_area_struct /* Insert a node into a tree, and * return the node to the left of it and the node to the right of it. */ -static void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree, +static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right) { vm_avl_key_t key = new_node->vm_avl_key; @@ -476,7 +537,7 @@ static void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_a } /* Removes a node out of a tree. */ -static void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree) +static inline void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree) { vm_avl_key_t key = node_to_delete->vm_avl_key; struct vm_area_struct ** nodeplace = ptree; @@ -652,7 +713,7 @@ static void avl_check (struct task_struct * task, char *caller) * Case 4 involves the creation of 2 new areas, for each side of * the hole. */ -void unmap_fixup(struct vm_area_struct *area, +static void unmap_fixup(struct vm_area_struct *area, unsigned long addr, size_t len) { struct vm_area_struct *mpnt; @@ -666,6 +727,9 @@ void unmap_fixup(struct vm_area_struct *area, area->vm_start, area->vm_end, addr, end); return; } + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; /* Unmapping the whole area */ if (addr == area->vm_start && end == area->vm_end) { @@ -699,7 +763,7 @@ void unmap_fixup(struct vm_area_struct *area, if (mpnt->vm_ops && mpnt->vm_ops->open) mpnt->vm_ops->open(mpnt); area->vm_end = addr; /* Truncate area */ - insert_vm_struct(current, mpnt); + insert_vm_struct(current->mm, mpnt); } /* construct whatever mapping is needed */ @@ -713,7 +777,7 @@ void unmap_fixup(struct vm_area_struct *area, area->vm_end = area->vm_start; area->vm_ops->close(area); } - insert_vm_struct(current, mpnt); + insert_vm_struct(current->mm, mpnt); } asmlinkage int sys_munmap(unsigned long addr, size_t len) @@ -743,7 +807,7 @@ int do_munmap(unsigned long addr, size_t len) * every area affected in some way (by any overlap) is put * on the list. If nothing is put on, nothing is affected. */ - mpnt = find_vma(current, addr); + mpnt = find_vma(current->mm, addr); if (!mpnt) return 0; avl_neighbours(mpnt, current->mm->mmap_avl, &prev, &next); @@ -768,7 +832,7 @@ int do_munmap(unsigned long addr, size_t len) * If the one of the segments is only being partially unmapped, * it will put new vm_area_struct(s) into the address space. */ - while (free) { + do { unsigned long st, end; mpnt = free; @@ -782,38 +846,47 @@ int do_munmap(unsigned long addr, size_t len) if (mpnt->vm_ops && mpnt->vm_ops->unmap) mpnt->vm_ops->unmap(mpnt, st, end-st); - + zap_page_range(current->mm, st, end-st); unmap_fixup(mpnt, st, end-st); kfree(mpnt); - } + } while (free); + + /* we could zap the page tables here too.. */ - unmap_page_range(addr, len); return 0; } /* Build the AVL tree corresponding to the VMA list. */ -void build_mmap_avl(struct task_struct * task) +void build_mmap_avl(struct mm_struct * mm) { struct vm_area_struct * vma; - task->mm->mmap_avl = NULL; - for (vma = task->mm->mmap; vma; vma = vma->vm_next) - avl_insert(vma, &task->mm->mmap_avl); + mm->mmap_avl = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) + avl_insert(vma, &mm->mmap_avl); } /* Release all mmaps. */ -void exit_mmap(struct task_struct * task) +void exit_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt; - mpnt = task->mm->mmap; - task->mm->mmap = NULL; - task->mm->mmap_avl = NULL; + mpnt = mm->mmap; + mm->mmap = NULL; + mm->mmap_avl = NULL; + mm->rss = 0; + mm->total_vm = 0; + mm->locked_vm = 0; while (mpnt) { struct vm_area_struct * next = mpnt->vm_next; - if (mpnt->vm_ops && mpnt->vm_ops->close) - mpnt->vm_ops->close(mpnt); + if (mpnt->vm_ops) { + if (mpnt->vm_ops->unmap) + mpnt->vm_ops->unmap(mpnt, mpnt->vm_start, mpnt->vm_end-mpnt->vm_start); + if (mpnt->vm_ops->close) + mpnt->vm_ops->close(mpnt); + } remove_shared_vm_struct(mpnt); + zap_page_range(mm, mpnt->vm_start, mpnt->vm_end-mpnt->vm_start); if (mpnt->vm_inode) iput(mpnt->vm_inode); kfree(mpnt); @@ -825,7 +898,7 @@ void exit_mmap(struct task_struct * task) * Insert vm structure into process list sorted by address * and into the inode's i_mmap ring. */ -void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp) +void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) { struct vm_area_struct *share; struct inode * inode; @@ -833,7 +906,7 @@ void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp) #if 0 /* equivalent, but slow */ struct vm_area_struct **p, *mpnt; - p = &t->mm->mmap; + p = &mm->mmap; while ((mpnt = *p) != NULL) { if (mpnt->vm_start > vmp->vm_start) break; @@ -846,13 +919,13 @@ void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp) #else struct vm_area_struct * prev, * next; - avl_insert_neighbours(vmp, &t->mm->mmap_avl, &prev, &next); - if ((prev ? prev->vm_next : t->mm->mmap) != next) + avl_insert_neighbours(vmp, &mm->mmap_avl, &prev, &next); + if ((prev ? prev->vm_next : mm->mmap) != next) printk("insert_vm_struct: tree inconsistent with list\n"); if (prev) prev->vm_next = vmp; else - t->mm->mmap = vmp; + mm->mmap = vmp; vmp->vm_next = next; #endif @@ -901,14 +974,16 @@ void remove_shared_vm_struct(struct vm_area_struct *mpnt) * We don't need to traverse the entire list, only those segments * which intersect or are adjacent to a given interval. */ -void merge_segments (struct task_struct * task, unsigned long start_addr, unsigned long end_addr) +void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) { struct vm_area_struct *prev, *mpnt, *next; - mpnt = find_vma(task, start_addr); + down(&mm->mmap_sem); + mpnt = find_vma(mm, start_addr); if (!mpnt) - return; - avl_neighbours(mpnt, task->mm->mmap_avl, &prev, &next); + goto no_vma; + + avl_neighbours(mpnt, mm->mmap_avl, &prev, &next); /* we have prev->vm_next == mpnt && mpnt->vm_next = next */ if (!prev) { @@ -952,7 +1027,7 @@ void merge_segments (struct task_struct * task, unsigned long start_addr, unsign * big segment can possibly merge with the next one. * The old unused mpnt is freed. */ - avl_remove(mpnt, &task->mm->mmap_avl); + avl_remove(mpnt, &mm->mmap_avl); prev->vm_end = mpnt->vm_end; prev->vm_next = mpnt->vm_next; if (mpnt->vm_ops && mpnt->vm_ops->close) { @@ -966,15 +1041,6 @@ void merge_segments (struct task_struct * task, unsigned long start_addr, unsign kfree_s(mpnt, sizeof(*mpnt)); mpnt = prev; } -} - -/* - * Map memory not associated with any file into a process - * address space. Adjacent memory is merged. - */ -static int anon_map(struct inode *ino, struct file * file, struct vm_area_struct * vma) -{ - if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) - return -ENOMEM; - return 0; +no_vma: + up(&mm->mmap_sem); } diff --git a/mm/mprotect.c b/mm/mprotect.c index ecf73730c..5aa7794a4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -13,7 +13,7 @@ #include <linux/string.h> #include <linux/malloc.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <asm/pgtable.h> @@ -38,7 +38,7 @@ static inline void change_pte_range(pmd_t * pmd, unsigned long address, do { pte_t entry = *pte; if (pte_present(entry)) - *pte = pte_modify(entry, newprot); + set_pte(pte, pte_modify(entry, newprot)); address += PAGE_SIZE; pte++; } while (address < end); @@ -72,14 +72,16 @@ static inline void change_pmd_range(pgd_t * pgd, unsigned long address, static void change_protection(unsigned long start, unsigned long end, pgprot_t newprot) { pgd_t *dir; + unsigned long beg = start; - dir = pgd_offset(current, start); + dir = pgd_offset(current->mm, start); + flush_cache_range(current->mm, beg, end); while (start < end) { change_pmd_range(dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } - invalidate(); + flush_tlb_range(current->mm, beg, end); return; } @@ -110,7 +112,7 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma, n->vm_inode->i_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - insert_vm_struct(current, n); + insert_vm_struct(current->mm, n); return 0; } @@ -133,7 +135,7 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma, n->vm_inode->i_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); - insert_vm_struct(current, n); + insert_vm_struct(current->mm, n); return 0; } @@ -167,8 +169,8 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma, vma->vm_ops->open(left); vma->vm_ops->open(right); } - insert_vm_struct(current, left); - insert_vm_struct(current, right); + insert_vm_struct(current->mm, left); + insert_vm_struct(current->mm, right); return 0; } @@ -214,7 +216,7 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) return -EINVAL; if (end == start) return 0; - vma = find_vma(current, start); + vma = find_vma(current->mm, start); if (!vma || vma->vm_start > start) return -EFAULT; @@ -246,6 +248,6 @@ asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) break; } } - merge_segments(current, start, end); + merge_segments(current->mm, start, end); return error; } diff --git a/mm/mremap.c b/mm/mremap.c new file mode 100644 index 000000000..a3e941055 --- /dev/null +++ b/mm/mremap.c @@ -0,0 +1,224 @@ +/* + * linux/mm/remap.c + * + * (C) Copyright 1996 Linus Torvalds + */ + +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> +#include <linux/swap.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/pgtable.h> + +static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pgd_t * pgd; + pmd_t * pmd; + pte_t * pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + goto end; + if (pgd_bad(*pgd)) { + printk("move_one_page: bad source pgd (%08lx)\n", pgd_val(*pgd)); + pgd_clear(pgd); + goto end; + } + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) + goto end; + if (pmd_bad(*pmd)) { + printk("move_one_page: bad source pmd (%08lx)\n", pmd_val(*pmd)); + pmd_clear(pmd); + goto end; + } + + pte = pte_offset(pmd, addr); + if (pte_none(*pte)) + pte = NULL; +end: + return pte; +} + +static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pmd_t * pmd; + pte_t * pte = NULL; + + pmd = pmd_alloc(pgd_offset(mm, addr), addr); + if (pmd) + pte = pte_alloc(pmd, addr); + return pte; +} + +static inline int copy_one_pte(pte_t * src, pte_t * dst) +{ + int error = 0; + pte_t pte = *src; + + if (!pte_none(pte)) { + error++; + if (dst) { + pte_clear(src); + set_pte(dst, pte); + error--; + } + } + return error; +} + +static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) +{ + int error = 0; + pte_t * src; + + src = get_one_pte(mm, old_addr); + if (src) + error = copy_one_pte(src, alloc_one_pte(mm, new_addr)); + return error; +} + +static int move_page_tables(struct mm_struct * mm, + unsigned long new_addr, unsigned long old_addr, unsigned long len) +{ + unsigned long offset = len; + + flush_cache_range(mm, old_addr, old_addr + len); + flush_tlb_range(mm, old_addr, old_addr + len); + + /* + * This is not the clever way to do this, but we're taking the + * easy way out on the assumption that most remappings will be + * only a few pages.. This also makes error recovery easier. + */ + while (offset) { + offset -= PAGE_SIZE; + if (move_one_page(mm, old_addr + offset, new_addr + offset)) + goto oops_we_failed; + } + return 0; + + /* + * Ok, the move failed because we didn't have enough pages for + * the new page table tree. This is unlikely, but we have to + * take the possibility into account. In that case we just move + * all the pages back (this will work, because we still have + * the old page tables) + */ +oops_we_failed: + flush_cache_range(mm, new_addr, new_addr + len); + while ((offset += PAGE_SIZE) < len) + move_one_page(mm, new_addr + offset, old_addr + offset); + flush_tlb_range(mm, new_addr, new_addr + len); + zap_page_range(mm, new_addr, new_addr + len); + return -1; +} + +static inline unsigned long move_vma(struct vm_area_struct * vma, + unsigned long addr, unsigned long old_len, unsigned long new_len) +{ + struct vm_area_struct * new_vma; + + new_vma = (struct vm_area_struct *) + kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (new_vma) { + unsigned long new_addr = get_unmapped_area(addr, new_len); + + if (new_addr && !move_page_tables(current->mm, new_addr, addr, old_len)) { + *new_vma = *vma; + new_vma->vm_start = new_addr; + new_vma->vm_end = new_addr+new_len; + new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start); + if (new_vma->vm_inode) + new_vma->vm_inode->i_count++; + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + insert_vm_struct(current->mm, new_vma); + merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end); + do_munmap(addr, old_len); + current->mm->total_vm += new_len >> PAGE_SHIFT; + return new_addr; + } + kfree(new_vma); + } + return -ENOMEM; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + */ +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags) +{ + struct vm_area_struct *vma; + + if (addr & ~PAGE_MASK) + return -EINVAL; + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + */ + if (old_len > new_len) { + do_munmap(addr+new_len, old_len - new_len); + return addr; + } + + /* + * Ok, we need to grow.. + */ + vma = find_vma(current->mm, addr); + if (!vma || vma->vm_start > addr) + return -EFAULT; + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + return -EFAULT; + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + locked += new_len - old_len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + /* old_len exactly to the end of the area.. */ + if (old_len == vma->vm_end - addr && + (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { + unsigned long max_addr = TASK_SIZE; + if (vma->vm_next) + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + vma->vm_end = addr + new_len; + current->mm->total_vm += pages; + if (vma->vm_flags & VM_LOCKED) + current->mm->locked_vm += pages; + return addr; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + if (flags & MREMAP_MAYMOVE) + return move_vma(vma, addr, old_len, new_len); + return -ENOMEM; +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c new file mode 100644 index 000000000..09373b3c9 --- /dev/null +++ b/mm/page_alloc.c @@ -0,0 +1,339 @@ +/* + * linux/mm/page_alloc.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/swapctl.h> +#include <linux/interrupt.h> + +#include <asm/dma.h> +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/uaccess.h> /* for copy_to/from_user */ +#include <asm/bitops.h> +#include <asm/pgtable.h> + +int nr_swap_pages = 0; +int nr_free_pages = 0; + +/* + * Free area management + * + * The free_area_list arrays point to the queue heads of the free areas + * of different sizes + */ + +#define NR_MEM_LISTS 6 + +/* The start of this MUST match the start of "struct page" */ +struct free_area_struct { + struct page *next; + struct page *prev; + unsigned int * map; +}; + +#define memory_head(x) ((struct page *)(x)) + +static struct free_area_struct free_area[NR_MEM_LISTS]; + +static inline void init_mem_queue(struct free_area_struct * head) +{ + head->next = memory_head(head); + head->prev = memory_head(head); +} + +static inline void add_mem_queue(struct free_area_struct * head, struct page * entry) +{ + struct page * next = head->next; + + entry->prev = memory_head(head); + entry->next = next; + next->prev = entry; + head->next = entry; +} + +static inline void remove_mem_queue(struct page * entry) +{ + struct page * next = entry->next; + struct page * prev = entry->prev; + next->prev = prev; + prev->next = next; +} + +/* + * Free_page() adds the page to the free lists. This is optimized for + * fast normal cases (no error jumps taken normally). + * + * The way to optimize jumps for gcc-2.2.2 is to: + * - select the "normal" case and put it inside the if () { XXX } + * - no else-statements if you can avoid them + * + * With the above two rules, you get a straight-line execution path + * for the normal case, giving better asm-code. + * + * free_page() may sleep since the page being freed may be a buffer + * page or present in the swap cache. It will not sleep, however, + * for a freshly allocated page (get_free_page()). + */ + +/* + * Buddy system. Hairy. You really aren't expected to understand this + * + * Hint: -mask = 1+~mask + */ +static inline void free_pages_ok(unsigned long map_nr, unsigned long order) +{ + struct free_area_struct *area = free_area + order; + unsigned long index = map_nr >> (1 + order); + unsigned long mask = (~0UL) << order; + unsigned long flags; + + save_flags(flags); + cli(); + +#define list(x) (mem_map+(x)) + + map_nr &= mask; + nr_free_pages -= mask; + while (mask + (1 << (NR_MEM_LISTS-1))) { + if (!change_bit(index, area->map)) + break; + remove_mem_queue(list(map_nr ^ -mask)); + mask <<= 1; + area++; + index >>= 1; + map_nr &= mask; + } + add_mem_queue(area, list(map_nr)); + +#undef list + + restore_flags(flags); +} + +void __free_page(struct page *page) +{ + if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { + unsigned long map_nr = page->map_nr; + delete_from_swap_cache(map_nr); + free_pages_ok(map_nr, 0); + } +} + +void free_pages(unsigned long addr, unsigned long order) +{ + unsigned long map_nr = MAP_NR(addr); + + if (map_nr < max_mapnr) { + mem_map_t * map = mem_map + map_nr; + if (PageReserved(map)) + return; + if (atomic_dec_and_test(&map->count)) { + delete_from_swap_cache(map_nr); + free_pages_ok(map_nr, order); + return; + } + } +} + +/* + * Some ugly macros to speed up __get_free_pages().. + */ +#define MARK_USED(index, order, area) \ + change_bit((index) >> (1+(order)), (area)->map) +#define CAN_DMA(x) (PageDMA(x)) +#define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT)) +#define RMQUEUE(order, dma) \ +do { struct free_area_struct * area = free_area+order; \ + unsigned long new_order = order; \ + do { struct page *prev = memory_head(area), *ret; \ + while (memory_head(area) != (ret = prev->next)) { \ + if (!dma || CAN_DMA(ret)) { \ + unsigned long map_nr = ret->map_nr; \ + (prev->next = ret->next)->prev = prev; \ + MARK_USED(map_nr, new_order, area); \ + nr_free_pages -= 1 << order; \ + EXPAND(ret, map_nr, order, new_order, area); \ + restore_flags(flags); \ + return ADDRESS(map_nr); \ + } \ + prev = ret; \ + } \ + new_order++; area++; \ + } while (new_order < NR_MEM_LISTS); \ +} while (0) + +#define EXPAND(map,index,low,high,area) \ +do { unsigned long size = 1 << high; \ + while (high > low) { \ + area--; high--; size >>= 1; \ + add_mem_queue(area, map); \ + MARK_USED(index, high, area); \ + index += size; \ + map += size; \ + } \ + map->count = 1; \ + map->age = PAGE_INITIAL_AGE; \ +} while (0) + +unsigned long __get_free_pages(int priority, unsigned long order, int dma) +{ + unsigned long flags; + int reserved_pages; + + if (order >= NR_MEM_LISTS) + return 0; + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("gfp called nonatomically from interrupt %p\n", + return_address()); + priority = GFP_ATOMIC; + } + } + reserved_pages = 5; + if (priority != GFP_NFS) + reserved_pages = min_free_pages; + save_flags(flags); +repeat: + cli(); + if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { + RMQUEUE(order, dma); + restore_flags(flags); + return 0; + } + restore_flags(flags); + if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1)) + goto repeat; + return 0; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + unsigned long order, flags; + unsigned long total = 0; + + printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); + save_flags(flags); + cli(); + for (order=0 ; order < NR_MEM_LISTS; order++) { + struct page * tmp; + unsigned long nr = 0; + for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) { + nr ++; + } + total += nr * ((PAGE_SIZE>>10) << order); + printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order); + } + restore_flags(flags); + printk("= %lukB)\n", total); +#ifdef SWAP_CACHE_INFO + show_swap_cache_info(); +#endif +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +/* + * set up the free-area data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) +{ + mem_map_t * p; + unsigned long mask = PAGE_MASK; + int i; + + /* + * select nr of pages we try to keep free for important stuff + * with a minimum of 16 pages. This is totally arbitrary + */ + i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7); + if (i < 16) + i = 16; + min_free_pages = i; + free_pages_low = i + (i>>1); + free_pages_high = i + i; + start_mem = init_swap_cache(start_mem, end_mem); + mem_map = (mem_map_t *) start_mem; + p = mem_map + MAP_NR(end_mem); + start_mem = LONG_ALIGN((unsigned long) p); + memset(mem_map, 0, start_mem - (unsigned long) mem_map); + do { + --p; + p->flags = (1 << PG_DMA) | (1 << PG_reserved); + p->map_nr = p - mem_map; + } while (p > mem_map); + + for (i = 0 ; i < NR_MEM_LISTS ; i++) { + unsigned long bitmap_size; + init_mem_queue(free_area+i); + mask += mask; + end_mem = (end_mem + ~mask) & mask; + bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i); + bitmap_size = (bitmap_size + 7) >> 3; + bitmap_size = LONG_ALIGN(bitmap_size); + free_area[i].map = (unsigned int *) start_mem; + memset((void *) start_mem, 0, bitmap_size); + start_mem += bitmap_size; + } + return start_mem; +} + +/* + * The tests may look silly, but it essentially makes sure that + * no other process did a swap-in on us just as we were waiting. + * + * Also, don't bother to add to the swap cache if this page-in + * was due to a write access. + */ +void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, + pte_t * page_table, unsigned long entry, int write_access) +{ + unsigned long page = __get_free_page(GFP_KERNEL); + + if (pte_val(*page_table) != entry) { + free_page(page); + return; + } + if (!page) { + set_pte(page_table, BAD_PAGE); + swap_free(entry); + oom(tsk); + return; + } + read_swap_page(entry, (char *) page); + if (pte_val(*page_table) != entry) { + free_page(page); + return; + } + vma->vm_mm->rss++; + tsk->maj_flt++; + if (!write_access && add_to_swap_cache(MAP_NR(page), entry)) { + /* keep swap page allocated for the moment (swap cache) */ + set_pte(page_table, mk_pte(page, vma->vm_page_prot)); + return; + } + set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); + swap_free(entry); + return; +} + diff --git a/mm/page_io.c b/mm/page_io.c new file mode 100644 index 000000000..9980c52b7 --- /dev/null +++ b/mm/page_io.c @@ -0,0 +1,193 @@ +/* + * linux/mm/page_io.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, + * Asynchronous swapping added 30.12.95. Stephen Tweedie + * Removed race in async swapping. 14.4.1996. Bruno Haible + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/locks.h> +#include <linux/swapctl.h> + +#include <asm/dma.h> +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/uaccess.h> /* for copy_to/from_user */ +#include <asm/bitops.h> +#include <asm/pgtable.h> + +static struct wait_queue * lock_queue = NULL; + +/* + * Reads or writes a swap page. + * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O. + * + * Important prevention of race condition: The first thing we do is set a lock + * on this swap page, which lasts until I/O completes. This way a + * write_swap_page(entry) immediately followed by a read_swap_page(entry) + * on the same entry will first complete the write_swap_page(). Fortunately, + * not more than one write_swap_page() request can be pending per entry. So + * all races the caller must catch are: multiple read_swap_page() requests + * on the same entry. + */ +void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) +{ + unsigned long type, offset; + struct swap_info_struct * p; + struct page *page; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk("Internal error: bad swap-device\n"); + return; + } + p = &swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("rw_swap_page: weirdness\n"); + return; + } + if (p->swap_map && !p->swap_map[offset]) { + printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to swap to unused swap-device\n"); + return; + } + /* Make sure we are the only process doing I/O with this swap page. */ + while (set_bit(offset,p->swap_lockmap)) { + run_task_queue(&tq_disk); + sleep_on(&lock_queue); + } + if (rw == READ) + kstat.pswpin++; + else + kstat.pswpout++; + page = mem_map + MAP_NR(buf); + atomic_inc(&page->count); + wait_on_page(page); + if (p->swap_device) { + if (!wait) { + set_bit(PG_free_after, &page->flags); + set_bit(PG_decr_after, &page->flags); + set_bit(PG_swap_unlock_after, &page->flags); + page->swap_unlock_entry = entry; + atomic_inc(&nr_async_pages); + } + ll_rw_page(rw,p->swap_device,offset,buf); + /* + * NOTE! We don't decrement the page count if we + * don't wait - that will happen asynchronously + * when the IO completes. + */ + if (!wait) + return; + wait_on_page(page); + } else if (p->swap_file) { + struct inode *swapf = p->swap_file; + unsigned int zones[PAGE_SIZE/512]; + int i; + if (swapf->i_op->bmap == NULL + && swapf->i_op->smap != NULL){ + /* + With MsDOS, we use msdos_smap which return + a sector number (not a cluster or block number). + It is a patch to enable the UMSDOS project. + Other people are working on better solution. + + It sounds like ll_rw_swap_file defined + it operation size (sector size) based on + PAGE_SIZE and the number of block to read. + So using bmap or smap should work even if + smap will require more blocks. + */ + int j; + unsigned int block = offset << 3; + + for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){ + if (!(zones[i] = swapf->i_op->smap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + }else{ + int j; + unsigned int block = offset + << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); + + for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) + if (!(zones[i] = bmap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + } + } + ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); + } else + printk("rw_swap_page: no swap file or device\n"); + atomic_dec(&page->count); + if (offset && !clear_bit(offset,p->swap_lockmap)) + printk("rw_swap_page: lock already cleared\n"); + wake_up(&lock_queue); +} + +/* This is run when asynchronous page I/O has completed. */ +void swap_after_unlock_page (unsigned long entry) +{ + unsigned long type, offset; + struct swap_info_struct * p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk("swap_after_unlock_page: bad swap-device\n"); + return; + } + p = &swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("swap_after_unlock_page: weirdness\n"); + return; + } + if (!clear_bit(offset,p->swap_lockmap)) + printk("swap_after_unlock_page: lock already cleared\n"); + wake_up(&lock_queue); +} + +/* + * Swap partitions are now read via brw_page. ll_rw_page is an + * asynchronous function now --- we must call wait_on_page afterwards + * if synchronous IO is required. + */ +void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer) +{ + int block = offset; + struct page *page; + + switch (rw) { + case READ: + break; + case WRITE: + if (is_read_only(dev)) { + printk("Can't page to read-only device %s\n", + kdevname(dev)); + return; + } + break; + default: + panic("ll_rw_page: bad block dev cmd, must be R/W"); + } + page = mem_map + MAP_NR(buffer); + if (set_bit(PG_locked, &page->flags)) + panic ("ll_rw_page: page already locked"); + brw_page(rw, page, dev, &block, PAGE_SIZE, 0); +} @@ -7,6 +7,8 @@ /* * This file should contain most things doing the swapping from/to disk. * Started 18.12.91 + * + * Swap aging added 23.2.95, Stephen Tweedie. */ #include <linux/mm.h> @@ -17,1215 +19,88 @@ #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> +#include <linux/swap.h> #include <linux/fs.h> +#include <linux/swapctl.h> +#include <linux/pagemap.h> #include <asm/dma.h> #include <asm/system.h> /* for cli()/sti() */ +#include <asm/uaccess.h> /* for copy_to/from_user */ #include <asm/bitops.h> #include <asm/pgtable.h> -#define MAX_SWAPFILES 8 - -#define SWP_USED 1 -#define SWP_WRITEOK 3 - -int min_free_pages = 20; - -static int nr_swapfiles = 0; -static struct wait_queue * lock_queue = NULL; - -static struct swap_info_struct { - unsigned long flags; - struct inode * swap_file; - unsigned int swap_device; - unsigned char * swap_map; - unsigned char * swap_lockmap; - int pages; - int lowest_bit; - int highest_bit; - unsigned long max; -} swap_info[MAX_SWAPFILES]; - -extern int shm_swap (int); - -unsigned long *swap_cache; - -#ifdef SWAP_CACHE_INFO -unsigned long swap_cache_add_total = 0; -unsigned long swap_cache_add_success = 0; -unsigned long swap_cache_del_total = 0; -unsigned long swap_cache_del_success = 0; -unsigned long swap_cache_find_total = 0; -unsigned long swap_cache_find_success = 0; - -extern inline void show_swap_cache_info(void) -{ - printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", - swap_cache_add_total, swap_cache_add_success, - swap_cache_del_total, swap_cache_del_success, - swap_cache_find_total, swap_cache_find_success); -} -#endif - -static int add_to_swap_cache(unsigned long addr, unsigned long entry) -{ - struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; - -#ifdef SWAP_CACHE_INFO - swap_cache_add_total++; -#endif - if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - entry = (unsigned long) xchg_ptr(swap_cache + MAP_NR(addr), (void *) entry); - if (entry) { - printk("swap_cache: replacing non-NULL entry\n"); - } -#ifdef SWAP_CACHE_INFO - swap_cache_add_success++; -#endif - return 1; - } - return 0; -} - -static unsigned long init_swap_cache(unsigned long mem_start, - unsigned long mem_end) -{ - unsigned long swap_cache_size; - - mem_start = (mem_start + 15) & ~15; - swap_cache = (unsigned long *) mem_start; - swap_cache_size = MAP_NR(mem_end); - memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long)); - return (unsigned long) (swap_cache + swap_cache_size); -} - -void rw_swap_page(int rw, unsigned long entry, char * buf) -{ - unsigned long type, offset; - struct swap_info_struct * p; - - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) { - printk("Internal error: bad swap-device\n"); - return; - } - p = &swap_info[type]; - offset = SWP_OFFSET(entry); - if (offset >= p->max) { - printk("rw_swap_page: weirdness\n"); - return; - } - if (p->swap_map && !p->swap_map[offset]) { - printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry); - return; - } - if (!(p->flags & SWP_USED)) { - printk("Trying to swap to unused swap-device\n"); - return; - } - while (set_bit(offset,p->swap_lockmap)) - sleep_on(&lock_queue); - if (rw == READ) - kstat.pswpin++; - else - kstat.pswpout++; - if (p->swap_device) { - ll_rw_page(rw,p->swap_device,offset,buf); - } else if (p->swap_file) { - struct inode *swapf = p->swap_file; - unsigned int zones[PAGE_SIZE/512]; - int i; - if (swapf->i_op->bmap == NULL - && swapf->i_op->smap != NULL){ - /* - With MsDOS, we use msdos_smap which return - a sector number (not a cluster or block number). - It is a patch to enable the UMSDOS project. - Other people are working on better solution. - - It sounds like ll_rw_swap_file defined - it operation size (sector size) based on - PAGE_SIZE and the number of block to read. - So using bmap or smap should work even if - smap will require more blocks. - */ - int j; - unsigned int block = offset << 3; - - for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){ - if (!(zones[i] = swapf->i_op->smap(swapf,block++))) { - printk("rw_swap_page: bad swap file\n"); - return; - } - } - }else{ - int j; - unsigned int block = offset - << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); - - for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) - if (!(zones[i] = bmap(swapf,block++))) { - printk("rw_swap_page: bad swap file\n"); - return; - } - } - ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); - } else - printk("re_swap_page: no swap file or device\n"); - if (offset && !clear_bit(offset,p->swap_lockmap)) - printk("rw_swap_page: lock already cleared\n"); - wake_up(&lock_queue); -} - -unsigned long get_swap_page(void) -{ - struct swap_info_struct * p; - unsigned long offset, type; - - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) { - if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) - continue; - for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) { - if (p->swap_map[offset]) - continue; - if (test_bit(offset, p->swap_lockmap)) - continue; - p->swap_map[offset] = 1; - nr_swap_pages--; - if (offset == p->highest_bit) - p->highest_bit--; - p->lowest_bit = offset; - return SWP_ENTRY(type,offset); - } - } - return 0; -} - -void swap_duplicate(unsigned long entry) -{ - struct swap_info_struct * p; - unsigned long offset, type; - - if (!entry) - return; - offset = SWP_OFFSET(entry); - type = SWP_TYPE(entry); - if (type == SHM_SWP_TYPE) - return; - if (type >= nr_swapfiles) { - printk("Trying to duplicate nonexistent swap-page\n"); - return; - } - p = type + swap_info; - if (offset >= p->max) { - printk("swap_duplicate: weirdness\n"); - return; - } - if (!p->swap_map[offset]) { - printk("swap_duplicate: trying to duplicate unused page\n"); - return; - } - p->swap_map[offset]++; - return; -} - -void swap_free(unsigned long entry) -{ - struct swap_info_struct * p; - unsigned long offset, type; - - if (!entry) - return; - type = SWP_TYPE(entry); - if (type == SHM_SWP_TYPE) - return; - if (type >= nr_swapfiles) { - printk("Trying to free nonexistent swap-page\n"); - return; - } - p = & swap_info[type]; - offset = SWP_OFFSET(entry); - if (offset >= p->max) { - printk("swap_free: weirdness\n"); - return; - } - if (!(p->flags & SWP_USED)) { - printk("Trying to free swap from unused swap-device\n"); - return; - } - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; - if (!p->swap_map[offset]) - printk("swap_free: swap-space map bad (entry %08lx)\n",entry); - else - if (!--p->swap_map[offset]) - nr_swap_pages++; -} - -/* - * The tests may look silly, but it essentially makes sure that - * no other process did a swap-in on us just as we were waiting. - * - * Also, don't bother to add to the swap cache if this page-in - * was due to a write access. - */ -void swap_in(struct vm_area_struct * vma, pte_t * page_table, - unsigned long entry, int write_access) -{ - unsigned long page = get_free_page(GFP_KERNEL); - - if (pte_val(*page_table) != entry) { - free_page(page); - return; - } - if (!page) { - *page_table = BAD_PAGE; - swap_free(entry); - oom(current); - return; - } - read_swap_page(entry, (char *) page); - if (pte_val(*page_table) != entry) { - free_page(page); - return; - } - vma->vm_task->mm->rss++; - vma->vm_task->mm->maj_flt++; - if (!write_access && add_to_swap_cache(page, entry)) { - *page_table = mk_pte(page, vma->vm_page_prot); - return; - } - *page_table = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - swap_free(entry); - return; -} - -/* - * The swap-out functions return 1 if they successfully - * threw something out, and we got a free page. It returns - * zero if it couldn't do anything, and any other value - * indicates it decreased rss, but the page was shared. - * - * NOTE! If it sleeps, it *must* return 1 to make sure we - * don't continue with the swap-out. Otherwise we may be - * using a process that no longer actually exists (it might - * have died while we slept). - */ -static inline int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table) -{ - pte_t pte; - unsigned long entry; - unsigned long page; - - pte = *page_table; - if (!pte_present(pte)) - return 0; - page = pte_page(pte); - if (page >= high_memory) - return 0; - if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) - return 0; - if ((pte_dirty(pte) && delete_from_swap_cache(page)) || pte_young(pte)) { - *page_table = pte_mkold(pte); - return 0; - } - if (pte_dirty(pte)) { - if (mem_map[MAP_NR(page)] != 1) - return 0; - if (vma->vm_ops && vma->vm_ops->swapout) { - vma->vm_task->mm->rss--; - vma->vm_ops->swapout(vma, address-vma->vm_start, page_table); - } else { - if (!(entry = get_swap_page())) - return 0; - vma->vm_task->mm->rss--; - pte_val(*page_table) = entry; - invalidate(); - write_swap_page(entry, (char *) page); - } - free_page(page); - return 1; /* we slept: the process may not exist any more */ - } - if ((entry = find_in_swap_cache(page))) { - if (mem_map[MAP_NR(page)] != 1) { - *page_table = pte_mkdirty(pte); - printk("Aiee.. duplicated cached swap-cache entry\n"); - return 0; - } - vma->vm_task->mm->rss--; - pte_val(*page_table) = entry; - invalidate(); - free_page(page); - return 1; - } - vma->vm_task->mm->rss--; - pte_clear(page_table); - invalidate(); - entry = mem_map[MAP_NR(page)]; - free_page(page); - return entry; -} - -/* - * A new implementation of swap_out(). We do not swap complete processes, - * but only a small number of blocks, before we continue with the next - * process. The number of blocks actually swapped is determined on the - * number of page faults, that this process actually had in the last time, - * so we won't swap heavily used processes all the time ... - * - * Note: the priority argument is a hint on much CPU to waste with the - * swap block search, not a hint, of how much blocks to swap with - * each process. - * - * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de - */ - -/* - * These are the minimum and maximum number of pages to swap from one process, - * before proceeding to the next: - */ -#define SWAP_MIN 4 -#define SWAP_MAX 32 - -/* - * The actual number of pages to swap is determined as: - * SWAP_RATIO / (number of recent major page faults) - */ -#define SWAP_RATIO 128 - -static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, - unsigned long address, unsigned long end) -{ - pte_t * pte; - unsigned long pmd_end; - - if (pmd_none(*dir)) - return 0; - if (pmd_bad(*dir)) { - printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); - pmd_clear(dir); - return 0; - } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; - - do { - int result; - vma->vm_task->mm->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(vma, address, pte); - if (result) - return result; - address += PAGE_SIZE; - pte++; - } while (address < end); - return 0; -} - -static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, - unsigned long address, unsigned long end) -{ - pmd_t * pmd; - unsigned long pgd_end; - - if (pgd_none(*dir)) - return 0; - if (pgd_bad(*dir)) { - printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); - pgd_clear(dir); - return 0; - } - - pmd = pmd_offset(dir, address); - - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (end > pgd_end) - end = pgd_end; - - do { - int result = swap_out_pmd(vma, pmd, address, end); - if (result) - return result; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - return 0; -} - -static int swap_out_vma(struct vm_area_struct * vma, pgd_t *pgdir, - unsigned long start) -{ - unsigned long end; - - /* Don't swap out areas like shared memory which have their - own separate swapping mechanism. */ - if (vma->vm_flags & VM_SHM) - return 0; - - end = vma->vm_end; - while (start < end) { - int result = swap_out_pgd(vma, pgdir, start, end); - if (result) - return result; - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } - return 0; -} - -static int swap_out_process(struct task_struct * p) -{ - unsigned long address; - struct vm_area_struct* vma; - - /* - * Go through process' page directory. - */ - address = p->mm->swap_address; - p->mm->swap_address = 0; - - /* - * Find the proper vm-area - */ - vma = find_vma(p, address); - if (!vma) - return 0; - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - int result = swap_out_vma(vma, pgd_offset(p, address), address); - if (result) - return result; - vma = vma->vm_next; - if (!vma) - break; - address = vma->vm_start; - } - p->mm->swap_address = 0; - return 0; -} - -static int swap_out(unsigned int priority) -{ - static int swap_task; - int loop, counter; - struct task_struct *p; - - counter = 6*nr_tasks >> priority; - for(; counter >= 0; counter--) { - /* - * Check that swap_task is suitable for swapping. If not, look for - * the next suitable process. - */ - loop = 0; - while(1) { - if (swap_task >= NR_TASKS) { - swap_task = 1; - if (loop) - /* all processes are unswappable or already swapped out */ - return 0; - loop = 1; - } - - p = task[swap_task]; - if (p && p->mm->swappable && p->mm->rss) - break; - - swap_task++; - } - - /* - * Determine the number of pages to swap from this process. - */ - if (!p->mm->swap_cnt) { - p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt; - p->mm->old_maj_flt = p->mm->maj_flt; - - if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) { - p->mm->dec_flt = SWAP_RATIO / SWAP_MIN; - p->mm->swap_cnt = SWAP_MIN; - } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX) - p->mm->swap_cnt = SWAP_MAX; - else - p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt; - } - if (!--p->mm->swap_cnt) - swap_task++; - switch (swap_out_process(p)) { - case 0: - if (p->mm->swap_cnt) - swap_task++; - break; - case 1: - return 1; - default: - break; - } - } - return 0; -} - /* - * we keep on shrinking one resource until it's considered "too hard", - * and then switch to the next one (priority being an indication on how - * hard we should try with the resource). + * We identify three levels of free memory. We never let free mem + * fall below the min_free_pages except for atomic allocations. We + * start background swapping if we fall below free_pages_high free + * pages, and we begin intensive swapping below free_pages_low. * - * This should automatically find the resource that can most easily be - * free'd, so hopefully we'll get reasonable behaviour even under very - * different circumstances. + * Keep these three variables contiguous for sysctl(2). */ -static int try_to_free_page(int priority) -{ - static int state = 0; - int i=6; - - switch (state) { - do { - case 0: - if (priority != GFP_NOBUFFER && shrink_buffers(i)) - return 1; - state = 1; - case 1: - if (shm_swap(i)) - return 1; - state = 2; - default: - if (swap_out(i)) - return 1; - state = 0; - } while(i--); - } - return 0; -} - -static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry) -{ - entry->prev = head; - (entry->next = head->next)->prev = entry; - head->next = entry; -} - -static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry) -{ - entry->next->prev = entry->prev; - entry->prev->next = entry->next; -} - -/* - * Free_page() adds the page to the free lists. This is optimized for - * fast normal cases (no error jumps taken normally). - * - * The way to optimize jumps for gcc-2.2.2 is to: - * - select the "normal" case and put it inside the if () { XXX } - * - no else-statements if you can avoid them - * - * With the above two rules, you get a straight-line execution path - * for the normal case, giving better asm-code. - * - * free_page() may sleep since the page being freed may be a buffer - * page or present in the swap cache. It will not sleep, however, - * for a freshly allocated page (get_free_page()). - */ - -/* - * Buddy system. Hairy. You really aren't expected to understand this - */ -static inline void free_pages_ok(unsigned long addr, unsigned long order) -{ - unsigned long index = MAP_NR(addr) >> (1 + order); - unsigned long mask = PAGE_MASK << order; - - addr &= mask; - nr_free_pages += 1 << order; - while (order < NR_MEM_LISTS-1) { - if (!change_bit(index, free_area_map[order])) - break; - remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask))); - order++; - index >>= 1; - mask <<= 1; - addr &= mask; - } - add_mem_queue(free_area_list+order, (struct mem_list *) addr); -} - -static inline void check_free_buffers(unsigned long addr) -{ - struct buffer_head * bh; - - bh = buffer_pages[MAP_NR(addr)]; - if (bh) { - struct buffer_head *tmp = bh; - do { - if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff) - refile_buffer(tmp); - tmp = tmp->b_this_page; - } while (tmp != bh); - } -} - -void free_pages(unsigned long addr, unsigned long order) -{ - if (addr < high_memory) { - unsigned long flag; - mem_map_t * map = mem_map + MAP_NR(addr); - if (*map) { - if (!(*map & MAP_PAGE_RESERVED)) { - save_flags(flag); - cli(); - if (!--*map) { - free_pages_ok(addr, order); - delete_from_swap_cache(addr); - } - restore_flags(flag); - if (*map == 1) - check_free_buffers(addr); - } - return; - } - printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr); - printk("PC = %p\n", __builtin_return_address(0)); - return; - } -} - -/* - * Some ugly macros to speed up __get_free_pages().. - */ -#define RMQUEUE(order) \ -do { struct mem_list * queue = free_area_list+order; \ - unsigned long new_order = order; \ - do { struct mem_list *next = queue->next; \ - if (queue != next) { \ - (queue->next = next->next)->prev = queue; \ - mark_used((unsigned long) next, new_order); \ - nr_free_pages -= 1 << order; \ - restore_flags(flags); \ - EXPAND(next, order, new_order); \ - return (unsigned long) next; \ - } new_order++; queue++; \ - } while (new_order < NR_MEM_LISTS); \ -} while (0) - -static inline int mark_used(unsigned long addr, unsigned long order) -{ - return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]); -} - -#define EXPAND(addr,low,high) \ -do { unsigned long size = PAGE_SIZE << high; \ - while (high > low) { \ - high--; size >>= 1; cli(); \ - add_mem_queue(free_area_list+high, addr); \ - mark_used((unsigned long) addr, high); \ - restore_flags(flags); \ - addr = (struct mem_list *) (size + (unsigned long) addr); \ - } mem_map[MAP_NR((unsigned long) addr)] = 1; \ -} while (0) - -unsigned long __get_free_pages(int priority, unsigned long order) -{ - unsigned long flags; - int reserved_pages; - - if (intr_count && priority != GFP_ATOMIC) { - static int count = 0; - if (++count < 5) { - printk("gfp called nonatomically from interrupt %p\n", - __builtin_return_address(0)); - priority = GFP_ATOMIC; - } - } - reserved_pages = 5; - if (priority != GFP_NFS) - reserved_pages = min_free_pages; - save_flags(flags); -repeat: - cli(); - if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { - RMQUEUE(order); - restore_flags(flags); - return 0; - } - restore_flags(flags); - if (priority != GFP_BUFFER && try_to_free_page(priority)) - goto repeat; - return 0; -} - -/* - * Yes, I know this is ugly. Don't tell me. - */ -unsigned long __get_dma_pages(int priority, unsigned long order) -{ - unsigned long list = 0; - unsigned long result; - unsigned long limit = MAX_DMA_ADDRESS; - - /* if (EISA_bus) limit = ~0UL; */ - if (priority != GFP_ATOMIC) - priority = GFP_BUFFER; - for (;;) { - result = __get_free_pages(priority, order); - if (result < limit) /* covers failure as well */ - break; - *(unsigned long *) result = list; - list = result; - } - while (list) { - unsigned long tmp = list; - list = *(unsigned long *) list; - free_pages(tmp, order); - } - return result; -} - -/* - * Show free area list (used inside shift_scroll-lock stuff) - * We also calculate the percentage fragmentation. We do this by counting the - * memory on each free list with the exception of the first item on the list. - */ -void show_free_areas(void) -{ - unsigned long order, flags; - unsigned long total = 0; - - printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); - save_flags(flags); - cli(); - for (order=0 ; order < NR_MEM_LISTS; order++) { - struct mem_list * tmp; - unsigned long nr = 0; - for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) { - nr ++; - } - total += nr * ((PAGE_SIZE>>10) << order); - printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order); - } - restore_flags(flags); - printk("= %lukB)\n", total); -#ifdef SWAP_CACHE_INFO - show_swap_cache_info(); -#endif -} - -/* - * Trying to stop swapping from a file is fraught with races, so - * we repeat quite a bit here when we have to pause. swapoff() - * isn't exactly timing-critical, so who cares (but this is /really/ - * inefficient, ugh). - * - * We return 1 after having slept, which makes the process start over - * from the beginning for this process.. - */ -static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address, - pte_t *dir, unsigned int type, unsigned long page) -{ - pte_t pte = *dir; - - if (pte_none(pte)) - return 0; - if (pte_present(pte)) { - unsigned long page = pte_page(pte); - if (page >= high_memory) - return 0; - if (!in_swap_cache(page)) - return 0; - if (SWP_TYPE(in_swap_cache(page)) != type) - return 0; - delete_from_swap_cache(page); - *dir = pte_mkdirty(pte); - return 0; - } - if (SWP_TYPE(pte_val(pte)) != type) - return 0; - read_swap_page(pte_val(pte), (char *) page); - if (pte_val(*dir) != pte_val(pte)) { - free_page(page); - return 1; - } - *dir = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - ++vma->vm_task->mm->rss; - swap_free(pte_val(pte)); - return 1; -} - -static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, - unsigned long address, unsigned long size, unsigned long offset, - unsigned int type, unsigned long page) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*dir)) - return 0; - if (pmd_bad(*dir)) { - printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); - pmd_clear(dir); - return 0; - } - pte = pte_offset(dir, address); - offset += address & PMD_MASK; - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page)) - return 1; - address += PAGE_SIZE; - pte++; - } while (address < end); - return 0; -} - -static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, - unsigned long address, unsigned long size, - unsigned int type, unsigned long page) -{ - pmd_t * pmd; - unsigned long offset, end; - - if (pgd_none(*dir)) - return 0; - if (pgd_bad(*dir)) { - printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); - pgd_clear(dir); - return 0; - } - pmd = pmd_offset(dir, address); - offset = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - if (unuse_pmd(vma, pmd, address, end - address, offset, type, page)) - return 1; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - return 0; -} - -static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - unsigned long start, unsigned long end, - unsigned int type, unsigned long page) -{ - while (start < end) { - if (unuse_pgd(vma, pgdir, start, end - start, type, page)) - return 1; - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } - return 0; -} - -static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page) -{ - struct vm_area_struct* vma; - - /* - * Go through process' page directory. - */ - vma = p->mm->mmap; - while (vma) { - pgd_t * pgd = pgd_offset(p, vma->vm_start); - if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page)) - return 1; - vma = vma->vm_next; - } - return 0; -} - -/* - * To avoid races, we repeat for each process after having - * swapped something in. That gets rid of a few pesky races, - * and "swapoff" isn't exactly timing critical. - */ -static int try_to_unuse(unsigned int type) -{ - int nr; - unsigned long page = get_free_page(GFP_KERNEL); - - if (!page) - return -ENOMEM; - nr = 0; - while (nr < NR_TASKS) { - if (task[nr]) { - if (unuse_process(task[nr], type, page)) { - page = get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - continue; - } - } - nr++; - } - free_page(page); - return 0; -} - -asmlinkage int sys_swapoff(const char * specialfile) -{ - struct swap_info_struct * p; - struct inode * inode; - unsigned int type; - struct file filp; +int min_free_pages = 20; +int free_pages_low = 30; +int free_pages_high = 40; + +/* We track the number of pages currently being asynchronously swapped + out, so that we don't try to swap TOO many pages out at once */ +atomic_t nr_async_pages = 0; + +/* + * Constants for the page aging mechanism: the maximum age (actually, + * the maximum "youthfulness"); the quanta by which pages rejuvenate + * and age; and the initial age for new pages. + */ + +swap_control_t swap_control = { + 20, 3, 1, 3, /* Page aging */ + 10, 2, 2, 4, /* Buffer aging */ + 32, 4, /* Aging cluster */ + 8192, 8192, /* Pageout and bufferout weights */ + -200, /* Buffer grace */ + 1, 1, /* Buffs/pages to free */ + RCL_ROUND_ROBIN /* Balancing policy */ +}; + +swapstat_t swapstats = {0}; + +/* General swap control */ + +/* Parse the kernel command line "swap=" option at load time: */ +void swap_setup(char *str, int *ints) +{ + int * swap_vars[8] = { + &MAX_PAGE_AGE, + &PAGE_ADVANCE, + &PAGE_DECLINE, + &PAGE_INITIAL_AGE, + &AGE_CLUSTER_FRACT, + &AGE_CLUSTER_MIN, + &PAGEOUT_WEIGHT, + &BUFFEROUT_WEIGHT + }; int i; - - if (!suser()) - return -EPERM; - i = namei(specialfile,&inode); - if (i) - return i; - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) { - if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) - continue; - if (p->swap_file) { - if (p->swap_file == inode) - break; - } else { - if (!S_ISBLK(inode->i_mode)) - continue; - if (p->swap_device == inode->i_rdev) - break; - } + for (i=0; i < ints[0] && i < 8; i++) { + if (ints[i+1]) + *(swap_vars[i]) = ints[i+1]; } - - if (type >= nr_swapfiles){ - iput(inode); - return -EINVAL; - } - p->flags = SWP_USED; - i = try_to_unuse(type); - if (i) { - iput(inode); - p->flags = SWP_WRITEOK; - return i; - } - - if(p->swap_device){ - memset(&filp, 0, sizeof(filp)); - filp.f_inode = inode; - filp.f_mode = 3; /* read write */ - /* open it again to get fops */ - if( !blkdev_open(inode, &filp) && - filp.f_op && filp.f_op->release){ - filp.f_op->release(inode,&filp); - filp.f_op->release(inode,&filp); - } - } - iput(inode); - - nr_swap_pages -= p->pages; - iput(p->swap_file); - p->swap_file = NULL; - p->swap_device = 0; - vfree(p->swap_map); - p->swap_map = NULL; - free_page((long) p->swap_lockmap); - p->swap_lockmap = NULL; - p->flags = 0; - return 0; -} - -/* - * Written 01/25/92 by Simmule Turner, heavily changed by Linus. - * - * The swapon system call - */ -asmlinkage int sys_swapon(const char * specialfile) -{ - struct swap_info_struct * p; - struct inode * swap_inode; - unsigned int type; - int i,j; - int error; - struct file filp; - - memset(&filp, 0, sizeof(filp)); - if (!suser()) - return -EPERM; - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) - if (!(p->flags & SWP_USED)) - break; - if (type >= MAX_SWAPFILES) - return -EPERM; - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - p->flags = SWP_USED; - p->swap_file = NULL; - p->swap_device = 0; - p->swap_map = NULL; - p->swap_lockmap = NULL; - p->lowest_bit = 0; - p->highest_bit = 0; - p->max = 1; - error = namei(specialfile,&swap_inode); - if (error) - goto bad_swap_2; - p->swap_file = swap_inode; - error = -EBUSY; - if (swap_inode->i_count != 1) - goto bad_swap_2; - error = -EINVAL; - - if (S_ISBLK(swap_inode->i_mode)) { - p->swap_device = swap_inode->i_rdev; - - filp.f_inode = swap_inode; - filp.f_mode = 3; /* read write */ - error = blkdev_open(swap_inode, &filp); - p->swap_file = NULL; - iput(swap_inode); - if(error) - goto bad_swap_2; - error = -ENODEV; - if (!p->swap_device) - goto bad_swap; - error = -EBUSY; - for (i = 0 ; i < nr_swapfiles ; i++) { - if (i == type) - continue; - if (p->swap_device == swap_info[i].swap_device) - goto bad_swap; - } - } else if (!S_ISREG(swap_inode->i_mode)) - goto bad_swap; - p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER); - if (!p->swap_lockmap) { - printk("Unable to start swapping: out of memory :-)\n"); - error = -ENOMEM; - goto bad_swap; - } - read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap); - if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) { - printk("Unable to find swap-space signature\n"); - error = -EINVAL; - goto bad_swap; - } - memset(p->swap_lockmap+PAGE_SIZE-10,0,10); - j = 0; - p->lowest_bit = 0; - p->highest_bit = 0; - for (i = 1 ; i < 8*PAGE_SIZE ; i++) { - if (test_bit(i,p->swap_lockmap)) { - if (!p->lowest_bit) - p->lowest_bit = i; - p->highest_bit = i; - p->max = i+1; - j++; - } - } - if (!j) { - printk("Empty swap-file\n"); - error = -EINVAL; - goto bad_swap; - } - p->swap_map = (unsigned char *) vmalloc(p->max); - if (!p->swap_map) { - error = -ENOMEM; - goto bad_swap; - } - for (i = 1 ; i < p->max ; i++) { - if (test_bit(i,p->swap_lockmap)) - p->swap_map[i] = 0; - else - p->swap_map[i] = 0x80; - } - p->swap_map[0] = 0x80; - memset(p->swap_lockmap,0,PAGE_SIZE); - p->flags = SWP_WRITEOK; - p->pages = j; - nr_swap_pages += j; - printk("Adding Swap: %dk swap-space\n",j<<(PAGE_SHIFT-10)); - return 0; -bad_swap: - if(filp.f_op && filp.f_op->release) - filp.f_op->release(filp.f_inode,&filp); -bad_swap_2: - free_page((long) p->swap_lockmap); - vfree(p->swap_map); - iput(p->swap_file); - p->swap_device = 0; - p->swap_file = NULL; - p->swap_map = NULL; - p->swap_lockmap = NULL; - p->flags = 0; - return error; } -void si_swapinfo(struct sysinfo *val) +/* Parse the kernel command line "buff=" option at load time: */ +void buff_setup(char *str, int *ints) { - unsigned int i, j; - - val->freeswap = val->totalswap = 0; - for (i = 0; i < nr_swapfiles; i++) { - if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK) - continue; - for (j = 0; j < swap_info[i].max; ++j) - switch (swap_info[i].swap_map[j]) { - case 128: - continue; - case 0: - ++val->freeswap; - default: - ++val->totalswap; - } - } - val->freeswap <<= PAGE_SHIFT; - val->totalswap <<= PAGE_SHIFT; - return; -} - -/* - * set up the free-area data structures: - * - mark all pages MAP_PAGE_RESERVED - * - mark all memory queues empty - * - clear the memory bitmaps - */ -unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) -{ - mem_map_t * p; - unsigned long mask = PAGE_MASK; + int * buff_vars[6] = { + &MAX_BUFF_AGE, + &BUFF_ADVANCE, + &BUFF_DECLINE, + &BUFF_INITIAL_AGE, + &BUFFEROUT_WEIGHT, + &BUFFERMEM_GRACE + }; int i; - - /* - * select nr of pages we try to keep free for important stuff - * with a minimum of 16 pages. This is totally arbitrary - */ - i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6); - if (i < 16) - i = 16; - min_free_pages = i; - start_mem = init_swap_cache(start_mem, end_mem); - mem_map = (mem_map_t *) start_mem; - p = mem_map + MAP_NR(end_mem); - start_mem = (unsigned long) p; - while (p > mem_map) - *--p = MAP_PAGE_RESERVED; - - for (i = 0 ; i < NR_MEM_LISTS ; i++) { - unsigned long bitmap_size; - free_area_list[i].prev = free_area_list[i].next = &free_area_list[i]; - mask += mask; - end_mem = (end_mem + ~mask) & mask; - bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i); - bitmap_size = (bitmap_size + 7) >> 3; - bitmap_size = (bitmap_size + sizeof(unsigned long) - 1) & ~(sizeof(unsigned long)-1); - free_area_map[i] = (unsigned char *) start_mem; - memset((void *) start_mem, 0, bitmap_size); - start_mem += bitmap_size; + for (i=0; i < ints[0] && i < 6; i++) { + if (ints[i+1]) + *(buff_vars[i]) = ints[i+1]; } - return start_mem; } + diff --git a/mm/swap_state.c b/mm/swap_state.c new file mode 100644 index 000000000..044180721 --- /dev/null +++ b/mm/swap_state.c @@ -0,0 +1,111 @@ +/* + * linux/mm/swap_state.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/swapctl.h> + +#include <asm/dma.h> +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/uaccess.h> /* for cop_to/from_user */ +#include <asm/bitops.h> +#include <asm/pgtable.h> + +/* + * To save us from swapping out pages which have just been swapped in and + * have not been modified since then, we keep in swap_cache[page>>PAGE_SHIFT] + * the swap entry which was last used to fill the page, or zero if the + * page does not currently correspond to a page in swap. PAGE_DIRTY makes + * this info useless. + */ +unsigned long *swap_cache; + +#ifdef SWAP_CACHE_INFO +unsigned long swap_cache_add_total = 0; +unsigned long swap_cache_add_success = 0; +unsigned long swap_cache_del_total = 0; +unsigned long swap_cache_del_success = 0; +unsigned long swap_cache_find_total = 0; +unsigned long swap_cache_find_success = 0; + +void show_swap_cache_info(void) +{ + printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", + swap_cache_add_total, swap_cache_add_success, + swap_cache_del_total, swap_cache_del_success, + swap_cache_find_total, swap_cache_find_success); +} +#endif + +int add_to_swap_cache(unsigned long index, unsigned long entry) +{ + struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; + +#ifdef SWAP_CACHE_INFO + swap_cache_add_total++; +#endif + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + entry = xchg(swap_cache + index, entry); + if (entry) { + printk("swap_cache: replacing non-NULL entry\n"); + } +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; + } + return 0; +} + +unsigned long init_swap_cache(unsigned long mem_start, + unsigned long mem_end) +{ + unsigned long swap_cache_size; + + mem_start = (mem_start + 15) & ~15; + swap_cache = (unsigned long *) mem_start; + swap_cache_size = MAP_NR(mem_end); + memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long)); + return (unsigned long) (swap_cache + swap_cache_size); +} + +void swap_duplicate(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return; + offset = SWP_OFFSET(entry); + type = SWP_TYPE(entry); + if (type & SHM_SWP_TYPE) + return; + if (type >= nr_swapfiles) { + printk("Trying to duplicate nonexistent swap-page\n"); + return; + } + p = type + swap_info; + if (offset >= p->max) { + printk("swap_duplicate: weirdness\n"); + return; + } + if (!p->swap_map[offset]) { + printk("swap_duplicate: trying to duplicate unused page\n"); + return; + } + p->swap_map[offset]++; + return; +} + diff --git a/mm/swapfile.c b/mm/swapfile.c new file mode 100644 index 000000000..0ee8b30c1 --- /dev/null +++ b/mm/swapfile.c @@ -0,0 +1,577 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/swapctl.h> +#include <linux/blkdev.h> /* for blk_size */ +#include <linux/vmalloc.h> + +#include <asm/dma.h> +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/uaccess.h> /* for copy_to/from_user */ +#include <asm/bitops.h> +#include <asm/pgtable.h> + +unsigned int nr_swapfiles = 0; + +static struct { + int head; /* head of priority-ordered swapfile list */ + int next; /* swapfile to be used next */ +} swap_list = {-1, -1}; + +struct swap_info_struct swap_info[MAX_SWAPFILES]; + + +static inline int scan_swap_map(struct swap_info_struct *si) +{ + unsigned long offset; + /* + * We try to cluster swap pages by allocating them + * sequentially in swap. Once we've allocated + * SWAP_CLUSTER_MAX pages this way, however, we resort to + * first-free allocation, starting a new cluster. This + * prevents us from scattering swap pages all over the entire + * swap partition, so that we reduce overall disk seek times + * between swap pages. -- sct */ + if (si->cluster_nr) { + while (si->cluster_next <= si->highest_bit) { + offset = si->cluster_next++; + if (si->swap_map[offset]) + continue; + if (test_bit(offset, si->swap_lockmap)) + continue; + si->cluster_nr--; + goto got_page; + } + } + si->cluster_nr = SWAP_CLUSTER_MAX; + for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { + if (si->swap_map[offset]) + continue; + if (test_bit(offset, si->swap_lockmap)) + continue; + si->lowest_bit = offset; +got_page: + si->swap_map[offset] = 1; + nr_swap_pages--; + if (offset == si->highest_bit) + si->highest_bit--; + si->cluster_next = offset; + return offset; + } + return 0; +} + +unsigned long get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned long offset, entry; + int type, wrapped = 0; + + type = swap_list.next; + if (type < 0) + return 0; + + while (1) { + p = &swap_info[type]; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + offset = scan_swap_map(p); + if (offset) { + entry = SWP_ENTRY(type,offset); + type = swap_info[type].next; + if (type < 0 || + p->prio != swap_info[type].prio) + { + swap_list.next = swap_list.head; + } + else + { + swap_list.next = type; + } + return entry; + } + } + type = p->next; + if (!wrapped) { + if (type < 0 || p->prio != swap_info[type].prio) { + type = swap_list.head; + wrapped = 1; + } + } else if (type < 0) { + return 0; /* out of swap space */ + } + } +} + +void swap_free(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return; + type = SWP_TYPE(entry); + if (type & SHM_SWP_TYPE) + return; + if (type >= nr_swapfiles) { + printk("Trying to free nonexistent swap-page\n"); + return; + } + p = & swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("swap_free: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to free swap from unused swap-device\n"); + return; + } + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (!p->swap_map[offset]) + printk("swap_free: swap-space map bad (entry %08lx)\n",entry); + else + if (!--p->swap_map[offset]) + nr_swap_pages++; + if (p->prio > swap_info[swap_list.next].prio) { + swap_list.next = swap_list.head; + } +} + +/* + * Trying to stop swapping from a file is fraught with races, so + * we repeat quite a bit here when we have to pause. swapoff() + * isn't exactly timing-critical, so who cares (but this is /really/ + * inefficient, ugh). + * + * We return 1 after having slept, which makes the process start over + * from the beginning for this process.. + */ +static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address, + pte_t *dir, unsigned int type, unsigned long page) +{ + pte_t pte = *dir; + + if (pte_none(pte)) + return 0; + if (pte_present(pte)) { + unsigned long page_nr = MAP_NR(pte_page(pte)); + if (page_nr >= max_mapnr) + return 0; + if (!in_swap_cache(page_nr)) + return 0; + if (SWP_TYPE(in_swap_cache(page_nr)) != type) + return 0; + delete_from_swap_cache(page_nr); + set_pte(dir, pte_mkdirty(pte)); + return 0; + } + if (SWP_TYPE(pte_val(pte)) != type) + return 0; + read_swap_page(pte_val(pte), (char *) page); + if (pte_val(*dir) != pte_val(pte)) { + free_page(page); + return 1; + } + set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); + flush_tlb_page(vma, address); + ++vma->vm_mm->rss; + swap_free(pte_val(pte)); + return 1; +} + +static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long size, unsigned long offset, + unsigned int type, unsigned long page) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*dir)) + return 0; + if (pmd_bad(*dir)) { + printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); + pmd_clear(dir); + return 0; + } + pte = pte_offset(dir, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page)) + return 1; + address += PAGE_SIZE; + pte++; + } while (address < end); + return 0; +} + +static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long size, + unsigned int type, unsigned long page) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); + pgd_clear(dir); + return 0; + } + pmd = pmd_offset(dir, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + if (unuse_pmd(vma, pmd, address, end - address, offset, type, page)) + return 1; + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return 0; +} + +static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + unsigned long start, unsigned long end, + unsigned int type, unsigned long page) +{ + while (start < end) { + if (unuse_pgd(vma, pgdir, start, end - start, type, page)) + return 1; + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } + return 0; +} + +static int unuse_process(struct mm_struct * mm, unsigned int type, unsigned long page) +{ + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + if (!mm || mm == &init_mm) + return 0; + vma = mm->mmap; + while (vma) { + pgd_t * pgd = pgd_offset(mm, vma->vm_start); + if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page)) + return 1; + vma = vma->vm_next; + } + return 0; +} + +/* + * To avoid races, we repeat for each process after having + * swapped something in. That gets rid of a few pesky races, + * and "swapoff" isn't exactly timing critical. + */ +static int try_to_unuse(unsigned int type) +{ + int nr; + unsigned long page = get_free_page(GFP_KERNEL); + + if (!page) + return -ENOMEM; + nr = 0; + while (nr < NR_TASKS) { + struct task_struct * p = task[nr]; + if (p) { + if (unuse_process(p->mm, type, page)) { + page = get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + continue; + } + } + nr++; + } + free_page(page); + return 0; +} + +asmlinkage int sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * inode; + struct file filp; + int i, type, prev; + int err; + + if (!suser()) + return -EPERM; + err = namei(specialfile,&inode); + if (err) + return err; + prev = -1; + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { + p = swap_info + type; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + if (p->swap_file) { + if (p->swap_file == inode) + break; + } else { + if (S_ISBLK(inode->i_mode) + && (p->swap_device == inode->i_rdev)) + break; + } + } + prev = type; + } + if (type < 0){ + iput(inode); + return -EINVAL; + } + if (prev < 0) { + swap_list.head = p->next; + } else { + swap_info[prev].next = p->next; + } + if (type == swap_list.next) { + /* just pick something that's safe... */ + swap_list.next = swap_list.head; + } + p->flags = SWP_USED; + err = try_to_unuse(type); + if (err) { + iput(inode); + /* re-insert swap space back into swap_list */ + for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio >= swap_info[i].prio) + break; + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p - swap_info; + else + swap_info[prev].next = p - swap_info; + p->flags = SWP_WRITEOK; + return err; + } + if(p->swap_device){ + memset(&filp, 0, sizeof(filp)); + filp.f_inode = inode; + filp.f_mode = 3; /* read write */ + /* open it again to get fops */ + if( !blkdev_open(inode, &filp) && + filp.f_op && filp.f_op->release){ + filp.f_op->release(inode,&filp); + filp.f_op->release(inode,&filp); + } + } + iput(inode); + + nr_swap_pages -= p->pages; + iput(p->swap_file); + p->swap_file = NULL; + p->swap_device = 0; + vfree(p->swap_map); + p->swap_map = NULL; + free_page((long) p->swap_lockmap); + p->swap_lockmap = NULL; + p->flags = 0; + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage int sys_swapon(const char * specialfile, int swap_flags) +{ + struct swap_info_struct * p; + struct inode * swap_inode; + unsigned int type; + int i, j, prev; + int error; + struct file filp; + static int least_priority = 0; + + memset(&filp, 0, sizeof(filp)); + if (!suser()) + return -EPERM; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + if (type >= MAX_SWAPFILES) + return -EPERM; + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->cluster_nr = 0; + p->max = 1; + p->next = -1; + if (swap_flags & SWAP_FLAG_PREFER) { + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; + } else { + p->prio = --least_priority; + } + error = namei(specialfile,&swap_inode); + if (error) + goto bad_swap_2; + p->swap_file = swap_inode; + error = -EBUSY; + if (swap_inode->i_count != 1) + goto bad_swap_2; + error = -EINVAL; + + if (S_ISBLK(swap_inode->i_mode)) { + p->swap_device = swap_inode->i_rdev; + set_blocksize(p->swap_device, PAGE_SIZE); + + filp.f_inode = swap_inode; + filp.f_mode = 3; /* read write */ + error = blkdev_open(swap_inode, &filp); + p->swap_file = NULL; + iput(swap_inode); + if(error) + goto bad_swap_2; + error = -ENODEV; + if (!p->swap_device || + (blk_size[MAJOR(p->swap_device)] && + !blk_size[MAJOR(p->swap_device)][MINOR(p->swap_device)])) + goto bad_swap; + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + if (i == type) + continue; + if (p->swap_device == swap_info[i].swap_device) + goto bad_swap; + } + } else if (!S_ISREG(swap_inode->i_mode)) + goto bad_swap; + p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER); + if (!p->swap_lockmap) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap); + if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + memset(p->swap_lockmap+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,p->swap_lockmap)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + p->max = i+1; + j++; + } + } + if (!j) { + printk("Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map = (unsigned char *) vmalloc(p->max); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < p->max ; i++) { + if (test_bit(i,p->swap_lockmap)) + p->swap_map[i] = 0; + else + p->swap_map[i] = 0x80; + } + p->swap_map[0] = 0x80; + clear_page(p->swap_lockmap); + p->flags = SWP_WRITEOK; + p->pages = j; + nr_swap_pages += j; + printk("Adding Swap: %dk swap-space (priority %d)\n", + j<<(PAGE_SHIFT-10), p->prio); + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + return 0; +bad_swap: + if(filp.f_op && filp.f_op->release) + filp.f_op->release(filp.f_inode,&filp); +bad_swap_2: + free_page((long) p->swap_lockmap); + vfree(p->swap_map); + iput(p->swap_file); + p->swap_device = 0; + p->swap_file = NULL; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->flags = 0; + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i, j; + + val->freeswap = val->totalswap = 0; + for (i = 0; i < nr_swapfiles; i++) { + if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (j = 0; j < swap_info[i].max; ++j) + switch (swap_info[i].swap_map[j]) { + case 128: + continue; + case 0: + ++val->freeswap; + default: + ++val->totalswap; + } + } + val->freeswap <<= PAGE_SHIFT; + val->totalswap <<= PAGE_SHIFT; + return; +} + diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 107be5546..142e6d256 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4,37 +4,14 @@ * Copyright (C) 1993 Linus Torvalds */ -#include <asm/system.h> - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/head.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> #include <linux/malloc.h> -#include <linux/mm.h> - -#include <asm/segment.h> -#include <asm/pgtable.h> +#include <linux/vmalloc.h> -struct vm_struct { - unsigned long flags; - void * addr; - unsigned long size; - struct vm_struct * next; -}; +#include <asm/uaccess.h> +#include <asm/system.h> static struct vm_struct * vmlist = NULL; -static inline void set_pgdir(unsigned long address, pgd_t entry) -{ - struct task_struct * p; - - for_each_task(p) - *pgd_offset(p,address) = entry; -} - static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) { pte_t * pte; @@ -96,13 +73,14 @@ static void free_area_pages(unsigned long address, unsigned long size) pgd_t * dir; unsigned long end = address + size; - dir = pgd_offset(&init_task, address); + dir = pgd_offset(&init_mm, address); + flush_cache_all(); while (address < end) { free_area_pmd(dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } - invalidate(); + flush_tlb_all(); } static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned long size) @@ -120,7 +98,7 @@ static inline int alloc_area_pte(pte_t * pte, unsigned long address, unsigned lo page = __get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; - *pte = mk_pte(page, PAGE_KERNEL); + set_pte(pte, mk_pte(page, PAGE_KERNEL)); address += PAGE_SIZE; pte++; } @@ -152,7 +130,8 @@ static int alloc_area_pages(unsigned long address, unsigned long size) pgd_t * dir; unsigned long end = address + size; - dir = pgd_offset(&init_task, address); + dir = pgd_offset(&init_mm, address); + flush_cache_all(); while (address < end) { pmd_t *pmd = pmd_alloc_kernel(dir, address); if (!pmd) @@ -163,10 +142,32 @@ static int alloc_area_pages(unsigned long address, unsigned long size) address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } - invalidate(); + flush_tlb_all(); return 0; } +struct vm_struct * get_vm_area(unsigned long size) +{ + void *addr; + struct vm_struct **p, *tmp, *area; + + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + addr = (void *) VMALLOC_START; + area->size = size + PAGE_SIZE; + area->next = NULL; + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + if (size + (unsigned long) addr < (unsigned long) tmp->addr) + break; + addr = (void *) (tmp->size + (unsigned long) tmp->addr); + } + area->addr = addr; + area->next = *p; + *p = area; + return area; +} + void vfree(void * addr) { struct vm_struct **p, *tmp; @@ -191,25 +192,15 @@ void vfree(void * addr) void * vmalloc(unsigned long size) { void * addr; - struct vm_struct **p, *tmp, *area; + struct vm_struct *area; size = PAGE_ALIGN(size); - if (!size || size > high_memory) + if (!size || size > (max_mapnr << PAGE_SHIFT)) return NULL; - area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + area = get_vm_area(size); if (!area) return NULL; - addr = (void *) VMALLOC_START; - area->size = size + PAGE_SIZE; - area->next = NULL; - for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { - if (size + (unsigned long) addr < (unsigned long) tmp->addr) - break; - addr = (void *) (tmp->size + (unsigned long) tmp->addr); - } - area->addr = addr; - area->next = *p; - *p = area; + addr = area->addr; if (alloc_area_pages(VMALLOC_VMADDR(addr), size)) { vfree(addr); return NULL; @@ -228,7 +219,10 @@ int vread(char *buf, char *addr, int count) while (addr < vaddr) { if (count == 0) goto finished; - put_fs_byte('\0', buf++), addr++, count--; + put_user('\0', buf); + buf++; + addr++; + count--; } n = tmp->size - PAGE_SIZE; if (addr > vaddr) @@ -236,7 +230,10 @@ int vread(char *buf, char *addr, int count) while (--n >= 0) { if (count == 0) goto finished; - put_fs_byte(*addr++, buf++), count--; + put_user(*addr, buf); + buf++; + addr++; + count--; } } finished: diff --git a/mm/vmscan.c b/mm/vmscan.c new file mode 100644 index 000000000..d14a82f0b --- /dev/null +++ b/mm/vmscan.c @@ -0,0 +1,453 @@ +/* + * linux/mm/vmscan.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Version: $Id: vmscan.c,v 1.4.2.2 1996/01/20 18:22:47 linux Exp $ + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/swapctl.h> +#include <linux/smp_lock.h> + +#include <asm/dma.h> +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/uaccess.h> /* for copy_to/from_user */ +#include <asm/bitops.h> +#include <asm/pgtable.h> + +/* + * When are we next due for a page scan? + */ +static int next_swap_jiffies = 0; + +/* + * How often do we do a pageout scan during normal conditions? + * Default is four times a second. + */ +int swapout_interval = HZ / 4; + +/* + * The wait queue for waking up the pageout daemon: + */ +static struct wait_queue * kswapd_wait = NULL; + +/* + * We avoid doing a reschedule if the pageout daemon is already awake; + */ +static int kswapd_awake = 0; + +/* + * sysctl-modifiable parameters to control the aggressiveness of the + * page-searching within the kswapd page recovery daemon. + */ +kswapd_control_t kswapd_ctl = {4, -1, -1, -1, -1}; + +static void init_swap_timer(void); + +/* + * The swap-out functions return 1 if they successfully + * threw something out, and we got a free page. It returns + * zero if it couldn't do anything, and any other value + * indicates it decreased rss, but the page was shared. + * + * NOTE! If it sleeps, it *must* return 1 to make sure we + * don't continue with the swap-out. Otherwise we may be + * using a process that no longer actually exists (it might + * have died while we slept). + */ +static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, + unsigned long address, pte_t * page_table, int dma, int wait) +{ + pte_t pte; + unsigned long entry; + unsigned long page; + struct page * page_map; + + pte = *page_table; + if (!pte_present(pte)) + return 0; + page = pte_page(pte); + if (MAP_NR(page) >= max_mapnr) + return 0; + + page_map = mem_map + MAP_NR(page); + if (PageReserved(page_map) + || PageLocked(page_map) + || (dma && !PageDMA(page_map))) + return 0; + /* Deal with page aging. Pages age from being unused; they + * rejuvenate on being accessed. Only swap old pages (age==0 + * is oldest). */ + if ((pte_dirty(pte) && delete_from_swap_cache(MAP_NR(page))) + || pte_young(pte)) { + set_pte(page_table, pte_mkold(pte)); + touch_page(page_map); + return 0; + } + age_page(page_map); + if (page_map->age) + return 0; + if (pte_dirty(pte)) { + if (vma->vm_ops && vma->vm_ops->swapout) { + pid_t pid = tsk->pid; + vma->vm_mm->rss--; + if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table)) + kill_proc(pid, SIGBUS, 1); + } else { + if (page_map->count != 1) + return 0; + if (!(entry = get_swap_page())) { + /* Aieee!!! Out of swap space! */ + int retval = -1; + if (nr_swapfiles == 0) + retval = 0; + return retval; + } + vma->vm_mm->rss--; + flush_cache_page(vma, address); + set_pte(page_table, __pte(entry)); + flush_tlb_page(vma, address); + tsk->nswap++; + rw_swap_page(WRITE, entry, (char *) page, wait); + } + free_page(page); + return 1; /* we slept: the process may not exist any more */ + } + if ((entry = find_in_swap_cache(MAP_NR(page)))) { + if (page_map->count != 1) { + set_pte(page_table, pte_mkdirty(pte)); + printk("Aiee.. duplicated cached swap-cache entry\n"); + return 0; + } + vma->vm_mm->rss--; + flush_cache_page(vma, address); + set_pte(page_table, __pte(entry)); + flush_tlb_page(vma, address); + free_page(page); + return 1; + } + vma->vm_mm->rss--; + flush_cache_page(vma, address); + pte_clear(page_table); + flush_tlb_page(vma, address); + entry = page_unuse(page); + free_page(page); + return entry; +} + +/* + * A new implementation of swap_out(). We do not swap complete processes, + * but only a small number of blocks, before we continue with the next + * process. The number of blocks actually swapped is determined on the + * number of page faults, that this process actually had in the last time, + * so we won't swap heavily used processes all the time ... + * + * Note: the priority argument is a hint on much CPU to waste with the + * swap block search, not a hint, of how much blocks to swap with + * each process. + * + * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de + */ + +static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma, + pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait) +{ + pte_t * pte; + unsigned long pmd_end; + + if (pmd_none(*dir)) + return 0; + if (pmd_bad(*dir)) { + printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); + pmd_clear(dir); + return 0; + } + + pte = pte_offset(dir, address); + + pmd_end = (address + PMD_SIZE) & PMD_MASK; + if (end > pmd_end) + end = pmd_end; + + do { + int result; + tsk->swap_address = address + PAGE_SIZE; + result = try_to_swap_out(tsk, vma, address, pte, dma, wait); + if (result) + return result; + address += PAGE_SIZE; + pte++; + } while (address < end); + return 0; +} + +static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma, + pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait) +{ + pmd_t * pmd; + unsigned long pgd_end; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); + pgd_clear(dir); + return 0; + } + + pmd = pmd_offset(dir, address); + + pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (end > pgd_end) + end = pgd_end; + + do { + int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait); + if (result) + return result; + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return 0; +} + +static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, + pgd_t *pgdir, unsigned long start, int dma, int wait) +{ + unsigned long end; + + /* Don't swap out areas like shared memory which have their + own separate swapping mechanism or areas which are locked down */ + if (vma->vm_flags & (VM_SHM | VM_LOCKED)) + return 0; + + end = vma->vm_end; + while (start < end) { + int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait); + if (result) + return result; + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } + return 0; +} + +static int swap_out_process(struct task_struct * p, int dma, int wait) +{ + unsigned long address; + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + address = p->swap_address; + p->swap_address = 0; + + /* + * Find the proper vm-area + */ + vma = find_vma(p->mm, address); + if (!vma) + return 0; + if (address < vma->vm_start) + address = vma->vm_start; + + for (;;) { + int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait); + if (result) + return result; + vma = vma->vm_next; + if (!vma) + break; + address = vma->vm_start; + } + p->swap_address = 0; + return 0; +} + +static int swap_out(unsigned int priority, int dma, int wait) +{ + static int swap_task; + int loop, counter; + struct task_struct *p; + + counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority; + for(; counter >= 0; counter--) { + /* + * Check that swap_task is suitable for swapping. If not, look for + * the next suitable process. + */ + loop = 0; + while(1) { + if (swap_task >= NR_TASKS) { + swap_task = 1; + if (loop) + /* all processes are unswappable or already swapped out */ + return 0; + loop = 1; + } + + p = task[swap_task]; + if (p && p->swappable && p->mm->rss) + break; + + swap_task++; + } + + /* + * Determine the number of pages to swap from this process. + */ + if (!p->swap_cnt) { + /* Normalise the number of pages swapped by + multiplying by (RSS / 1MB) */ + p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss); + } + if (!--p->swap_cnt) + swap_task++; + switch (swap_out_process(p, dma, wait)) { + /* out of swap space? */ + case -1: + return 0; + case 0: + if (p->swap_cnt) + swap_task++; + break; + case 1: + return 1; + default: + break; + } + } + return 0; +} + +/* + * We are much more aggressive about trying to swap out than we used + * to be. This works out OK, because we now do proper aging on page + * contents. + */ +int try_to_free_page(int priority, int dma, int wait) +{ + static int state = 0; + int i=6; + int stop; + + /* we don't try as hard if we're not waiting.. */ + stop = 3; + if (wait) + stop = 0; + switch (state) { + do { + case 0: + if (shrink_mmap(i, dma)) + return 1; + state = 1; + case 1: + if (shm_swap(i, dma)) + return 1; + state = 2; + default: + if (swap_out(i, dma, wait)) + return 1; + state = 0; + i--; + } while ((i - stop) >= 0); + } + return 0; +} + + +/* + * The background pageout daemon. + * Started as a kernel thread from the init process. + */ +int kswapd(void *unused) +{ + int i; + char *revision="$Revision: 1.4.2.2 $", *s, *e; + + current->session = 1; + current->pgrp = 1; + sprintf(current->comm, "kswapd"); + current->blocked = ~0UL; + + /* + * As a kernel thread we want to tamper with system buffers + * and other internals and thus be subject to the SMP locking + * rules. (On a uniprocessor box this does nothing). + */ + +#ifdef __SMP__ + lock_kernel(); + syscall_count++; +#endif + + /* Give kswapd a realtime priority. */ + current->policy = SCHED_FIFO; + current->priority = 32; /* Fixme --- we need to standardise our + namings for POSIX.4 realtime scheduling + priorities. */ + + init_swap_timer(); + + if ((s = strchr(revision, ':')) && + (e = strchr(s, '$'))) + s++, i = e - s; + else + s = revision, i = -1; + printk ("Started kswapd v%.*s\n", i, s); + + while (1) { + kswapd_awake = 0; + current->signal = 0; + run_task_queue(&tq_disk); + interruptible_sleep_on(&kswapd_wait); + kswapd_awake = 1; + swapstats.wakeups++; + /* Do the background pageout: */ + for (i=0; i < kswapd_ctl.maxpages; i++) + try_to_free_page(GFP_KERNEL, 0, 0); + } +} + +/* + * The swap_tick function gets called on every clock tick. + */ + +void swap_tick(void) +{ + if ((nr_free_pages + nr_async_pages) < free_pages_low || + ((nr_free_pages + nr_async_pages) < free_pages_high && + jiffies >= next_swap_jiffies)) { + if (!kswapd_awake && kswapd_ctl.maxpages > 0) { + wake_up(&kswapd_wait); + need_resched = 1; + } + next_swap_jiffies = jiffies + swapout_interval; + } + timer_active |= (1<<SWAP_TIMER); +} + + +/* + * Initialise the swap timer + */ + +void init_swap_timer(void) +{ + timer_table[SWAP_TIMER].expires = 0; + timer_table[SWAP_TIMER].fn = swap_tick; + timer_active |= (1<<SWAP_TIMER); +} |