diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/.cvsignore | 1 | ||||
-rw-r--r-- | mm/filemap.c | 187 | ||||
-rw-r--r-- | mm/memory.c | 231 | ||||
-rw-r--r-- | mm/mlock.c | 10 | ||||
-rw-r--r-- | mm/mmap.c | 73 | ||||
-rw-r--r-- | mm/mprotect.c | 10 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 128 | ||||
-rw-r--r-- | mm/page_io.c | 133 | ||||
-rw-r--r-- | mm/simp.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 48 | ||||
-rw-r--r-- | mm/swap_state.c | 219 | ||||
-rw-r--r-- | mm/swapfile.c | 169 | ||||
-rw-r--r-- | mm/vmscan.c | 253 |
14 files changed, 954 insertions, 516 deletions
diff --git a/mm/.cvsignore b/mm/.cvsignore index 4671378ae..857dd22e9 100644 --- a/mm/.cvsignore +++ b/mm/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/mm/filemap.c b/mm/filemap.c index 6d718c01d..7a4e20e21 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -25,6 +25,8 @@ #include <linux/smp.h> #include <linux/smp_lock.h> #include <linux/blkdev.h> +#include <linux/file.h> +#include <linux/swapctl.h> #include <asm/system.h> #include <asm/pgtable.h> @@ -115,7 +117,7 @@ repeat: } } -int shrink_mmap(int priority, int dma) +int shrink_mmap(int priority, int gfp_mask) { static unsigned long clock = 0; struct page * page; @@ -134,7 +136,7 @@ int shrink_mmap(int priority, int dma) if (PageLocked(page)) goto next; - if (dma && !PageDMA(page)) + if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) goto next; /* First of all, regenerate the page's referenced bit from any buffers in the page */ @@ -158,20 +160,31 @@ int shrink_mmap(int priority, int dma) switch (atomic_read(&page->count)) { case 1: - /* If it has been referenced recently, don't free it */ - if (test_and_clear_bit(PG_referenced, &page->flags)) - break; - - /* is it a page cache page? */ + /* is it a swap-cache or page-cache page? */ if (page->inode) { + if (test_and_clear_bit(PG_referenced, &page->flags)) { + touch_page(page); + break; + } + age_page(page); + if (page->age) + break; + if (PageSwapCache(page)) { + delete_from_swap_cache(page); + return 1; + } remove_page_from_hash_queue(page); remove_page_from_inode_queue(page); __free_page(page); return 1; } + /* It's not a cache page, so we don't do aging. + * If it has been referenced recently, don't free it */ + if (test_and_clear_bit(PG_referenced, &page->flags)) + break; /* is it a buffer cache page? */ - if (bh && try_to_free_buffer(bh, &bh, 6)) + if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6)) return 1; break; @@ -208,6 +221,8 @@ unsigned long page_unuse(unsigned long page) return count; if (!p->inode) return count; + if (PageSwapCache(p)) + panic ("Doing a normal page_unuse of a swap cache page"); remove_page_from_hash_queue(p); remove_page_from_inode_queue(p); free_page(page); @@ -260,8 +275,10 @@ static inline void add_to_page_cache(struct page * page, * that we could use for the cache (if it is 0 we can try to create one, * this is all overlapped with the IO on the previous page finishing anyway) */ -static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache) +static unsigned long try_to_read_ahead(struct file * file, + unsigned long offset, unsigned long page_cache) { + struct inode *inode = file->f_dentry->d_inode; struct page * page; struct page ** hash; @@ -282,7 +299,7 @@ static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offse */ page = mem_map + MAP_NR(page_cache); add_to_page_cache(page, inode, offset, hash); - inode->i_op->readpage(inode, page); + inode->i_op->readpage(file, page); page_cache = 0; } release_page(page); @@ -299,18 +316,20 @@ static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offse */ void __wait_on_page(struct page *page) { - struct wait_queue wait = { current, NULL }; + struct task_struct *tsk = current; + struct wait_queue wait; + wait.task = tsk; add_wait_queue(&page->wait, &wait); repeat: + tsk->state = TASK_UNINTERRUPTIBLE; run_task_queue(&tq_disk); - current->state = TASK_UNINTERRUPTIBLE; if (PageLocked(page)) { schedule(); goto repeat; } + tsk->state = TASK_RUNNING; remove_wait_queue(&page->wait, &wait); - current->state = TASK_RUNNING; } #if 0 @@ -436,16 +455,6 @@ static void profile_readahead(int async, struct file *filp) * 64k if defined (4K page size assumed). */ -#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK) - -#if 0 /* small readahead */ -#define MAX_READAHEAD PageAlignSize(4096*7) -#define MIN_READAHEAD PageAlignSize(4096*2) -#else /* large readahead */ -#define MAX_READAHEAD PageAlignSize(4096*18) -#define MIN_READAHEAD PageAlignSize(4096*3) -#endif - static inline int get_max_readahead(struct inode * inode) { if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)]) @@ -453,9 +462,9 @@ static inline int get_max_readahead(struct inode * inode) return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; } -static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode, - unsigned long ppos, struct page * page, - unsigned long page_cache) +static inline unsigned long generic_file_readahead(int reada_ok, + struct file * filp, struct inode * inode, + unsigned long ppos, struct page * page, unsigned long page_cache) { unsigned long max_ahead, ahead; unsigned long raend; @@ -519,7 +528,8 @@ static inline unsigned long generic_file_readahead(int reada_ok, struct file * f ahead = 0; while (ahead < max_ahead) { ahead += PAGE_SIZE; - page_cache = try_to_read_ahead(inode, raend + ahead, page_cache); + page_cache = try_to_read_ahead(filp, raend + ahead, + page_cache); } /* * If we tried to read ahead some pages, @@ -567,7 +577,8 @@ static inline unsigned long generic_file_readahead(int reada_ok, struct file * f ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) { - struct inode *inode = filp->f_dentry->d_inode; + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; ssize_t error, read; size_t pos, pgpos, page_cache; int reada_ok; @@ -724,7 +735,7 @@ no_cached_page: if (reada_ok && filp->f_ramax > MIN_READAHEAD) filp->f_ramax = MIN_READAHEAD; - error = inode->i_op->readpage(inode, page); + error = inode->i_op->readpage(filp, page); if (!error) goto found_page; release_page(page); @@ -736,7 +747,7 @@ page_read_error: * Try to re-read it _once_. We do this synchronously, * because this happens only if there were errors. */ - error = inode->i_op->readpage(inode, page); + error = inode->i_op->readpage(filp, page); if (!error) { wait_on_page(page); if (PageUptodate(page) && !PageError(page)) @@ -751,7 +762,7 @@ page_read_error: filp->f_reada = 1; if (page_cache) free_page(page_cache); - UPDATE_ATIME(inode) + UPDATE_ATIME(inode); if (!read) read = error; return read; @@ -771,11 +782,11 @@ page_read_error: */ static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share) { -/* XXX: Check the flushes in this code. At least sometimes we do - duplicate flushes. ... */ + struct file * file = area->vm_file; + struct dentry * dentry = file->f_dentry; + struct inode * inode = dentry->d_inode; unsigned long offset; struct page * page, **hash; - struct inode * inode = area->vm_dentry->d_inode; unsigned long old_page, new_page; new_page = 0; @@ -856,14 +867,14 @@ no_cached_page: new_page = 0; add_to_page_cache(page, inode, offset, hash); - if (inode->i_op->readpage(inode, page) != 0) + if (inode->i_op->readpage(file, page) != 0) goto failure; /* * Do a very limited read-ahead if appropriate */ if (PageLocked(page)) - new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0); + new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0); goto found_page; page_locked_wait: @@ -878,7 +889,7 @@ page_read_error: * because there really aren't any performance issues here * and we need to check for errors. */ - if (inode->i_op->readpage(inode, page) != 0) + if (inode->i_op->readpage(file, page) != 0) goto failure; wait_on_page(page); if (PageError(page)) @@ -907,6 +918,7 @@ static inline int do_write_page(struct inode * inode, struct file * file, { int retval; unsigned long size; + loff_t loff = offset; mm_segment_t old_fs; size = offset + PAGE_SIZE; @@ -922,8 +934,7 @@ static inline int do_write_page(struct inode * inode, struct file * file, old_fs = get_fs(); set_fs(KERNEL_DS); retval = -EIO; - if (size == file->f_op->write(file, (const char *) page, - size, &file->f_pos)) + if (size == file->f_op->write(file, (const char *) page, size, &loff)) retval = 0; set_fs(old_fs); return retval; @@ -934,7 +945,7 @@ static int filemap_write_page(struct vm_area_struct * vma, unsigned long page) { int result; - struct file file; + struct file * file; struct dentry * dentry; struct inode * inode; struct buffer_head * bh; @@ -954,27 +965,21 @@ static int filemap_write_page(struct vm_area_struct * vma, return 0; } - dentry = vma->vm_dentry; + file = vma->vm_file; + dentry = file->f_dentry; inode = dentry->d_inode; - file.f_op = inode->i_op->default_file_ops; - if (!file.f_op->write) + if (!file->f_op->write) return -EIO; - file.f_mode = 3; - file.f_flags = 0; - file.f_count = 1; - file.f_dentry = dentry; - file.f_pos = offset; - file.f_reada = 0; /* * If a task terminates while we're swapping the page, the vma and - * and dentry could be released ... increment the count to be safe. + * and file could be released ... increment the count to be safe. */ - dget(dentry); + file->f_count++; down(&inode->i_sem); - result = do_write_page(inode, &file, (const char *) page, offset); + result = do_write_page(inode, file, (const char *) page, offset); up(&inode->i_sem); - dput(dentry); + fput(file); return result; } @@ -1209,7 +1214,8 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) if (!inode->i_op || !inode->i_op->readpage) return -ENOEXEC; UPDATE_ATIME(inode); - vma->vm_dentry = dget(file->f_dentry); + vma->vm_file = file; + file->f_count++; vma->vm_ops = ops; return 0; } @@ -1222,15 +1228,16 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) static int msync_interval(struct vm_area_struct * vma, unsigned long start, unsigned long end, int flags) { - if (vma->vm_dentry && vma->vm_ops && vma->vm_ops->sync) { + if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) { int error; error = vma->vm_ops->sync(vma, start, end-start, flags); if (!error && (flags & MS_SYNC)) { - struct dentry * dentry = vma->vm_dentry; - if (dentry) { + struct file * file = vma->vm_file; + if (file) { + struct dentry * dentry = file->f_dentry; struct inode * inode = dentry->d_inode; down(&inode->i_sem); - error = file_fsync(NULL,dentry); + error = file_fsync(file, dentry); up(&inode->i_sem); } } @@ -1315,7 +1322,8 @@ ssize_t generic_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; struct page *page, **hash; unsigned long page_cache = 0; unsigned long pgpos, offset; @@ -1349,11 +1357,10 @@ generic_file_write(struct file *file, const char *buf, if (!(page = __find_page(inode, pgpos, *hash))) { if (!page_cache) { page_cache = __get_free_page(GFP_KERNEL); - if (!page_cache) { - status = -ENOMEM; - break; - } - continue; + if (page_cache) + continue; + status = -ENOMEM; + break; } page = mem_map + MAP_NR(page_cache); add_to_page_cache(page, inode, pgpos, hash); @@ -1361,36 +1368,47 @@ generic_file_write(struct file *file, const char *buf, } /* - * WSH 06/05/97: restructured slightly to make sure we release - * the page on an error exit. Removed explicit setting of - * PG_locked, as that's handled below the i_op->xxx interface. + * Note: setting of the PG_locked bit is handled + * below the i_op->xxx interface. */ didread = 0; page_wait: wait_on_page(page); + if (PageUptodate(page)) + goto do_update_page; /* - * If the page is not uptodate, and we're writing less + * The page is not up-to-date ... if we're writing less * than a full page of data, we may have to read it first. - * However, don't bother with reading the page when it's - * after the current end of file. + * But if the page is past the current end of file, we must + * clear it before updating. */ - if (!PageUptodate(page)) { - if (bytes < PAGE_SIZE && pgpos < inode->i_size) { - if (didread < 2) - status = inode->i_op->readpage(inode, page); - else - status = -EIO; /* two tries ... error out */ + if (bytes < PAGE_SIZE) { + if (pgpos < inode->i_size) { + status = -EIO; + if (didread >= 2) + goto done_with_page; + status = inode->i_op->readpage(file, page); if (status < 0) goto done_with_page; didread++; goto page_wait; + } else { + /* Must clear for partial writes */ + memset((void *) page_address(page), 0, + PAGE_SIZE); } - set_bit(PG_uptodate, &page->flags); } + /* + * N.B. We should defer setting PG_uptodate at least until + * the data is copied. A failure in i_op->updatepage() could + * leave the page with garbage data. + */ + set_bit(PG_uptodate, &page->flags); +do_update_page: /* Alright, the page is there. Now update it. */ - status = inode->i_op->updatepage(inode, page, buf, + status = inode->i_op->updatepage(file, page, buf, offset, bytes, sync); done_with_page: __free_page(page); @@ -1408,9 +1426,7 @@ done_with_page: if (page_cache) free_page(page_cache); - if (written) - return written; - return status; + return written ? written : status; } /* @@ -1429,7 +1445,7 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset, { struct page * page; struct page ** hash; - unsigned long page_cache; + unsigned long page_cache = 0; hash = page_hash(inode, offset); page = __find_page(inode, offset, *hash); @@ -1443,14 +1459,15 @@ unsigned long get_cached_page(struct inode * inode, unsigned long offset, add_to_page_cache(page, inode, offset, hash); } if (atomic_read(&page->count) != 2) - printk("get_cached_page: page count=%d\n", + printk(KERN_ERR "get_cached_page: page count=%d\n", atomic_read(&page->count)); if (test_bit(PG_locked, &page->flags)) - printk("get_cached_page: page already locked!\n"); + printk(KERN_ERR "get_cached_page: page already locked!\n"); set_bit(PG_locked, &page->flags); + page_cache = page_address(page); out: - return page_address(page); + return page_cache; } /* diff --git a/mm/memory.c b/mm/memory.c index 82ed6c986..66cdf0bc1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -175,100 +175,16 @@ int new_page_tables(struct task_struct * tsk) return 0; } -static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte, int cow) -{ - pte_t pte = *old_pte; - unsigned long page_nr; - - if (pte_none(pte)) - return; - if (!pte_present(pte)) { - swap_duplicate(pte_val(pte)); - set_pte(new_pte, pte); - return; - } - page_nr = MAP_NR(pte_page(pte)); - if (page_nr >= max_mapnr || PageReserved(mem_map+page_nr)) { - set_pte(new_pte, pte); - return; - } - if (cow) - pte = pte_wrprotect(pte); - if (delete_from_swap_cache(&mem_map[page_nr])) - pte = pte_mkdirty(pte); - set_pte(new_pte, pte_mkold(pte)); - set_pte(old_pte, pte); - atomic_inc(&mem_map[page_nr].count); -} - -static inline int copy_pte_range(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long address, unsigned long size, int cow) -{ - pte_t * src_pte, * dst_pte; - unsigned long end; - - if (pmd_none(*src_pmd)) - return 0; - if (pmd_bad(*src_pmd)) { - printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd)); - pmd_clear(src_pmd); - return 0; - } - src_pte = pte_offset(src_pmd, address); - if (pmd_none(*dst_pmd)) { - if (!pte_alloc(dst_pmd, 0)) - return -ENOMEM; - } - dst_pte = pte_offset(dst_pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end >= PMD_SIZE) - end = PMD_SIZE; - do { - /* I would like to switch arguments here, to make it - * consistent with copy_xxx_range and memcpy syntax. - */ - copy_one_pte(src_pte++, dst_pte++, cow); - address += PAGE_SIZE; - } while (address < end); - return 0; -} - -static inline int copy_pmd_range(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long address, unsigned long size, int cow) -{ - pmd_t * src_pmd, * dst_pmd; - unsigned long end; - int error = 0; - - if (pgd_none(*src_pgd)) - return 0; - if (pgd_bad(*src_pgd)) { - printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*src_pgd)); - pgd_clear(src_pgd); - return 0; - } - src_pmd = pmd_offset(src_pgd, address); - if (pgd_none(*dst_pgd)) { - if (!pmd_alloc(dst_pgd, 0)) - return -ENOMEM; - } - dst_pmd = pmd_offset(dst_pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - error = copy_pte_range(dst_pmd++, src_pmd++, address, end - address, cow); - if (error) - break; - address = (address + PMD_SIZE) & PMD_MASK; - } while (address < end); - return error; -} +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. + * + * 08Jan98 Merged into one routine from several inline routines to reduce + * variable count and make things faster. -jj */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -276,18 +192,105 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; - int error = 0, cow; + unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; + + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + /* copy_pmd_range */ + + if (pgd_none(*src_pgd)) + goto skip_copy_pmd_range; + if (pgd_bad(*src_pgd)) { + printk("copy_pmd_range: bad pgd (%08lx)\n", + pgd_val(*src_pgd)); + pgd_clear(src_pgd); +skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (address >= end) + goto out; + continue; + } + if (pgd_none(*dst_pgd)) { + if (!pmd_alloc(dst_pgd, 0)) + goto nomem; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_offset(dst_pgd, address); + + do { + pte_t * src_pte, * dst_pte; + + /* copy_pte_range */ + + if (pmd_none(*src_pmd)) + goto skip_copy_pte_range; + if (pmd_bad(*src_pmd)) { + printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*src_pmd)); + pmd_clear(src_pmd); +skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out; + goto cont_copy_pmd_range; + } + if (pmd_none(*dst_pmd)) { + if (!pte_alloc(dst_pmd, 0)) + goto nomem; + } + + src_pte = pte_offset(src_pmd, address); + dst_pte = pte_offset(dst_pmd, address); + + do { + pte_t pte = *src_pte; + unsigned long page_nr; + + /* copy_one_pte */ + + if (pte_none(pte)) + goto cont_copy_pte_range; + if (!pte_present(pte)) { + swap_duplicate(pte_val(pte)); + set_pte(dst_pte, pte); + goto cont_copy_pte_range; + } + page_nr = MAP_NR(pte_page(pte)); + if (page_nr >= max_mapnr || + PageReserved(mem_map+page_nr)) { + set_pte(dst_pte, pte); + goto cont_copy_pte_range; + } + if (cow) + pte = pte_wrprotect(pte); +#if 0 /* No longer needed with the new swap cache code */ + if (delete_from_swap_cache(&mem_map[page_nr])) + pte = pte_mkdirty(pte); +#endif + set_pte(dst_pte, pte_mkold(pte)); + set_pte(src_pte, pte); + atomic_inc(&mem_map[page_nr].count); + +cont_copy_pte_range: address += PAGE_SIZE; + if (address >= end) + goto out; + src_pte++; + dst_pte++; + } while ((unsigned long)src_pte & PTE_TABLE_MASK); + +cont_copy_pmd_range: src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + } +out: + return 0; - cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; - src_pgd = pgd_offset(src, address); - dst_pgd = pgd_offset(dst, address); - while (address < end) { - error = copy_pmd_range(dst_pgd++, src_pgd++, address, end - address, cow); - if (error) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - } - return error; +nomem: + return -ENOMEM; } /* @@ -299,7 +302,11 @@ static inline int free_pte(pte_t page) unsigned long addr = pte_page(page); if (MAP_NR(addr) >= max_mapnr || PageReserved(mem_map+MAP_NR(addr))) return 0; - free_page(addr); + /* + * free_page() used to be able to clear swap cache + * entries. We may now have to do it manually. + */ + free_page_and_swap_cache(addr); return 1; } swap_free(pte_val(page)); @@ -542,7 +549,7 @@ int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long static void put_page(pte_t * page_table, pte_t pte) { if (!pte_none(*page_table)) { - free_page(pte_page(pte)); + free_page_and_swap_cache(pte_page(pte)); return; } /* no need for flush_tlb */ @@ -609,9 +616,13 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, { pte_t pte; unsigned long old_page, new_page; - - new_page = __get_free_page(GFP_KERNEL); + struct page * page_map; + pte = *page_table; + new_page = __get_free_page(GFP_KERNEL); + /* Did someone else copy this page for us while we slept? */ + if (pte_val(*page_table) != pte_val(pte)) + goto end_wp_page; if (!pte_present(pte)) goto end_wp_page; if (pte_write(pte)) @@ -620,10 +631,12 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, if (MAP_NR(old_page) >= max_mapnr) goto bad_wp_page; tsk->min_flt++; + page_map = mem_map + MAP_NR(old_page); + /* * Do we need to copy? */ - if (atomic_read(&mem_map[MAP_NR(old_page)].count) != 1) { + if (is_page_shared(page_map)) { if (new_page) { if (PageReserved(mem_map + MAP_NR(old_page))) ++vma->vm_mm->rss; @@ -643,6 +656,8 @@ static void do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma, oom(tsk); return; } + if (PageSwapCache(page_map)) + delete_from_swap_cache(page_map); flush_cache_page(vma, address); set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); flush_tlb_page(vma, address); @@ -867,12 +882,14 @@ static inline void handle_pte_fault(struct task_struct *tsk, do_no_page(tsk, vma, address, write_access, pte, entry); return; } - set_pte(pte, pte_mkyoung(entry)); + entry = pte_mkyoung(entry); + set_pte(pte, entry); flush_tlb_page(vma, address); if (!write_access) return; if (pte_write(entry)) { - set_pte(pte, pte_mkdirty(*pte)); + entry = pte_mkdirty(entry); + set_pte(pte, entry); flush_tlb_page(vma, address); return; } diff --git a/mm/mlock.c b/mm/mlock.c index eea100add..5bffab93f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -38,7 +38,8 @@ static inline int mlock_fixup_start(struct vm_area_struct * vma, n->vm_end = end; vma->vm_offset += vma->vm_start - n->vm_start; n->vm_flags = newflags; - n->vm_dentry = dget(vma->vm_dentry); + if (n->vm_file) + n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); insert_vm_struct(current->mm, n); @@ -58,7 +59,8 @@ static inline int mlock_fixup_end(struct vm_area_struct * vma, n->vm_start = start; n->vm_offset += n->vm_start - vma->vm_start; n->vm_flags = newflags; - n->vm_dentry = dget(vma->vm_dentry); + if (n->vm_file) + n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); insert_vm_struct(current->mm, n); @@ -87,8 +89,8 @@ static inline int mlock_fixup_middle(struct vm_area_struct * vma, vma->vm_offset += vma->vm_start - left->vm_start; right->vm_offset += right->vm_start - left->vm_start; vma->vm_flags = newflags; - if (vma->vm_dentry) - vma->vm_dentry->d_count += 2; + if (vma->vm_file) + vma->vm_file->f_count += 2; if (vma->vm_ops && vma->vm_ops->open) { vma->vm_ops->open(left); @@ -17,6 +17,7 @@ #include <linux/smp.h> #include <linux/smp_lock.h> #include <linux/init.h> +#include <linux/file.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -74,11 +75,11 @@ int vm_enough_memory(long pages) /* Remove one vm structure from the inode's i_mmap ring. */ static inline void remove_shared_vm_struct(struct vm_area_struct *vma) { - struct dentry * dentry = vma->vm_dentry; + struct file * file = vma->vm_file; - if (dentry) { + if (file) { if (vma->vm_flags & VM_DENYWRITE) - dentry->d_inode->i_writecount++; + file->f_dentry->d_inode->i_writecount++; if(vma->vm_next_share) vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share; *vma->vm_pprev_share = vma->vm_next_share; @@ -173,6 +174,10 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, if (off + len < off) return -EINVAL; + /* Too many mappings? */ + if (mm->map_count > MAX_MAP_COUNT) + return -ENOMEM; + /* mlock MCL_FUTURE? */ if (mm->def_flags & VM_LOCKED) { unsigned long locked = mm->locked_vm << PAGE_SHIFT; @@ -257,7 +262,7 @@ unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len, vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; vma->vm_ops = NULL; vma->vm_offset = off; - vma->vm_dentry = NULL; + vma->vm_file = NULL; vma->vm_pte = 0; /* Clear old maps */ @@ -390,8 +395,8 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr, if (addr == area->vm_start && end == area->vm_end) { if (area->vm_ops && area->vm_ops->close) area->vm_ops->close(area); - if (area->vm_dentry) - dput(area->vm_dentry); + if (area->vm_file) + fput(area->vm_file); return 0; } @@ -414,7 +419,9 @@ static int unmap_fixup(struct vm_area_struct *area, unsigned long addr, mpnt->vm_flags = area->vm_flags; mpnt->vm_ops = area->vm_ops; mpnt->vm_offset = area->vm_offset + (end - area->vm_start); - mpnt->vm_dentry = dget(area->vm_dentry); + mpnt->vm_file = area->vm_file; + if (mpnt->vm_file) + mpnt->vm_file->f_count++; if (mpnt->vm_ops && mpnt->vm_ops->open) mpnt->vm_ops->open(mpnt); area->vm_end = addr; /* Truncate area */ @@ -452,6 +459,7 @@ asmlinkage int sys_munmap(unsigned long addr, size_t len) */ int do_munmap(unsigned long addr, size_t len) { + struct mm_struct * mm; struct vm_area_struct *mpnt, *next, *free, *extra; int freed; @@ -466,7 +474,8 @@ int do_munmap(unsigned long addr, size_t len) * every area affected in some way (by any overlap) is put * on the list. If nothing is put on, nothing is affected. */ - mpnt = current->mm->mmap; + mm = current->mm; + mpnt = mm->mmap; while(mpnt && mpnt->vm_end <= addr) mpnt = mpnt->vm_next; if (!mpnt) @@ -496,6 +505,13 @@ int do_munmap(unsigned long addr, size_t len) mpnt = next; } + if (free && (free->vm_start < addr) && (free->vm_end > addr+len)) { + if (mm->map_count > MAX_MAP_COUNT) { + kmem_cache_free(vm_area_cachep, extra); + return -ENOMEM; + } + } + /* Ok - we have the memory areas we should free on the 'free' list, * so release them, and unmap the page range.. * If the one of the segments is only being partially unmapped, @@ -508,6 +524,7 @@ int do_munmap(unsigned long addr, size_t len) free = free->vm_next; freed = 1; + mm->map_count--; remove_shared_vm_struct(mpnt); st = addr < mpnt->vm_start ? mpnt->vm_start : addr; @@ -518,9 +535,9 @@ int do_munmap(unsigned long addr, size_t len) if (mpnt->vm_ops && mpnt->vm_ops->unmap) mpnt->vm_ops->unmap(mpnt, st, size); - flush_cache_range(current->mm, st, end); - zap_page_range(current->mm, st, size); - flush_tlb_range(current->mm, st, end); + flush_cache_range(mm, st, end); + zap_page_range(mm, st, size); + flush_tlb_range(mm, st, end); /* * Fix the mapping, and free the old area if it wasn't reused. @@ -534,7 +551,7 @@ int do_munmap(unsigned long addr, size_t len) kmem_cache_free(vm_area_cachep, extra); if (freed) - current->mm->mmap_cache = NULL; /* Kill the cache. */ + mm->mmap_cache = NULL; /* Kill the cache. */ return 0; } @@ -560,13 +577,18 @@ void exit_mmap(struct mm_struct * mm) if (mpnt->vm_ops->close) mpnt->vm_ops->close(mpnt); } + mm->map_count--; remove_shared_vm_struct(mpnt); zap_page_range(mm, start, size); - if (mpnt->vm_dentry) - dput(mpnt->vm_dentry); + if (mpnt->vm_file) + fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); mpnt = next; } + + /* This is just debugging */ + if (mm->map_count) + printk("exit_mmap: map count is %d\n", mm->map_count); } /* Insert vm structure into process list sorted by address @@ -575,7 +597,9 @@ void exit_mmap(struct mm_struct * mm) void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) { struct vm_area_struct **pprev = &mm->mmap; - struct dentry * dentry; + struct file * file; + + mm->map_count++; /* Find where to link it in. */ while(*pprev && (*pprev)->vm_start <= vmp->vm_start) @@ -587,9 +611,9 @@ void insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vmp) *pprev = vmp; vmp->vm_pprev = pprev; - dentry = vmp->vm_dentry; - if (dentry) { - struct inode * inode = dentry->d_inode; + file = vmp->vm_file; + if (file) { + struct inode * inode = file->f_dentry->d_inode; if (vmp->vm_flags & VM_DENYWRITE) inode->i_writecount--; @@ -636,8 +660,8 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l for ( ; mpnt && prev->vm_start < end_addr ; prev = mpnt, mpnt = next) { next = mpnt->vm_next; - /* To share, we must have the same dentry, operations.. */ - if ((mpnt->vm_dentry != prev->vm_dentry)|| + /* To share, we must have the same file, operations.. */ + if ((mpnt->vm_file != prev->vm_file)|| (mpnt->vm_pte != prev->vm_pte) || (mpnt->vm_ops != prev->vm_ops) || (mpnt->vm_flags != prev->vm_flags) || @@ -645,10 +669,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l continue; /* - * If we have a dentry or it's a shared memory area + * If we have a file or it's a shared memory area * the offsets must be contiguous.. */ - if ((mpnt->vm_dentry != NULL) || (mpnt->vm_flags & VM_SHM)) { + if ((mpnt->vm_file != NULL) || (mpnt->vm_flags & VM_SHM)) { unsigned long off = prev->vm_offset+prev->vm_end-prev->vm_start; if (off != mpnt->vm_offset) continue; @@ -668,9 +692,10 @@ void merge_segments (struct mm_struct * mm, unsigned long start_addr, unsigned l mpnt->vm_start = mpnt->vm_end; mpnt->vm_ops->close(mpnt); } + mm->map_count--; remove_shared_vm_struct(mpnt); - if (mpnt->vm_dentry) - dput(mpnt->vm_dentry); + if (mpnt->vm_file) + fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); mpnt = prev; } diff --git a/mm/mprotect.c b/mm/mprotect.c index ddf4f4ed6..a34225d83 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -110,7 +110,8 @@ static inline int mprotect_fixup_start(struct vm_area_struct * vma, vma->vm_offset += vma->vm_start - n->vm_start; n->vm_flags = newflags; n->vm_page_prot = prot; - n->vm_dentry = dget(n->vm_dentry); + if (n->vm_file) + n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); insert_vm_struct(current->mm, n); @@ -132,7 +133,8 @@ static inline int mprotect_fixup_end(struct vm_area_struct * vma, n->vm_offset += n->vm_start - vma->vm_start; n->vm_flags = newflags; n->vm_page_prot = prot; - n->vm_dentry = dget(n->vm_dentry); + if (n->vm_file) + n->vm_file->f_count++; if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); insert_vm_struct(current->mm, n); @@ -163,8 +165,8 @@ static inline int mprotect_fixup_middle(struct vm_area_struct * vma, right->vm_offset += right->vm_start - left->vm_start; vma->vm_flags = newflags; vma->vm_page_prot = prot; - if (vma->vm_dentry) - vma->vm_dentry->d_count += 2; + if (vma->vm_file) + vma->vm_file->f_count += 2; if (vma->vm_ops && vma->vm_ops->open) { vma->vm_ops->open(left); vma->vm_ops->open(right); diff --git a/mm/mremap.c b/mm/mremap.c index aaabde322..a31a0ae14 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -140,7 +140,9 @@ static inline unsigned long move_vma(struct vm_area_struct * vma, new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; new_vma->vm_offset = vma->vm_offset + (addr - vma->vm_start); - new_vma->vm_dentry = dget(vma->vm_dentry); + new_vma->vm_file = vma->vm_file; + if (new_vma->vm_file) + new_vma->vm_file->f_count++; if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); insert_vm_struct(current->mm, new_vma); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07264f81e..ed748bbfb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -19,6 +19,7 @@ #include <linux/swapctl.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/pagemap.h> #include <asm/dma.h> #include <asm/system.h> /* for cli()/sti() */ @@ -101,6 +102,46 @@ static inline void remove_mem_queue(struct page * entry) static spinlock_t page_alloc_lock; #endif +/* + * This routine is used by the kernel swap deamon to determine + * whether we have "enough" free pages. It is fairly arbitrary, + * but this had better return false if any reasonable "get_free_page()" + * allocation could currently fail.. + * + * Currently we approve of the following situations: + * - the highest memory order has two entries + * - the highest memory order has one free entry and: + * - the next-highest memory order has two free entries + * - the highest memory order has one free entry and: + * - the next-highest memory order has one free entry + * - the next-next-highest memory order has two free entries + * + * [previously, there had to be two entries of the highest memory + * order, but this lead to problems on large-memory machines.] + */ +int free_memory_available(void) +{ + int i, retval = 0; + unsigned long flags; + struct free_area_struct * list = NULL; + + spin_lock_irqsave(&page_alloc_lock, flags); + /* We fall through the loop if the list contains one + * item. -- thanks to Colin Plumb <colin@nyx.net> + */ + for (i = 1; i < 4; ++i) { + list = free_area + NR_MEM_LISTS - i; + if (list->next == memory_head(list)) + break; + if (list->next->next == memory_head(list)) + continue; + retval = 1; + break; + } + spin_unlock_irqrestore(&page_alloc_lock, flags); + return retval; +} + static inline void free_pages_ok(unsigned long map_nr, unsigned long order) { struct free_area_struct *area = free_area + order; @@ -133,9 +174,12 @@ static inline void free_pages_ok(unsigned long map_nr, unsigned long order) void __free_page(struct page *page) { if (!PageReserved(page) && atomic_dec_and_test(&page->count)) { - delete_from_swap_cache(page); + if (PageSwapCache(page)) + panic ("Freeing swap cache page"); free_pages_ok(page->map_nr, 0); } + if (PageSwapCache(page) && atomic_read(&page->count) == 1) + panic ("Releasing swap cache page"); } void free_pages(unsigned long addr, unsigned long order) @@ -147,10 +191,14 @@ void free_pages(unsigned long addr, unsigned long order) if (PageReserved(map)) return; if (atomic_dec_and_test(&map->count)) { - delete_from_swap_cache(map); + if (PageSwapCache(map)) + panic ("Freeing swap cache pages"); free_pages_ok(map_nr, order); return; } + if (PageSwapCache(map) && atomic_read(&map->count) == 1) + panic ("Releasing swap cache pages at %p", + __builtin_return_address(0)); } } @@ -161,11 +209,13 @@ void free_pages(unsigned long addr, unsigned long order) change_bit((index) >> (1+(order)), (area)->map) #define CAN_DMA(x) (PageDMA(x)) #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT)) -#define RMQUEUE(order, dma) \ +#define RMQUEUE(order, maxorder, dma) \ do { struct free_area_struct * area = free_area+order; \ unsigned long new_order = order; \ - do { struct page *prev = memory_head(area), *ret; \ - while (memory_head(area) != (ret = prev->next)) { \ + do { struct page *prev = memory_head(area), *ret = prev->next; \ + while (memory_head(area) != ret) { \ + if (new_order >= maxorder && ret->next == prev) \ + break; \ if (!dma || CAN_DMA(ret)) { \ unsigned long map_nr = ret->map_nr; \ (prev->next = ret->next)->prev = prev; \ @@ -176,6 +226,7 @@ do { struct free_area_struct * area = free_area+order; \ return ADDRESS(map_nr); \ } \ prev = ret; \ + ret = ret->next; \ } \ new_order++; area++; \ } while (new_order < NR_MEM_LISTS); \ @@ -194,36 +245,40 @@ do { unsigned long size = 1 << high; \ map->age = PAGE_INITIAL_AGE; \ } while (0) -unsigned long __get_free_pages(int priority, unsigned long order, int dma) +unsigned long __get_free_pages(int gfp_mask, unsigned long order) { - unsigned long flags; - int reserved_pages; + unsigned long flags, maxorder; if (order >= NR_MEM_LISTS) - return 0; + goto nopage; - if (in_interrupt() && priority != GFP_ATOMIC) { + /* + * "maxorder" is the highest order number that we're allowed + * to empty in order to find a free page.. + */ + maxorder = order + NR_MEM_LISTS/3; + if (gfp_mask & __GFP_MED) + maxorder += NR_MEM_LISTS/3; + if ((gfp_mask & __GFP_HIGH) || maxorder > NR_MEM_LISTS) + maxorder = NR_MEM_LISTS; + + if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { static int count = 0; if (++count < 5) { printk("gfp called nonatomically from interrupt %p\n", - return_address()); - priority = GFP_ATOMIC; + return_address()); + gfp_mask &= ~__GFP_WAIT; } } - reserved_pages = 5; - if (priority != GFP_NFS) - reserved_pages = min_free_pages; repeat: spin_lock_irqsave(&page_alloc_lock, flags); - if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { - RMQUEUE(order, dma); - spin_unlock_irqrestore(&page_alloc_lock, flags); - return 0; - } + RMQUEUE(order, maxorder, (gfp_mask & GFP_DMA)); spin_unlock_irqrestore(&page_alloc_lock, flags); - if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1)) + if ((gfp_mask & __GFP_WAIT) && try_to_free_page(gfp_mask)) goto repeat; + +nopage: return 0; } @@ -315,31 +370,38 @@ __initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long e void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, pte_t * page_table, unsigned long entry, int write_access) { - unsigned long page = __get_free_page(GFP_KERNEL); + unsigned long page; + struct page *page_map; + + page_map = read_swap_cache(entry); if (pte_val(*page_table) != entry) { - free_page(page); + if (page_map) + free_page_and_swap_cache(page_address(page_map)); return; } - if (!page) { + if (!page_map) { set_pte(page_table, BAD_PAGE); swap_free(entry); oom(tsk); return; } - read_swap_page(entry, (char *) page); - if (pte_val(*page_table) != entry) { - free_page(page); - return; - } + + page = page_address(page_map); vma->vm_mm->rss++; - tsk->maj_flt++; - if (!write_access && add_to_swap_cache(&mem_map[MAP_NR(page)], entry)) { - /* keep swap page allocated for the moment (swap cache) */ + tsk->min_flt++; + swap_free(entry); + + if (!write_access || is_page_shared(page_map)) { set_pte(page_table, mk_pte(page, vma->vm_page_prot)); return; } + + /* The page is unshared, and we want write access. In this + case, it is safe to tear down the swap cache and give the + page over entirely to this process. */ + + delete_from_swap_cache(page_map); set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); - swap_free(entry); return; } diff --git a/mm/page_io.c b/mm/page_io.c index 5ebea3f09..e02565def 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -6,6 +6,7 @@ * Swap reorganised 29.12.95, * Asynchronous swapping added 30.12.95. Stephen Tweedie * Removed race in async swapping. 14.4.1996. Bruno Haible + * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie */ #include <linux/mm.h> @@ -27,26 +28,38 @@ #include <asm/bitops.h> #include <asm/pgtable.h> -static struct wait_queue * lock_queue = NULL; - /* * Reads or writes a swap page. * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O. + * All IO to swap files (as opposed to swap partitions) is done + * synchronously. * - * Important prevention of race condition: The first thing we do is set a lock - * on this swap page, which lasts until I/O completes. This way a - * write_swap_page(entry) immediately followed by a read_swap_page(entry) - * on the same entry will first complete the write_swap_page(). Fortunately, - * not more than one write_swap_page() request can be pending per entry. So - * all races the caller must catch are: multiple read_swap_page() requests - * on the same entry. + * Important prevention of race condition: the caller *must* atomically + * create a unique swap cache entry for this swap page before calling + * rw_swap_page, and must lock that page. By ensuring that there is a + * single page of memory reserved for the swap entry, the normal VM page + * lock on that page also doubles as a lock on swap entries. Having only + * one lock to deal with per swap entry (rather than locking swap and memory + * independently) also makes it easier to make certain swapping operations + * atomic, which is particularly important when we are trying to ensure + * that shared pages stay shared while being swapped. */ + void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) { unsigned long type, offset; struct swap_info_struct * p; - struct page *page; - + struct page *page = mem_map + MAP_NR(buf); + +#ifdef DEBUG_SWAP + printk ("DebugVM: %s_swap_page entry %08lx, page %p (count %d), %s\n", + (rw == READ) ? "read" : "write", + entry, buf, atomic_read(&page->count), + wait ? "wait" : "nowait"); +#endif + + if (page->inode && page->inode != &swapper_inode) + panic ("Tried to swap a non-swapper page"); type = SWP_TYPE(entry); if (type >= nr_swapfiles) { printk("Internal error: bad swap-device\n"); @@ -59,33 +72,49 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) return; } if (p->swap_map && !p->swap_map[offset]) { - printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry); + printk("Hmm.. Trying to %s unallocated swap (%08lx)\n", + (rw == READ) ? "read" : "write", + entry); return; } if (!(p->flags & SWP_USED)) { printk("Trying to swap to unused swap-device\n"); return; } - /* Make sure we are the only process doing I/O with this swap page. */ - while (test_and_set_bit(offset,p->swap_lockmap)) { - run_task_queue(&tq_disk); - sleep_on(&lock_queue); + + if (!PageLocked(page)) { + printk("VM: swap page is unlocked\n"); + return; } - if (rw == READ) + + if (rw == READ) { + clear_bit(PG_uptodate, &page->flags); kstat.pswpin++; - else + } else kstat.pswpout++; - page = mem_map + MAP_NR(buf); + atomic_inc(&page->count); - wait_on_page(page); + /* + * Make sure that we have a swap cache association for this + * page. We need this to find which swap page to unlock once + * the swap IO has completed to the physical page. If the page + * is not already in the cache, just overload the offset entry + * as if it were: we are not allowed to manipulate the inode + * hashing for locked pages. + */ + if (!PageSwapCache(page)) { + printk("VM: swap page is not in swap cache\n"); + return; + } + if (page->offset != entry) { + printk ("swap entry mismatch"); + return; + } + if (p->swap_device) { if (!wait) { set_bit(PG_free_after, &page->flags); set_bit(PG_decr_after, &page->flags); - set_bit(PG_swap_unlock_after, &page->flags); - /* swap-cache shouldn't be set, but play safe */ - PageClearSwapCache(page); - page->pg_swap_entry = entry; atomic_inc(&nr_async_pages); } ll_rw_page(rw,p->swap_device,offset,buf); @@ -132,39 +161,55 @@ void rw_swap_page(int rw, unsigned long entry, char * buf, int wait) for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) if (!(zones[i] = bmap(swapf,block++))) { printk("rw_swap_page: bad swap file\n"); + return; } } ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); + /* Unlike ll_rw_page, ll_rw_swap_file won't unlock the + page for us. */ + clear_bit(PG_locked, &page->flags); + wake_up(&page->wait); } else printk("rw_swap_page: no swap file or device\n"); + atomic_dec(&page->count); - if (offset && !test_and_clear_bit(offset,p->swap_lockmap)) - printk("rw_swap_page: lock already cleared\n"); - wake_up(&lock_queue); +#ifdef DEBUG_SWAP + printk ("DebugVM: %s_swap_page finished on page %p (count %d)\n", + (rw == READ) ? "read" : "write", + buf, atomic_read(&page->count)); +#endif } -/* This is run when asynchronous page I/O has completed. */ -void swap_after_unlock_page (unsigned long entry) +/* + * Setting up a new swap file needs a simple wrapper just to read the + * swap signature. SysV shared memory also needs a simple wrapper. + */ +void rw_swap_page_nocache(int rw, unsigned long entry, char *buffer) { - unsigned long type, offset; - struct swap_info_struct * p; - - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) { - printk("swap_after_unlock_page: bad swap-device\n"); + struct page *page; + + page = mem_map + MAP_NR((unsigned long) buffer); + wait_on_page(page); + set_bit(PG_locked, &page->flags); + if (test_and_set_bit(PG_swap_cache, &page->flags)) { + printk ("VM: read_swap_page: page already in swap cache!\n"); return; } - p = &swap_info[type]; - offset = SWP_OFFSET(entry); - if (offset >= p->max) { - printk("swap_after_unlock_page: weirdness\n"); + if (page->inode) { + printk ("VM: read_swap_page: page already in page cache!\n"); return; } - if (!test_and_clear_bit(offset,p->swap_lockmap)) - printk("swap_after_unlock_page: lock already cleared\n"); - wake_up(&lock_queue); + page->inode = &swapper_inode; + page->offset = entry; + atomic_inc(&page->count); /* Protect from shrink_mmap() */ + rw_swap_page(rw, entry, buffer, 1); + atomic_dec(&page->count); + page->inode = 0; + clear_bit(PG_swap_cache, &page->flags); } + + /* * Swap partitions are now read via brw_page. ll_rw_page is an * asynchronous function now --- we must call wait_on_page afterwards @@ -189,7 +234,7 @@ void ll_rw_page(int rw, kdev_t dev, unsigned long offset, char * buffer) panic("ll_rw_page: bad block dev cmd, must be R/W"); } page = mem_map + MAP_NR(buffer); - if (test_and_set_bit(PG_locked, &page->flags)) - panic ("ll_rw_page: page already locked"); + if (!PageLocked(page)) + panic ("ll_rw_page: page not already locked"); brw_page(rw, page, dev, &block, PAGE_SIZE, 0); } @@ -115,7 +115,7 @@ struct simp * simp_create(char * name, long size, if(!global) { #ifdef __SMP__ - global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER, 0); + global = (struct global_data*)__get_free_pages(GFP_KERNEL, ORDER); memset(global, 0, CHUNK_SIZE); #else global = (struct global_data*)get_free_page(GFP_KERNEL); @@ -167,7 +167,7 @@ static void alloc_header(struct simp * simp) spin_unlock(&simp->lock); for(;;) { - hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER, 0); + hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER); if(hdr) break; if(!simp_garbage()) @@ -506,8 +506,7 @@ kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma) void *addr; *dma = flags & SLAB_DMA; - addr = (void*) __get_free_pages(flags & SLAB_LEVEL_MASK, - cachep->c_gfporder, *dma); + addr = (void*) __get_free_pages(flags, cachep->c_gfporder); /* Assume that now we have the pages no one else can legally * messes with the 'struct page's. * However vm_scan() might try to test the structure to see if @@ -1732,19 +1731,18 @@ kmem_find_general_cachep(size_t size) * This function _cannot_ be called within a int, but it * can be interrupted. */ -int -kmem_cache_reap(int pri, int dma, int wait) +void +kmem_cache_reap(int gfp_mask) { kmem_slab_t *slabp; kmem_cache_t *searchp; kmem_cache_t *best_cachep; unsigned int scan; unsigned int reap_level; - static unsigned long call_count = 0; if (in_interrupt()) { printk("kmem_cache_reap() called within int!\n"); - return 0; + return; } /* We really need a test semphore op so we can avoid sleeping when @@ -1752,28 +1750,8 @@ kmem_cache_reap(int pri, int dma, int wait) */ down(&cache_chain_sem); - scan = 10-pri; - if (pri == 6 && !dma) { - if (++call_count == 199) { - /* Hack Alert! - * Occassionally we try hard to reap a slab. - */ - call_count = 0UL; - reap_level = 0; - scan += 2; - } else - reap_level = 3; - } else { - if (pri >= 5) { - /* We also come here for dma==1 at pri==6, just - * to try that bit harder (assumes that there are - * less DMAable pages in a system - not always true, - * but this doesn't hurt). - */ - reap_level = 2; - } else - reap_level = 0; - } + scan = 10; + reap_level = 0; best_cachep = NULL; searchp = clock_searchp; @@ -1812,7 +1790,7 @@ kmem_cache_reap(int pri, int dma, int wait) } spin_unlock_irq(&searchp->c_spinlock); - if (dma && !dma_flag) + if ((gfp_mask & GFP_DMA) && !dma_flag) goto next; if (full_free) { @@ -1825,10 +1803,6 @@ kmem_cache_reap(int pri, int dma, int wait) * more than one page per slab (as it can be difficult * to get high orders from gfp()). */ - if (pri == 6) { /* magic '6' from try_to_free_page() */ - if (searchp->c_gfporder || searchp->c_ctor) - full_free--; - } if (full_free >= reap_level) { reap_level = full_free; best_cachep = searchp; @@ -1846,12 +1820,12 @@ next: if (!best_cachep) { /* couldn't find anthying to reap */ - return 0; + return; } spin_lock_irq(&best_cachep->c_spinlock); if (!best_cachep->c_growing && !(slabp = best_cachep->c_lastp)->s_inuse && slabp != kmem_slab_end(best_cachep)) { - if (dma) { + if (gfp_mask & GFP_DMA) { do { if (slabp->s_dma) goto good_dma; @@ -1874,11 +1848,11 @@ good_dma: */ spin_unlock_irq(&best_cachep->c_spinlock); kmem_slab_destroy(best_cachep, slabp); - return 1; + return; } dma_fail: spin_unlock_irq(&best_cachep->c_spinlock); - return 0; + return; } #if SLAB_SELFTEST diff --git a/mm/swap_state.c b/mm/swap_state.c index 75f284124..4ebc5c05f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -3,6 +3,8 @@ * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie + * + * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ #include <linux/mm.h> @@ -17,6 +19,7 @@ #include <linux/fs.h> #include <linux/swapctl.h> #include <linux/init.h> +#include <linux/pagemap.h> #include <asm/bitops.h> #include <asm/pgtable.h> @@ -29,6 +32,18 @@ unsigned long swap_cache_del_success = 0; unsigned long swap_cache_find_total = 0; unsigned long swap_cache_find_success = 0; +/* + * Keep a reserved false inode which we will use to mark pages in the + * page cache are acting as swap cache instead of file cache. + * + * We only need a unique pointer to satisfy the page cache, but we'll + * reserve an entire zeroed inode structure for the purpose just to + * ensure that any mistaken dereferences of this structure cause a + * kernel oops. + */ +struct inode swapper_inode; + + void show_swap_cache_info(void) { printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", @@ -40,21 +55,33 @@ void show_swap_cache_info(void) int add_to_swap_cache(struct page *page, unsigned long entry) { - struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; - #ifdef SWAP_CACHE_INFO swap_cache_add_total++; #endif - if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - page->pg_swap_entry = entry; - if (PageTestandSetSwapCache(page)) - printk("swap_cache: replacing non-empty entry\n"); -#ifdef SWAP_CACHE_INFO - swap_cache_add_success++; +#ifdef DEBUG_SWAP + printk("DebugVM: add_to_swap_cache(%08lx count %d, entry %08lx)\n", + page_address(page), atomic_read(&page->count), entry); #endif - return 1; + if (PageTestandSetSwapCache(page)) { + printk("swap_cache: replacing non-empty entry %08lx " + "on page %08lx", + page->offset, page_address(page)); + return 0; } - return 0; + if (page->inode) { + printk("swap_cache: replacing page-cached entry " + "on page %08lx", page_address(page)); + return 0; + } + atomic_inc(&page->count); + page->inode = &swapper_inode; + page->offset = entry; + add_page_to_hash_queue(page, &swapper_inode, entry); + add_page_to_inode_queue(&swapper_inode, page); +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; } /* @@ -87,6 +114,10 @@ void swap_duplicate(unsigned long entry) entry, p->swap_map[offset]); p->swap_map[offset] = 127; } +#ifdef DEBUG_SWAP + printk("DebugVM: swap_duplicate(entry %08lx, count now %d)\n", + entry, p->swap_map[offset]); +#endif out: return; @@ -97,7 +128,173 @@ bad_offset: printk("swap_duplicate: offset exceeds max\n"); goto out; bad_unused: - printk("swap_duplicate: unused page\n"); + printk("swap_duplicate at %8p: unused page\n", + __builtin_return_address(0)); goto out; } + +void remove_from_swap_cache(struct page *page) +{ + if (!page->inode) { + printk ("VM: Removing swap cache page with zero inode hash " + "on page %08lx", page_address(page)); + return; + } + if (page->inode != &swapper_inode) { + printk ("VM: Removing swap cache page with wrong inode hash " + "on page %08lx", page_address(page)); + } + /* + * This will be a legal case once we have a more mature swap cache. + */ + if (atomic_read(&page->count) == 1) { + printk ("VM: Removing page cache on unshared page %08lx", + page_address(page)); + return; + } + + +#ifdef DEBUG_SWAP + printk("DebugVM: remove_from_swap_cache(%08lx count %d)\n", + page_address(page), atomic_read(&page->count)); +#endif + remove_page_from_hash_queue (page); + remove_page_from_inode_queue (page); + PageClearSwapCache (page); + __free_page (page); +} + + +long find_in_swap_cache(struct page *page) +{ +#ifdef SWAP_CACHE_INFO + swap_cache_find_total++; +#endif + if (PageSwapCache (page)) { + long entry = page->offset; +#ifdef SWAP_CACHE_INFO + swap_cache_find_success++; +#endif + remove_from_swap_cache (page); + return entry; + } + return 0; +} + +int delete_from_swap_cache(struct page *page) +{ +#ifdef SWAP_CACHE_INFO + swap_cache_del_total++; +#endif + if (PageSwapCache (page)) { + long entry = page->offset; +#ifdef SWAP_CACHE_INFO + swap_cache_del_success++; +#endif +#ifdef DEBUG_SWAP + printk("DebugVM: delete_from_swap_cache(%08lx count %d, " + "entry %08lx)\n", + page_address(page), atomic_read(&page->count), entry); +#endif + remove_from_swap_cache (page); + swap_free (entry); + return 1; + } + return 0; +} + +/* + * Perform a free_page(), also freeing any swap cache associated with + * this page if it is the last user of the page. + */ + +void free_page_and_swap_cache(unsigned long addr) +{ + struct page *page = mem_map + MAP_NR(addr); + /* + * If we are the only user, then free up the swap cache. + */ + if (PageSwapCache(page) && !is_page_shared(page)) { + delete_from_swap_cache(page); + } + + free_page(addr); +} + + +/* + * Lookup a swap entry in the swap cache. We need to be careful about + * locked pages. A found page will be returned with its refcount + * incremented. + */ + +static struct page * lookup_swap_cache(unsigned long entry) +{ + struct page *found; + + while (1) { + found = find_page(&swapper_inode, entry); + if (!found) + return 0; + if (found->inode != &swapper_inode + || !PageSwapCache(found)) { + __free_page(found); + printk ("VM: Found a non-swapper swap page!\n"); + return 0; + } + if (!PageLocked(found)) + return found; + __free_page(found); + __wait_on_page(found); + } +} + +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. If wait==0, we are + * only doing readahead, so don't worry if the page is already locked. + */ + +struct page * read_swap_cache_async(unsigned long entry, int wait) +{ + struct page *found_page, *new_page = 0; + unsigned long new_page_addr = 0; + +#ifdef DEBUG_SWAP + printk("DebugVM: read_swap_cache_async entry %08lx%s\n", + entry, wait ? ", wait" : ""); +#endif +repeat: + found_page = lookup_swap_cache(entry); + if (found_page) { + if (new_page) + __free_page(new_page); + return found_page; + } + + /* The entry is not present. Lock down a new page, add it to + * the swap cache and read its contents. */ + if (!new_page) { + new_page_addr = __get_free_page(GFP_KERNEL); + if (!new_page_addr) + return 0; /* Out of memory */ + new_page = mem_map + MAP_NR(new_page_addr); + goto repeat; /* We might have stalled */ + } + + if (!add_to_swap_cache(new_page, entry)) { + free_page(new_page_addr); + return 0; + } + swap_duplicate(entry); /* Account for the swap cache */ + set_bit(PG_locked, &new_page->flags); + rw_swap_page(READ, entry, (char *) new_page_addr, wait); +#ifdef DEBUG_SWAP + printk("DebugVM: read_swap_cache_async created " + "entry %08lx at %p\n", + entry, (char *) page_address(new_page)); +#endif + return new_page; +} + diff --git a/mm/swapfile.c b/mm/swapfile.c index 13d2436ba..8608db8d8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -21,6 +21,7 @@ #include <linux/malloc.h> #include <linux/blkdev.h> /* for blk_size */ #include <linux/vmalloc.h> +#include <linux/pagemap.h> #include <asm/bitops.h> #include <asm/pgtable.h> @@ -51,8 +52,6 @@ static inline int scan_swap_map(struct swap_info_struct *si) offset = si->cluster_next++; if (si->swap_map[offset]) continue; - if (test_bit(offset, si->swap_lockmap)) - continue; si->cluster_nr--; goto got_page; } @@ -61,8 +60,6 @@ static inline int scan_swap_map(struct swap_info_struct *si) for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { if (si->swap_map[offset]) continue; - if (test_bit(offset, si->swap_lockmap)) - continue; si->lowest_bit = offset; got_page: si->swap_map[offset] = 1; @@ -129,6 +126,7 @@ void swap_free(unsigned long entry) if (!entry) goto out; + type = SWP_TYPE(entry); if (type & SHM_SWP_TYPE) goto out; @@ -152,6 +150,10 @@ void swap_free(unsigned long entry) if (!--p->swap_map[offset]) nr_swap_pages++; } +#ifdef DEBUG_SWAP + printk("DebugVM: swap_free(entry %08lx, count now %d)\n", + entry, p->swap_map[offset]); +#endif out: return; @@ -172,42 +174,38 @@ bad_free: /* * The swap entry has been read in advance, and we return 1 to indicate * that the page has been used or is no longer needed. + * + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many ptes will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. */ -static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address, +static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, pte_t *dir, unsigned long entry, unsigned long page) { pte_t pte = *dir; if (pte_none(pte)) - return 0; + return; if (pte_present(pte)) { - struct page *pg; - unsigned long page_nr = MAP_NR(pte_page(pte)); - unsigned long pg_swap_entry; - - if (page_nr >= max_mapnr) - return 0; - pg = mem_map + page_nr; - if (!(pg_swap_entry = in_swap_cache(pg))) - return 0; - if (SWP_TYPE(pg_swap_entry) != SWP_TYPE(entry)) - return 0; - delete_from_swap_cache(pg); + /* If this entry is swap-cached, then page must already + hold the right address for any copies in physical + memory */ + if (pte_page(pte) != page) + return; + /* We will be removing the swap cache in a moment, so... */ set_pte(dir, pte_mkdirty(pte)); - if (pg_swap_entry != entry) - return 0; - free_page(page); - return 1; + return; } if (pte_val(pte) != entry) - return 0; - set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)))); - ++vma->vm_mm->rss; + return; + set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); swap_free(entry); - return 1; + atomic_inc(&mem_map[MAP_NR(page)].count); + ++vma->vm_mm->rss; } -static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, +static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, unsigned long entry, unsigned long page) { @@ -215,11 +213,11 @@ static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long end; if (pmd_none(*dir)) - return 0; + return; if (pmd_bad(*dir)) { printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir)); pmd_clear(dir); - return 0; + return; } pte = pte_offset(dir, address); offset += address & PMD_MASK; @@ -228,16 +226,13 @@ static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, if (end > PMD_SIZE) end = PMD_SIZE; do { - if (unuse_pte(vma, offset+address-vma->vm_start, pte, entry, - page)) - return 1; + unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); address += PAGE_SIZE; pte++; } while (address < end); - return 0; } -static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, +static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, unsigned long entry, unsigned long page) { @@ -245,11 +240,11 @@ static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long offset, end; if (pgd_none(*dir)) - return 0; + return; if (pgd_bad(*dir)) { printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir)); pgd_clear(dir); - return 0; + return; } pmd = pmd_offset(dir, address); offset = address & PGDIR_MASK; @@ -258,30 +253,26 @@ static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - if (unuse_pmd(vma, pmd, address, end - address, offset, entry, - page)) - return 1; + unuse_pmd(vma, pmd, address, end - address, offset, entry, + page); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); - return 0; } -static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, +static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, unsigned long entry, unsigned long page) { unsigned long start = vma->vm_start, end = vma->vm_end; while (start < end) { - if (unuse_pgd(vma, pgdir, start, end - start, entry, page)) - return 1; + unuse_pgd(vma, pgdir, start, end - start, entry, page); start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } - return 0; } -static int unuse_process(struct mm_struct * mm, unsigned long entry, +static void unuse_process(struct mm_struct * mm, unsigned long entry, unsigned long page) { struct vm_area_struct* vma; @@ -290,13 +281,12 @@ static int unuse_process(struct mm_struct * mm, unsigned long entry, * Go through process' page directory. */ if (!mm || mm == &init_mm) - return 0; + return; for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page)) - return 1; + unuse_vma(vma, pgd, entry, page); } - return 0; + return; } /* @@ -309,19 +299,14 @@ static int try_to_unuse(unsigned int type) struct swap_info_struct * si = &swap_info[type]; struct task_struct *p; unsigned long page = 0; + struct page *page_map; unsigned long entry; int i; while (1) { - if (!page) { - page = __get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - } - /* - * Find a swap page in use and read it in. - */ + * Find a swap page in use and read it in. + */ for (i = 1 , entry = 0; i < si->max ; i++) { if (si->swap_map[i] > 0 && si->swap_map[i] != 0x80) { entry = SWP_ENTRY(type, i); @@ -330,36 +315,31 @@ static int try_to_unuse(unsigned int type) } if (!entry) break; - read_swap_page(entry, (char *) page); + /* Get a page for the entry, using the existing swap + cache page if there is one. Otherwise, get a clean + page and read the swap into it. */ + page_map = read_swap_cache(entry); + if (!page_map) + return -ENOMEM; + page = page_address(page_map); read_lock(&tasklist_lock); - for_each_task(p) { - if (unuse_process(p->mm, entry, page)) { - page = 0; - goto unlock; - } - } - unlock: + for_each_task(p) + unuse_process(p->mm, entry, page); read_unlock(&tasklist_lock); - if (page) { - /* - * If we couldn't find an entry, there are several - * possible reasons: someone else freed it first, - * we freed the last reference to an overflowed entry, - * or the system has lost track of the use counts. - */ - if (si->swap_map[i] != 0) { - if (si->swap_map[i] != 127) - printk("try_to_unuse: entry %08lx " - "not in use\n", entry); - si->swap_map[i] = 0; - nr_swap_pages++; - } + /* Now get rid of the extra reference to the temporary + page we've been using. */ + if (PageSwapCache(page_map)) + delete_from_swap_cache(page_map); + free_page(page); + if (si->swap_map[i] != 0) { + if (si->swap_map[i] != 127) + printk("try_to_unuse: entry %08lx " + "not in use\n", entry); + si->swap_map[i] = 0; + nr_swap_pages++; } } - - if (page) - free_page(page); return 0; } @@ -370,7 +350,7 @@ asmlinkage int sys_swapoff(const char * specialfile) struct file filp; int i, type, prev; int err = -EPERM; - + lock_kernel(); if (!suser()) goto out; @@ -444,8 +424,6 @@ asmlinkage int sys_swapoff(const char * specialfile) p->swap_device = 0; vfree(p->swap_map); p->swap_map = NULL; - free_page((long) p->swap_lockmap); - p->swap_lockmap = NULL; p->flags = 0; err = 0; out: @@ -505,6 +483,7 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) int error = -EPERM; struct file filp; static int least_priority = 0; + unsigned char *avail_map = 0; lock_kernel(); if (!suser()) @@ -522,7 +501,6 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) p->swap_file = NULL; p->swap_device = 0; p->swap_map = NULL; - p->swap_lockmap = NULL; p->lowest_bit = 0; p->highest_bit = 0; p->cluster_nr = 0; @@ -565,24 +543,24 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) } } else if (!S_ISREG(swap_dentry->d_inode->i_mode)) goto bad_swap; - p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER); - if (!p->swap_lockmap) { + avail_map = (unsigned char *) get_free_page(GFP_USER); + if (!avail_map) { printk("Unable to start swapping: out of memory :-)\n"); error = -ENOMEM; goto bad_swap; } - read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap); - if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) { + rw_swap_page_nocache(READ, SWP_ENTRY(type,0), (char *) avail_map); + if (memcmp("SWAP-SPACE",avail_map+PAGE_SIZE-10,10)) { printk("Unable to find swap-space signature\n"); error = -EINVAL; goto bad_swap; } - memset(p->swap_lockmap+PAGE_SIZE-10,0,10); + memset(avail_map+PAGE_SIZE-10,0,10); j = 0; p->lowest_bit = 0; p->highest_bit = 0; for (i = 1 ; i < 8*PAGE_SIZE ; i++) { - if (test_bit(i,p->swap_lockmap)) { + if (test_bit(i,avail_map)) { if (!p->lowest_bit) p->lowest_bit = i; p->highest_bit = i; @@ -601,13 +579,12 @@ asmlinkage int sys_swapon(const char * specialfile, int swap_flags) goto bad_swap; } for (i = 1 ; i < p->max ; i++) { - if (test_bit(i,p->swap_lockmap)) + if (test_bit(i,avail_map)) p->swap_map[i] = 0; else p->swap_map[i] = 0x80; } p->swap_map[0] = 0x80; - clear_page(p->swap_lockmap); p->flags = SWP_WRITEOK; p->pages = j; nr_swap_pages += j; @@ -634,15 +611,15 @@ bad_swap: if(filp.f_op && filp.f_op->release) filp.f_op->release(filp.f_dentry->d_inode,&filp); bad_swap_2: - free_page((long) p->swap_lockmap); vfree(p->swap_map); dput(p->swap_file); p->swap_device = 0; p->swap_file = NULL; p->swap_map = NULL; - p->swap_lockmap = NULL; p->flags = 0; out: + if (avail_map) + free_page((long) avail_map); unlock_kernel(); return error; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a50684973..ebef7a362 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7,7 +7,7 @@ * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to free_pages_high: 2.4.97, Rik van Riel. - * Version: $Id: vmscan.c,v 1.23 1997/04/12 04:31:05 davem Exp $ + * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $ */ #include <linux/mm.h> @@ -61,7 +61,7 @@ static void init_swap_timer(void); * have died while we slept). */ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma, - unsigned long address, pte_t * page_table, int dma, int wait) + unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; unsigned long entry; @@ -78,20 +78,62 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc page_map = mem_map + MAP_NR(page); if (PageReserved(page_map) || PageLocked(page_map) - || (dma && !PageDMA(page_map))) + || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))) return 0; - /* Deal with page aging. Pages age from being unused; they - * rejuvenate on being accessed. Only swap old pages (age==0 - * is oldest). */ - if ((pte_dirty(pte) && delete_from_swap_cache(page_map)) - || pte_young(pte)) { + + /* + * Deal with page aging. There are several special cases to + * consider: + * + * Page has been accessed, but is swap cached. If the page is + * getting sufficiently "interesting" --- its age is getting + * high --- then if we are sufficiently short of free swap + * pages, then delete the swap cache. We can only do this if + * the swap page's reference count is one: ie. there are no + * other references to it beyond the swap cache (as there must + * still be pte's pointing to it if count > 1). + * + * If the page has NOT been touched, and its age reaches zero, + * then we are swapping it out: + * + * If there is already a swap cache page for this page, then + * another process has already allocated swap space, so just + * dereference the physical page and copy in the swap entry + * from the swap cache. + * + * Note, we rely on all pages read in from swap either having + * the swap cache flag set, OR being marked writable in the pte, + * but NEVER BOTH. (It IS legal to be neither cached nor dirty, + * however.) + * + * -- Stephen Tweedie 1998 */ + + if (PageSwapCache(page_map)) { + if (pte_write(pte)) { + printk ("VM: Found a writable swap-cached page!\n"); + return 0; + } + } + + if (pte_young(pte)) { set_pte(page_table, pte_mkold(pte)); touch_page(page_map); + /* + * We should test here to see if we want to recover any + * swap cache page here. We do this if the page seeing + * enough activity, AND we are sufficiently low on swap + * + * We need to track both the number of available swap + * pages and the total number present before we can do + * this... + */ return 0; } + age_page(page_map); if (page_map->age) return 0; + if (pte_dirty(pte)) { if (vma->vm_ops && vma->vm_ops->swapout) { pid_t pid = tsk->pid; @@ -99,33 +141,83 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table)) kill_proc(pid, SIGBUS, 1); } else { - if (atomic_read(&page_map->count) != 1) - return 0; - if (!(entry = get_swap_page())) - return 0; + /* + * This is a dirty, swappable page. First of all, + * get a suitable swap entry for it, and make sure + * we have the swap cache set up to associate the + * page with that swap entry. + */ + if (PageSwapCache(page_map)) { + entry = page_map->offset; + } else { + entry = get_swap_page(); + if (!entry) + return 0; /* No swap space left */ + } + vma->vm_mm->rss--; + tsk->nswap++; flush_cache_page(vma, address); set_pte(page_table, __pte(entry)); flush_tlb_page(vma, address); - tsk->nswap++; - rw_swap_page(WRITE, entry, (char *) page, wait); + swap_duplicate(entry); + + /* Now to write back the page. We have two + * cases: if the page is already part of the + * swap cache, then it is already on disk. Just + * free the page and return (we release the swap + * cache on the last accessor too). + * + * If we have made a new swap entry, then we + * start the write out to disk. If the page is + * shared, however, we still need to keep the + * copy in memory, so we add it to the swap + * cache. */ + if (PageSwapCache(page_map)) { + free_page_and_swap_cache(page); + return (atomic_read(&page_map->count) == 0); + } + add_to_swap_cache(page_map, entry); + /* We checked we were unlocked way up above, and we + have been careful not to stall until here */ + set_bit(PG_locked, &page_map->flags); + /* OK, do a physical write to swap. */ + rw_swap_page(WRITE, entry, (char *) page, (gfp_mask & __GFP_WAIT)); } - free_page(page); + /* Now we can free the current physical page. We also + * free up the swap cache if this is the last use of the + * page. Note that there is a race here: the page may + * still be shared COW by another process, but that + * process may exit while we are writing out the page + * asynchronously. That's no problem, shrink_mmap() can + * correctly clean up the occassional unshared page + * which gets left behind in the swap cache. */ + free_page_and_swap_cache(page); return 1; /* we slept: the process may not exist any more */ } - if ((entry = find_in_swap_cache(page_map))) { - if (atomic_read(&page_map->count) != 1) { - set_pte(page_table, pte_mkdirty(pte)); - printk("Aiee.. duplicated cached swap-cache entry\n"); - return 0; - } + + /* The page was _not_ dirty, but still has a zero age. It must + * already be uptodate on disk. If it is in the swap cache, + * then we can just unlink the page now. Remove the swap cache + * too if this is the last user. */ + if ((entry = in_swap_cache(page_map))) { vma->vm_mm->rss--; flush_cache_page(vma, address); set_pte(page_table, __pte(entry)); flush_tlb_page(vma, address); - free_page(page); - return 1; + swap_duplicate(entry); + free_page_and_swap_cache(page); + return (atomic_read(&page_map->count) == 0); } + /* + * A clean page to be discarded? Must be mmap()ed from + * somewhere. Unlink the pte, and tell the filemap code to + * discard any cached backing page if this is the last user. + */ + if (PageSwapCache(page_map)) { + printk ("VM: How can this page _still_ be cached?"); + return 0; + } vma->vm_mm->rss--; flush_cache_page(vma, address); pte_clear(page_table); @@ -150,7 +242,7 @@ static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struc */ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma, - pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait) + pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pte_t * pte; unsigned long pmd_end; @@ -172,7 +264,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * do { int result; tsk->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(tsk, vma, address, pte, dma, wait); + result = try_to_swap_out(tsk, vma, address, pte, gfp_mask); if (result) return result; address += PAGE_SIZE; @@ -182,7 +274,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * } static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait) + pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pmd_t * pmd; unsigned long pgd_end; @@ -202,7 +294,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * end = pgd_end; do { - int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait); + int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask); if (result) return result; address = (address + PMD_SIZE) & PMD_MASK; @@ -212,7 +304,7 @@ static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * } static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, - pgd_t *pgdir, unsigned long start, int dma, int wait) + pgd_t *pgdir, unsigned long start, int gfp_mask) { unsigned long end; @@ -223,7 +315,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, end = vma->vm_end; while (start < end) { - int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait); + int result = swap_out_pgd(tsk, vma, pgdir, start, end, gfp_mask); if (result) return result; start = (start + PGDIR_SIZE) & PGDIR_MASK; @@ -232,7 +324,7 @@ static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma, return 0; } -static int swap_out_process(struct task_struct * p, int dma, int wait) +static int swap_out_process(struct task_struct * p, int gfp_mask) { unsigned long address; struct vm_area_struct* vma; @@ -241,19 +333,20 @@ static int swap_out_process(struct task_struct * p, int dma, int wait) * Go through process' page directory. */ address = p->swap_address; - p->swap_address = 0; /* * Find the proper vm-area */ vma = find_vma(p->mm, address); - if (!vma) + if (!vma) { + p->swap_address = 0; return 0; + } if (address < vma->vm_start) address = vma->vm_start; for (;;) { - int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait); + int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, gfp_mask); if (result) return result; vma = vma->vm_next; @@ -270,7 +363,7 @@ static int swap_out_process(struct task_struct * p, int dma, int wait) * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ -static int swap_out(unsigned int priority, int dma, int wait) +static int swap_out(unsigned int priority, int gfp_mask) { struct task_struct * p, * pbest; int counter, assign, max_cnt; @@ -321,7 +414,7 @@ static int swap_out(unsigned int priority, int dma, int wait) } pbest->swap_cnt--; - switch (swap_out_process(pbest, dma, wait)) { + switch (swap_out_process(pbest, gfp_mask)) { case 0: /* * Clear swap_cnt so we don't look at this task @@ -345,7 +438,7 @@ out: * to be. This works out OK, because we now do proper aging on page * contents. */ -static inline int do_try_to_free_page(int priority, int dma, int wait) +static inline int do_try_to_free_page(int gfp_mask) { static int state = 0; int i=6; @@ -353,25 +446,27 @@ static inline int do_try_to_free_page(int priority, int dma, int wait) /* Let the dcache know we're looking for memory ... */ shrink_dcache_memory(); + /* Always trim SLAB caches when memory gets low. */ - (void) kmem_cache_reap(0, dma, wait); + kmem_cache_reap(gfp_mask); - /* we don't try as hard if we're not waiting.. */ + /* We try harder if we are waiting .. */ stop = 3; - if (wait) + if (gfp_mask & __GFP_WAIT) stop = 0; + switch (state) { do { case 0: - if (shrink_mmap(i, dma)) + if (shrink_mmap(i, gfp_mask)) return 1; state = 1; case 1: - if (shm_swap(i, dma)) + if ((gfp_mask & __GFP_IO) && shm_swap(i, gfp_mask)) return 1; state = 2; default: - if (swap_out(i, dma, wait)) + if (swap_out(i, gfp_mask)) return 1; state = 0; i--; @@ -387,12 +482,12 @@ static inline int do_try_to_free_page(int priority, int dma, int wait) * now we need this so that we can do page allocations * without holding the kernel lock etc. */ -int try_to_free_page(int priority, int dma, int wait) +int try_to_free_page(int gfp_mask) { int retval; lock_kernel(); - retval = do_try_to_free_page(priority,dma,wait); + retval = do_try_to_free_page(gfp_mask); unlock_kernel(); return retval; } @@ -406,7 +501,7 @@ int try_to_free_page(int priority, int dma, int wait) void kswapd_setup(void) { int i; - char *revision="$Revision: 1.23 $", *s, *e; + char *revision="$Revision: 1.5 $", *s, *e; if ((s = strchr(revision, ':')) && (e = strchr(s, '$'))) @@ -423,6 +518,7 @@ void kswapd_setup(void) */ int kswapd(void *unused) { + struct wait_queue wait = { current, NULL }; current->session = 1; current->pgrp = 1; sprintf(current->comm, "kswapd"); @@ -442,42 +538,63 @@ int kswapd(void *unused) priorities. */ init_swap_timer(); - + add_wait_queue(&kswapd_wait, &wait); while (1) { - int fail; + int tries; kswapd_awake = 0; flush_signals(current); run_task_queue(&tq_disk); - interruptible_sleep_on(&kswapd_wait); + schedule(); + current->state = TASK_INTERRUPTIBLE; kswapd_awake = 1; swapstats.wakeups++; /* Do the background pageout: - * We now only swap out as many pages as needed. - * When we are truly low on memory, we swap out - * synchronously (WAIT == 1). -- Rik. - * If we've had too many consecutive failures, - * go back to sleep to let other tasks run. + * When we've got loads of memory, we try + * (free_pages_high - nr_free_pages) times to + * free memory. As memory gets tighter, kswapd + * gets more and more agressive. -- Rik. */ - for (fail = 0; fail++ < MAX_SWAP_FAIL;) { - int pages, wait; + tries = free_pages_high - nr_free_pages; + if (tries < min_free_pages) { + tries = min_free_pages; + } + else if (nr_free_pages < (free_pages_high + free_pages_low) / 2) { + tries <<= 1; + if (nr_free_pages < free_pages_low) { + tries <<= 1; + if (nr_free_pages <= min_free_pages) { + tries <<= 1; + } + } + } + while (tries--) { + int gfp_mask; - pages = nr_free_pages; - if (nr_free_pages >= min_free_pages) - pages += atomic_read(&nr_async_pages); - if (pages >= free_pages_high) + if (free_memory_available()) break; - wait = (pages < free_pages_low); - if (try_to_free_page(GFP_KERNEL, 0, wait)) - fail = 0; + gfp_mask = __GFP_IO; + try_to_free_page(gfp_mask); + /* + * Syncing large chunks is faster than swapping + * synchronously (less head movement). -- Rik. + */ + if (atomic_read(&nr_async_pages) >= SWAP_CLUSTER_MAX) + run_task_queue(&tq_disk); + } - /* - * Report failure if we couldn't reach the minimum goal. - */ - if (nr_free_pages < min_free_pages) - printk("kswapd: failed, got %d of %d\n", - nr_free_pages, min_free_pages); +#if 0 + /* + * Report failure if we couldn't even reach min_free_pages. + */ + if (nr_free_pages < min_free_pages) + printk("kswapd: failed, got %d of %d\n", + nr_free_pages, min_free_pages); +#endif } + /* As if we could ever get here - maybe we want to make this killable */ + remove_wait_queue(&kswapd_wait, &wait); + return 0; } /* |