Merge with 2.3.99-pre6.

author: Ralf Baechle <ralf@linux-mips.org> 2000-04-28 01:09:25 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-04-28 01:09:25 +0000
commit: b9ba7aeb165cffecdffb60aec8c3fa8d590d9ca9 (patch)
tree: 42d07b0c7246ae2536a702e7c5de9e2732341116 /mm
parent: 7406b0a326f2d70ade2671c37d1beef62249db97 (diff)
9 files changed, 233 insertions, 127 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 533747f96..d0df8bd2c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -44,6 +44,7 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
+struct list_head lru_cache;
 
 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
@@ -77,6 +78,15 @@ static void remove_page_from_hash_queue(struct page * page)
 	atomic_dec(&page_cache_size);
 }
 
+static inline int sync_page(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
+		return mapping->a_ops->sync_page(page);
+	return 0;
+}
+
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
@@ -87,6 +97,9 @@ void remove_inode_page(struct page *page)
 	if (!PageLocked(page))
 		PAGE_BUG(page);
 
+	/* Initiate completion of any async operations */
+	sync_page(page);
+
 	spin_lock(&pagecache_lock);
 	remove_page_from_inode_queue(page);
 	remove_page_from_hash_queue(page);
@@ -99,6 +112,7 @@ void invalidate_inode_pages(struct inode * inode)
 	struct list_head *head, *curr;
 	struct page * page;
 
+ repeat:
 	head = &inode->i_mapping->pages;
 	spin_lock(&pagecache_lock);
 	curr = head->next;
@@ -110,14 +124,13 @@ void invalidate_inode_pages(struct inode * inode)
 		/* We cannot invalidate a locked page */
 		if (TryLockPage(page))
 			continue;
+		spin_unlock(&pagecache_lock);
 
 		lru_cache_del(page);
-		remove_page_from_inode_queue(page);
-		remove_page_from_hash_queue(page);
-		page->mapping = NULL;
+		remove_inode_page(page);
 		UnlockPage(page);
-
 		page_cache_release(page);
+		goto repeat;
 	}
 	spin_unlock(&pagecache_lock);
 }
@@ -149,11 +162,16 @@ repeat:
 
 		/* page wholly truncated - free it */
 		if (offset >= start) {
+			if (TryLockPage(page)) {
+				spin_unlock(&pagecache_lock);
+				get_page(page);
+				wait_on_page(page);
+				put_page(page);
+				goto repeat;
+			}
 			get_page(page);
 			spin_unlock(&pagecache_lock);
 
-			lock_page(page);
-
 			if (!page->buffers || block_flushpage(page, 0))
 				lru_cache_del(page);
 
@@ -191,11 +209,13 @@ repeat:
 			continue;
 
 		/* partial truncate, clear end of page */
+		if (TryLockPage(page)) {
+			spin_unlock(&pagecache_lock);
+			goto repeat;
+		}
 		get_page(page);
 		spin_unlock(&pagecache_lock);
 
-		lock_page(page);
-
 		memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 		if (page->buffers)
 			block_flushpage(page, partial);
@@ -208,6 +228,9 @@ repeat:
 		 */
 		UnlockPage(page);
 		page_cache_release(page);
+		get_page(page);
+		wait_on_page(page);
+		put_page(page);
 		goto repeat;
 	}
 	spin_unlock(&pagecache_lock);
@@ -215,46 +238,55 @@ repeat:
 
 int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 {
-	int ret = 0, count;
+	int ret = 0, loop = 0, count;
 	LIST_HEAD(young);
 	LIST_HEAD(old);
 	LIST_HEAD(forget);
 	struct list_head * page_lru, * dispose;
-	struct page * page;
-
+	struct page * page = NULL;
+	struct zone_struct * p_zone;
+	int maxloop = 256 >> priority;
+	
 	if (!zone)
 		BUG();
 
-	count = nr_lru_pages / (priority+1);
+	count = nr_lru_pages >> priority;
+	if (!count)
+		return ret;
 
 	spin_lock(&pagemap_lru_lock);
-
-	while (count > 0 && (page_lru = zone->lru_cache.prev) != &zone->lru_cache) {
+again:
+	/* we need pagemap_lru_lock for list_del() ... subtle code below */
+	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
+		p_zone = page->zone;
 
-		dispose = &zone->lru_cache;
-		if (test_and_clear_bit(PG_referenced, &page->flags))
-			/* Roll the page at the top of the lru list,
-			 * we could also be more aggressive putting
-			 * the page in the young-dispose-list, so
-			 * avoiding to free young pages in each pass.
-			 */
-			goto dispose_continue;
-
+		/*
+		 * These two tests are there to make sure we don't free too
+		 * many pages from the "wrong" zone. We free some anyway,
+		 * they are the least recently used pages in the system.
+		 * When we don't free them, leave them in &old.
+		 */
 		dispose = &old;
-		/* don't account passes over not DMA pages */
-		if (zone && (!memclass(page->zone, zone)))
+		if (p_zone != zone && (loop > (maxloop / 4) ||
+				p_zone->free_pages > p_zone->pages_high))
 			goto dispose_continue;
 
-		count--;
-
+		/* The page is in use, or was used very recently, put it in
+		 * &young to make sure that we won't try to free it the next
+		 * time */
 		dispose = &young;
 
-		/* avoid unscalable SMP locking */
+		if (test_and_clear_bit(PG_referenced, &page->flags))
+			goto dispose_continue;
+
+		count--;
 		if (!page->buffers && page_count(page) > 1)
 			goto dispose_continue;
 
+		/* Page not used -> free it; if that fails -> &old */
+		dispose = &old;
 		if (TryLockPage(page))
 			goto dispose_continue;
 
@@ -327,6 +359,7 @@ unlock_continue:
 		list_add(page_lru, dispose);
 		continue;
 
+		/* we're holding pagemap_lru_lock, so we can just loop again */
 dispose_continue:
 		list_add(page_lru, dispose);
 	}
@@ -342,9 +375,14 @@ made_buffer_progress:
 	/* nr_lru_pages needs the spinlock */
 	nr_lru_pages--;
 
+	loop++;
+	/* wrong zone?  not looped too often?    roll again... */
+	if (page->zone != zone && loop < maxloop)
+		goto again;
+
 out:
-	list_splice(&young, &zone->lru_cache);
-	list_splice(&old, zone->lru_cache.prev);
+	list_splice(&young, &lru_cache);
+	list_splice(&old, lru_cache.prev);
 
 	spin_unlock(&pagemap_lru_lock);
 
@@ -467,6 +505,9 @@ static inline void __add_to_page_cache(struct page * page,
 	struct page *alias;
 	unsigned long flags;
 
+	if (PageLocked(page))
+		BUG();
+
 	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty));
 	page->flags = flags | (1 << PG_locked) | (1 << PG_referenced);
 	get_page(page);
@@ -574,7 +615,7 @@ void ___wait_on_page(struct page *page)
 
 	add_wait_queue(&page->wait, &wait);
 	do {
-		run_task_queue(&tq_disk);
+		sync_page(page);
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!PageLocked(page))
 			break;
@@ -619,7 +660,7 @@ repeat:
 		struct task_struct *tsk = current;
 		DECLARE_WAITQUEUE(wait, tsk);
 
-		run_task_queue(&tq_disk);
+		sync_page(page);
 
 		__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		add_wait_queue(&page->wait, &wait);
@@ -669,7 +710,7 @@ repeat:
 		struct task_struct *tsk = current;
 		DECLARE_WAITQUEUE(wait, tsk);
 
-		run_task_queue(&tq_disk);
+		sync_page(page);
 
 		__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		add_wait_queue(&page->wait, &wait);
@@ -1496,7 +1537,7 @@ static int filemap_write_page(struct file *file,
 	 * mmap_sem is held.
 	 */
 	lock_page(page);
-	result = inode->i_mapping->a_ops->writepage(dentry, page);
+	result = inode->i_mapping->a_ops->writepage(file, dentry, page);
 	UnlockPage(page);
 	return result;
 }
@@ -1707,8 +1748,8 @@ static int msync_interval(struct vm_area_struct * vma,
 		error = vma->vm_ops->sync(vma, start, end-start, flags);
 		if (!error && (flags & MS_SYNC)) {
 			struct file * file = vma->vm_file;
-			if (file)
-				error = file_fsync(file, file->f_dentry);
+			if (file && file->f_op && file->f_op->fsync)
+				error = file->f_op->fsync(file, file->f_dentry);
 		}
 		return error;
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 28791baa2..84ecb57b5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -472,7 +472,7 @@ int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
 				goto out_unlock;
 			}
 		}
-		if (handle_mm_fault(current, vma, ptr, datain) <= 0) 
+		if (handle_mm_fault(current->mm, vma, ptr, datain) <= 0) 
 			goto out_unlock;
 		spin_lock(&mm->page_table_lock);
 		map = follow_page(ptr);
@@ -815,7 +815,7 @@ static inline void break_cow(struct vm_area_struct * vma, struct page *	old_page
  * We enter with the page table read-lock held, and need to exit without
  * it.
  */
-static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	unsigned long address, pte_t *page_table, pte_t pte)
 {
 	unsigned long map_nr;
@@ -824,7 +824,7 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	map_nr = pte_pagenr(pte);
 	if (map_nr >= max_mapnr)
 		goto bad_wp_page;
-	tsk->min_flt++;
+	mm->min_flt++;
 	old_page = mem_map + map_nr;
 	
 	/*
@@ -854,36 +854,36 @@ static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
 	case 1:
 		flush_cache_page(vma, address);
 		establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
-		spin_unlock(&tsk->mm->page_table_lock);
+		spin_unlock(&mm->page_table_lock);
 		return 1;
 	}
 
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
-	spin_unlock(&tsk->mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	new_page = alloc_page(GFP_HIGHUSER);
 	if (!new_page)
 		return -1;
-	spin_lock(&tsk->mm->page_table_lock);
+	spin_lock(&mm->page_table_lock);
 
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
 	if (pte_val(*page_table) == pte_val(pte)) {
 		if (PageReserved(old_page))
-			++vma->vm_mm->rss;
+			++mm->rss;
 		break_cow(vma, old_page, new_page, address, page_table);
 
 		/* Free the old page.. */
 		new_page = old_page;
 	}
-	spin_unlock(&tsk->mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	__free_page(new_page);
 	return 1;
 
 bad_wp_page:
-	spin_unlock(&tsk->mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	printk("do_wp_page: bogus page at address %08lx (nr %ld)\n",address,map_nr);
 	return -1;
 }
@@ -1029,7 +1029,7 @@ void swapin_readahead(swp_entry_t entry)
 	return;
 }
 
-static int do_swap_page(struct task_struct * tsk,
+static int do_swap_page(struct mm_struct * mm,
 	struct vm_area_struct * vma, unsigned long address,
 	pte_t * page_table, swp_entry_t entry, int write_access)
 {
@@ -1048,8 +1048,8 @@ static int do_swap_page(struct task_struct * tsk,
 		flush_icache_page(vma, page);
 	}
 
-	vma->vm_mm->rss++;
-	tsk->min_flt++;
+	mm->rss++;
+	mm->min_flt++;
 
 	pte = mk_pte(page, vma->vm_page_prot);
 
@@ -1080,7 +1080,7 @@ static int do_swap_page(struct task_struct * tsk,
 /*
  * This only needs the MM semaphore
  */
-static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
+static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 {
 	int high = 0;
 	struct page *page = NULL;
@@ -1093,8 +1093,8 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v
 			high = 1;
 		clear_user_highpage(page, addr);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-		vma->vm_mm->rss++;
-		tsk->min_flt++;
+		mm->rss++;
+		mm->min_flt++;
 		flush_page_to_ram(page);
 	}
 	set_pte(page_table, entry);
@@ -1114,14 +1114,14 @@ static int do_anonymous_page(struct task_struct * tsk, struct vm_area_struct * v
  *
  * This is called with the MM semaphore held.
  */
-static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
+static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	unsigned long address, int write_access, pte_t *page_table)
 {
 	struct page * new_page;
 	pte_t entry;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
-		return do_anonymous_page(tsk, vma, page_table, write_access, address);
+		return do_anonymous_page(mm, vma, page_table, write_access, address);
 
 	/*
 	 * The third argument is "no_share", which tells the low-level code
@@ -1133,8 +1133,8 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
 		return 0;
 	if (new_page == NOPAGE_OOM)
 		return -1;
-	++tsk->maj_flt;
-	++vma->vm_mm->rss;
+	++mm->maj_flt;
+	++mm->rss;
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
@@ -1177,7 +1177,7 @@ static int do_no_page(struct task_struct * tsk, struct vm_area_struct * vma,
  * so we don't need to worry about a page being suddenly been added into
  * our VM.
  */
-static inline int handle_pte_fault(struct task_struct *tsk,
+static inline int handle_pte_fault(struct mm_struct *mm,
 	struct vm_area_struct * vma, unsigned long address,
 	int write_access, pte_t * pte)
 {
@@ -1186,8 +1186,8 @@ static inline int handle_pte_fault(struct task_struct *tsk,
 	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry))
-			return do_no_page(tsk, vma, address, write_access, pte);
-		return do_swap_page(tsk, vma, address, pte, pte_to_swp_entry(entry), write_access);
+			return do_no_page(mm, vma, address, write_access, pte);
+		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
 	}
 
 	/*
@@ -1195,38 +1195,38 @@ static inline int handle_pte_fault(struct task_struct *tsk,
 	 * lock to synchronize with kswapd, and verify that the entry
 	 * didn't change from under us..
 	 */
-	spin_lock(&tsk->mm->page_table_lock);
+	spin_lock(&mm->page_table_lock);
 	if (pte_val(entry) == pte_val(*pte)) {
 		if (write_access) {
 			if (!pte_write(entry))
-				return do_wp_page(tsk, vma, address, pte, entry);
+				return do_wp_page(mm, vma, address, pte, entry);
 
 			entry = pte_mkdirty(entry);
 		}
 		entry = pte_mkyoung(entry);
 		establish_pte(vma, address, pte, entry);
 	}
-	spin_unlock(&tsk->mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	return 1;
 }
 
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 	unsigned long address, int write_access)
 {
 	int ret = -1;
 	pgd_t *pgd;
 	pmd_t *pmd;
 
-	pgd = pgd_offset(vma->vm_mm, address);
+	pgd = pgd_offset(mm, address);
 	pmd = pmd_alloc(pgd, address);
 	
 	if (pmd) {
 		pte_t * pte = pte_alloc(pmd, address);
 		if (pte)
-			ret = handle_pte_fault(tsk, vma, address, write_access, pte);
+			ret = handle_pte_fault(mm, vma, address, write_access, pte);
 	}
 	return ret;
 }
@@ -1237,15 +1237,15 @@ int handle_mm_fault(struct task_struct *tsk, struct vm_area_struct * vma,
 int make_pages_present(unsigned long addr, unsigned long end)
 {
 	int write;
-	struct task_struct *tsk = current;
+	struct mm_struct *mm = current->mm;
 	struct vm_area_struct * vma;
 
-	vma = find_vma(tsk->mm, addr);
+	vma = find_vma(mm, addr);
 	write = (vma->vm_flags & VM_WRITE) != 0;
 	if (addr >= end)
 		BUG();
 	do {
-		if (handle_mm_fault(tsk, vma, addr, write) < 0)
+		if (handle_mm_fault(mm, vma, addr, write) < 0)
 			return -1;
 		addr += PAGE_SIZE;
 	} while (addr < end);
diff --git a/mm/mmap.c b/mm/mmap.c
index 604624168..8a81bfb20 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -110,7 +110,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 
 	/* Always allow shrinking brk. */
 	if (brk <= mm->brk) {
-		if (!do_munmap(newbrk, oldbrk-newbrk))
+		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
 			goto set_brk;
 		goto out;
 	}
@@ -220,8 +220,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 		default:
 			return -EINVAL;
 		}
-	} else if ((flags & MAP_TYPE) != MAP_PRIVATE)
-		return -EINVAL;
+	}
 
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
@@ -269,8 +268,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 			if (!(file->f_mode & FMODE_WRITE))
 				vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
 		}
-	} else
+	} else {
 		vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+		if (flags & MAP_SHARED)
+			vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
+	}
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
 	vma->vm_ops = NULL;
 	vma->vm_pgoff = pgoff;
@@ -279,7 +281,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 
 	/* Clear old maps */
 	error = -ENOMEM;
-	if (do_munmap(addr, len))
+	if (do_munmap(mm, addr, len))
 		goto free_vma;
 
 	/* Check against address space limit. */
@@ -316,6 +318,8 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned lon
 			atomic_inc(&file->f_dentry->d_inode->i_writecount);
 		if (error)
 			goto unmap_and_free_vma;
+	} else if (flags & MAP_SHARED) {
+		error = map_zero_setup(vma);
 	}
 
 	/*
@@ -468,13 +472,13 @@ struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 	return NULL;
 }
 
-struct vm_area_struct * find_extend_vma(struct task_struct * tsk, unsigned long addr)
+struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr)
 {
 	struct vm_area_struct * vma;
 	unsigned long start;
 
 	addr &= PAGE_MASK;
-	vma = find_vma(tsk->mm,addr);
+	vma = find_vma(mm,addr);
 	if (!vma)
 		return NULL;
 	if (vma->vm_start <= addr)
@@ -513,8 +517,9 @@ struct vm_area_struct * find_extend_vma(struct task_struct * tsk, unsigned long
  * allocate a new one, and the return indicates whether the old
  * area was reused.
  */
-static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
-	unsigned long addr, size_t len, struct vm_area_struct *extra)
+static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, 
+	struct vm_area_struct *area, unsigned long addr, size_t len, 
+	struct vm_area_struct *extra)
 {
 	struct vm_area_struct *mpnt;
 	unsigned long end = addr + len;
@@ -536,11 +541,11 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
 	/* Work out to one of the ends. */
 	if (end == area->vm_end) {
 		area->vm_end = addr;
-		vmlist_modify_lock(current->mm);
+		vmlist_modify_lock(mm);
 	} else if (addr == area->vm_start) {
 		area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
 		area->vm_start = end;
-		vmlist_modify_lock(current->mm);
+		vmlist_modify_lock(mm);
 	} else {
 	/* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
 		/* Add end mapping -- leave beginning for below */
@@ -562,12 +567,12 @@ static struct vm_area_struct * unmap_fixup(struct vm_area_struct *area,
 		if (mpnt->vm_ops && mpnt->vm_ops->open)
 			mpnt->vm_ops->open(mpnt);
 		area->vm_end = addr;	/* Truncate area */
-		vmlist_modify_lock(current->mm);
-		insert_vm_struct(current->mm, mpnt);
+		vmlist_modify_lock(mm);
+		insert_vm_struct(mm, mpnt);
 	}
 
-	insert_vm_struct(current->mm, area);
-	vmlist_modify_unlock(current->mm);
+	insert_vm_struct(mm, area);
+	vmlist_modify_unlock(mm);
 	return extra;
 }
 
@@ -634,9 +639,8 @@ no_mmaps:
  * work.  This now handles partial unmappings.
  * Jeremy Fitzhardine <jeremy@sw.oz.au>
  */
-int do_munmap(unsigned long addr, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 {
-	struct mm_struct * mm;
 	struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
 
 	if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
@@ -650,7 +654,6 @@ int do_munmap(unsigned long addr, size_t len)
 	 * every area affected in some way (by any overlap) is put
 	 * on the list.  If nothing is put on, nothing is affected.
 	 */
-	mm = current->mm;
 	mpnt = find_vma_prev(mm, addr, &prev);
 	if (!mpnt)
 		return 0;
@@ -713,7 +716,7 @@ int do_munmap(unsigned long addr, size_t len)
 		/*
 		 * Fix the mapping, and free the old area if it wasn't reused.
 		 */
-		extra = unmap_fixup(mpnt, st, size, extra);
+		extra = unmap_fixup(mm, mpnt, st, size, extra);
 	}
 
 	/* Release the extra vma struct if it wasn't used */
@@ -728,10 +731,11 @@ int do_munmap(unsigned long addr, size_t len)
 asmlinkage long sys_munmap(unsigned long addr, size_t len)
 {
 	int ret;
+	struct mm_struct *mm = current->mm;
 
-	down(&current->mm->mmap_sem);
-	ret = do_munmap(addr, len);
-	up(&current->mm->mmap_sem);
+	down(&mm->mmap_sem);
+	ret = do_munmap(mm, addr, len);
+	up(&mm->mmap_sem);
 	return ret;
 }
 
@@ -763,7 +767,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	/*
 	 * Clear old maps.  this also does some error checking for us
 	 */
-	retval = do_munmap(addr, len);
+	retval = do_munmap(mm, addr, len);
 	if (retval != 0)
 		return retval;
 
diff --git a/mm/mremap.c b/mm/mremap.c
index d8d18cf62..0404dd795 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -145,7 +145,7 @@ static inline unsigned long move_vma(struct vm_area_struct * vma,
 			insert_vm_struct(current->mm, new_vma);
 			merge_segments(current->mm, new_vma->vm_start, new_vma->vm_end);
 			vmlist_modify_unlock(vma->vm_mm);
-			do_munmap(addr, old_len);
+			do_munmap(current->mm, addr, old_len);
 			current->mm->total_vm += new_len >> PAGE_SHIFT;
 			if (new_vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += new_len >> PAGE_SHIFT;
@@ -201,7 +201,7 @@ unsigned long do_mremap(unsigned long addr,
 		if ((addr <= new_addr) && (addr+old_len) > new_addr)
 			goto out;
 
-		do_munmap(new_addr, new_len);
+		do_munmap(current->mm, new_addr, new_len);
 	}
 
 	/*
@@ -210,7 +210,7 @@ unsigned long do_mremap(unsigned long addr,
 	 */
 	ret = addr;
 	if (old_len >= new_len) {
-		do_munmap(addr+new_len, old_len - new_len);
+		do_munmap(current->mm, addr+new_len, old_len - new_len);
 		if (!(flags & MREMAP_FIXED) || (new_addr == addr))
 			goto out;
 	}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07fdaa021..ba5ba3013 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,7 +25,7 @@
 #endif
 
 int nr_swap_pages = 0;
-int nr_lru_pages;
+int nr_lru_pages = 0;
 pg_data_t *pgdat_list = (pg_data_t *)0;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -273,6 +273,8 @@ static int zone_balance_memory(zonelist_t *zonelist)
 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 {
 	zone_t **zone = zonelist->zones;
+	int gfp_mask = zonelist->gfp_mask;
+	static int low_on_memory;
 
 	/*
 	 * If this is a recursive call, we'd better
@@ -282,6 +284,11 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 	if (current->flags & PF_MEMALLOC)
 		goto allocate_ok;
 
+	/* If we're a memory hog, unmap some pages */
+	if (current->hog && low_on_memory &&
+			(gfp_mask & __GFP_WAIT))
+		swap_out(4, gfp_mask);
+
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
 	 * will sooner or later tripped up by a schedule().)
@@ -299,11 +306,13 @@ struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 		/* Are we supposed to free memory? Don't make it worse.. */
 		if (!z->zone_wake_kswapd && z->free_pages > z->pages_low) {
 			struct page *page = rmqueue(z, order);
+			low_on_memory = 0;
 			if (page)
 				return page;
 		}
 	}
 
+	low_on_memory = 1;
 	/*
 	 * Ok, no obvious zones were available, start
 	 * balancing things a bit..
@@ -530,6 +539,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
+	memlist_init(&lru_cache);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -609,7 +619,6 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 			unsigned long bitmap_size;
 
 			memlist_init(&zone->free_area[i].free_list);
-			memlist_init(&zone->lru_cache);
 			mask += mask;
 			size = (size + ~mask) & mask;
 			bitmap_size = size >> i;
diff --git a/mm/slab.c b/mm/slab.c
index 976f78c1a..68bbb7d17 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1534,7 +1534,7 @@ nul_ptr:
  * it should be in this state _before_ it is released.
  */
 static inline void
-__kmem_cache_free(kmem_cache_t *cachep, const void *objp)
+__kmem_cache_free(kmem_cache_t *cachep, void *objp)
 {
 	kmem_slab_t	*slabp;
 	kmem_bufctl_t	*bufp;
@@ -1739,7 +1739,7 @@ kfree(const void *objp)
 		 */
 		cachep = SLAB_GET_PAGE_CACHE(page);
 		if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
-			__kmem_cache_free(cachep, objp);
+			__kmem_cache_free(cachep, (void *)objp);
 			return;
 		}
 	}
@@ -1774,7 +1774,7 @@ kfree_s(const void *objp, size_t size)
 		cachep = SLAB_GET_PAGE_CACHE(page);
 		if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) {
 			if (size <= cachep->c_org_size) {	/* XXX better check */
-				__kmem_cache_free(cachep, objp);
+				__kmem_cache_free(cachep, (void *)objp);
 				return;
 			}
 		}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index defe9b463..29ba0d78b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,12 +17,17 @@
 
 #include <asm/pgtable.h>
 
+static struct address_space_operations swap_aops = {
+	sync_page: block_sync_page
+};
+
 struct address_space swapper_space = {
 	{				/* pages	*/
 		&swapper_space.pages,	/*        .next */
 		&swapper_space.pages	/*	  .prev */
 	},
-	0				/* nrpages	*/
+	0,				/* nrpages	*/
+	&swap_aops,
 };
 
 #ifdef SWAP_CACHE_INFO
diff --git a/mm/swapfile.c b/mm/swapfile.c
index abdb08e57..da2dd9147 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -444,6 +444,7 @@ asmlinkage long sys_swapoff(const char * specialfile)
 {
 	struct swap_info_struct * p = NULL;
 	struct dentry * dentry;
+	struct vfsmount *mnt;
 	int i, type, prev;
 	int err;
 	
@@ -513,11 +514,14 @@ asmlinkage long sys_swapoff(const char * specialfile)
 
 	dentry = p->swap_file;
 	p->swap_file = NULL;
+	mnt = p->swap_vfsmnt;
+	p->swap_vfsmnt = NULL;
 	p->swap_device = 0;
 	vfree(p->swap_map);
 	p->swap_map = NULL;
 	p->flags = 0;
 	err = 0;
+	mntput(mnt);
 
 out_dput:
 	dput(dentry);
@@ -538,7 +542,8 @@ int get_swaparea_info(char *buf)
 	len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
 	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
 		if (ptr->flags & SWP_USED) {
-			char * path = d_path(ptr->swap_file, NULL, page, PAGE_SIZE);
+			char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
+						page, PAGE_SIZE);
 
 			len += sprintf(buf + len, "%-31s ", path);
 
@@ -584,7 +589,8 @@ int is_swap_partition(kdev_t dev) {
 asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 {
 	struct swap_info_struct * p;
-	struct dentry * swap_dentry;
+	struct nameidata nd;
+	struct inode * swap_inode;
 	unsigned int type;
 	int i, j, prev;
 	int error;
@@ -595,6 +601,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	unsigned long maxpages;
 	int swapfilesize;
 	struct block_device *bdev = NULL;
+	char *name;
 	
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -610,6 +617,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 		nr_swapfiles = type+1;
 	p->flags = SWP_USED;
 	p->swap_file = NULL;
+	p->swap_vfsmnt = NULL;
 	p->swap_device = 0;
 	p->swap_map = NULL;
 	p->lowest_bit = 0;
@@ -624,24 +632,31 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 	} else {
 		p->prio = --least_priority;
 	}
-	swap_dentry = namei(specialfile);
-	error = PTR_ERR(swap_dentry);
-	if (IS_ERR(swap_dentry))
+	name = getname(specialfile);
+	error = PTR_ERR(name);
+	if (IS_ERR(name))
+		goto bad_swap_2;
+	error = 0;
+	if (walk_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
+		error = walk_name(name, &nd);
+	putname(name);
+	if (error)
 		goto bad_swap_2;
 
-	p->swap_file = swap_dentry;
+	p->swap_file = nd.dentry;
+	p->swap_vfsmnt = nd.mnt;
+	swap_inode = nd.dentry->d_inode;
 	error = -EINVAL;
 
-	if (S_ISBLK(swap_dentry->d_inode->i_mode)) {
-		kdev_t dev = swap_dentry->d_inode->i_rdev;
+	if (S_ISBLK(swap_inode->i_mode)) {
+		kdev_t dev = swap_inode->i_rdev;
 		struct block_device_operations *bdops;
 
 		p->swap_device = dev;
 		set_blocksize(dev, PAGE_SIZE);
 		
-		bdev = swap_dentry->d_inode->i_bdev;
-		bdops = devfs_get_ops ( devfs_get_handle_from_inode
-					(swap_dentry->d_inode) );
+		bdev = swap_inode->i_bdev;
+		bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
 		if (bdops) bdev->bd_op = bdops;
 
 		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
@@ -663,15 +678,15 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
 		if (blk_size[MAJOR(dev)])
 			swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
 				>> (PAGE_SHIFT - 10);
-	} else if (S_ISREG(swap_dentry->d_inode->i_mode)) {
+	} else if (S_ISREG(swap_inode->i_mode)) {
 		error = -EBUSY;
 		for (i = 0 ; i < nr_swapfiles ; i++) {
 			if (i == type || !swap_info[i].swap_file)
 				continue;
-			if (swap_dentry->d_inode == swap_info[i].swap_file->d_inode)
+			if (swap_inode == swap_info[i].swap_file->d_inode)
 				goto bad_swap;
 		}
-		swapfilesize = swap_dentry->d_inode->i_size >> PAGE_SHIFT;
+		swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
 	} else
 		goto bad_swap;
 
@@ -811,13 +826,17 @@ bad_swap:
 bad_swap_2:
 	if (p->swap_map)
 		vfree(p->swap_map);
-	dput(p->swap_file);
+	nd.mnt = p->swap_vfsmnt;
+	nd.dentry = p->swap_file;
 	p->swap_device = 0;
 	p->swap_file = NULL;
+	p->swap_vfsmnt = NULL;
 	p->swap_map = NULL;
 	p->flags = 0;
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
+	dput(nd.dentry);
+	mntput(nd.mnt);
 out:
 	if (swap_header)
 		free_page((long) swap_header);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1057dbb60..691d47f18 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -34,7 +34,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -48,6 +48,7 @@ static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pt
 	if ((page-mem_map >= max_mapnr) || PageReserved(page))
 		goto out_failed;
 
+	mm->swap_cnt--;
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
 		/*
@@ -194,7 +195,7 @@ out_failed:
  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
  */
 
-static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -216,16 +217,18 @@ static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned
 	do {
 		int result;
 		vma->vm_mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(vma, address, pte, gfp_mask);
+		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
+		if (!mm->swap_cnt)
+			return 0;
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
 	return 0;
 }
 
-static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -245,16 +248,18 @@ static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 		if (result)
 			return result;
+		if (!mm->swap_cnt)
+			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
 	return 0;
 }
 
-static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -269,9 +274,11 @@ static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 		if (result)
 			return result;
+		if (!mm->swap_cnt)
+			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	} while (address && (address < end));
@@ -299,7 +306,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(vma, address, gfp_mask);
+			int result = swap_out_vma(mm, vma, address, gfp_mask);
 			if (result)
 				return result;
 			vma = vma->vm_next;
@@ -321,7 +328,7 @@ static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p;
 	int counter;
@@ -356,6 +363,7 @@ static int swap_out(unsigned int priority, int gfp_mask)
 		p = init_task.next_task;
 		for (; p != &init_task; p = p->next_task) {
 			struct mm_struct *mm = p->mm;
+			p->hog = 0;
 			if (!p->swappable || !mm)
 				continue;
 	 		if (mm->rss <= 0)
@@ -369,9 +377,26 @@ static int swap_out(unsigned int priority, int gfp_mask)
 				pid = p->pid;
 			}
 		}
-		read_unlock(&tasklist_lock);
-		if (assign == 1)
+		if (assign == 1) {
+			/* we just assigned swap_cnt, normalise values */
 			assign = 2;
+			p = init_task.next_task;
+			for (; p != &init_task; p = p->next_task) {
+				int i = 0;
+				struct mm_struct *mm = p->mm;
+				if (!p->swappable || !mm || mm->rss <= 0)
+					continue;
+				/* small processes are swapped out less */
+				while ((mm->swap_cnt << 2 * (i + 1) < max_cnt))
+					i++;
+				mm->swap_cnt >>= i;
+				mm->swap_cnt += i; /* if swap_cnt reaches 0 */
+				/* we're big -> hog treatment */
+				if (!i)
+					p->hog = 1;
+			}
+		}
+		read_unlock(&tasklist_lock);
 		if (!best) {
 			if (!assign) {
 				assign = 1;
@@ -412,13 +437,14 @@ static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
+	int ret;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
 	priority = 6;
 	do {
-		while (shrink_mmap(priority, gfp_mask, zone)) {
+		while ((ret = shrink_mmap(priority, gfp_mask, zone))) {
 			if (!--count)
 				goto done;
 		}
@@ -441,7 +467,9 @@ static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 			}
 		}
 
-		/* Then, try to page stuff out.. */
+		/* Then, try to page stuff out..
+		 * We use swapcount here because this doesn't actually
+		 * free pages */
 		while (swap_out(priority, gfp_mask)) {
 			if (!--count)
 				goto done;
author	Ralf Baechle <ralf@linux-mips.org>	2000-04-28 01:09:25 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-04-28 01:09:25 +0000
commit	b9ba7aeb165cffecdffb60aec8c3fa8d590d9ca9 (patch)
tree	42d07b0c7246ae2536a702e7c5de9e2732341116 /mm
parent	7406b0a326f2d70ade2671c37d1beef62249db97 (diff)