1 files changed, 313 insertions, 96 deletions
diff --git a/ipc/shm.c b/ipc/shm.c
index 508b6a3c9..9c326c799 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -11,6 +11,7 @@
  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
+ * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  */
 
 #include <linux/config.h>
@@ -70,6 +71,13 @@ static int shm_swapout(struct page *, struct file *);
 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #endif
 
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone);
+static void zmap_unuse(swp_entry_t entry, struct page *page);
+static void shmzero_open(struct vm_area_struct *shmd);
+static void shmzero_close(struct vm_area_struct *shmd);
+static int zero_id;
+static struct shmid_kernel zshmid_kernel;
+
 size_t shm_ctlmax = SHMMAX;
 int shm_ctlall = SHMALL;
 int shm_ctlmni = SHMMNI;
@@ -103,6 +111,8 @@ void __init shm_init (void)
 #ifdef CONFIG_PROC_FS
 	create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 #endif
+	zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, shm_ctlmni);
+	shm_unlock(zero_id);
 	return;
 }
 
@@ -179,6 +189,26 @@ static int shm_revalidate(struct shmid_kernel* shp, int shmid, int pagecount, in
 	return 0;
 }
 
+static inline struct shmid_kernel *newseg_alloc(int numpages)
+{
+	struct shmid_kernel *shp;
+
+	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
+	if (!shp)
+		return 0;
+
+	shp->shm_dir = shm_alloc (numpages);
+	if (!shp->shm_dir) {
+		kfree(shp);
+		return 0;
+	}
+	shp->shm_npages = numpages;
+	shp->attaches = NULL;
+	shp->shm_nattch = 0;
+	init_MUTEX(&shp->sem);
+	return(shp);
+}
+
 static int newseg (key_t key, int shmflg, size_t size)
 {
 	struct shmid_kernel *shp;
@@ -193,15 +223,8 @@ static int newseg (key_t key, int shmflg, size_t size)
 	if (shm_tot + numpages >= shm_ctlall)
 		return -ENOSPC;
 
-	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
-	if (!shp)
-		return -ENOMEM;
-
-	shp->shm_dir = shm_alloc (numpages);
-	if (!shp->shm_dir) {
-		kfree(shp);
+	if (!(shp = newseg_alloc(numpages)))
 		return -ENOMEM;
-	}
 	id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
 	if(id == -1) {
 		shm_free(shp->shm_dir,numpages);
@@ -212,13 +235,10 @@ static int newseg (key_t key, int shmflg, size_t size)
 	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 	shp->shm_segsz = size;
 	shp->shm_cpid = current->pid;
-	shp->attaches = NULL;
-	shp->shm_lpid = shp->shm_nattch = 0;
+	shp->shm_lpid = 0;
 	shp->shm_atime = shp->shm_dtime = 0;
 	shp->shm_ctime = CURRENT_TIME;
-	shp->shm_npages = numpages;
 	shp->id = shm_buildid(id,shp->shm_perm.seq);
-	init_MUTEX(&shp->sem);
 
 	shm_tot += numpages;
 	shm_unlock(id);
@@ -255,6 +275,35 @@ asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
 	return err;
 }
 
+static void killseg_core(struct shmid_kernel *shp, int doacc)
+{
+	int i, numpages, rss, swp;
+
+	numpages = shp->shm_npages;
+	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
+		pte_t pte;
+		pte = SHM_ENTRY (shp,i);
+		if (pte_none(pte))
+			continue;
+		if (pte_present(pte)) {
+			__free_page (pte_page(pte));
+			rss++;
+		} else {
+			swap_free(pte_to_swp_entry(pte));
+			swp++;
+		}
+	}
+	shm_free (shp->shm_dir, numpages);
+	kfree(shp);
+	if (doacc) {
+		shm_lockall();
+		shm_rss -= rss;
+		shm_swp -= swp;
+		shm_tot -= numpages;
+		shm_unlockall();
+	}
+}
+
 /*
  * Only called after testing nattch and SHM_DEST.
  * Here pages, pgtable and shmid_kernel are freed.
@@ -262,8 +311,6 @@ asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
 static void killseg (int shmid)
 {
 	struct shmid_kernel *shp;
-	int i, numpages;
-	int rss, swp;
 
 	down(&shm_ids.sem);
 	shp = shm_lock(shmid);
@@ -284,28 +331,8 @@ out_up:
 		BUG();
 	shm_unlock(shmid);
 	up(&shm_ids.sem);
+	killseg_core(shp, 1);
 
-	numpages = shp->shm_npages;
-	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
-		pte_t pte;
-		pte = SHM_ENTRY (shp,i);
-		if (pte_none(pte))
-			continue;
-		if (pte_present(pte)) {
-			__free_page (pte_page(pte));
-			rss++;
-		} else {
-			swap_free(pte_to_swp_entry(pte));
-			swp++;
-		}
-	}
-	shm_free (shp->shm_dir, numpages);
-	kfree(shp);
-	shm_lockall();
-	shm_rss -= rss;
-	shm_swp -= swp;
-	shm_tot -= numpages;
-	shm_unlockall();
 	return;
 }
 
@@ -458,6 +485,10 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
+		if (shp == &zshmid_kernel) {
+			shm_unlock(shmid);
+			return -EINVAL;
+		}
 		if(cmd==SHM_STAT) {
 			err = -EINVAL;
 			if (shmid > shm_ids.max_id)
@@ -498,6 +529,10 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
+		if (shp == &zshmid_kernel) {
+			shm_unlock(shmid);
+			return -EINVAL;
+		}
 		err=-EIDRM;
 		if(shm_checkid(shp,shmid))
 			goto out_unlock;
@@ -532,6 +567,8 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 	err=-EINVAL;
 	if(shp==NULL)
 		goto out_up;
+	if (shp == &zshmid_kernel)
+		goto out_unlock_up;
 	err=-EIDRM;
 	if(shm_checkid(shp,shmid))
 		goto out_unlock_up;
@@ -653,6 +690,8 @@ asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 	shp = shm_lock(shmid);
 	if (!shp)
 		goto out_up;
+	if (shp == &zshmid_kernel)
+		goto out_unlock_up;
 
 	err = -EACCES;
 	if (ipcperms(&shp->shm_perm, flg))
@@ -835,10 +874,12 @@ static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long addr
 	struct shmid_kernel *shp;
 	unsigned int idx;
 	struct page * page;
+	int is_shmzero;
 
 	shp = (struct shmid_kernel *) shmd->vm_private_data;
 	idx = (address - shmd->vm_start) >> PAGE_SHIFT;
 	idx += shmd->vm_pgoff;
+	is_shmzero = (shp->id == zero_id);
 
 	/*
 	 * A shared mapping past the last page of the file is an error
@@ -850,7 +891,7 @@ static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long addr
 		return NULL;
 	}
 	down(&shp->sem);
-	if(shp != shm_lock(shp->id))
+	if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 		BUG();
 
 	pte = SHM_ENTRY(shp,idx);
@@ -864,7 +905,7 @@ static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long addr
 			if (!page)
 				goto oom;
 			clear_highpage(page);
-			if(shp != shm_lock(shp->id))
+			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 				BUG();
 		} else {
 			swp_entry_t entry = pte_to_swp_entry(pte);
@@ -882,11 +923,11 @@ static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long addr
 			delete_from_swap_cache(page);
 			page = replace_with_highmem(page);
 			swap_free(entry);
-			if(shp != shm_lock(shp->id))
+			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 				BUG();
-			shm_swp--;
+			if (is_shmzero) shm_swp--;
 		}
-		shm_rss++;
+		if (is_shmzero) shm_rss++;
 		pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
 		SHM_ENTRY(shp, idx) = pte;
 	} else
@@ -904,6 +945,65 @@ oom:
 	return NOPAGE_OOM;
 }
 
+#define OKAY	0
+#define RETRY	1
+#define FAILED	2
+
+static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage)
+{
+	pte_t page;
+	struct page *page_map;
+
+	page = SHM_ENTRY(shp, idx);
+	if (!pte_present(page))
+		return RETRY;
+	page_map = pte_page(page);
+	if (zone && (!memclass(page_map->zone, zone)))
+		return RETRY;
+	if (shp->id != zero_id) swap_attempts++;
+
+	if (--counter < 0) /* failed */
+		return FAILED;
+	if (page_count(page_map) != 1)
+		return RETRY;
+
+	if (!(page_map = prepare_highmem_swapout(page_map)))
+		return FAILED;
+	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
+
+	/* add the locked page to the swap cache before allowing
+	   the swapin path to run lookup_swap_cache(). This avoids
+	   reading a not yet uptodate block from disk.
+	   NOTE: we just accounted the swap space reference for this
+	   swap cache page at __get_swap_page() time. */
+	add_to_swap_cache(*outpage = page_map, swap_entry);
+	return OKAY;
+}
+
+static void shm_swap_postop(struct page *page)
+{
+	lock_kernel();
+	rw_swap_page(WRITE, page, 0);
+	unlock_kernel();
+	__free_page(page);
+}
+
+static int shm_swap_preop(swp_entry_t *swap_entry)
+{
+	lock_kernel();
+	/* subtle: preload the swap count for the swap cache. We can't
+	   increase the count inside the critical section as we can't release
+	   the shm_lock there. And we can't acquire the big lock with the
+	   shm_lock held (otherwise we would deadlock too easily). */
+	*swap_entry = __get_swap_page(2);
+	if (!(*swap_entry).val) {
+		unlock_kernel();
+		return 1;
+	}
+	unlock_kernel();
+	return 0;
+}
+
 /*
  * Goes through counter = (shm_rss >> prio) present shm pages.
  */
@@ -912,28 +1012,19 @@ static unsigned long swap_idx = 0; /* next to swap */
 
 int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
-	pte_t page;
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
 	unsigned long id, idx;
 	int loop = 0;
 	int counter;
 	struct page * page_map;
-	
+
+	zshm_swap(prio, gfp_mask, zone);
 	counter = shm_rss >> prio;
 	if (!counter)
 		return 0;
-	lock_kernel();
-	/* subtle: preload the swap count for the swap cache. We can't
-	   increase the count inside the critical section as we can't release
-	   the shm_lock there. And we can't acquire the big lock with the
-	   shm_lock held (otherwise we would deadlock too easily). */
-	swap_entry = __get_swap_page(2);
-	if (!swap_entry.val) {
-		unlock_kernel();
+	if (shm_swap_preop(&swap_entry))
 		return 0;
-	}
-	unlock_kernel();
 
 	shm_lockall();
 check_id:
@@ -943,8 +1034,12 @@ next_id:
 		swap_idx = 0;
 		if (++swap_id > shm_ids.max_id) {
 			swap_id = 0;
-			if (loop)
-				goto failed;
+			if (loop) {
+failed:
+				shm_unlockall();
+				__swap_free(swap_entry, 2);
+				return 0;
+			}
 			loop = 1;
 		}
 		goto check_id;
@@ -956,43 +1051,16 @@ check_table:
 	if (idx >= shp->shm_npages)
 		goto next_id;
 
-	page = SHM_ENTRY(shp, idx);
-	if (!pte_present(page))
-		goto check_table;
-	page_map = pte_page(page);
-	if (zone && (!memclass(page_map->zone, zone)))
-		goto check_table;
-	swap_attempts++;
-
-	if (--counter < 0) { /* failed */
-failed:
-		shm_unlockall();
-		__swap_free(swap_entry, 2);
-		return 0;
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+		case RETRY: goto check_table;
+		case FAILED: goto failed;
 	}
-	if (page_count(page_map) != 1)
-		goto check_table;
-
-	if (!(page_map = prepare_highmem_swapout(page_map)))
-		goto failed;
-	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
 	swap_successes++;
 	shm_swp++;
 	shm_rss--;
-
-	/* add the locked page to the swap cache before allowing
-	   the swapin path to run lookup_swap_cache(). This avoids
-	   reading a not yet uptodate block from disk.
-	   NOTE: we just accounted the swap space reference for this
-	   swap cache page at __get_swap_page() time. */
-	add_to_swap_cache(page_map, swap_entry);
 	shm_unlockall();
 
-	lock_kernel();
-	rw_swap_page(WRITE, page_map, 0);
-	unlock_kernel();
-
-	__free_page(page_map);
+	shm_swap_postop(page_map);
 	return 1;
 }
 
@@ -1014,31 +1082,41 @@ static void shm_unuse_page(struct shmid_kernel *shp, unsigned long idx,
 	swap_free(entry);
 }
 
+static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
+{
+	int n;
+
+	for (n = 0; n < shp->shm_npages; n++) {
+		if (pte_none(SHM_ENTRY(shp,n)))
+			continue;
+		if (pte_present(SHM_ENTRY(shp,n)))
+			continue;
+		if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
+			shm_unuse_page(shp, n, entry, page);
+			return 1;
+		}
+	}
+	return 0;
+}
+
 /*
  * unuse_shm() search for an eventually swapped out shm page.
  */
 void shm_unuse(swp_entry_t entry, struct page *page)
 {
-	int i, n;
+	int i;
 
 	shm_lockall();
 	for (i = 0; i <= shm_ids.max_id; i++) {
 		struct shmid_kernel *shp = shm_get(i);
 		if(shp==NULL)
 			continue;
-		for (n = 0; n < shp->shm_npages; n++) {
-			if (pte_none(SHM_ENTRY(shp,n)))
-				continue;
-			if (pte_present(SHM_ENTRY(shp,n)))
-				continue;
-			if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
-				shm_unuse_page(shp, n, entry, page);
-				goto out;
-			}
-		}
+		if (shm_unuse_core(shp, entry, page))
+			goto out;
 	}
 out:
 	shm_unlockall();
+	zmap_unuse(entry, page);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1053,6 +1131,10 @@ static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int l
 
     	for(i = 0; i <= shm_ids.max_id; i++) {
 		struct shmid_kernel* shp = shm_lock(i);
+		if (shp == &zshmid_kernel) {
+			shm_unlock(i);
+			continue;
+		}
 		if(shp!=NULL) {
 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
@@ -1100,3 +1182,138 @@ done:
 	return len;
 }
 #endif
+
+static struct shmid_kernel *zmap_list = 0;
+static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
+static unsigned long zswap_idx = 0; /* next to swap */
+static struct shmid_kernel *zswap_shp = 0;
+
+static struct vm_operations_struct shmzero_vm_ops = {
+	open:		shmzero_open,
+	close:		shmzero_close,
+	nopage:		shm_nopage,
+	swapout:	shm_swapout,
+};
+
+int map_zero_setup(struct vm_area_struct *vma)
+{
+	struct shmid_kernel *shp;
+
+	if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE)))
+		return -ENOMEM;
+	shp->id = zero_id;	/* hack for shm_lock et al */
+	vma->vm_private_data = shp;
+	vma->vm_ops = &shmzero_vm_ops;
+	shmzero_open(vma);
+	spin_lock(&zmap_list_lock);
+	shp->attaches = (struct vm_area_struct *)zmap_list;
+	zmap_list = shp;
+	spin_unlock(&zmap_list_lock);
+	return 0;
+}
+
+static void shmzero_open(struct vm_area_struct *shmd)
+{
+	struct shmid_kernel *shp;
+
+	shp = (struct shmid_kernel *) shmd->vm_private_data;
+	down(&shp->sem);
+	shp->shm_nattch++;
+	up(&shp->sem);
+}
+
+static void shmzero_close(struct vm_area_struct *shmd)
+{
+	int done = 0;
+	struct shmid_kernel *shp, *prev, *cur;
+
+	shp = (struct shmid_kernel *) shmd->vm_private_data;
+	down(&shp->sem);
+	if (--shp->shm_nattch == 0)
+		done = 1;
+	up(&shp->sem);
+	if (done) {
+		spin_lock(&zmap_list_lock);
+		if (shp == zswap_shp)
+			zswap_shp = (struct shmid_kernel *)(shp->attaches);
+		if (shp == zmap_list)
+			zmap_list = (struct shmid_kernel *)(shp->attaches);
+		else {
+			prev = zmap_list;
+			cur = (struct shmid_kernel *)(prev->attaches);
+			while (cur != shp) {
+				prev = cur;
+				cur = (struct shmid_kernel *)(prev->attaches);
+			}
+			prev->attaches = (struct vm_area_struct *)(shp->attaches);
+		}
+		spin_unlock(&zmap_list_lock);
+		killseg_core(shp, 0);
+	}
+}
+
+static void zmap_unuse(swp_entry_t entry, struct page *page)
+{
+	struct shmid_kernel *shp;
+
+	spin_lock(&zmap_list_lock);
+	shp = zmap_list;
+	while (shp) {
+		if (shm_unuse_core(shp, entry, page))
+			break;
+		shp = (struct shmid_kernel *)shp->attaches;
+	}
+	spin_unlock(&zmap_list_lock);
+}
+
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone)
+{
+	struct shmid_kernel *shp;
+	swp_entry_t swap_entry;
+	unsigned long idx;
+	int loop = 0;
+	int counter;
+	struct page * page_map;
+
+	counter = 10;	/* maybe we should use zshm_rss */
+	if (!counter)
+		return;
+next:
+	if (shm_swap_preop(&swap_entry))
+		return;
+
+	spin_lock(&zmap_list_lock);
+	if (zmap_list == 0)
+		goto failed;
+next_id:
+	if ((shp = zswap_shp) == 0) {
+		if (loop) {
+failed:
+			spin_unlock(&zmap_list_lock);
+			__swap_free(swap_entry, 2);
+			return;
+		}
+		zswap_shp = shp = zmap_list;
+		zswap_idx = 0;
+		loop = 1;
+	}
+
+check_table:
+	idx = zswap_idx++;
+	if (idx >= shp->shm_npages) {
+		zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches);
+		zswap_idx = 0;
+		goto next_id;
+	}
+
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+		case RETRY: goto check_table;
+		case FAILED: goto failed;
+	}
+	spin_unlock(&zmap_list_lock);
+
+	shm_swap_postop(page_map);
+	if (counter)
+		goto next;
+	return;
+}