Import of Linux/MIPS 1.3.0

author: Ralf Baechle <ralf@linux-mips.org> 1995-11-14 08:00:00 +0000
committer: <ralf@linux-mips.org> 1995-11-14 08:00:00 +0000
commit: e7c2a72e2680827d6a733931273a93461c0d8d1b (patch)
tree: c9abeda78ef7504062bb2e816bcf3e3c9d680112 /ipc
parent: ec6044459060a8c9ce7f64405c465d141898548c (diff)
4 files changed, 572 insertions, 336 deletions
diff --git a/ipc/Makefile b/ipc/Makefile
index a3a18a7ae..936d1cf50 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -28,6 +28,7 @@ ipc.o: $(OBJS)
 dep:
 	$(CPP) -M $(SRCS) > .depend
 
+modules:
 dummy:
 
 #
diff --git a/ipc/sem.c b/ipc/sem.c
index 0aeaf588b..6dbe8e4fe 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1,6 +1,34 @@
 /*
  * linux/ipc/sem.c
- * Copyright (C) 1992 Krishna Balasubramanian 
+ * Copyright (C) 1992 Krishna Balasubramanian
+ * Copyright (C) 1995 Eric Schenk, Bruno Haible
+ *
+ * IMPLEMENTATION NOTES ON CODE REWRITE (Eric Schenk, January 1995):
+ * This code underwent a massive rewrite in order to solve some problems
+ * with the original code. In particular the original code failed to
+ * wake up processes that were waiting for semval to go to 0 if the
+ * value went to 0 and was then incremented rapidly enough. In solving
+ * this problem I have also modified the implementation so that it
+ * processes pending operations in a FIFO manner, thus give a guarantee
+ * that processes waiting for a lock on the semaphore won't starve
+ * unless another locking process fails to unlock.
+ * In addition the following two changes in behavior have been introduced:
+ * - The original implementation of semop returned the value
+ *   last semaphore element examined on success. This does not
+ *   match the manual page specifications, and effectively
+ *   allows the user to read the semaphore even if they do not
+ *   have read permissions. The implementation now returns 0
+ *   on success as stated in the manual page.
+ * - There is some confusion over whether the set of undo adjustments
+ *   to be performed at exit should be done in an atomic manner.
+ *   That is, if we are attempting to decrement the semval should we queue
+ *   up and wait until we can do so legally?
+ *   The original implementation attempted to do this.
+ *   The current implementation does not do so. This is because I don't
+ *   think it is the right thing (TM) to do, and because I couldn't
+ *   see a clean way to get the old behavior with the new design.
+ *   The POSIX standard and SVID should be consulted to determine
+ *   what behavior is mandated.
  */
 
 #include <linux/errno.h>
@@ -18,7 +46,7 @@ static int findkey (key_t key);
 static void freeary (int id);
 
 static struct semid_ds *semary[SEMMNI];
-static int used_sems = 0, used_semids = 0;                    
+static int used_sems = 0, used_semids = 0;
 static struct wait_queue *sem_lock = NULL;
 static int max_semid = 0;
 
@@ -27,7 +55,7 @@ static unsigned short sem_seq = 0;
 void sem_init (void)
 {
 	int i;
-	
+
 	sem_lock = NULL;
 	used_sems = used_semids = max_semid = sem_seq = 0;
 	for (i = 0; i < SEMMNI; i++)
@@ -39,9 +67,9 @@ static int findkey (key_t key)
 {
 	int id;
 	struct semid_ds *sma;
-	
+
 	for (id = 0; id <= max_semid; id++) {
-		while ((sma = semary[id]) == IPC_NOID) 
+		while ((sma = semary[id]) == IPC_NOID)
 			interruptible_sleep_on (&sem_lock);
 		if (sma == IPC_UNUSED)
 			continue;
@@ -62,7 +90,7 @@ static int newary (key_t key, int nsems, int semflg)
 		return -EINVAL;
 	if (used_sems + nsems > SEMMNS)
 		return -ENOSPC;
-	for (id = 0; id < SEMMNI; id++) 
+	for (id = 0; id < SEMMNI; id++)
 		if (semary[id] == IPC_UNUSED) {
 			semary[id] = (struct semid_ds *) IPC_NOID;
 			goto found;
@@ -87,10 +115,12 @@ found:
 	ipcp->cuid = ipcp->uid = current->euid;
 	ipcp->gid = ipcp->cgid = current->egid;
 	sma->sem_perm.seq = sem_seq;
-	sma->eventn = sma->eventz = NULL;
+	/* sma->sem_pending = NULL; */
+	sma->sem_pending_last = &sma->sem_pending;
+	/* sma->undo = NULL; */
 	sma->sem_nsems = nsems;
 	sma->sem_ctime = CURRENT_TIME;
-        if (id > max_semid)
+	if (id > max_semid)
 		max_semid = id;
 	used_semids++;
 	semary[id] = sma;
@@ -103,10 +133,10 @@ int sys_semget (key_t key, int nsems, int semflg)
 {
 	int id;
 	struct semid_ds *sma;
-	
-	if (nsems < 0  || nsems > SEMMSL)
+
+	if (nsems < 0 || nsems > SEMMSL)
 		return -EINVAL;
-	if (key == IPC_PRIVATE) 
+	if (key == IPC_PRIVATE)
 		return newary(key, nsems, semflg);
 	if ((id = findkey (key)) == -1) {  /* key not used */
 		if (!(semflg & IPC_CREAT))
@@ -121,13 +151,188 @@ int sys_semget (key_t key, int nsems, int semflg)
 	if (ipcperms(&sma->sem_perm, semflg))
 		return -EACCES;
 	return (unsigned int) sma->sem_perm.seq * SEMMNI + id;
-} 
+}
 
+/* Manage the doubly linked list sma->sem_pending as a FIFO:
+ * insert new queue elements at the tail sma->sem_pending_last.
+ */
+static inline void insert_into_queue (struct semid_ds * sma, struct sem_queue * q)
+{
+	*(q->prev = sma->sem_pending_last) = q;
+	*(sma->sem_pending_last = &q->next) = NULL;
+}
+static inline void remove_from_queue (struct semid_ds * sma, struct sem_queue * q)
+{
+	*(q->prev) = q->next;
+	if (q->next)
+		q->next->prev = q->prev;
+	else /* sma->sem_pending_last == &q->next */
+		sma->sem_pending_last = q->prev;
+	q->prev = NULL; /* mark as removed */
+}
+
+/* Determine whether a sequence of semaphore operations would succeed
+ * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
+ */
+static int try_semop (struct semid_ds * sma, struct sembuf * sops, int nsops)
+{
+	int result = 0;
+	int i = 0;
+
+	while (i < nsops) {
+		struct sembuf * sop = &sops[i];
+		struct sem * curr = &sma->sem_base[sop->sem_num];
+		if (sop->sem_op + curr->semval > SEMVMX) {
+			result = -ERANGE;
+			break;
+		}
+		if (!sop->sem_op && curr->semval) {
+			if (sop->sem_flg & IPC_NOWAIT)
+				result = -EAGAIN;
+			else
+				result = 1;
+			break;
+		}
+		i++;
+		curr->semval += sop->sem_op;
+		if (curr->semval < 0) {
+			if (sop->sem_flg & IPC_NOWAIT)
+				result = -EAGAIN;
+			else
+				result = 1;
+			break;
+		}
+	}
+	while (--i >= 0) {
+		struct sembuf * sop = &sops[i];
+		struct sem * curr = &sma->sem_base[sop->sem_num];
+		curr->semval -= sop->sem_op;
+	}
+	return result;
+}
+
+/* Actually perform a sequence of semaphore operations. Atomically. */
+/* This assumes that try_semop() already returned 0. */
+static int do_semop (struct semid_ds * sma, struct sembuf * sops, int nsops,
+		     struct sem_undo * un, int pid)
+{
+	int i;
+
+	for (i = 0; i < nsops; i++) {
+		struct sembuf * sop = &sops[i];
+		struct sem * curr = &sma->sem_base[sop->sem_num];
+		if (sop->sem_op + curr->semval > SEMVMX) {
+			printk("do_semop: race\n");
+			break;
+		}
+		if (!sop->sem_op) {
+			if (curr->semval) {
+				printk("do_semop: race\n");
+				break;
+			}
+		} else {
+			curr->semval += sop->sem_op;
+			if (curr->semval < 0) {
+				printk("do_semop: race\n");
+				break;
+			}
+			if (sop->sem_flg & SEM_UNDO)
+				un->semadj[sop->sem_num] -= sop->sem_op;
+		}
+		curr->sempid = pid;
+	}
+	sma->sem_otime = CURRENT_TIME;
+
+	/* Previous implementation returned the last semaphore's semval.
+	 * This is wrong because we may not have checked read permission,
+	 * only write permission.
+	 */
+	return 0;
+}
+
+/* Go through the pending queue for the indicated semaphore
+ * looking for tasks that can be completed. Keep cycling through
+ * the queue until a pass is made in which no process is woken up.
+ */
+static void update_queue (struct semid_ds * sma)
+{
+	int wokeup, error;
+	struct sem_queue * q;
+
+	do {
+		wokeup = 0;
+		for (q = sma->sem_pending; q; q = q->next) {
+			error = try_semop(sma, q->sops, q->nsops);
+			/* Does q->sleeper still need to sleep? */
+			if (error > 0)
+				continue;
+			/* Perform the operations the sleeper was waiting for */
+			if (!error)
+				error = do_semop(sma, q->sops, q->nsops, q->undo, q->pid);
+			q->status = error;
+			/* Remove it from the queue */
+			remove_from_queue(sma,q);
+			/* Wake it up */
+			wake_up_interruptible(&q->sleeper); /* doesn't sleep! */
+			wokeup++;
+		}
+	} while (wokeup);
+}
+
+/* The following counts are associated to each semaphore:
+ *   semncnt        number of tasks waiting on semval being nonzero
+ *   semzcnt        number of tasks waiting on semval being zero
+ * This model assumes that a task waits on exactly one semaphore.
+ * Since semaphore operations are to be performed atomically, tasks actually
+ * wait on a whole sequence of semaphores simultaneously.
+ * The counts we return here are a rough approximation, but still
+ * warrant that semncnt+semzcnt>0 if the task is on the pending queue.
+ */
+static int count_semncnt (struct semid_ds * sma, ushort semnum)
+{
+	int semncnt;
+	struct sem_queue * q;
+
+	semncnt = 0;
+	for (q = sma->sem_pending; q; q = q->next) {
+		struct sembuf * sops = q->sops;
+		int nsops = q->nsops;
+		int i;
+		for (i = 0; i < nsops; i++)
+			if (sops[i].sem_num == semnum
+			    && (sops[i].sem_op < 0)
+			    && !(sops[i].sem_flg & IPC_NOWAIT))
+				semncnt++;
+	}
+	return semncnt;
+}
+static int count_semzcnt (struct semid_ds * sma, ushort semnum)
+{
+	int semzcnt;
+	struct sem_queue * q;
+
+	semzcnt = 0;
+	for (q = sma->sem_pending; q; q = q->next) {
+		struct sembuf * sops = q->sops;
+		int nsops = q->nsops;
+		int i;
+		for (i = 0; i < nsops; i++)
+			if (sops[i].sem_num == semnum
+			    && (sops[i].sem_op == 0)
+			    && !(sops[i].sem_flg & IPC_NOWAIT))
+				semzcnt++;
+	}
+	return semzcnt;
+}
+
+/* Free a semaphore set. */
 static void freeary (int id)
 {
 	struct semid_ds *sma = semary[id];
 	struct sem_undo *un;
+	struct sem_queue *q;
 
+	/* Invalidate this semaphore set */
 	sma->sem_perm.seq++;
 	sem_seq = (sem_seq+1) % ((unsigned)(1<<31)/SEMMNI); /* increment, but avoid overflow */
 	used_sems -= sma->sem_nsems;
@@ -135,17 +340,21 @@ static void freeary (int id)
 		while (max_semid && (semary[--max_semid] == IPC_UNUSED));
 	semary[id] = (struct semid_ds *) IPC_UNUSED;
 	used_semids--;
+
+	/* Invalidate the existing undo structures for this semaphore set.
+	 * (They will be freed without any further action in sem_exit().)
+	 */
 	for (un = sma->undo; un; un = un->id_next)
-	        un->semadj = 0;
-	while (sma->eventz || sma->eventn) {
-		if (sma->eventz)
-			wake_up (&sma->eventz);
-		if (sma->eventn)
-			wake_up (&sma->eventn);
-		schedule();
+		un->semid = -1;
+
+	/* Wake up all pending processes and let them fail with EIDRM. */
+	for (q = sma->sem_pending; q; q = q->next) {
+		q->status = -EIDRM;
+		q->prev = NULL;
+		wake_up_interruptible(&q->sleeper); /* doesn't sleep! */
 	}
+
 	kfree(sma);
-	return;
 }
 
 int sys_semctl (int semid, int semnum, int cmd, union semun arg)
@@ -155,7 +364,7 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 	int i, id, val = 0;
 	struct semid_ds *sma;
 	struct ipc_perm *ipcp;
-	struct sem *curr;
+	struct sem *curr = NULL;
 	struct sem_undo *un;
 	unsigned int nsems;
 	ushort *array = NULL;
@@ -165,8 +374,8 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 		return -EINVAL;
 
 	switch (cmd) {
-	case IPC_INFO: 
-	case SEM_INFO: 
+	case IPC_INFO:
+	case SEM_INFO:
 	{
 		struct seminfo seminfo, *tmp = arg.__buf;
 		seminfo.semmni = SEMMNI;
@@ -174,8 +383,8 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 		seminfo.semmsl = SEMMSL;
 		seminfo.semopm = SEMOPM;
 		seminfo.semvmx = SEMVMX;
-		seminfo.semmnu = SEMMNU; 
-		seminfo.semmap = SEMMAP; 
+		seminfo.semmnu = SEMMNU;
+		seminfo.semmap = SEMMAP;
 		seminfo.semume = SEMUME;
 		seminfo.semusz = SEMUSZ;
 		seminfo.semaem = SEMAEM;
@@ -219,9 +428,18 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 	nsems = sma->sem_nsems;
 	if (sma->sem_perm.seq != (unsigned int) semid / SEMMNI)
 		return -EIDRM;
-	if (semnum >= nsems)
-		return -EINVAL;
-	curr = &sma->sem_base[semnum];
+
+	switch (cmd) {
+	case GETVAL:
+	case GETPID:
+	case GETNCNT:
+	case GETZCNT:
+	case SETVAL:
+		if (semnum >= nsems)
+			return -EINVAL;
+		curr = &sma->sem_base[semnum];
+		break;
+	}
 
 	switch (cmd) {
 	case GETVAL:
@@ -232,10 +450,10 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 		if (ipcperms (ipcp, S_IRUGO))
 			return -EACCES;
 		switch (cmd) {
-		case GETVAL : return curr->semval; 
+		case GETVAL : return curr->semval;
 		case GETPID : return curr->sempid;
-		case GETNCNT: return curr->semncnt;
-		case GETZCNT: return curr->semzcnt;
+		case GETNCNT: return count_semncnt(sma,semnum);
+		case GETZCNT: return count_semzcnt(sma,semnum);
 		case GETALL:
 			array = arg.array;
 			i = verify_area (VERIFY_WRITE, array, nsems*sizeof(ushort));
@@ -245,13 +463,12 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 		break;
 	case SETVAL:
 		val = arg.val;
-		if (val > SEMVMX || val < 0) 
+		if (val > SEMVMX || val < 0)
 			return -ERANGE;
 		break;
 	case IPC_RMID:
-		if (suser() || current->euid == ipcp->cuid || 
-		    current->euid == ipcp->uid) {
-			freeary (id); 
+		if (suser() || current->euid == ipcp->cuid || current->euid == ipcp->uid) {
+			freeary (id);
 			return 0;
 		}
 		return -EPERM;
@@ -276,12 +493,12 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 		memcpy_fromfs (&tbuf, buf, sizeof (*buf));
 		break;
 	}
-	
+
 	if (semary[id] == IPC_UNUSED || semary[id] == IPC_NOID)
 		return -EIDRM;
 	if (sma->sem_perm.seq != (unsigned int) semid / SEMMNI)
 		return -EIDRM;
-	
+
 	switch (cmd) {
 	case GETALL:
 		if (ipcperms (ipcp, S_IRUGO))
@@ -294,18 +511,14 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 		if (ipcperms (ipcp, S_IWUGO))
 			return -EACCES;
 		for (un = sma->undo; un; un = un->id_next)
-			if (semnum == un->sem_num)
-				un->semadj = 0;
-		sma->sem_ctime = CURRENT_TIME;
+			un->semadj[semnum] = 0;
 		curr->semval = val;
-		if (sma->eventn)
-			wake_up (&sma->eventn);
-		if (sma->eventz)
-			wake_up (&sma->eventz);
+		sma->sem_ctime = CURRENT_TIME;
+		/* maybe some queued-up processes were waiting for this */
+		update_queue(sma);
 		break;
 	case IPC_SET:
-		if (suser() || current->euid == ipcp->cuid || 
-		    current->euid == ipcp->uid) {
+		if (suser() || current->euid == ipcp->cuid || current->euid == ipcp->uid) {
 			ipcp->uid = tbuf.sem_perm.uid;
 			ipcp->gid = tbuf.sem_perm.gid;
 			ipcp->mode = (ipcp->mode & ~S_IRWXUGO)
@@ -326,15 +539,14 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 	case SETALL:
 		if (ipcperms (ipcp, S_IWUGO))
 			return -EACCES;
-		for (i = 0; i < nsems; i++) 
+		for (i = 0; i < nsems; i++)
 			sma->sem_base[i].semval = sem_io[i];
 		for (un = sma->undo; un; un = un->id_next)
-			un->semadj = 0;
-		if (sma->eventn)
-			wake_up (&sma->eventn);
-		if (sma->eventz)
-			wake_up (&sma->eventz);
+			for (i = 0; i < nsems; i++)
+				un->semadj[i] = 0;
 		sma->sem_ctime = CURRENT_TIME;
+		/* maybe some queued-up processes were waiting for this */
+		update_queue(sma);
 		break;
 	default:
 		return -EINVAL;
@@ -344,170 +556,155 @@ int sys_semctl (int semid, int semnum, int cmd, union semun arg)
 
 int sys_semop (int semid, struct sembuf *tsops, unsigned nsops)
 {
-	int i, id;
+	int i, id, size, error;
 	struct semid_ds *sma;
-	struct sem *curr = NULL;
 	struct sembuf sops[SEMOPM], *sop;
 	struct sem_undo *un;
-	int undos = 0, alter = 0, semncnt = 0, semzcnt = 0;
-	
+	int undos = 0, alter = 0;
+
 	if (nsops < 1 || semid < 0)
 		return -EINVAL;
 	if (nsops > SEMOPM)
 		return -E2BIG;
-	if (!tsops) 
+	if (!tsops)
 		return -EFAULT;
 	if ((i = verify_area (VERIFY_READ, tsops, nsops * sizeof(*tsops))))
 		return i;
-	memcpy_fromfs (sops, tsops, nsops * sizeof(*tsops));  
+	memcpy_fromfs (sops, tsops, nsops * sizeof(*tsops));
 	id = (unsigned int) semid % SEMMNI;
 	if ((sma = semary[id]) == IPC_UNUSED || sma == IPC_NOID)
 		return -EINVAL;
-	for (i = 0; i < nsops; i++) { 
+	if (sma->sem_perm.seq != (unsigned int) semid / SEMMNI)
+		return -EIDRM;
+	for (i = 0; i < nsops; i++) {
 		sop = &sops[i];
-		if (sop->sem_num > sma->sem_nsems)
+		if (sop->sem_num >= sma->sem_nsems)
 			return -EFBIG;
 		if (sop->sem_flg & SEM_UNDO)
 			undos++;
-		if (sop->sem_op) {
+		if (sop->sem_op)
 			alter++;
-			if (sop->sem_op > 0)
-				semncnt ++;
-		}
 	}
 	if (ipcperms(&sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
 		return -EACCES;
-	/* 
-	 * ensure every sop with undo gets an undo structure 
-	 */
+	error = try_semop(sma, sops, nsops);
+	if (error < 0)
+		return error;
 	if (undos) {
-		for (i = 0; i < nsops; i++) {
-			if (!(sops[i].sem_flg & SEM_UNDO))
-				continue;
-			for (un = current->semundo; un; un = un->proc_next) 
-				if ((un->semid == semid) && 
-				    (un->sem_num == sops[i].sem_num))
-					break;
-			if (un)
-				continue;
-			un = (struct sem_undo *) 
-				kmalloc (sizeof(*un), GFP_ATOMIC);
+		/* Make sure we have an undo structure
+		 * for this process and this semaphore set.
+		 */
+		for (un = current->semundo; un; un = un->proc_next)
+			if (un->semid == semid)
+				break;
+		if (!un) {
+			size = sizeof(struct sem_undo) + sizeof(short)*sma->sem_nsems;
+			un = (struct sem_undo *) kmalloc(size, GFP_ATOMIC);
 			if (!un)
-				return -ENOMEM; /* freed on exit */
+				return -ENOMEM;
+			memset(un, 0, size);
+			un->semadj = (short *) &un[1];
 			un->semid = semid;
-			un->semadj = 0;
-			un->sem_num = sops[i].sem_num;
 			un->proc_next = current->semundo;
 			current->semundo = un;
 			un->id_next = sma->undo;
 			sma->undo = un;
 		}
-	}
-	
- slept:
-	if (sma->sem_perm.seq != (unsigned int) semid / SEMMNI) 
-		return -EIDRM;
-	for (i = 0; i < nsops; i++) {
-		sop = &sops[i];
-		curr = &sma->sem_base[sop->sem_num];
-		if (sop->sem_op + curr->semval > SEMVMX)
-			return -ERANGE;
-		if (!sop->sem_op && curr->semval) { 
-			if (sop->sem_flg & IPC_NOWAIT)
-				return -EAGAIN;
-			if (current->signal & ~current->blocked) 
-				return -EINTR;
-			curr->semzcnt++;
-			interruptible_sleep_on (&sma->eventz);
-			curr->semzcnt--;
-			goto slept;
-		}
-		if ((sop->sem_op + curr->semval < 0) ) { 
-			if (sop->sem_flg & IPC_NOWAIT)
-				return -EAGAIN;
-			if (current->signal & ~current->blocked)
-				return -EINTR;
-			curr->semncnt++;
-			interruptible_sleep_on (&sma->eventn);
-			curr->semncnt--;
-			goto slept;
-		}
-	}
-	
-	for (i = 0; i < nsops; i++) {
-		sop = &sops[i];
-		curr = &sma->sem_base[sop->sem_num];
-		curr->sempid = current->pid;
-		if (!(curr->semval += sop->sem_op))
-			semzcnt++;
-		if (!(sop->sem_flg & SEM_UNDO))
-			continue;
-		for (un = current->semundo; un; un = un->proc_next) 
-			if ((un->semid == semid) && 
-			    (un->sem_num == sop->sem_num))
-				break;
-		if (!un) {
-			printk ("semop : no undo for op %d\n", i);
-			continue;
+	} else
+		un = NULL;
+	if (error == 0) {
+		/* the operations go through immediately */
+		error = do_semop(sma, sops, nsops, un, current->pid);
+		/* maybe some queued-up processes were waiting for this */
+		update_queue(sma);
+		return error;
+	} else {
+		/* We need to sleep on this operation, so we put the current
+		 * task into the pending queue and go to sleep.
+		 */
+		struct sem_queue queue;
+
+		queue.sma = sma;
+		queue.sops = sops;
+		queue.nsops = nsops;
+		queue.undo = un;
+		queue.pid = current->pid;
+		queue.status = 0;
+		insert_into_queue(sma,&queue);
+		queue.sleeper = NULL;
+		current->semsleeping = &queue;
+		interruptible_sleep_on(&queue.sleeper);
+		current->semsleeping = NULL;
+		/* When we wake up, either the operation is finished,
+		 * or some kind of error happened.
+		 */
+		if (!queue.prev) {
+			/* operation is finished, update_queue() removed us */
+			return queue.status;
+		} else {
+			remove_from_queue(sma,&queue);
+			return -EINTR;
 		}
-		un->semadj -= sop->sem_op;
 	}
-	sma->sem_otime = CURRENT_TIME; 
-       	if (semncnt && sma->eventn)
-		wake_up(&sma->eventn);
-	if (semzcnt && sma->eventz)
-		wake_up(&sma->eventz);
-	return curr->semval;
 }
 
 /*
  * add semadj values to semaphores, free undo structures.
  * undo structures are not freed when semaphore arrays are destroyed
  * so some of them may be out of date.
+ * IMPLEMENTATION NOTE: There is some confusion over whether the
+ * set of adjustments that needs to be done should be done in an atomic
+ * manner or not. That is, if we are attempting to decrement the semval
+ * should we queue up and wait until we can do so legally?
+ * The original implementation attempted to do this (queue and wait).
+ * The current implementation does not do so. The POSIX standard
+ * and SVID should be consulted to determine what behavior is mandated.
  */
 void sem_exit (void)
 {
+	struct sem_queue *q;
 	struct sem_undo *u, *un = NULL, **up, **unp;
 	struct semid_ds *sma;
-	struct sem *sem = NULL;
-	
+	int nsems, i;
+
+	/* If the current process was sleeping for a semaphore,
+	 * remove it from the queue.
+	 */
+	if ((q = current->semsleeping)) {
+		if (q->prev)
+			remove_from_queue(q->sma,q);
+		current->semsleeping = NULL;
+	}
+
 	for (up = &current->semundo; (u = *up); *up = u->proc_next, kfree(u)) {
+		if (u->semid == -1)
+			continue;
 		sma = semary[(unsigned int) u->semid % SEMMNI];
-		if (sma == IPC_UNUSED || sma == IPC_NOID) 
+		if (sma == IPC_UNUSED || sma == IPC_NOID)
 			continue;
 		if (sma->sem_perm.seq != (unsigned int) u->semid / SEMMNI)
 			continue;
+		/* remove u from the sma->undo list */
 		for (unp = &sma->undo; (un = *unp); unp = &un->id_next) {
-			if (u == un) 
+			if (u == un)
 				goto found;
 		}
 		printk ("sem_exit undo list error id=%d\n", u->semid);
 		break;
 found:
 		*unp = un->id_next;
-		if (!un->semadj)
-			continue;
-		while (1) {
-			if (sma->sem_perm.seq != (unsigned int) un->semid / SEMMNI)
-				break;
-			sem = &sma->sem_base[un->sem_num];
-			if (sem->semval + un->semadj >= 0) {
-				sem->semval += un->semadj;
-				sem->sempid = current->pid;
-				sma->sem_otime = CURRENT_TIME;
-				if (un->semadj > 0 && sma->eventn)
-					wake_up (&sma->eventn);
-				if (!sem->semval && sma->eventz)
-					wake_up (&sma->eventz);
-				break;
-			} 
-			if (current->signal & ~current->blocked)
-				break;
-			sem->semncnt++;
-			interruptible_sleep_on (&sma->eventn);
-			sem->semncnt--;
+		/* perform adjustments registered in u */
+		nsems = sma->sem_nsems;
+		for (i = 0; i < nsems; i++) {
+			struct sem * sem = &sma->sem_base[i];
+			sem->semval += u->semadj[i];
+			if (sem->semval < 0)
+				sem->semval = 0; /* shouldn't happen */
+			sem->sempid = current->pid;
 		}
+		sma->sem_otime = CURRENT_TIME;
+		/* maybe some queued-up processes were waiting for this */
+		update_queue(sma);
 	}
 	current->semundo = NULL;
-	return;
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index 562539a90..9dc89ec22 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -6,7 +6,6 @@
  */
 
 #include <linux/errno.h>
-#include <asm/segment.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/ipc.h>
@@ -14,15 +13,18 @@
 #include <linux/stat.h>
 #include <linux/malloc.h>
 
+#include <asm/segment.h>
+#include <asm/pgtable.h>
+
 extern int ipcperms (struct ipc_perm *ipcp, short shmflg);
-extern unsigned int get_swap_page (void);
+extern unsigned long get_swap_page (void);
 static int findkey (key_t key);
 static int newseg (key_t key, int shmflg, int size);
-static int shm_map (struct vm_area_struct *shmd, int remap);
+static int shm_map (struct vm_area_struct *shmd);
 static void killseg (int id);
 static void shm_open (struct vm_area_struct *shmd);
 static void shm_close (struct vm_area_struct *shmd);
-static unsigned long shm_swap_in (struct vm_area_struct *, unsigned long);
+static pte_t shm_swap_in(struct vm_area_struct *, unsigned long, unsigned long);
 
 static int shm_tot = 0; /* total number of shared memory pages */
 static int shm_rss = 0; /* number of shared memory pages that are in memory */
@@ -161,7 +163,6 @@ static void killseg (int id)
 {
 	struct shmid_ds *shp;
 	int i, numpages;
-	ulong page;
 
 	shp = shm_segs[id];
 	if (shp == IPC_NOID || shp == IPC_UNUSED) {
@@ -180,13 +181,15 @@ static void killseg (int id)
 	}
 	numpages = shp->shm_npages;
 	for (i = 0; i < numpages ; i++) {
-		if (!(page = shp->shm_pages[i]))
+		pte_t pte;
+		pte_val(pte) = shp->shm_pages[i];
+		if (pte_none(pte))
 			continue;
-		if (page & PAGE_PRESENT) {
-			free_page (page & PAGE_MASK);
+		if (pte_present(pte)) {
+			free_page (pte_page(pte));
 			shm_rss--;
 		} else {
-			swap_free (page);
+			swap_free(pte_val(pte));
 			shm_swp--;
 		}
 	}
@@ -351,7 +354,7 @@ int sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
  * shmd->vm_start	virt addr of attach, multiple of SHMLBA
  * shmd->vm_end		multiple of SHMLBA
  * shmd->vm_next	next attach for task
- * shmd->vm_share	next attach for segment
+ * shmd->vm_next_share	next attach for segment
  * shmd->vm_offset	offset into segment
  * shmd->vm_pte		signature for this attach
  */
@@ -359,78 +362,84 @@ int sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 static struct vm_operations_struct shm_vm_ops = {
 	shm_open,		/* open */
 	shm_close,		/* close */
+	NULL,			/* unmap */
+	NULL,			/* protect */
+	NULL,			/* sync */
+	NULL,			/* advise */
 	NULL,			/* nopage (done with swapin) */
 	NULL,			/* wppage */
-	NULL,			/* share */
-	NULL,			/* unmap */
 	NULL,			/* swapout (hardcoded right now) */
 	shm_swap_in		/* swapin */
 };
 
+/* Insert shmd into the circular list shp->attaches */
+static inline void insert_attach (struct shmid_ds * shp, struct vm_area_struct * shmd)
+{
+	struct vm_area_struct * attaches;
+
+	if ((attaches = shp->attaches)) {
+		shmd->vm_next_share = attaches;
+		shmd->vm_prev_share = attaches->vm_prev_share;
+		shmd->vm_prev_share->vm_next_share = shmd;
+		attaches->vm_prev_share = shmd;
+	} else
+		shp->attaches = shmd->vm_next_share = shmd->vm_prev_share = shmd;
+}
+
+/* Remove shmd from circular list shp->attaches */
+static inline void remove_attach (struct shmid_ds * shp, struct vm_area_struct * shmd)
+{
+	if (shmd->vm_next_share == shmd) {
+		if (shp->attaches != shmd) {
+			printk("shm_close: shm segment (id=%ld) attach list inconsistent\n",
+				(shmd->vm_pte >> SHM_ID_SHIFT) & SHM_ID_MASK);
+			printk("shm_close: %d %08lx-%08lx %c%c%c%c %08lx %08lx\n",
+				shmd->vm_task->pid, shmd->vm_start, shmd->vm_end,
+				shmd->vm_flags & VM_READ ? 'r' : '-',
+				shmd->vm_flags & VM_WRITE ? 'w' : '-',
+				shmd->vm_flags & VM_EXEC ? 'x' : '-',
+				shmd->vm_flags & VM_MAYSHARE ? 's' : 'p',
+				shmd->vm_offset, shmd->vm_pte);
+		}
+		shp->attaches = NULL;
+	} else {
+		if (shp->attaches == shmd)
+			shp->attaches = shmd->vm_next_share;
+		shmd->vm_prev_share->vm_next_share = shmd->vm_next_share;
+		shmd->vm_next_share->vm_prev_share = shmd->vm_prev_share;
+	}
+}
+
 /*
- * check range is unmapped, ensure page tables exist
+ * ensure page tables exist
  * mark page table entries with shm_sgn.
- * if remap != 0 the range is remapped.
  */
-static int shm_map (struct vm_area_struct *shmd, int remap)
+static int shm_map (struct vm_area_struct *shmd)
 {
-	unsigned long *page_table;
+	pgd_t *page_dir;
+	pmd_t *page_middle;
+	pte_t *page_table;
 	unsigned long tmp, shm_sgn;
-	unsigned long page_dir = shmd->vm_task->tss.cr3;
-
-	/* check that the range is unmapped */
-	if (!remap)
-		for (tmp = shmd->vm_start; tmp < shmd->vm_end; tmp += PAGE_SIZE) {
-			page_table = PAGE_DIR_OFFSET(page_dir,tmp);
-			if (*page_table & PAGE_PRESENT) {
-				page_table = (ulong *) (PAGE_MASK & *page_table);
-				page_table += ((tmp >> PAGE_SHIFT) & (PTRS_PER_PAGE-1));
-				if (*page_table) {
-					/* printk("shmat() -> EINVAL because address 0x%lx is already mapped.\n",tmp); */
-					return -EINVAL;
-				}
-			}
-		}
 
 	/* clear old mappings */
 	do_munmap(shmd->vm_start, shmd->vm_end - shmd->vm_start);
 
 	/* add new mapping */
 	insert_vm_struct(current, shmd);
-	merge_segments(current->mm->mmap);
-
-	/* check that the range has page_tables */
-	for (tmp = shmd->vm_start; tmp < shmd->vm_end; tmp += PAGE_SIZE) {
-		page_table = PAGE_DIR_OFFSET(page_dir,tmp);
-		if (*page_table & PAGE_PRESENT) {
-			page_table = (ulong *) (PAGE_MASK & *page_table);
-			page_table += ((tmp >> PAGE_SHIFT) & (PTRS_PER_PAGE-1));
-			if (*page_table) {
-				if (*page_table & PAGE_PRESENT) {
-					--current->mm->rss;
-					free_page (*page_table & PAGE_MASK);
-				}
-				else
-					swap_free (*page_table);
-				*page_table = 0;
-			}
-		} else {
-			unsigned long new_pt;
-			if (!(new_pt = get_free_page(GFP_KERNEL)))
-				return -ENOMEM;
-			*page_table = new_pt | PAGE_TABLE;
-			tmp |= ((PAGE_SIZE << 10) - PAGE_SIZE);
-		}
-	}
+	merge_segments(current, shmd->vm_start, shmd->vm_end);
 
 	/* map page range */
 	shm_sgn = shmd->vm_pte + ((shmd->vm_offset >> PAGE_SHIFT) << SHM_IDX_SHIFT);
 	for (tmp = shmd->vm_start; tmp < shmd->vm_end; tmp += PAGE_SIZE,
 	     shm_sgn += (1 << SHM_IDX_SHIFT)) {
-		page_table = PAGE_DIR_OFFSET(page_dir,tmp);
-		page_table = (ulong *) (PAGE_MASK & *page_table);
-		page_table += (tmp >> PAGE_SHIFT) & (PTRS_PER_PAGE-1);
-		*page_table = shm_sgn;
+		page_dir = pgd_offset(shmd->vm_task,tmp);
+		page_middle = pmd_alloc(page_dir,tmp);
+		if (!page_middle)
+			return -ENOMEM;
+		page_table = pte_alloc(page_middle,tmp);
+		if (!page_table)
+			return -ENOMEM;
+		pte_val(*page_table) = shm_sgn;
 	}
 	invalidate();
 	return 0;
@@ -438,7 +447,6 @@ static int shm_map (struct vm_area_struct *shmd, int remap)
 
 /*
  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
- * raddr is needed to return addresses above 2Gig.
  */
 int sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 {
@@ -453,12 +461,6 @@ int sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 		return -EINVAL;
 	}
 
-	if (raddr) {
-		err = verify_area(VERIFY_WRITE, raddr, sizeof(ulong));
-		if (err)
-			return err;
-	}
-
 	shp = shm_segs[id = (unsigned int) shmid % SHMMNI];
 	if (shp == IPC_UNUSED || shp == IPC_NOID) {
 		/* printk("shmat() -> EINVAL because shmid = %d is invalid\n",shmid); */
@@ -468,7 +470,7 @@ int sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 	if (!(addr = (ulong) shmaddr)) {
 		if (shmflg & SHM_REMAP)
 			return -EINVAL;
-		if (!(addr = get_unmapped_area(shp->shm_segsz)))
+		if (!(addr = get_unmapped_area(0, shp->shm_segsz)))
 			return -ENOMEM;
 	} else if (addr & (SHMLBA-1)) {
 		if (shmflg & SHM_RND)
@@ -481,12 +483,11 @@ int sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 		return -EINVAL;
 	}
 	if (!(shmflg & SHM_REMAP))
-		for (shmd = current->mm->mmap; shmd; shmd = shmd->vm_next)
-			if (!(addr >= shmd->vm_end || addr + shp->shm_segsz <= shmd->vm_start)) {
-				/* printk("shmat() -> EINVAL because the interval [0x%lx,0x%lx) intersects an already mapped interval [0x%lx,0x%lx).\n",
-					addr, addr + shp->shm_segsz, shmd->vm_start, shmd->vm_end); */
-				return -EINVAL;
-			}
+		if ((shmd = find_vma_intersection(current, addr, addr + shp->shm_segsz))) {
+			/* printk("shmat() -> EINVAL because the interval [0x%lx,0x%lx) intersects an already mapped interval [0x%lx,0x%lx).\n",
+				addr, addr + shp->shm_segsz, shmd->vm_start, shmd->vm_end); */
+			return -EINVAL;
+		}
 
 	if (ipcperms(&shp->shm_perm, shmflg & SHM_RDONLY ? S_IRUGO : S_IRUGO|S_IWUGO))
 		return -EACCES;
@@ -501,8 +502,7 @@ int sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 		return -EIDRM;
 	}
 
-	shmd->vm_pte = (SHM_SWP_TYPE << 1) | (id << SHM_ID_SHIFT) |
-		(shmflg & SHM_RDONLY ? SHM_READ_ONLY : 0);
+	shmd->vm_pte = (SHM_SWP_TYPE << 1) | (id << SHM_ID_SHIFT);
 	shmd->vm_start = addr;
 	shmd->vm_end = addr + shp->shm_npages * PAGE_SIZE;
 	shmd->vm_task = current;
@@ -510,27 +510,25 @@ int sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
 	shmd->vm_flags = VM_SHM | VM_MAYSHARE | VM_SHARED
 			 | VM_MAYREAD | VM_MAYEXEC | VM_READ | VM_EXEC
 			 | ((shmflg & SHM_RDONLY) ? 0 : VM_MAYWRITE | VM_WRITE);
-	shmd->vm_share = NULL;
+	shmd->vm_next_share = shmd->vm_prev_share = NULL;
 	shmd->vm_inode = NULL;
 	shmd->vm_offset = 0;
 	shmd->vm_ops = &shm_vm_ops;
 
 	shp->shm_nattch++;            /* prevent destruction */
-	if ((err = shm_map (shmd, shmflg & SHM_REMAP))) {
+	if ((err = shm_map (shmd))) {
 		if (--shp->shm_nattch <= 0 && shp->shm_perm.mode & SHM_DEST)
 			killseg(id);
 		kfree(shmd);
 		return err;
 	}
 
-	shmd->vm_share = shp->attaches;
-	shp->attaches = shmd;
+	insert_attach(shp,shmd);  /* insert shmd into shp->attaches */
+
 	shp->shm_lpid = current->pid;
 	shp->shm_atime = CURRENT_TIME;
 
-	if (!raddr)
-		return addr;
-	put_fs_long (addr, raddr);
+	*raddr = addr;
 	return 0;
 }
 
@@ -546,8 +544,7 @@ static void shm_open (struct vm_area_struct *shmd)
 		printk("shm_open: unused id=%d PANIC\n", id);
 		return;
 	}
-	shmd->vm_share = shp->attaches;
-	shp->attaches = shmd;
+	insert_attach(shp,shmd);  /* insert shmd into shp->attaches */
 	shp->shm_nattch++;
 	shp->shm_atime = CURRENT_TIME;
 	shp->shm_lpid = current->pid;
@@ -561,7 +558,6 @@ static void shm_open (struct vm_area_struct *shmd)
  */
 static void shm_close (struct vm_area_struct *shmd)
 {
-	struct vm_area_struct **shmdp;
 	struct shmid_ds *shp;
 	int id;
 
@@ -570,21 +566,7 @@ static void shm_close (struct vm_area_struct *shmd)
 	/* remove from the list of attaches of the shm segment */
 	id = (shmd->vm_pte >> SHM_ID_SHIFT) & SHM_ID_MASK;
 	shp = shm_segs[id];
-	for (shmdp = &shp->attaches; *shmdp; shmdp = &(*shmdp)->vm_share)
-		if (*shmdp == shmd) {
-			*shmdp = shmd->vm_share;
-			goto found;
-		}
-	printk("shm_close: shm segment (id=%d) attach list inconsistent\n",id);
-	printk("shm_close: %d %08lx-%08lx %c%c%c%c %08lx %08lx\n",
-		shmd->vm_task->pid, shmd->vm_start, shmd->vm_end,
-		shmd->vm_flags & VM_READ ? 'r' : '-',
-		shmd->vm_flags & VM_WRITE ? 'w' : '-',
-		shmd->vm_flags & VM_EXEC ? 'x' : '-',
-		shmd->vm_flags & VM_SHARED ? 's' : 'p',
-		shmd->vm_offset, shmd->vm_pte);
-
- found:
+	remove_attach(shp,shmd);  /* remove from shp->attaches */
   	shp->shm_lpid = current->pid;
 	shp->shm_dtime = CURRENT_TIME;
 	if (--shp->shm_nattch <= 0 && shp->shm_perm.mode & SHM_DEST)
@@ -611,73 +593,86 @@ int sys_shmdt (char *shmaddr)
 /*
  * page not present ... go through shm_pages
  */
-static unsigned long shm_swap_in(struct vm_area_struct * vma, unsigned long code)
+static pte_t shm_swap_in(struct vm_area_struct * shmd, unsigned long offset, unsigned long code)
 {
-	unsigned long page;
+	pte_t pte;
 	struct shmid_ds *shp;
 	unsigned int id, idx;
 
 	id = (code >> SHM_ID_SHIFT) & SHM_ID_MASK;
+	if (id != ((shmd->vm_pte >> SHM_ID_SHIFT) & SHM_ID_MASK)) {
+		printk ("shm_swap_in: code id = %d and shmd id = %ld differ\n",
+			id, (shmd->vm_pte >> SHM_ID_SHIFT) & SHM_ID_MASK);
+		return BAD_PAGE;
+	}
 	if (id > max_shmid) {
-		printk ("shm_no_page: id=%d too big. proc mem corrupted\n", id);
-		return BAD_PAGE | PAGE_SHARED;
+		printk ("shm_swap_in: id=%d too big. proc mem corrupted\n", id);
+		return BAD_PAGE;
 	}
 	shp = shm_segs[id];
 	if (shp == IPC_UNUSED || shp == IPC_NOID) {
-		printk ("shm_no_page: id=%d invalid. Race.\n", id);
-		return BAD_PAGE | PAGE_SHARED;
+		printk ("shm_swap_in: id=%d invalid. Race.\n", id);
+		return BAD_PAGE;
 	}
 	idx = (code >> SHM_IDX_SHIFT) & SHM_IDX_MASK;
+	if (idx != (offset >> PAGE_SHIFT)) {
+		printk ("shm_swap_in: code idx = %u and shmd idx = %lu differ\n",
+			idx, offset >> PAGE_SHIFT);
+		return BAD_PAGE;
+	}
 	if (idx >= shp->shm_npages) {
-		printk ("shm_no_page : too large page index. id=%d\n", id);
-		return BAD_PAGE | PAGE_SHARED;
+		printk ("shm_swap_in : too large page index. id=%d\n", id);
+		return BAD_PAGE;
 	}
 
-	if (!(shp->shm_pages[idx] & PAGE_PRESENT)) {
-		if(!(page = get_free_page(GFP_KERNEL))) {
+	pte_val(pte) = shp->shm_pages[idx];
+	if (!pte_present(pte)) {
+		unsigned long page = get_free_page(GFP_KERNEL);
+		if (!page) {
 			oom(current);
-			return BAD_PAGE | PAGE_SHARED;
+			return BAD_PAGE;
 		}
-		if (shp->shm_pages[idx] & PAGE_PRESENT) {
-			free_page (page);
+		pte_val(pte) = shp->shm_pages[idx];
+		if (pte_present(pte)) {
+			free_page (page); /* doesn't sleep */
 			goto done;
 		}
-		if (shp->shm_pages[idx]) {
-			read_swap_page (shp->shm_pages[idx], (char *) page);
-			if (shp->shm_pages[idx] & PAGE_PRESENT)  {
-				free_page (page);
+		if (!pte_none(pte)) {
+			read_swap_page(pte_val(pte), (char *) page);
+			pte_val(pte) = shp->shm_pages[idx];
+			if (pte_present(pte))  {
+				free_page (page); /* doesn't sleep */
 				goto done;
 			}
-			swap_free (shp->shm_pages[idx]);
+			swap_free(pte_val(pte));
 			shm_swp--;
 		}
 		shm_rss++;
-		shp->shm_pages[idx] = page | (PAGE_SHARED | PAGE_DIRTY);
+		pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
+		shp->shm_pages[idx] = pte_val(pte);
 	} else
 		--current->mm->maj_flt;  /* was incremented in do_no_page */
 
-done:
+done:	/* pte_val(pte) == shp->shm_pages[idx] */
 	current->mm->min_flt++;
-	page = shp->shm_pages[idx];
-	if (code & SHM_READ_ONLY)           /* write-protect */
-		page &= ~PAGE_RW;
-	mem_map[MAP_NR(page)]++;
-	return page;
+	mem_map[MAP_NR(pte_page(pte))]++;
+	return pte_modify(pte, shmd->vm_page_prot);
 }
 
 /*
- * Goes through counter = (shm_rss << prio) present shm pages.
+ * Goes through counter = (shm_rss >> prio) present shm pages.
  */
 static unsigned long swap_id = 0; /* currently being swapped */
 static unsigned long swap_idx = 0; /* next to swap */
 
 int shm_swap (int prio)
 {
-	unsigned long page;
+	pte_t page;
 	struct shmid_ds *shp;
 	struct vm_area_struct *shmd;
-	unsigned int swap_nr;
-	unsigned long id, idx, invalid = 0;
+	unsigned long swap_nr;
+	unsigned long id, idx;
+	int loop = 0, invalid = 0;
 	int counter;
 
 	counter = shm_rss >> prio;
@@ -687,35 +682,43 @@ int shm_swap (int prio)
  check_id:
 	shp = shm_segs[swap_id];
 	if (shp == IPC_UNUSED || shp == IPC_NOID || shp->shm_perm.mode & SHM_LOCKED ) {
+		next_id:
 		swap_idx = 0;
-		if (++swap_id > max_shmid)
+		if (++swap_id > max_shmid) {
+			if (loop)
+				goto failed;
+			loop = 1;
 			swap_id = 0;
+		}
 		goto check_id;
 	}
 	id = swap_id;
 
  check_table:
 	idx = swap_idx++;
-	if (idx >= shp->shm_npages) {
-		swap_idx = 0;
-		if (++swap_id > max_shmid)
-			swap_id = 0;
-		goto check_id;
-	}
+	if (idx >= shp->shm_npages)
+		goto next_id;
 
-	page = shp->shm_pages[idx];
-	if (!(page & PAGE_PRESENT))
+	pte_val(page) = shp->shm_pages[idx];
+	if (!pte_present(page))
 		goto check_table;
 	swap_attempts++;
 
 	if (--counter < 0) { /* failed */
+		failed:
 		if (invalid)
 			invalidate();
 		swap_free (swap_nr);
 		return 0;
 	}
-	for (shmd = shp->attaches; shmd; shmd = shmd->vm_share) {
-		unsigned long tmp, *pte;
+	if (shp->attaches)
+	  for (shmd = shp->attaches; ; ) {
+	    do {
+		pgd_t *page_dir;
+		pmd_t *page_middle;
+		pte_t *page_table, pte;
+		unsigned long tmp;
+
 		if ((shmd->vm_pte >> SHM_ID_SHIFT & SHM_ID_MASK) != id) {
 			printk ("shm_swap: id=%ld does not match shmd->vm_pte.id=%ld\n", id, shmd->vm_pte >> SHM_ID_SHIFT & SHM_ID_MASK);
 			continue;
@@ -723,37 +726,48 @@ int shm_swap (int prio)
 		tmp = shmd->vm_start + (idx << PAGE_SHIFT) - shmd->vm_offset;
 		if (!(tmp >= shmd->vm_start && tmp < shmd->vm_end))
 			continue;
-		pte = PAGE_DIR_OFFSET(shmd->vm_task->tss.cr3,tmp);
-		if (!(*pte & PAGE_PRESENT)) {
+		page_dir = pgd_offset(shmd->vm_task,tmp);
+		if (pgd_none(*page_dir) || pgd_bad(*page_dir)) {
 			printk("shm_swap: bad pgtbl! id=%ld start=%lx idx=%ld\n",
 					id, shmd->vm_start, idx);
-			*pte = 0;
+			pgd_clear(page_dir);
 			continue;
 		}
-		pte = (ulong *) (PAGE_MASK & *pte);
-		pte += ((tmp >> PAGE_SHIFT) & (PTRS_PER_PAGE-1));
-		tmp = *pte;
-		if (!(tmp & PAGE_PRESENT))
+		page_middle = pmd_offset(page_dir,tmp);
+		if (pmd_none(*page_middle) || pmd_bad(*page_middle)) {
+			printk("shm_swap: bad pgmid! id=%ld start=%lx idx=%ld\n",
+					id, shmd->vm_start, idx);
+			pmd_clear(page_middle);
 			continue;
-		if (tmp & PAGE_ACCESSED) {
-			*pte &= ~PAGE_ACCESSED;
+		}
+		page_table = pte_offset(page_middle,tmp);
+		pte = *page_table;
+		if (!pte_present(pte))
+			continue;
+		if (pte_young(pte)) {
+			*page_table = pte_mkold(pte);
 			continue;
 		}
-		tmp = shmd->vm_pte | idx << SHM_IDX_SHIFT;
-		*pte = tmp;
-		mem_map[MAP_NR(page)]--;
-		shmd->vm_task->mm->rss--;
+		if (pte_page(pte) != pte_page(page))
+			printk("shm_swap_out: page and pte mismatch\n");
+		pte_val(*page_table) = shmd->vm_pte | idx << SHM_IDX_SHIFT;
+		mem_map[MAP_NR(pte_page(pte))]--;
+		if (shmd->vm_task->mm->rss > 0)
+			shmd->vm_task->mm->rss--;
 		invalid++;
+	    /* continue looping through circular list */
+	    } while (0);
+	    if ((shmd = shmd->vm_next_share) == shp->attaches)
+		break;
 	}
 
-	if (mem_map[MAP_NR(page)] != 1)
+	if (mem_map[MAP_NR(pte_page(page))] != 1)
 		goto check_table;
-	page &= PAGE_MASK;
 	shp->shm_pages[idx] = swap_nr;
 	if (invalid)
 		invalidate();
-	write_swap_page (swap_nr, (char *) page);
-	free_page (page);
+	write_swap_page (swap_nr, (char *) pte_page(page));
+	free_page(pte_page(page));
 	swap_successes++;
 	shm_swp++;
 	shm_rss--;
diff --git a/ipc/util.c b/ipc/util.c
index fb0e6970d..87c6c28ea 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -7,13 +7,14 @@
 #include <linux/errno.h>
 #include <asm/segment.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
 #include <linux/sem.h>
 #include <linux/msg.h>
 #include <linux/shm.h>
 #include <linux/stat.h>
 
 void ipc_init (void);
-asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr);
+asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr, long fifth);
 
 #ifdef CONFIG_SYSVIPC
 
@@ -62,9 +63,13 @@ int ipcperms (struct ipc_perm *ipcp, short flag)
 	return 0;
 }
 
-asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr) 
+asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr, long fifth)
 {
-	
+	int version;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
 	if (call <= SEMCTL)
 		switch (call) {
 		case SEMOP:
@@ -89,17 +94,21 @@ asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr)
 		case MSGSND:
 			return sys_msgsnd (first, (struct msgbuf *) ptr, 
 					   second, third);
-		case MSGRCV: {
-			struct ipc_kludge tmp;
-			int err;
-			if (!ptr)
-				return -EINVAL;
-			if ((err = verify_area (VERIFY_READ, ptr, sizeof(tmp))))
-				return err;
-			memcpy_fromfs (&tmp,(struct ipc_kludge *) ptr,
-				       sizeof (tmp));
-			return sys_msgrcv (first, tmp.msgp, second, tmp.msgtyp,
-					 	third);
+		case MSGRCV:
+			switch (version) {
+			case 0: {
+				struct ipc_kludge tmp;
+				int err;
+				if (!ptr)
+					return -EINVAL;
+				if ((err = verify_area (VERIFY_READ, ptr, sizeof(tmp))))
+					return err;
+				memcpy_fromfs (&tmp,(struct ipc_kludge *) ptr,
+					       sizeof (tmp));
+				return sys_msgrcv (first, tmp.msgp, second, tmp.msgtyp, third);
+				}
+			case 1: default:
+				return sys_msgrcv (first, (struct msgbuf *) ptr, second, fifth, third);
 			}
 		case MSGGET:
 			return sys_msgget ((key_t) first, second);
@@ -111,8 +120,23 @@ asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr)
 	if (call <= SHMCTL) 
 		switch (call) {
 		case SHMAT:
-			return sys_shmat (first, (char *) ptr, second, 
-							(ulong *) third);
+			switch (version) {
+			case 0: default: {
+				ulong raddr;
+				int err;
+				if ((err = verify_area(VERIFY_WRITE, (ulong*) third, sizeof(ulong))))
+					return err;
+				err = sys_shmat (first, (char *) ptr, second, &raddr);
+				if (err)
+					return err;
+				put_fs_long (raddr, (ulong *) third);
+				return 0;
+				}
+			case 1:	/* iBCS2 emulator entry point */
+				if (get_fs() != get_ds())
+					return -EINVAL;
+				return sys_shmat (first, (char *) ptr, second, (ulong *) third);
+			}
 		case SHMDT: 
 			return sys_shmdt ((char *)ptr);
 		case SHMGET:
@@ -127,7 +151,7 @@ asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr)
 
 #else /* not CONFIG_SYSVIPC */
 
-asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr) 
+asmlinkage int sys_ipc (uint call, int first, int second, int third, void *ptr, long fifth) 
 {
     return -ENOSYS;
 }
author	Ralf Baechle <ralf@linux-mips.org>	1995-11-14 08:00:00 +0000
committer	<ralf@linux-mips.org>	1995-11-14 08:00:00 +0000
commit	e7c2a72e2680827d6a733931273a93461c0d8d1b (patch)
tree	c9abeda78ef7504062bb2e816bcf3e3c9d680112 /ipc
parent	ec6044459060a8c9ce7f64405c465d141898548c (diff)