summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/i386/config.in4
-rw-r--r--arch/i386/kernel/apm.c2
-rw-r--r--arch/i386/lib/Makefile4
-rw-r--r--arch/i386/lib/dec_and_lock.c40
-rw-r--r--arch/mips/kernel/syscall.c7
-rw-r--r--arch/sparc/kernel/signal.c3
-rw-r--r--arch/sparc64/kernel/signal.c3
-rw-r--r--arch/sparc64/kernel/signal32.c3
-rw-r--r--drivers/atm/ambassador.c14
-rw-r--r--drivers/atm/eni.c24
-rw-r--r--drivers/atm/eni.h1
-rw-r--r--drivers/atm/fore200e.c4
-rw-r--r--drivers/atm/horizon.c3
-rw-r--r--drivers/atm/idt77105.c10
-rw-r--r--drivers/atm/nicstar.c218
-rw-r--r--drivers/atm/nicstar.h17
-rw-r--r--drivers/block/lvm.c2
-rw-r--r--drivers/block/md.c4
-rw-r--r--drivers/block/raid0.c8
-rw-r--r--drivers/block/raid1.c1
-rw-r--r--drivers/block/raid5.c2
-rw-r--r--drivers/ide/piix.c2
-rw-r--r--drivers/net/3c509.c4
-rw-r--r--drivers/net/3c515.c7
-rw-r--r--drivers/scsi/NCR5380.c2
-rw-r--r--drivers/scsi/atp870u.c1
-rw-r--r--drivers/scsi/seagate.c2
-rw-r--r--drivers/sound/os.h4
-rw-r--r--fs/binfmt_aout.c30
-rw-r--r--fs/dcache.c86
-rw-r--r--fs/file.c18
-rw-r--r--fs/inode.c102
-rw-r--r--fs/namei.c18
-rw-r--r--fs/open.c1
-rw-r--r--fs/proc/proc_misc.c20
-rw-r--r--include/linux/dcache.h11
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/slab.h16
-rw-r--r--include/linux/spinlock.h9
-rw-r--r--include/linux/sunrpc/sched.h40
-rw-r--r--include/linux/sunrpc/xprt.h8
-rw-r--r--init/main.c1
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/ksyms.c3
-rw-r--r--kernel/sys.c2
-rw-r--r--lib/Makefile4
-rw-r--r--lib/dec_and_lock.c37
-rw-r--r--mm/slab.c2942
-rw-r--r--net/atm/Makefile2
-rw-r--r--net/atm/clip.c28
-rw-r--r--net/atm/common.c10
-rw-r--r--net/atm/common.h1
-rw-r--r--net/atm/ipcommon.c35
-rw-r--r--net/atm/ipcommon.h4
-rw-r--r--net/atm/lec.c26
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/proc.c9
-rw-r--r--net/atm/signaling.c27
-rw-r--r--net/atm/signaling.h8
-rw-r--r--net/atm/svc.c13
-rw-r--r--net/ipv4/ip_gre.c10
-rw-r--r--net/ipv4/ipip.c12
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c28
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c5
-rw-r--r--net/ipv4/netfilter/ip_fw_compat.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c20
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c3
-rw-r--r--net/ipv4/netfilter/ip_queue.c3
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c4
-rw-r--r--net/ipv4/netfilter/ipt_MIRROR.c43
-rw-r--r--net/ipv4/netfilter/ipt_mac.c6
-rw-r--r--net/ipv6/sit.c12
-rw-r--r--net/sunrpc/clnt.c11
-rw-r--r--net/sunrpc/sched.c138
-rw-r--r--net/sunrpc/svcsock.c31
-rw-r--r--net/sunrpc/xprt.c78
77 files changed, 2302 insertions, 2024 deletions
diff --git a/arch/i386/config.in b/arch/i386/config.in
index 7344e802c..a2252d7af 100644
--- a/arch/i386/config.in
+++ b/arch/i386/config.in
@@ -34,6 +34,7 @@ choice 'Processor family' \
# Define implied options from the CPU selection here
#
if [ "$CONFIG_M386" = "y" ]; then
+ define_bool CONFIG_X86_CMPXCHG n
define_int CONFIG_X86_L1_CACHE_BYTES 16
else
define_bool CONFIG_X86_WP_WORKS_OK y
@@ -139,6 +140,9 @@ if [ "$CONFIG_SMP" != "y" ]; then
define_bool CONFIG_X86_LOCAL_APIC y
fi
fi
+if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
+ define_bool CONFIG_HAVE_DEC_LOCK y
+fi
endmenu
mainmenu_option next_comment
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index c78a8f512..6a4eedf72 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -660,6 +660,7 @@ static void apm_power_off(void)
#endif
}
+#ifdef CONFIG_APM_DO_ENABLE
static int apm_enable_power_management(int enable)
{
u32 eax;
@@ -675,6 +676,7 @@ static int apm_enable_power_management(int enable)
apm_bios_info.flags |= APM_BIOS_DISABLED;
return APM_SUCCESS;
}
+#endif
static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
{
diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile
index 2394245a6..be8fcec0a 100644
--- a/arch/i386/lib/Makefile
+++ b/arch/i386/lib/Makefile
@@ -13,4 +13,8 @@ ifdef CONFIG_X86_USE_3DNOW
L_OBJS += mmx.o
endif
+ifdef CONFIG_HAVE_DEC_LOCK
+L_OBJS += dec_and_lock.o
+endif
+
include $(TOPDIR)/Rules.make
diff --git a/arch/i386/lib/dec_and_lock.c b/arch/i386/lib/dec_and_lock.c
new file mode 100644
index 000000000..ffd486900
--- /dev/null
+++ b/arch/i386/lib/dec_and_lock.c
@@ -0,0 +1,40 @@
+/*
+ * x86 version of "atomic_dec_and_lock()" using
+ * the atomic "cmpxchg" instruction.
+ *
+ * (For CPU's lacking cmpxchg, we use the slow
+ * generic version, and this one never even gets
+ * compiled).
+ */
+
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+ int counter;
+ int newcount;
+
+repeat:
+ counter = atomic_read(atomic);
+ newcount = counter-1;
+
+ if (!newcount)
+ goto slow_path;
+
+ asm volatile("lock; cmpxchgl %1,%2"
+ :"=a" (newcount)
+ :"r" (newcount), "m" (atomic->counter), "0" (counter));
+
+ /* If the above failed, "eax" will have changed */
+ if (newcount != counter)
+ goto repeat;
+ return 0;
+
+slow_path:
+ spin_lock(lock);
+ if (atomic_dec_and_test(atomic))
+ return 1;
+ spin_unlock(lock);
+ return 0;
+}
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index d98507f16..2afb00e0a 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -1,10 +1,10 @@
-/* $Id: syscall.c,v 1.13 2000/02/04 07:40:23 ralf Exp $
- *
+/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
- * Copyright (C) 1995 - 1999 by Ralf Baechle
+ * Copyright (C) 1995 - 2000 by Ralf Baechle
+ * Copyright (C) 2000 Silicon Graphics, Inc.
*
* TODO: Implement the compatibility syscalls.
* Don't waste that much memory for empty entries in the syscall
@@ -21,6 +21,7 @@
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/file.h>
+#include <linux/slab.h>
#include <linux/utsname.h>
#include <linux/unistd.h>
#include <asm/branch.h>
diff --git a/arch/sparc/kernel/signal.c b/arch/sparc/kernel/signal.c
index 8ff82dd7e..bcad13dbc 100644
--- a/arch/sparc/kernel/signal.c
+++ b/arch/sparc/kernel/signal.c
@@ -1,4 +1,4 @@
-/* $Id: signal.c,v 1.105 2000/06/19 06:24:37 davem Exp $
+/* $Id: signal.c,v 1.106 2000/07/07 04:25:17 davem Exp $
* linux/arch/sparc/kernel/signal.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
@@ -1282,7 +1282,6 @@ asmlinkage int do_signal(sigset_t *oldset, struct pt_regs * regs,
#endif
/* fall through */
default:
- lock_kernel();
sigaddset(&current->signal, signr);
recalc_sigpending(current);
current->flags |= PF_SIGNALED;
diff --git a/arch/sparc64/kernel/signal.c b/arch/sparc64/kernel/signal.c
index 2b7cde28b..c666dc408 100644
--- a/arch/sparc64/kernel/signal.c
+++ b/arch/sparc64/kernel/signal.c
@@ -1,4 +1,4 @@
-/* $Id: signal.c,v 1.51 2000/06/19 06:24:37 davem Exp $
+/* $Id: signal.c,v 1.52 2000/07/07 04:25:17 davem Exp $
* arch/sparc64/kernel/signal.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
@@ -788,7 +788,6 @@ asmlinkage int do_signal(sigset_t *oldset, struct pt_regs * regs,
#endif
/* fall through */
default:
- lock_kernel();
sigaddset(&current->signal, signr);
recalc_sigpending(current);
current->flags |= PF_SIGNALED;
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index 5a772c7eb..aabde84c2 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -1,4 +1,4 @@
-/* $Id: signal32.c,v 1.64 2000/06/19 06:24:37 davem Exp $
+/* $Id: signal32.c,v 1.65 2000/07/07 04:25:17 davem Exp $
* arch/sparc64/kernel/signal32.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
@@ -1420,7 +1420,6 @@ asmlinkage int do_signal32(sigset_t *oldset, struct pt_regs * regs,
#endif
/* fall through */
default:
- lock_kernel();
sigaddset(&current->signal, signr);
recalc_sigpending(current);
current->flags |= PF_SIGNALED;
diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index acf2c194c..b860253d3 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -373,16 +373,16 @@ static inline u32 rd_mem (const amb_dev * dev, size_t addr) {
static inline void dump_registers (const amb_dev * dev) {
#ifdef DEBUG_AMBASSADOR
if (debug & DBG_REGS) {
- u32 * i;
+ size_t i;
PRINTD (DBG_REGS, "reading PLX control: ");
- for (i = (u32 *) 0x00; i < (u32 *) 0x30; ++i)
- rd_mem (dev, (size_t)i);
+ for (i = 0x00; i < 0x30; i += sizeof(u32))
+ rd_mem (dev, i);
PRINTD (DBG_REGS, "reading mailboxes: ");
- for (i = (u32 *) 0x40; i < (u32 *) 0x60; ++i)
- rd_mem (dev, (size_t)i);
+ for (i = 0x40; i < 0x60; i += sizeof(u32))
+ rd_mem (dev, i);
PRINTD (DBG_REGS, "reading doorb irqev irqen reset:");
- for (i = (u32 *) 0x60; i < (u32 *) 0x70; ++i)
- rd_mem (dev, (size_t)i);
+ for (i = 0x60; i < 0x70; i += sizeof(u32))
+ rd_mem (dev, i);
}
#else
(void) dev;
diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c
index 5a2ba1521..174873f72 100644
--- a/drivers/atm/eni.c
+++ b/drivers/atm/eni.c
@@ -1187,14 +1187,12 @@ static void poll_tx(struct atm_dev *dev)
if (tx->send)
while ((skb = skb_dequeue(&tx->backlog))) {
res = do_tx(skb);
- if (res == enq_ok) atomic_dec(&tx->backlog_len);
- else {
- DPRINTK("re-queuing TX PDU\n");
- skb_queue_head(&tx->backlog,skb);
+ if (res == enq_ok) continue;
+ DPRINTK("re-queuing TX PDU\n");
+ skb_queue_head(&tx->backlog,skb);
requeued++;
- if (res == enq_jam) return;
- else break;
- }
+ if (res == enq_jam) return;
+ break;
}
}
}
@@ -1326,7 +1324,6 @@ static int reserve_or_set_tx(struct atm_vcc *vcc,struct atm_trafprm *txtp,
tx->send = mem;
tx->words = size >> 2;
skb_queue_head_init(&tx->backlog);
- atomic_set(&tx->backlog_len,0);
for (order = 0; size > (1 << (order+10)); order++);
eni_out((order << MID_SIZE_SHIFT) |
((tx->send-eni_dev->ram) >> (MID_LOC_SKIP+2)),
@@ -2064,6 +2061,8 @@ static int eni_setsockopt(struct atm_vcc *vcc,int level,int optname,
static int eni_send(struct atm_vcc *vcc,struct sk_buff *skb)
{
+ enum enq_res res;
+
DPRINTK(">eni_send\n");
if (!ENI_VCC(vcc)->tx) {
if (vcc->pop) vcc->pop(vcc,skb);
@@ -2085,8 +2084,11 @@ static int eni_send(struct atm_vcc *vcc,struct sk_buff *skb)
}
submitted++;
ATM_SKB(skb)->vcc = vcc;
+ tasklet_disable(&ENI_DEV(vcc->dev)->task);
+ res = do_tx(skb);
+ tasklet_enable(&ENI_DEV(vcc->dev)->task);
+ if (res == enq_ok) return 0;
skb_queue_tail(&ENI_VCC(vcc)->tx->backlog,skb);
- atomic_inc(&ENI_VCC(vcc)->tx->backlog_len);
backlogged++;
tasklet_schedule(&ENI_DEV(vcc->dev)->task);
return 0;
@@ -2186,8 +2188,8 @@ static int eni_proc_read(struct atm_dev *dev,loff_t *pos,char *page)
tx == eni_dev->ubr ? " (UBR)" : "");
}
if (--left) continue;
- return sprintf(page,"%10sbacklog %d bytes\n","",
- atomic_read(&tx->backlog_len));
+ return sprintf(page,"%10sbacklog %u packets\n","",
+ skb_queue_len(&tx->backlog));
}
for (vcc = dev->vccs; vcc; vcc = vcc->next) {
struct eni_vcc *eni_vcc = ENI_VCC(vcc);
diff --git a/drivers/atm/eni.h b/drivers/atm/eni.h
index 12a3e196c..e7af66bd8 100644
--- a/drivers/atm/eni.h
+++ b/drivers/atm/eni.h
@@ -47,7 +47,6 @@ struct eni_tx {
int reserved; /* reserved peak cell rate */
int shaping; /* shaped peak cell rate */
struct sk_buff_head backlog; /* queue of waiting TX buffers */
- atomic_t backlog_len; /* length of backlog in bytes */
};
struct eni_vcc {
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 164ef4523..a8bda5927 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -2599,9 +2599,7 @@ fore200e_detect(void)
printk(FORE200E "FORE Systems 200E-series driver - version " FORE200E_VERSION "\n");
-#if 0 /* XXX uncomment this to forbid module unloading */
MOD_INC_USE_COUNT;
-#endif
/* for each configured bus interface */
for (link = 0, bus = fore200e_bus; bus->model_name; bus++) {
@@ -2628,10 +2626,8 @@ fore200e_detect(void)
}
}
-#if 0 /* XXX uncomment this to forbid module unloading */
if (link <= 0)
MOD_DEC_USE_COUNT;
-#endif
return link;
}
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index dc39a1ce5..913b6f2ed 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -603,7 +603,8 @@ static int make_rate (const hrz_dev * dev, u32 c, rounding r,
// note: rounding the rate down means rounding 'p' up
- const unsigned long br = test_bit (ultra, &dev->flags) ? BR_ULT : BR_HRZ;
+ const unsigned long br = test_bit (ultra, (hrz_flags *) &dev->flags) ?
+ BR_ULT : BR_HRZ;
u32 div = CR_MIND;
u32 pre;
diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c
index 48ba84369..320dd0ac8 100644
--- a/drivers/atm/idt77105.c
+++ b/drivers/atm/idt77105.c
@@ -48,10 +48,12 @@ static void idt77105_stats_timer_func(unsigned long);
static void idt77105_restart_timer_func(unsigned long);
-static struct timer_list stats_timer = { NULL, NULL, 0L, 0L,
- &idt77105_stats_timer_func };
-static struct timer_list restart_timer = { NULL, NULL, 0L, 0L,
- &idt77105_restart_timer_func };
+static struct timer_list stats_timer = {
+ function: &idt77105_stats_timer_func
+};
+static struct timer_list restart_timer = {
+ function: &idt77105_restart_timer_func
+};
static int start_timer = 1;
static struct idt77105_priv *idt77105_all = NULL;
diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index 733487374..3550acd41 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -21,6 +21,18 @@
******************************************************************************/
+/**** IMPORTANT INFORMATION ***************************************************
+ *
+ * There are currently three types of spinlocks:
+ *
+ * 1 - Per card interrupt spinlock (to protect structures and such)
+ * 2 - Per SCQ scq spinlock
+ * 3 - Per card resource spinlock (to access registers, etc.)
+ *
+ * These must NEVER be grabbed in reverse order.
+ *
+ ******************************************************************************/
+
/* Header files ***************************************************************/
#include <linux/module.h>
@@ -115,6 +127,85 @@
#define ATM_SKB(s) (&(s)->atm)
#endif
+ /* Spinlock debugging stuff */
+#ifdef NS_DEBUG_SPINLOCKS /* See nicstar.h */
+#define ns_grab_int_lock(card,flags) \
+ do { \
+ unsigned long nsdsf, nsdsf2; \
+ local_irq_save(flags); \
+ save_flags(nsdsf); cli();\
+ if (nsdsf & (1<<9)) printk ("nicstar.c: ints %sabled -> enabled.\n", \
+ (flags)&(1<<9)?"en":"dis"); \
+ if (spin_is_locked(&(card)->int_lock) && \
+ (card)->cpu_int == smp_processor_id()) { \
+ printk("nicstar.c: line %d (cpu %d) int_lock already locked at line %d (cpu %d)\n", \
+ __LINE__, smp_processor_id(), (card)->has_int_lock, \
+ (card)->cpu_int); \
+ printk("nicstar.c: ints were %sabled.\n", ((flags)&(1<<9)?"en":"dis")); \
+ } \
+ if (spin_is_locked(&(card)->res_lock) && \
+ (card)->cpu_res == smp_processor_id()) { \
+ printk("nicstar.c: line %d (cpu %d) res_lock locked at line %d (cpu %d)(trying int)\n", \
+ __LINE__, smp_processor_id(), (card)->has_res_lock, \
+ (card)->cpu_res); \
+ printk("nicstar.c: ints were %sabled.\n", ((flags)&(1<<9)?"en":"dis")); \
+ } \
+ spin_lock_irq(&(card)->int_lock); \
+ (card)->has_int_lock = __LINE__; \
+ (card)->cpu_int = smp_processor_id(); \
+ restore_flags(nsdsf); } while (0)
+#define ns_grab_res_lock(card,flags) \
+ do { \
+ unsigned long nsdsf, nsdsf2; \
+ local_irq_save(flags); \
+ save_flags(nsdsf); cli();\
+ if (nsdsf & (1<<9)) printk ("nicstar.c: ints %sabled -> enabled.\n", \
+ (flags)&(1<<9)?"en":"dis"); \
+ if (spin_is_locked(&(card)->res_lock) && \
+ (card)->cpu_res == smp_processor_id()) { \
+ printk("nicstar.c: line %d (cpu %d) res_lock already locked at line %d (cpu %d)\n", \
+ __LINE__, smp_processor_id(), (card)->has_res_lock, \
+ (card)->cpu_res); \
+ printk("nicstar.c: ints were %sabled.\n", ((flags)&(1<<9)?"en":"dis")); \
+ } \
+ spin_lock_irq(&(card)->res_lock); \
+ (card)->has_res_lock = __LINE__; \
+ (card)->cpu_res = smp_processor_id(); \
+ restore_flags(nsdsf); } while (0)
+#define ns_grab_scq_lock(card,scq,flags) \
+ do { \
+ unsigned long nsdsf, nsdsf2; \
+ local_irq_save(flags); \
+ save_flags(nsdsf); cli();\
+ if (nsdsf & (1<<9)) printk ("nicstar.c: ints %sabled -> enabled.\n", \
+ (flags)&(1<<9)?"en":"dis"); \
+ if (spin_is_locked(&(scq)->lock) && \
+ (scq)->cpu_lock == smp_processor_id()) { \
+ printk("nicstar.c: line %d (cpu %d) this scq_lock already locked at line %d (cpu %d)\n", \
+ __LINE__, smp_processor_id(), (scq)->has_lock, \
+ (scq)->cpu_lock); \
+ printk("nicstar.c: ints were %sabled.\n", ((flags)&(1<<9)?"en":"dis")); \
+ } \
+ if (spin_is_locked(&(card)->res_lock) && \
+ (card)->cpu_res == smp_processor_id()) { \
+ printk("nicstar.c: line %d (cpu %d) res_lock locked at line %d (cpu %d)(trying scq)\n", \
+ __LINE__, smp_processor_id(), (card)->has_res_lock, \
+ (card)->cpu_res); \
+ printk("nicstar.c: ints were %sabled.\n", ((flags)&(1<<9)?"en":"dis")); \
+ } \
+ spin_lock_irq(&(scq)->lock); \
+ (scq)->has_lock = __LINE__; \
+ (scq)->cpu_lock = smp_processor_id(); \
+ restore_flags(nsdsf); } while (0)
+#else /* !NS_DEBUG_SPINLOCKS */
+#define ns_grab_int_lock(card,flags) \
+ spin_lock_irqsave(&(card)->int_lock,(flags))
+#define ns_grab_res_lock(card,flags) \
+ spin_lock_irqsave(&(card)->res_lock,(flags))
+#define ns_grab_scq_lock(card,scq,flags) \
+ spin_lock_irqsave(&(scq)->lock,flags)
+#endif /* NS_DEBUG_SPINLOCKS */
+
/* Version definition *********************************************************/
/*
@@ -406,12 +497,12 @@ static u32 ns_read_sram(ns_dev *card, u32 sram_address)
sram_address <<= 2;
sram_address &= 0x0007FFFC; /* address must be dword aligned */
sram_address |= 0x50000000; /* SRAM read command */
- save_flags(flags); cli();
+ ns_grab_res_lock(card, flags);
while (CMD_BUSY(card));
writel(sram_address, card->membase + CMD);
while (CMD_BUSY(card));
data = readl(card->membase + DR0);
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->res_lock, flags);
return data;
}
@@ -424,7 +515,7 @@ static void ns_write_sram(ns_dev *card, u32 sram_address, u32 *value, int count)
count--; /* count range now is 0..3 instead of 1..4 */
c = count;
c <<= 2; /* to use increments of 4 */
- save_flags(flags); cli();
+ ns_grab_res_lock(card, flags);
while (CMD_BUSY(card));
for (i = 0; i <= c; i += 4)
writel(*(value++), card->membase + i);
@@ -434,14 +525,14 @@ static void ns_write_sram(ns_dev *card, u32 sram_address, u32 *value, int count)
sram_address &= 0x0007FFFC;
sram_address |= (0x40000000 | count);
writel(sram_address, card->membase + CMD);
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->res_lock, flags);
}
static int ns_init_card(int i, struct pci_dev *pcidev)
{
int j;
- struct ns_dev *card=NULL;
+ struct ns_dev *card = NULL;
unsigned short pci_command;
unsigned char pci_latency;
unsigned error;
@@ -468,6 +559,8 @@ static int ns_init_card(int i, struct pci_dev *pcidev)
return error;
}
cards[i] = card;
+ spin_lock_init(&card->int_lock);
+ spin_lock_init(&card->res_lock);
card->index = i;
card->atmdev = NULL;
@@ -853,9 +946,6 @@ static int ns_init_card(int i, struct pci_dev *pcidev)
card->iovpool.count++;
}
-
- card->in_handler = 0;
- card->in_poll = 0;
card->intcnt = 0;
/* Configure NICStAR */
@@ -1025,6 +1115,7 @@ static scq_info *get_scq(int size, u32 scd)
scq->tbd_count = 0;
init_waitqueue_head(&scq->scqfull_waitq);
scq->full = 0;
+ spin_lock_init(&scq->lock);
for (i = 0; i < scq->num_entries; i++)
scq->skb[i] = NULL;
@@ -1161,7 +1252,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
card->lbfqc += 2;
}
- save_flags(flags); cli();
+ ns_grab_res_lock(card, flags);
while (CMD_BUSY(card));
writel(addr2, card->membase + DR3);
@@ -1170,7 +1261,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
writel(handle1, card->membase + DR0);
writel(NS_CMD_WRITE_FREEBUFQ | (u32) type, card->membase + CMD);
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->res_lock, flags);
XPRINTK("nicstar%d: Pushing %s buffers at 0x%x and 0x%x.\n", card->index,
(type == BUF_SM ? "small" : "large"), addr1, addr2);
@@ -1193,6 +1284,7 @@ static void ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs)
u32 stat_r;
ns_dev *card;
struct atm_dev *dev;
+ unsigned long flags;
card = (ns_dev *) dev_id;
dev = card->atmdev;
@@ -1200,19 +1292,7 @@ static void ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs)
PRINTK("nicstar%d: NICStAR generated an interrupt\n", card->index);
- if (card->in_handler)
- {
- printk("nicstar%d: Re-entering ns_irq_handler()???\n", card->index);
- return;
- }
- card->in_handler = 1;
- if (card->in_poll)
- {
- card->in_handler = 0;
- printk("nicstar%d: Called irq handler while in ns_poll()!?\n",
- card->index);
- return;
- }
+ ns_grab_int_lock(card, flags);
stat_r = readl(card->membase + STAT);
@@ -1377,7 +1457,7 @@ static void ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs)
process_rsq(card);
}
- card->in_handler = 0;
+ spin_unlock_irqrestore(&card->int_lock, flags);
PRINTK("nicstar%d: end of interrupt service\n", card->index);
}
@@ -1595,10 +1675,10 @@ static void ns_close(struct atm_vcc *vcc)
unsigned long flags;
addr = NS_RCT + (vcc->vpi << card->vcibits | vcc->vci) * NS_RCT_ENTRY_SIZE;
- save_flags(flags); cli();
+ ns_grab_res_lock(card, flags);
while(CMD_BUSY(card));
writel(NS_CMD_CLOSE_CONNECTION | addr << 2, card->membase + CMD);
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->res_lock, flags);
vc->rx = 0;
if (vc->rx_iov != NULL)
@@ -1617,9 +1697,9 @@ static void ns_close(struct atm_vcc *vcc)
ATM_SKB(iovb)->iovcnt);
ATM_SKB(iovb)->iovcnt = 0;
ATM_SKB(iovb)->vcc = NULL;
- save_flags(flags); cli();
+ ns_grab_int_lock(card, flags);
recycle_iov_buf(card, iovb);
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->int_lock, flags);
vc->rx_iov = NULL;
}
}
@@ -1639,7 +1719,7 @@ static void ns_close(struct atm_vcc *vcc)
for (;;)
{
- save_flags(flags); cli();
+ ns_grab_scq_lock(card, scq, flags);
scqep = scq->next;
if (scqep == scq->base)
scqep = scq->last;
@@ -1647,7 +1727,7 @@ static void ns_close(struct atm_vcc *vcc)
scqep--;
if (scqep == scq->tail)
{
- restore_flags(flags);
+ spin_unlock_irqrestore(&scq->lock, flags);
break;
}
/* If the last entry is not a TSR, place one in the SCQ in order to
@@ -1675,8 +1755,8 @@ static void ns_close(struct atm_vcc *vcc)
data = (u32) virt_to_bus(scq->next);
ns_write_sram(card, scq->scd, &data, 1);
}
+ spin_unlock_irqrestore(&scq->lock, flags);
schedule();
- restore_flags(flags);
}
/* Free all TST entries */
@@ -1884,19 +1964,22 @@ static int push_scqe(ns_dev *card, vc_map *vc, scq_info *scq, ns_scqe *tbd,
u32 data;
int index;
- if (scq->tail == scq->next)
+ ns_grab_scq_lock(card, scq, flags);
+ while (scq->tail == scq->next)
{
if (in_interrupt()) {
+ spin_unlock_irqrestore(&scq->lock, flags);
printk("nicstar%d: Error pushing TBD.\n", card->index);
return 1;
}
- save_flags(flags); cli();
scq->full = 1;
+ spin_unlock_irqrestore(&scq->lock, flags);
interruptible_sleep_on_timeout(&scq->scqfull_waitq, SCQFULL_TIMEOUT);
- restore_flags(flags);
+ ns_grab_scq_lock(card, scq, flags);
if (scq->full) {
+ spin_unlock_irqrestore(&scq->lock, flags);
printk("nicstar%d: Timeout pushing TBD.\n", card->index);
return 1;
}
@@ -1926,19 +2009,23 @@ static int push_scqe(ns_dev *card, vc_map *vc, scq_info *scq, ns_scqe *tbd,
if (vc->tbd_count >= MAX_TBD_PER_VC || scq->tbd_count >= MAX_TBD_PER_SCQ)
{
- if (scq->tail == scq->next)
+ int has_run = 0;
+
+ while (scq->tail == scq->next)
{
if (in_interrupt()) {
data = (u32) virt_to_bus(scq->next);
ns_write_sram(card, scq->scd, &data, 1);
+ spin_unlock_irqrestore(&scq->lock, flags);
printk("nicstar%d: Error pushing TSR.\n", card->index);
return 0;
}
- save_flags(flags); cli();
scq->full = 1;
+ if (has_run++) break;
+ spin_unlock_irqrestore(&scq->lock, flags);
interruptible_sleep_on_timeout(&scq->scqfull_waitq, SCQFULL_TIMEOUT);
- restore_flags(flags);
+ ns_grab_scq_lock(card, scq, flags);
}
if (!scq->full)
@@ -1970,10 +2057,11 @@ static int push_scqe(ns_dev *card, vc_map *vc, scq_info *scq, ns_scqe *tbd,
else
PRINTK("nicstar%d: Timeout pushing TSR.\n", card->index);
}
-
data = (u32) virt_to_bus(scq->next);
ns_write_sram(card, scq->scd, &data, 1);
+ spin_unlock_irqrestore(&scq->lock, flags);
+
return 0;
}
@@ -2064,6 +2152,7 @@ static void drain_scq(ns_dev *card, scq_info *scq, int pos)
struct atm_vcc *vcc;
struct sk_buff *skb;
int i;
+ unsigned long flags;
XPRINTK("nicstar%d: drain_scq() called, scq at 0x%x, pos %d.\n",
card->index, (u32) scq, pos);
@@ -2073,6 +2162,7 @@ static void drain_scq(ns_dev *card, scq_info *scq, int pos)
return;
}
+ ns_grab_scq_lock(card, scq, flags);
i = (int) (scq->tail - scq->base);
if (++i == scq->num_entries)
i = 0;
@@ -2084,16 +2174,18 @@ static void drain_scq(ns_dev *card, scq_info *scq, int pos)
if (skb != NULL)
{
vcc = ATM_SKB(skb)->vcc;
- if (vcc->pop != NULL)
+ if (vcc->pop != NULL) {
vcc->pop(vcc, skb);
- else
- dev_kfree_skb_any(skb);
+ } else {
+ dev_kfree_skb_irq(skb);
+ }
scq->skb[i] = NULL;
}
if (++i == scq->num_entries)
i = 0;
}
scq->tail = scq->base + pos;
+ spin_unlock_irqrestore(&scq->lock, flags);
}
@@ -2890,10 +2982,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void *arg)
{
struct sk_buff *hb;
- save_flags(flags); cli();
+ ns_grab_int_lock(card, flags);
hb = skb_dequeue(&card->hbpool.queue);
card->hbpool.count--;
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->int_lock, flags);
if (hb == NULL)
printk("nicstar%d: huge buffer count inconsistent.\n",
card->index);
@@ -2908,10 +3000,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void *arg)
hb = alloc_skb(NS_HBUFSIZE, GFP_KERNEL);
if (hb == NULL)
return -ENOMEM;
- save_flags(flags); cli();
+ ns_grab_int_lock(card, flags);
skb_queue_tail(&card->hbpool.queue, hb);
card->hbpool.count++;
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->int_lock, flags);
}
break;
@@ -2920,10 +3012,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void *arg)
{
struct sk_buff *iovb;
- save_flags(flags); cli();
+ ns_grab_int_lock(card, flags);
iovb = skb_dequeue(&card->iovpool.queue);
card->iovpool.count--;
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->int_lock, flags);
if (iovb == NULL)
printk("nicstar%d: iovec buffer count inconsistent.\n",
card->index);
@@ -2938,10 +3030,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void *arg)
iovb = alloc_skb(NS_IOVBUFSIZE, GFP_KERNEL);
if (iovb == NULL)
return -ENOMEM;
- save_flags(flags); cli();
+ ns_grab_int_lock(card, flags);
skb_queue_tail(&card->iovpool.queue, iovb);
card->iovpool.count++;
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->int_lock, flags);
}
break;
@@ -2986,22 +3078,11 @@ static void ns_poll(unsigned long arg)
for (i = 0; i < num_cards; i++)
{
card = cards[i];
- save_flags(flags); cli();
- if (card->in_poll)
- {
- printk("nicstar: Re-entering ns_poll()???\n");
- restore_flags(flags);
- continue;
- }
- card->in_poll = 1;
- if (card->in_handler)
- {
- card->in_poll = 0;
- printk("nicstar%d: ns_poll called while in interrupt handler!?\n",
- card->index);
- restore_flags(flags);
+ if (spin_is_locked(&card->int_lock)) {
+ /* Probably it isn't worth spinning */
continue;
}
+ ns_grab_int_lock(card, flags);
stat_w = 0;
stat_r = readl(card->membase + STAT);
@@ -3014,8 +3095,7 @@ static void ns_poll(unsigned long arg)
process_rsq(card);
writel(stat_w, card->membase + STAT);
- card->in_poll = 0;
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->int_lock, flags);
}
mod_timer(&ns_timer, jiffies + NS_POLL_PERIOD);
PRINTK("nicstar: Leaving ns_poll().\n");
@@ -3069,12 +3149,12 @@ static void ns_phy_put(struct atm_dev *dev, unsigned char value,
unsigned long flags;
card = dev->dev_data;
- save_flags(flags); cli();
+ ns_grab_res_lock(card, flags);
while(CMD_BUSY(card));
writel((unsigned long) value, card->membase + DR0);
writel(NS_CMD_WRITE_UTILITY | 0x00000200 | (addr & 0x000000FF),
card->membase + CMD);
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->res_lock, flags);
}
@@ -3086,12 +3166,12 @@ static unsigned char ns_phy_get(struct atm_dev *dev, unsigned long addr)
unsigned long data;
card = dev->dev_data;
- save_flags(flags); cli();
+ ns_grab_res_lock(card, flags);
while(CMD_BUSY(card));
writel(NS_CMD_READ_UTILITY | 0x00000200 | (addr & 0x000000FF),
card->membase + CMD);
while(CMD_BUSY(card));
data = readl(card->membase + DR0) & 0x000000FF;
- restore_flags(flags);
+ spin_unlock_irqrestore(&card->res_lock, flags);
return (unsigned char) data;
}
diff --git a/drivers/atm/nicstar.h b/drivers/atm/nicstar.h
index 4e90650c0..7dfa9ec6f 100644
--- a/drivers/atm/nicstar.h
+++ b/drivers/atm/nicstar.h
@@ -28,6 +28,8 @@
/* Options ********************************************************************/
+#undef NS_DEBUG_SPINLOCKS
+
#define NS_MAX_CARDS 4 /* Maximum number of NICStAR based cards
controlled by the device driver. Must
be <= 5 */
@@ -705,6 +707,11 @@ typedef struct scq_info
int tbd_count; /* Only meaningful on variable rate */
wait_queue_head_t scqfull_waitq;
volatile char full; /* SCQ full indicator */
+ spinlock_t lock; /* SCQ spinlock */
+#ifdef NS_DEBUG_SPINLOCKS
+ volatile long has_lock;
+ volatile int cpu_lock;
+#endif /* NS_DEBUG_SPINLOCKS */
} scq_info;
@@ -779,8 +786,14 @@ typedef struct ns_dev
struct sk_buff *rcbuf; /* Current raw cell buffer */
u32 rawch; /* Raw cell queue head */
unsigned intcnt; /* Interrupt counter */
- volatile int in_handler: 1;
- volatile int in_poll: 1;
+ spinlock_t int_lock; /* Interrupt lock */
+ spinlock_t res_lock; /* Card resource lock */
+#ifdef NS_DEBUG_SPINLOCKS
+ volatile long has_int_lock;
+ volatile int cpu_int;
+ volatile long has_res_lock;
+ volatile int cpu_res;
+#endif /* NS_DEBUG_SPINLOCKS */
} ns_dev;
diff --git a/drivers/block/lvm.c b/drivers/block/lvm.c
index f5c2bb1e4..1e2a21cf1 100644
--- a/drivers/block/lvm.c
+++ b/drivers/block/lvm.c
@@ -165,6 +165,8 @@ static char *lvm_short_version = "version 0.8final (15/02/2000)";
#include <linux/kerneld.h>
#endif
+#define LOCAL_END_REQUEST
+
#include <linux/blk.h>
#include <linux/blkpg.h>
diff --git a/drivers/block/md.c b/drivers/block/md.c
index 3fa5e5318..058c001c7 100644
--- a/drivers/block/md.c
+++ b/drivers/block/md.c
@@ -496,7 +496,7 @@ static int alloc_array_sb (mddev_t * mddev)
mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
if (!mddev->sb)
return -ENOMEM;
- md_clear_page((unsigned long)mddev->sb);
+ md_clear_page(mddev->sb);
return 0;
}
@@ -510,7 +510,7 @@ static int alloc_disk_sb (mdk_rdev_t * rdev)
printk (OUT_OF_MEM);
return -EINVAL;
}
- md_clear_page((unsigned long)rdev->sb);
+ md_clear_page(rdev->sb);
return 0;
}
diff --git a/drivers/block/raid0.c b/drivers/block/raid0.c
index 32821d936..f06ddc355 100644
--- a/drivers/block/raid0.c
+++ b/drivers/block/raid0.c
@@ -120,7 +120,7 @@ static int create_strip_zones (mddev_t *mddev)
static int raid0_run (mddev_t *mddev)
{
- int cur=0, i=0, size, zone0_size, nb_zone;
+ unsigned long cur=0, i=0, size, zone0_size, nb_zone;
raid0_conf_t *conf;
MOD_INC_USE_COUNT;
@@ -142,11 +142,11 @@ static int raid0_run (mddev_t *mddev)
printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
(md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
- printk("raid0 : nb_zone is %d.\n", nb_zone);
+ printk("raid0 : nb_zone is %ld.\n", nb_zone);
conf->nr_zones = nb_zone;
- printk("raid0 : Allocating %d bytes for hash.\n",
- sizeof(struct raid0_hash)*nb_zone);
+ printk("raid0 : Allocating %ld bytes for hash.\n",
+ nb_zone*sizeof(struct raid0_hash));
conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
if (!conf->hash_table)
diff --git a/drivers/block/raid1.c b/drivers/block/raid1.c
index 69d03feca..6748c8016 100644
--- a/drivers/block/raid1.c
+++ b/drivers/block/raid1.c
@@ -551,7 +551,6 @@ static int raid1_make_request (request_queue_t *q, mddev_t *mddev, int rw,
int disks = MD_SB_DISKS;
int i, sum_bhs = 0, sectors;
struct mirror_info *mirror;
- DECLARE_WAITQUEUE(wait, current);
if (!buffer_locked(bh))
BUG();
diff --git a/drivers/block/raid5.c b/drivers/block/raid5.c
index e7bf85d08..116be21a1 100644
--- a/drivers/block/raid5.c
+++ b/drivers/block/raid5.c
@@ -1674,7 +1674,7 @@ static int __check_consistency (mddev_t *mddev, int row)
tmp->b_data = (char *)page_address(tmp->b_page);
if (!tmp->b_data)
goto out;
- md_clear_page((unsigned long)tmp->b_data);
+ md_clear_page(tmp->b_data);
memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
for (i = 0; i < conf->raid_disks; i++) {
dev = conf->disks[i].dev;
diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c
index 0dbb8d883..a8a0e8690 100644
--- a/drivers/ide/piix.c
+++ b/drivers/ide/piix.c
@@ -191,6 +191,7 @@ byte piix_proc = 0;
extern char *ide_xfer_verbose (byte xfer_rate);
+#if defined(CONFIG_BLK_DEV_IDEDMA) && defined(CONFIG_PIIX_TUNING)
/*
*
*/
@@ -221,6 +222,7 @@ static byte piix_dma_2_pio (byte xfer_rate) {
return 0;
}
}
+#endif /* defined(CONFIG_BLK_DEV_IDEDMA) && (CONFIG_PIIX_TUNING) */
/*
* Based on settings done by AMI BIOS
diff --git a/drivers/net/3c509.c b/drivers/net/3c509.c
index 00c1c6e60..cea0739b0 100644
--- a/drivers/net/3c509.c
+++ b/drivers/net/3c509.c
@@ -187,7 +187,7 @@ u16 el3_isapnp_phys_addr[8][3] = {
{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}
};
#endif /* CONFIG_ISAPNP */
-#if defined(__ISAPNP__) || defined(MODULE)
+#ifdef __ISAPNP__
static int nopnp = 0;
#endif
@@ -984,7 +984,9 @@ MODULE_PARM(debug,"i");
MODULE_PARM(irq,"1-8i");
MODULE_PARM(xcvr,"1-8i");
MODULE_PARM(max_interrupt_work, "i");
+#ifdef __ISAPNP__
MODULE_PARM(nopnp, "i");
+#endif
int
init_module(void)
diff --git a/drivers/net/3c515.c b/drivers/net/3c515.c
index 4bd49bfe9..80021d00a 100644
--- a/drivers/net/3c515.c
+++ b/drivers/net/3c515.c
@@ -363,8 +363,9 @@ struct corkscrew_isapnp_adapters_struct corkscrew_isapnp_adapters[] = {
int corkscrew_isapnp_phys_addr[3] = {
0, 0, 0
};
-#endif
+
static int nopnp = 0;
+#endif
static int corkscrew_scan(struct net_device *dev);
static struct net_device *corkscrew_found_device(struct net_device *dev,
@@ -439,9 +440,11 @@ int tc515_probe(struct net_device *dev)
static int corkscrew_scan(struct net_device *dev)
{
int cards_found = 0;
- short i;
static int ioaddr;
+#ifdef __ISAPNP__
+ short i;
static int pnp_cards = 0;
+#endif
#ifdef __ISAPNP__
if(nopnp == 1)
diff --git a/drivers/scsi/NCR5380.c b/drivers/scsi/NCR5380.c
index cac8329c3..7e9c3f9b5 100644
--- a/drivers/scsi/NCR5380.c
+++ b/drivers/scsi/NCR5380.c
@@ -2245,7 +2245,9 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
register unsigned char p = *phase;
register unsigned char *d = *data;
unsigned char tmp;
+#if defined(PSEUDO_DMA) && !defined(UNSAFE)
unsigned long flags;
+#endif
int foo;
#if defined(REAL_DMA_POLL)
int cnt, toPIO;
diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c
index 3eb3021f2..0cbad04fa 100644
--- a/drivers/scsi/atp870u.c
+++ b/drivers/scsi/atp870u.c
@@ -1627,7 +1627,6 @@ int atp870u_detect(Scsi_Host_Template * tpnt)
struct Scsi_Host *shpnt = NULL;
int tmpcnt = 0;
int count = 0;
- int result;
static unsigned short devid[8] = {
0x8002, 0x8010, 0x8020, 0x8030, 0x8040, 0x8050, 0x8060, 0
diff --git a/drivers/scsi/seagate.c b/drivers/scsi/seagate.c
index ff68dca96..53f194e43 100644
--- a/drivers/scsi/seagate.c
+++ b/drivers/scsi/seagate.c
@@ -498,7 +498,7 @@ int __init seagate_st0x_detect (Scsi_Host_Template * tpnt)
{
int clock;
ULOOP( 1*1000*1000 ) {
- volatile int x = STATUS;
+ STATUS;
if (TIMEOUT) break;
}
}
diff --git a/drivers/sound/os.h b/drivers/sound/os.h
index 2b2be83d5..09b8c2b86 100644
--- a/drivers/sound/os.h
+++ b/drivers/sound/os.h
@@ -4,11 +4,9 @@
#define MANUAL_PNP
#undef DO_TIMINGS
-#ifdef MODULE
-#define __NO_VERSION__
#include <linux/module.h>
#include <linux/version.h>
-#endif
+
#if LINUX_VERSION_CODE > 131328
#define LINUX21X
#endif
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ade9091a5..b16b0db6a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -321,21 +321,33 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
#endif
if (N_MAGIC(ex) == OMAGIC) {
+ unsigned long text_addr, map_size;
loff_t pos;
+
+ text_addr = N_TXTADDR(ex);
+
#if defined(__alpha__) || defined(__sparc__)
pos = fd_offset;
- do_brk(N_TXTADDR(ex) & PAGE_MASK,
- ex.a_text+ex.a_data + PAGE_SIZE - 1);
- bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
- ex.a_text+ex.a_data, &pos);
+ map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
#else
pos = 32;
- do_brk(0, ex.a_text+ex.a_data);
- bprm->file->f_op->read(bprm->file, (char *) 0,
- ex.a_text+ex.a_data, &pos);
+ map_size = ex.a_text+ex.a_data;
#endif
- flush_icache_range((unsigned long) 0,
- (unsigned long) ex.a_text+ex.a_data);
+
+ error = do_brk(text_addr & PAGE_MASK, map_size);
+ if (error != (text_addr & PAGE_MASK)) {
+ send_sig(SIGKILL, current, 0);
+ return error;
+ }
+
+ error = bprm->file->f_op->read(bprm->file, (char *)text_addr,
+ ex.a_text+ex.a_data, &pos);
+ if (error < 0) {
+ send_sig(SIGKILL, current, 0);
+ return error;
+ }
+
+ flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
} else {
static unsigned long error_time, error_time2;
if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
diff --git a/fs/dcache.c b/fs/dcache.c
index c0c94ff1e..1841eef97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,7 +33,7 @@ spinlock_t dcache_lock = SPIN_LOCK_UNLOCKED;
/* Right now the dcache depends on the kernel lock */
#define check_lock() if (!kernel_locked()) BUG()
-kmem_cache_t *dentry_cache;
+static kmem_cache_t *dentry_cache;
/*
* This is the single most critical data structure when it comes
@@ -67,6 +67,7 @@ static inline void d_free(struct dentry *dentry)
if (dname_external(dentry))
kfree(dentry->d_name.name);
kmem_cache_free(dentry_cache, dentry);
+ dentry_stat.nr_dentry--;
}
/*
@@ -117,58 +118,54 @@ static inline void dentry_iput(struct dentry * dentry)
* they too may now get deleted.
*
* no dcache lock, please.
- *
- * Note: dput() itself is inlined and uses __dput() for slow path (after
- * decrementing the ->d_count on the argument and finding it zero).
*/
-void __dput(struct dentry *dentry)
+void dput(struct dentry *dentry)
{
- struct dentry * parent;
+ if (!dentry)
+ return;
+
repeat:
- spin_lock(&dcache_lock);
- if (atomic_read(&dentry->d_count))
- goto out;
+ if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
+ return;
+
+ /* dput on a free dentry? */
+ if (!list_empty(&dentry->d_lru))
+ BUG();
/*
* AV: ->d_delete() is _NOT_ allowed to block now.
*/
if (dentry->d_op && dentry->d_op->d_delete) {
- if (dentry->d_op->d_delete(dentry)) {
- list_del(&dentry->d_hash);
- goto kill_it;
- }
+ if (dentry->d_op->d_delete(dentry))
+ goto unhash_it;
}
+ /* Unreachable? Get rid of it */
if (list_empty(&dentry->d_hash))
goto kill_it;
- if (!list_empty(&dentry->d_lru)) {
- dentry_stat.nr_unused--;
- list_del(&dentry->d_lru);
- }
list_add(&dentry->d_lru, &dentry_unused);
dentry_stat.nr_unused++;
/*
* Update the timestamp
*/
dentry->d_reftime = jiffies;
-
-out:
spin_unlock(&dcache_lock);
return;
-kill_it:
- if (!list_empty(&dentry->d_lru)) {
- dentry_stat.nr_unused--;
- list_del(&dentry->d_lru);
- }
- list_del(&dentry->d_child);
- /* drops the lock, at that point nobody can reach this dentry */
- dentry_iput(dentry);
- parent = dentry->d_parent;
- d_free(dentry);
- if (dentry == parent)
- return;
- dentry = parent;
- if (atomic_dec_and_test(&dentry->d_count))
+
+unhash_it:
+ list_del(&dentry->d_hash);
+
+kill_it: {
+ struct dentry *parent;
+ list_del(&dentry->d_child);
+ /* drops the lock, at that point nobody can reach this dentry */
+ dentry_iput(dentry);
+ parent = dentry->d_parent;
+ d_free(dentry);
+ if (dentry == parent)
+ return;
+ dentry = parent;
goto repeat;
+ }
}
/**
@@ -329,11 +326,14 @@ void prune_dcache(int count)
list_del(tmp);
INIT_LIST_HEAD(tmp);
dentry = list_entry(tmp, struct dentry, d_lru);
- if (!atomic_read(&dentry->d_count)) {
- prune_one_dentry(dentry);
- if (!--count)
- break;
- }
+
+ /* Unused dentry with a count? */
+ if (atomic_read(&dentry->d_count))
+ BUG();
+
+ prune_one_dentry(dentry);
+ if (!--count)
+ break;
}
spin_unlock(&dcache_lock);
}
@@ -539,7 +539,7 @@ int shrink_dcache_memory(int priority, unsigned int gfp_mask)
{
int count = 0;
if (priority)
- count = dentry_stat.nr_unused / priority;
+ count = dentry_stat.nr_unused >> (priority >> 2);
prune_dcache(count);
/* FIXME: kmem_cache_shrink here should tell us
the number of pages freed, and it should
@@ -608,6 +608,7 @@ struct dentry * d_alloc(struct dentry * parent, const struct qstr *name)
} else
INIT_LIST_HEAD(&dentry->d_child);
+ dentry_stat.nr_dentry++;
return dentry;
}
@@ -705,7 +706,12 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
if (memcmp(dentry->d_name.name, str, len))
continue;
}
- dget(dentry);
+ atomic_inc(&dentry->d_count);
+ if (atomic_read(&dentry->d_count) == 1) {
+ dentry_stat.nr_unused--;
+ list_del(&dentry->d_lru);
+ INIT_LIST_HEAD(&dentry->d_lru); /* make "list_empty()" work */
+ }
spin_unlock(&dcache_lock);
return dentry;
}
diff --git a/fs/file.c b/fs/file.c
index 7bdf29179..2f8ea1918 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -24,11 +24,9 @@ struct file ** alloc_fd_array(int num)
struct file **new_fds;
int size = num * sizeof(struct file *);
- if (size < PAGE_SIZE)
+ if (size <= PAGE_SIZE)
new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
- else if (size == PAGE_SIZE)
- new_fds = (struct file **) __get_free_page(GFP_KERNEL);
- else
+ else
new_fds = (struct file **) vmalloc(size);
return new_fds;
}
@@ -44,10 +42,8 @@ void free_fd_array(struct file **array, int num)
if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */
return;
- else if (size < PAGE_SIZE)
+ else if (size <= PAGE_SIZE)
kfree(array);
- else if (size == PAGE_SIZE)
- free_page((unsigned long) array);
else
vfree(array);
}
@@ -137,10 +133,8 @@ fd_set * alloc_fdset(int num)
fd_set *new_fdset;
int size = num / 8;
- if (size < PAGE_SIZE)
+ if (size <= PAGE_SIZE)
new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL);
- else if (size == PAGE_SIZE)
- new_fdset = (fd_set *) __get_free_page(GFP_KERNEL);
else
new_fdset = (fd_set *) vmalloc(size);
return new_fdset;
@@ -157,10 +151,8 @@ void free_fdset(fd_set *array, int num)
if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */
return;
- else if (size < PAGE_SIZE)
+ else if (size <= PAGE_SIZE)
kfree(array);
- else if (size == PAGE_SIZE)
- free_page((unsigned long) array);
else
vfree(array);
}
diff --git a/fs/inode.c b/fs/inode.c
index 3dbd9f54e..28fbd0098 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -26,7 +26,7 @@
/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
-#define INODE_PARANOIA 1
+/* #define INODE_PARANOIA 1 */
/* #define INODE_DEBUG 1 */
/*
@@ -327,6 +327,7 @@ static void dispose_list(struct list_head * head)
truncate_inode_pages(&inode->i_data, 0);
clear_inode(inode);
destroy_inode(inode);
+ inodes_stat.nr_inodes--;
}
}
@@ -548,6 +549,7 @@ struct inode * get_empty_inode(void)
if (inode)
{
spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
list_add(&inode->i_list, &inode_in_use);
inode->i_sb = NULL;
inode->i_dev = 0;
@@ -579,6 +581,7 @@ static struct inode * get_new_inode(struct super_block *sb, unsigned long ino, s
/* We released the lock, so.. */
old = find_inode(sb, ino, head, find_actor, opaque);
if (!old) {
+ inodes_stat.nr_inodes++;
list_add(&inode->i_list, &inode_in_use);
list_add(&inode->i_hash, head);
inode->i_sb = sb;
@@ -752,78 +755,55 @@ void iput(struct inode *inode)
{
if (inode) {
struct super_operations *op = NULL;
- int destroy = 0;
if (inode->i_sb && inode->i_sb->s_op)
op = inode->i_sb->s_op;
if (op && op->put_inode)
op->put_inode(inode);
- spin_lock(&inode_lock);
- if (atomic_dec_and_test(&inode->i_count)) {
- if (!inode->i_nlink) {
- list_del(&inode->i_hash);
- INIT_LIST_HEAD(&inode->i_hash);
- list_del(&inode->i_list);
- INIT_LIST_HEAD(&inode->i_list);
- inode->i_state|=I_FREEING;
- spin_unlock(&inode_lock);
-
- if (inode->i_data.nrpages)
- truncate_inode_pages(&inode->i_data, 0);
+ if (!atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ return;
- destroy = 1;
- if (op && op->delete_inode) {
- void (*delete)(struct inode *) = op->delete_inode;
- /* s_op->delete_inode internally recalls clear_inode() */
- delete(inode);
- } else
- clear_inode(inode);
- if (inode->i_state != I_CLEAR)
- BUG();
+ if (!inode->i_nlink) {
+ list_del(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_hash);
+ list_del(&inode->i_list);
+ INIT_LIST_HEAD(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ spin_unlock(&inode_lock);
- spin_lock(&inode_lock);
- } else {
- if (!list_empty(&inode->i_hash)) {
- if (!(inode->i_state & I_DIRTY)) {
- list_del(&inode->i_list);
- list_add(&inode->i_list,
- &inode_unused);
- }
- inodes_stat.nr_unused++;
- } else {
- /* magic nfs path */
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (op && op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ } else {
+ if (!list_empty(&inode->i_hash)) {
+ if (!(inode->i_state & I_DIRTY)) {
list_del(&inode->i_list);
- INIT_LIST_HEAD(&inode->i_list);
- inode->i_state|=I_FREEING;
- spin_unlock(&inode_lock);
- clear_inode(inode);
- destroy = 1;
- spin_lock(&inode_lock);
+ list_add(&inode->i_list,
+ &inode_unused);
}
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ return;
+ } else {
+ /* magic nfs path */
+ list_del(&inode->i_list);
+ INIT_LIST_HEAD(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ spin_unlock(&inode_lock);
+ clear_inode(inode);
}
-#ifdef INODE_PARANOIA
-if (inode->i_flock)
-printk(KERN_ERR "iput: inode %s/%ld still has locks!\n",
-kdevname(inode->i_dev), inode->i_ino);
-if (!list_empty(&inode->i_dentry))
-printk(KERN_ERR "iput: device %s inode %ld still has aliases!\n",
-kdevname(inode->i_dev), inode->i_ino);
-if (atomic_read(&inode->i_count))
-printk(KERN_ERR "iput: device %s inode %ld count changed, count=%d\n",
-kdevname(inode->i_dev), inode->i_ino, atomic_read(&inode->i_count));
-if (atomic_read(&inode->i_sem.count) != 1)
-printk(KERN_ERR "iput: Aieee, semaphore in use inode %s/%ld, count=%d\n",
-kdevname(inode->i_dev), inode->i_ino, atomic_read(&inode->i_sem.count));
-#endif
- }
- if ((unsigned)atomic_read(&inode->i_count) > (1U<<31)) {
- printk(KERN_ERR "iput: inode %s/%ld count wrapped\n",
- kdevname(inode->i_dev), inode->i_ino);
}
- spin_unlock(&inode_lock);
- if (destroy)
- destroy_inode(inode);
+ inodes_stat.nr_inodes--;
+ destroy_inode(inode);
}
}
diff --git a/fs/namei.c b/fs/namei.c
index cba4fb775..3ac2602dc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -14,6 +14,7 @@
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
*/
+#include <linux/init.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp_lock.h>
@@ -1951,3 +1952,20 @@ struct inode_operations page_symlink_inode_operations = {
readlink: page_readlink,
follow_link: page_follow_link,
};
+
+/* SLAB cache for name blocks */
+kmem_cache_t *names_cachep;
+
+static int __init namecache_init(void)
+{
+ names_cachep = kmem_cache_create("names_cache",
+ PAGE_SIZE,
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!names_cachep)
+ panic("Cannot create names cache");
+ return 0;
+}
+
+module_init(namecache_init)
diff --git a/fs/open.c b/fs/open.c
index 2968decf9..70a3d199d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -11,6 +11,7 @@
#include <linux/smp_lock.h>
#include <linux/quotaops.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 9afe2d67c..5937878c9 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -508,18 +508,6 @@ static int swaps_read_proc(char *page, char **start, off_t off,
return len;
}
-static int slabinfo_read_proc(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len = get_slabinfo(page);
- if (len <= off+count) *eof = 1;
- *start = page + off;
- len -= off;
- if (len>count) len = count;
- if (len<0) len = 0;
- return len;
-}
-
static int memory_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -671,4 +659,12 @@ void __init proc_misc_init(void)
entry->proc_fops = &ppc_htab_operations;
}
#endif
+ {
+ struct proc_dir_entry *res = create_proc_entry("slabinfo",
+ S_IWUSR | S_IRUGO, NULL);
+ if (res) {
+ res->read_proc = slabinfo_read_proc;
+ res->write_proc = slabinfo_write_proc;
+ }
+ }
}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index bae5641fd..7c0acf4bb 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -198,8 +198,8 @@ extern void d_rehash(struct dentry *);
static __inline__ void d_add(struct dentry * entry, struct inode * inode)
{
- d_rehash(entry);
d_instantiate(entry, inode);
+ d_rehash(entry);
}
/* used for rename() and baskets */
@@ -227,6 +227,8 @@ extern char * __d_path(struct dentry *, struct vfsmount *, struct dentry *,
static __inline__ struct dentry * dget(struct dentry *dentry)
{
+ if (!atomic_read(&dentry->d_count))
+ BUG();
if (dentry)
atomic_inc(&dentry->d_count);
return dentry;
@@ -244,12 +246,7 @@ static __inline__ int d_unhashed(struct dentry *dentry)
return list_empty(&dentry->d_hash);
}
-extern void __dput(struct dentry *);
-static __inline__ void dput(struct dentry *dentry)
-{
- if (dentry && atomic_dec_and_test(&dentry->d_count))
- __dput(dentry);
-}
+extern void dput(struct dentry *);
static __inline__ int d_mountpoint(struct dentry *dentry)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index da3f0e56e..dcb93786e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -883,8 +883,9 @@ extern struct file *filp_open(const char *, int, int);
extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
extern int filp_close(struct file *, fl_owner_t id);
extern char * getname(const char *);
-#define __getname() ((char *) __get_free_page(GFP_KERNEL))
-#define putname(name) free_page((unsigned long)(name))
+
+#define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL)
+#define putname(name) kmem_cache_free(names_cachep, (void *)(name))
enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW};
extern int register_blkdev(unsigned int, const char *, struct block_device_operations *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 019538c7c..58edc3b0c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -35,9 +35,7 @@ typedef struct kmem_cache_s kmem_cache_t;
#define SLAB_POISON 0x00000800UL /* Poison objects */
#define SLAB_NO_REAP 0x00001000UL /* never reap from the cache */
#define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */
-#if 0
-#define SLAB_HIGH_PACK 0x00004000UL /* XXX */
-#endif
+#define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */
/* flags passed to a constructor func */
#define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */
@@ -47,7 +45,9 @@ typedef struct kmem_cache_s kmem_cache_t;
/* prototypes */
extern void kmem_cache_init(void);
extern void kmem_cache_sizes_init(void);
-extern kmem_cache_t *kmem_find_general_cachep(size_t);
+extern void kmem_cpucache_init(void);
+
+extern kmem_cache_t *kmem_find_general_cachep(size_t, int gfpflags);
extern kmem_cache_t *kmem_cache_create(const char *, size_t, size_t, unsigned long,
void (*)(void *, kmem_cache_t *, unsigned long),
void (*)(void *, kmem_cache_t *, unsigned long));
@@ -58,14 +58,18 @@ extern void kmem_cache_free(kmem_cache_t *, void *);
extern void *kmalloc(size_t, int);
extern void kfree(const void *);
-extern void kfree_s(const void *, size_t);
+#define kfree_s(objp,s) kfree(objp)
extern void kmem_cache_reap(int);
-extern int get_slabinfo(char *);
+extern int slabinfo_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+extern int slabinfo_write_proc(struct file *file, const char *buffer,
+ unsigned long count, void *data);
/* System wide caches */
extern kmem_cache_t *vm_area_cachep;
extern kmem_cache_t *mm_cachep;
+extern kmem_cache_t *names_cachep;
#endif /* __KERNEL__ */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index b4a190d65..86fce1e7d 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -40,6 +40,8 @@
#if (DEBUG_SPINLOCKS < 1)
+#define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
+
/*
* Your basic spinlocks, allowing only a single CPU anywhere
*
@@ -122,4 +124,11 @@ typedef struct {
#define write_unlock(lock) do { } while(0)
#endif /* !SMP */
+
+/* "lock on reference count zero" */
+#ifndef atomic_dec_and_lock
+#include <asm/atomic.h>
+extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
+#endif
+
#endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index a45f0ae2e..8b559703a 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -43,7 +43,7 @@ struct rpc_task {
struct rpc_task * tk_prev_task; /* global list of tasks */
struct rpc_clnt * tk_client; /* RPC client */
struct rpc_rqst * tk_rqstp; /* RPC request */
- int tk_status; /* result of last operation */
+ volatile int tk_status; /* result of last operation */
struct rpc_wait_queue * tk_rpcwait; /* RPC wait queue we're on */
/*
@@ -56,10 +56,12 @@ struct rpc_task {
tk_suid_retry;
/*
+ * timeout_fn to be executed by timer bottom half
* callback to be executed after waking up
* action next procedure for async tasks
* exit exit async task and report to caller
*/
+ void (*tk_timeout_fn)(struct rpc_task *);
void (*tk_callback)(struct rpc_task *);
void (*tk_action)(struct rpc_task *);
void (*tk_exit)(struct rpc_task *);
@@ -76,9 +78,10 @@ struct rpc_task {
unsigned long tk_timeout; /* timeout for rpc_sleep() */
unsigned short tk_flags; /* misc flags */
unsigned short tk_lock; /* Task lock counter */
- unsigned int tk_wakeup : 1,/* Task waiting to wake up */
- tk_sleeping : 1,/* Task is truly asleep */
- tk_active : 1;/* Task has been activated */
+ unsigned char tk_active : 1,/* Task has been activated */
+ tk_wakeup : 1;/* Task waiting to wake up */
+ volatile unsigned char tk_running : 1,/* Task is running */
+ tk_sleeping : 1;/* Task is truly asleep */
#ifdef RPC_DEBUG
unsigned short tk_pid; /* debugging aid */
#endif
@@ -91,29 +94,26 @@ typedef void (*rpc_action)(struct rpc_task *);
/*
* RPC task flags
*/
-#define RPC_TASK_RUNNING 0x0001 /* is running */
-#define RPC_TASK_ASYNC 0x0002 /* is an async task */
-#define RPC_TASK_CALLBACK 0x0004 /* invoke callback */
-#define RPC_TASK_SWAPPER 0x0008 /* is swapping in/out */
-#define RPC_TASK_SETUID 0x0010 /* is setuid process */
-#define RPC_TASK_CHILD 0x0020 /* is child of other task */
-#define RPC_CALL_REALUID 0x0040 /* try using real uid */
-#define RPC_CALL_MAJORSEEN 0x0080 /* major timeout seen */
-#define RPC_TASK_ROOTCREDS 0x0100 /* force root creds */
-#define RPC_TASK_DYNAMIC 0x0200 /* task was kmalloc'ed */
-#define RPC_TASK_KILLED 0x0400 /* task was killed */
-#define RPC_TASK_NFSWRITE 0x1000 /* an NFS writeback */
-
-#define RPC_IS_RUNNING(t) ((t)->tk_flags & RPC_TASK_RUNNING)
+#define RPC_TASK_ASYNC 0x0001 /* is an async task */
+#define RPC_TASK_SWAPPER 0x0002 /* is swapping in/out */
+#define RPC_TASK_SETUID 0x0004 /* is setuid process */
+#define RPC_TASK_CHILD 0x0008 /* is child of other task */
+#define RPC_CALL_REALUID 0x0010 /* try using real uid */
+#define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */
+#define RPC_TASK_ROOTCREDS 0x0040 /* force root creds */
+#define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */
+#define RPC_TASK_KILLED 0x0100 /* task was killed */
+
#define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC)
#define RPC_IS_SETUID(t) ((t)->tk_flags & RPC_TASK_SETUID)
#define RPC_IS_CHILD(t) ((t)->tk_flags & RPC_TASK_CHILD)
#define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER)
-#define RPC_DO_CALLBACK(t) ((t)->tk_flags & RPC_TASK_CALLBACK)
#define RPC_DO_ROOTOVERRIDE(t) ((t)->tk_flags & RPC_TASK_ROOTCREDS)
#define RPC_ASSASSINATED(t) ((t)->tk_flags & RPC_TASK_KILLED)
+#define RPC_IS_RUNNING(t) ((t)->tk_running)
#define RPC_IS_SLEEPING(t) ((t)->tk_sleeping)
#define RPC_IS_ACTIVATED(t) ((t)->tk_active)
+#define RPC_DO_CALLBACK(t) ((t)->tk_callback != NULL)
/*
* RPC synchronization objects
@@ -154,7 +154,7 @@ void rpc_wake_up_task(struct rpc_task *);
void rpc_wake_up(struct rpc_wait_queue *);
struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
void rpc_wake_up_status(struct rpc_wait_queue *, int);
-int rpc_lock_task(struct rpc_task *);
+int __rpc_lock_task(struct rpc_task *);
void rpc_unlock_task(struct rpc_task *);
void rpc_delay(struct rpc_task *, unsigned long);
void * rpc_allocate(unsigned int flags, unsigned int);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index e5e66c1de..dd27162ff 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -96,7 +96,7 @@ struct rpc_rqst {
struct rpc_task * rq_task; /* RPC task data */
__u32 rq_xid; /* request XID */
struct rpc_rqst * rq_next; /* free list */
- unsigned char rq_damaged; /* reply being received */
+ volatile unsigned char rq_received : 1;/* receive completed */
/*
* For authentication (e.g. auth_des)
@@ -138,9 +138,9 @@ struct rpc_xprt {
struct rpc_wait_queue reconn; /* waiting for reconnect */
struct rpc_rqst * free; /* free slots */
struct rpc_rqst slot[RPC_MAXREQS];
- unsigned int connected : 1, /* TCP: connected */
- write_space: 1, /* TCP: can send */
- shutdown : 1, /* being shut down */
+ volatile unsigned char connected : 1, /* TCP: connected */
+ write_space: 1; /* TCP: can send */
+ unsigned char shutdown : 1, /* being shut down */
nocong : 1, /* no congestion control */
stream : 1, /* TCP */
tcp_more : 1, /* more record fragments */
diff --git a/init/main.c b/init/main.c
index e8d8f0d99..01df23fc8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -596,6 +596,7 @@ asmlinkage void __init start_kernel(void)
* make syscalls (and thus be locked).
*/
smp_init();
+ kmem_cpucache_init();
kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
unlock_kernel();
current->need_resched = 1;
diff --git a/kernel/fork.c b/kernel/fork.c
index b9cc831ec..109219e0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -799,5 +799,5 @@ void __init filescache_init(void)
SLAB_HWCACHE_ALIGN,
NULL, NULL);
if (!files_cachep)
- panic("Cannot create files cache");
+ panic("Cannot create files cache");
}
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 5a1f457de..440aaf9f5 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -110,7 +110,6 @@ EXPORT_SYMBOL(kmem_cache_alloc);
EXPORT_SYMBOL(kmem_cache_free);
EXPORT_SYMBOL(kmalloc);
EXPORT_SYMBOL(kfree);
-EXPORT_SYMBOL(kfree_s);
EXPORT_SYMBOL(vfree);
EXPORT_SYMBOL(__vmalloc);
EXPORT_SYMBOL(mem_map);
@@ -213,7 +212,7 @@ EXPORT_SYMBOL(posix_test_lock);
EXPORT_SYMBOL(posix_block_lock);
EXPORT_SYMBOL(posix_unblock_lock);
EXPORT_SYMBOL(locks_mandatory_area);
-EXPORT_SYMBOL(__dput);
+EXPORT_SYMBOL(dput);
EXPORT_SYMBOL(have_submounts);
EXPORT_SYMBOL(d_find_alias);
EXPORT_SYMBOL(d_prune_aliases);
diff --git a/kernel/sys.c b/kernel/sys.c
index 8bd07d55f..3079dc295 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -278,7 +278,7 @@ asmlinkage long sys_getpriority(int which, int who)
*
* reboot doesn't sync: do that yourself before calling this.
*/
-asmlinkage long sys_reboot(int magic1, int magic2, int cmd, void * arg)
+asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void * arg)
{
char buffer[256];
diff --git a/lib/Makefile b/lib/Makefile
index fb090afcd..1f9068e4d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,4 +10,8 @@ L_TARGET := lib.a
L_OBJS := errno.o ctype.o string.o vsprintf.o brlock.o
LX_OBJS := cmdline.o
+ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
+ L_OBJS += dec_and_lock.o
+endif
+
include $(TOPDIR)/Rules.make
diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c
new file mode 100644
index 000000000..281bb359c
--- /dev/null
+++ b/lib/dec_and_lock.c
@@ -0,0 +1,37 @@
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+/*
+ * This is an architecture-neutral, but slow,
+ * implementation of the notion of "decrement
+ * a reference count, and return locked if it
+ * decremented to zero".
+ *
+ * NOTE NOTE NOTE! This is _not_ equivalent to
+ *
+ * if (atomic_dec_and_test(&atomic)) {
+ * spin_lock(&lock);
+ * return 1;
+ * }
+ * return 0;
+ *
+ * because the spin-lock and the decrement must be
+ * "atomic".
+ *
+ * This slow version gets the spinlock unconditionally,
+ * and releases it if it isn't needed. Architectures
+ * are encouraged to come up with better approaches,
+ * this is trivially done efficiently using a load-locked
+ * store-conditional approach, for example.
+ */
+
+#ifndef atomic_dec_and_lock
+int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (atomic_dec_and_test(atomic))
+ return 1;
+ spin_unlock(lock);
+ return 0;
+}
+#endif
diff --git a/mm/slab.c b/mm/slab.c
index cccc16c58..ad1147500 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -5,513 +5,485 @@
*
* kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
*
+ * Major cleanup, different bufctl logic, per-cpu arrays
+ * (c) 2000 Manfred Spraul
+ *
+ * An implementation of the Slab Allocator as described in outline in;
+ * UNIX Internals: The New Frontiers by Uresh Vahalia
+ * Pub: Prentice Hall ISBN 0-13-101908-2
+ * or with a little more detail in;
+ * The Slab Allocator: An Object-Caching Kernel Memory Allocator
+ * Jeff Bonwick (Sun Microsystems).
+ * Presented at: USENIX Summer 1994 Technical Conference
+ *
+ *
+ * The memory is organized in caches, one cache for each object type.
+ * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
+ * Each cache consists out of many slabs (they are small (usually one
+ * page long) and always contiguous), and each slab contains multiple
+ * initialized objects.
+ *
+ * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
+ * normal). If you need a special memory type, then must create a new
+ * cache for that memory type.
+ *
+ * In order to reduce fragmentation, the slabs are sorted in 3 groups:
+ * full slabs with 0 free objects
+ * partial slabs
+ * empty slabs with no allocated objects
+ *
+ * If partial slabs exist, then new allocations come from these slabs,
+ * otherwise from empty slabs or new slabs are allocated.
+ *
+ * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
+ * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
+ *
+ * On SMP systems, each cache has a short per-cpu head array, most allocs
+ * and frees go into that array, and if that array overflows, then 1/2
+ * of the entries in the array are given back into the global cache.
+ * This reduces the number of spinlock operations.
+ *
+ * The c_cpuarray can be changed with a smp_call_function call,
+ * it may not be read with enabled local interrupts.
+ *
+ * SMP synchronization:
+ * constructors and destructors are called without any locking.
+ * Several members in kmem_cache_t and slab_t never change, they
+ * are accessed without any locking.
+ * The per-cpu arrays are never accessed from the wrong cpu, no locking.
+ * smp_call_function() is used if one cpu must flush the arrays from
+ * other cpus.
+ * The non-constant members are protected with a per-cache irq spinlock.
+ *
+ * Further notes from the original documentation:
+ *
* 11 April '97. Started multi-threading - markhe
* The global cache-chain is protected by the semaphore 'cache_chain_sem'.
* The sem is only needed when accessing/extending the cache-chain, which
* can never happen inside an interrupt (kmem_cache_create(),
* kmem_cache_shrink() and kmem_cache_reap()).
- * This is a medium-term exclusion lock.
- *
- * Each cache has its own lock; 'c_spinlock'. This lock is needed only
- * when accessing non-constant members of a cache-struct.
- * Note: 'constant members' are assigned a value in kmem_cache_create() before
- * the cache is linked into the cache-chain. The values never change, so not
- * even a multi-reader lock is needed for these members.
- * The c_spinlock is only ever held for a few cycles.
*
* To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which
* maybe be sleeping and therefore not holding the semaphore/lock), the
- * c_growing field is used. This also prevents reaping from a cache.
- *
- * Note, caches can _never_ be destroyed. When a sub-system (eg module) has
- * finished with a cache, it can only be shrunk. This leaves the cache empty,
- * but already enabled for re-use, eg. during a module re-load.
- *
- * Notes:
- * o Constructors/deconstructors are called while the cache-lock
- * is _not_ held. Therefore they _must_ be threaded.
- * o Constructors must not attempt to allocate memory from the
- * same cache that they are a constructor for - infinite loop!
- * (There is no easy way to trap this.)
- * o The per-cache locks must be obtained with local-interrupts disabled.
- * o When compiled with debug support, and an object-verify (upon release)
- * is request for a cache, the verify-function is called with the cache
- * lock held. This helps debugging.
- * o The functions called from try_to_free_page() must not attempt
- * to allocate memory from a cache which is being grown.
- * The buffer sub-system might try to allocate memory, via buffer_cachep.
- * As this pri is passed to the SLAB, and then (if necessary) onto the
- * gfp() funcs (which avoid calling try_to_free_page()), no deadlock
- * should happen.
- *
- * The positioning of the per-cache lock is tricky. If the lock is
- * placed on the same h/w cache line as commonly accessed members
- * the number of L1 cache-line faults is reduced. However, this can
- * lead to the cache-line ping-ponging between processors when the
- * lock is in contention (and the common members are being accessed).
- * Decided to keep it away from common members.
- *
- * More fine-graining is possible, with per-slab locks...but this might be
- * taking fine graining too far, but would have the advantage;
- * During most allocs/frees no writes occur to the cache-struct.
- * Therefore a multi-reader/one writer lock could be used (the writer
- * needed when the slab chain is being link/unlinked).
- * As we would not have an exclusion lock for the cache-structure, one
- * would be needed per-slab (for updating s_free ptr, and/or the contents
- * of s_index).
- * The above locking would allow parallel operations to different slabs within
- * the same cache with reduced spinning.
- *
- * Per-engine slab caches, backed by a global cache (as in Mach's Zone allocator),
- * would allow most allocations from the same cache to execute in parallel.
+ * growing field is used. This also prevents reaping from a cache.
*
* At present, each engine can be growing a cache. This should be blocked.
*
- * It is not currently 100% safe to examine the page_struct outside of a kernel
- * or global cli lock. The risk is v. small, and non-fatal.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/*
+ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
+ * SLAB_RED_ZONE & SLAB_POISON.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * STATS - 1 to collect stats for /proc/slabinfo.
+ * 0 for faster, smaller code (especially in the critical paths).
*
- * Calls to printk() are not 100% safe (the function is not threaded). However,
- * printk() is only used under an error condition, and the risk is v. small (not
- * sure if the console write functions 'enjoy' executing multiple contexts in
- * parallel. I guess they don't...).
- * Note, for most calls to printk() any held cache-lock is dropped. This is not
- * always done for text size reasons - having *_unlock() everywhere is bloat.
+ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
*/
+#define DEBUG 0
+#define STATS 0
+#define FORCED_DEBUG 0
+
/*
- * An implementation of the Slab Allocator as described in outline in;
- * UNIX Internals: The New Frontiers by Uresh Vahalia
- * Pub: Prentice Hall ISBN 0-13-101908-2
- * or with a little more detail in;
- * The Slab Allocator: An Object-Caching Kernel Memory Allocator
- * Jeff Bonwick (Sun Microsystems).
- * Presented at: USENIX Summer 1994 Technical Conference
+ * Parameters for kmem_cache_reap
*/
+#define REAP_SCANLEN 10
+#define REAP_PERFECT 10
+
+/* Shouldn't this be in a header file somewhere? */
+#define BYTES_PER_WORD sizeof(void *)
+
+/* Legal flag mask for kmem_cache_create(). */
+#if DEBUG
+# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
+ SLAB_POISON | SLAB_HWCACHE_ALIGN | \
+ SLAB_NO_REAP | SLAB_CACHE_DMA)
+#else
+# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | SLAB_CACHE_DMA)
+#endif
/*
- * This implementation deviates from Bonwick's paper as it
- * does not use a hash-table for large objects, but rather a per slab
- * index to hold the bufctls. This allows the bufctl structure to
- * be small (one word), but limits the number of objects a slab (not
- * a cache) can contain when off-slab bufctls are used. The limit is the
- * size of the largest general cache that does not use off-slab bufctls,
- * divided by the size of a bufctl. For 32bit archs, is this 256/4 = 64.
+ * kmem_bufctl_t:
+ *
+ * Bufctl's are used for linking objs within a slab
+ * linked offsets.
+ *
+ * This implementaion relies on "struct page" for locating the cache &
+ * slab an object belongs to.
+ * This allows the bufctl structure to be small (one int), but limits
+ * the number of objects a slab (not a cache) can contain when off-slab
+ * bufctls are used. The limit is the size of the largest general cache
+ * that does not use off-slab slabs.
+ * For 32bit archs with 4 kB pages, is this 56.
* This is not serious, as it is only for large objects, when it is unwise
* to have too many per slab.
* Note: This limit can be raised by introducing a general cache whose size
* is less than 512 (PAGE_SIZE<<3), but greater than 256.
*/
-#include <linux/config.h>
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
+#define BUFCTL_END 0xffffFFFF
+#define SLAB_LIMIT 0xffffFFFE
+typedef unsigned int kmem_bufctl_t;
-/* If there is a different PAGE_SIZE around, and it works with this allocator,
- * then change the following.
+/* Max number of objs-per-slab for caches which use off-slab slabs.
+ * Needed to avoid a possible looping condition in kmem_cache_grow().
*/
-#if (PAGE_SIZE != 8192 && PAGE_SIZE != 4096 && PAGE_SIZE != 16384 && PAGE_SIZE != 32768)
-#error Your page size is probably not correctly supported - please check
-#endif
+static unsigned long offslab_limit;
-/* SLAB_MGMT_CHECKS - 1 to enable extra checks in kmem_cache_create().
- * 0 if you wish to reduce memory usage.
+/*
+ * slab_t
*
- * SLAB_DEBUG_SUPPORT - 1 for kmem_cache_create() to honour; SLAB_DEBUG_FREE,
- * SLAB_DEBUG_INITIAL, SLAB_RED_ZONE & SLAB_POISON.
- * 0 for faster, smaller, code (especially in the critical paths).
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into one ordered list: fully used, partial, then fully
+ * free slabs.
+ */
+typedef struct slab_s {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+} slab_t;
+
+#define slab_bufctl(slabp) \
+ ((kmem_bufctl_t *)(((slab_t*)slabp)+1))
+
+/*
+ * cpucache_t
*
- * SLAB_STATS - 1 to collect stats for /proc/slabinfo.
- * 0 for faster, smaller, code (especially in the critical paths).
+ * Per cpu structures
+ * The limit is stored in the per-cpu structure to reduce the data cache
+ * footprint.
+ */
+typedef struct cpucache_s {
+ unsigned int avail;
+ unsigned int limit;
+} cpucache_t;
+
+#define cc_entry(cpucache) \
+ ((void **)(((cpucache_t*)cpucache)+1))
+#define cc_data(cachep) \
+ ((cachep)->cpudata[smp_processor_id()])
+/*
+ * kmem_cache_t
*
- * SLAB_SELFTEST - 1 to perform a few tests, mainly for development.
+ * manages a cache.
*/
-#define SLAB_MGMT_CHECKS 1
-#define SLAB_DEBUG_SUPPORT 1
-#define SLAB_STATS 0
-#define SLAB_SELFTEST 0
-/* Shouldn't this be in a header file somewhere? */
-#define BYTES_PER_WORD sizeof(void *)
+#define CACHE_NAMELEN 20 /* max name length for a slab cache */
-/* Legal flag mask for kmem_cache_create(). */
-#if SLAB_DEBUG_SUPPORT
-#if 0
-#define SLAB_C_MASK (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
- SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP| \
- SLAB_HIGH_PACK)
-#endif
-#define SLAB_C_MASK (SLAB_DEBUG_FREE|SLAB_DEBUG_INITIAL|SLAB_RED_ZONE| \
- SLAB_POISON|SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
-#else
-#if 0
-#define SLAB_C_MASK (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP|SLAB_HIGH_PACK)
+struct kmem_cache_s {
+/* 1) each alloc & free */
+ /* full, partial first, then free */
+ struct list_head slabs;
+ struct list_head *firstnotfull;
+ unsigned int objsize;
+ unsigned int flags; /* constant flags */
+ unsigned int num; /* # of objs per slab */
+ spinlock_t spinlock;
+#ifdef CONFIG_SMP
+ unsigned int batchcount;
#endif
-#define SLAB_C_MASK (SLAB_HWCACHE_ALIGN|SLAB_NO_REAP)
-#endif /* SLAB_DEBUG_SUPPORT */
-
-/* Slab management struct.
- * Manages the objs in a slab. Placed either at the end of mem allocated
- * for a slab, or from an internal obj cache (cache_slabp).
- * Slabs are chained into a partially ordered list; fully used first, partial
- * next, and then fully free slabs.
- * The first 4 members are referenced during an alloc/free operation, and
- * should always appear on the same cache line.
- * Note: The offset between some members _must_ match offsets within
- * the kmem_cache_t - see kmem_cache_init() for the checks. */
-
-#define SLAB_OFFSET_BITS 16 /* could make this larger for 64bit archs */
-
-typedef struct kmem_slab_s {
- struct kmem_bufctl_s *s_freep; /* ptr to first inactive obj in slab */
- struct kmem_bufctl_s *s_index;
- unsigned long s_magic;
- unsigned long s_inuse; /* num of objs active in slab */
-
- struct kmem_slab_s *s_nextp;
- struct kmem_slab_s *s_prevp;
- void *s_mem; /* addr of first obj in slab */
- unsigned long s_offset:SLAB_OFFSET_BITS,
- s_dma:1;
-} kmem_slab_t;
-
-/* When the slab management is on-slab, this gives the size to use. */
-#define slab_align_size (L1_CACHE_ALIGN(sizeof(kmem_slab_t)))
-
-/* Test for end of slab chain. */
-#define kmem_slab_end(x) ((kmem_slab_t*)&((x)->c_offset))
-
-/* s_magic */
-#define SLAB_MAGIC_ALLOC 0xA5C32F2BUL /* slab is alive */
-#define SLAB_MAGIC_DESTROYED 0xB2F23C5AUL /* slab has been destroyed */
-
-/* Bufctl's are used for linking objs within a slab, identifying what slab an obj
- * is in, and the address of the associated obj (for sanity checking with off-slab
- * bufctls). What a bufctl contains depends upon the state of the obj and
- * the organisation of the cache.
- */
-typedef struct kmem_bufctl_s {
- union {
- struct kmem_bufctl_s *buf_nextp;
- kmem_slab_t *buf_slabp; /* slab for obj */
- void * buf_objp;
- } u;
-} kmem_bufctl_t;
-
-/* ...shorthand... */
-#define buf_nextp u.buf_nextp
-#define buf_slabp u.buf_slabp
-#define buf_objp u.buf_objp
-
-#if SLAB_DEBUG_SUPPORT
-/* Magic nums for obj red zoning.
- * Placed in the first word before and the first word after an obj.
- */
-#define SLAB_RED_MAGIC1 0x5A2CF071UL /* when obj is active */
-#define SLAB_RED_MAGIC2 0x170FC2A5UL /* when obj is inactive */
-/* ...and for poisoning */
-#define SLAB_POISON_BYTE 0x5a /* byte value for poisoning */
-#define SLAB_POISON_END 0xa5 /* end-byte of poisoning */
+/* 2) slab additions /removals */
+ /* order of pgs per slab (2^n) */
+ unsigned int gfporder;
-#endif /* SLAB_DEBUG_SUPPORT */
+ /* force GFP flags, e.g. GFP_DMA */
+ unsigned int gfpflags;
-#define SLAB_CACHE_NAME_LEN 20 /* max name length for a slab cache */
+ size_t colour; /* cache colouring range */
+ unsigned int colour_off; /* colour offset */
+ unsigned int colour_next; /* cache colouring */
+ kmem_cache_t *slabp_cache;
+ unsigned int growing;
+ unsigned int dflags; /* dynamic flags */
-/* Cache struct - manages a cache.
- * First four members are commonly referenced during an alloc/free operation.
- */
-struct kmem_cache_s {
- kmem_slab_t *c_freep; /* first slab w. free objs */
- unsigned long c_flags; /* constant flags */
- unsigned long c_offset;
- unsigned long c_num; /* # of objs per slab */
-
- unsigned long c_magic;
- unsigned long c_inuse; /* kept at zero */
- kmem_slab_t *c_firstp; /* first slab in chain */
- kmem_slab_t *c_lastp; /* last slab in chain */
-
- spinlock_t c_spinlock;
- unsigned long c_growing;
- unsigned long c_dflags; /* dynamic flags */
- size_t c_org_size;
- unsigned long c_gfporder; /* order of pgs per slab (2^n) */
- void (*c_ctor)(void *, kmem_cache_t *, unsigned long); /* constructor func */
- void (*c_dtor)(void *, kmem_cache_t *, unsigned long); /* de-constructor func */
- unsigned long c_align; /* alignment of objs */
- size_t c_colour; /* cache colouring range */
- size_t c_colour_next;/* cache colouring */
- unsigned long c_failures;
- char c_name[SLAB_CACHE_NAME_LEN];
- struct kmem_cache_s *c_nextp;
- kmem_cache_t *c_index_cachep;
-#if SLAB_STATS
- unsigned long c_num_active;
- unsigned long c_num_allocations;
- unsigned long c_high_mark;
- unsigned long c_grown;
- unsigned long c_reaped;
- atomic_t c_errors;
-#endif /* SLAB_STATS */
+ /* constructor func */
+ void (*ctor)(void *, kmem_cache_t *, unsigned long);
+
+ /* de-constructor func */
+ void (*dtor)(void *, kmem_cache_t *, unsigned long);
+
+ unsigned long failures;
+
+/* 3) cache creation/removal */
+ char name[CACHE_NAMELEN];
+ struct list_head next;
+#ifdef CONFIG_SMP
+/* 4) per-cpu data */
+ cpucache_t *cpudata[NR_CPUS];
+#endif
+#if STATS
+ unsigned long num_active;
+ unsigned long num_allocations;
+ unsigned long high_mark;
+ unsigned long grown;
+ unsigned long reaped;
+ unsigned long errors;
+#ifdef CONFIG_SMP
+ atomic_t allochit;
+ atomic_t allocmiss;
+ atomic_t freehit;
+ atomic_t freemiss;
+#endif
+#endif
};
/* internal c_flags */
-#define SLAB_CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */
-#define SLAB_CFLGS_BUFCTL 0x020000UL /* bufctls in own cache */
-#define SLAB_CFLGS_GENERAL 0x080000UL /* a general cache */
-
-/* c_dflags (dynamic flags). Need to hold the spinlock to access this member */
-#define SLAB_CFLGS_GROWN 0x000002UL /* don't reap a recently grown */
-
-#define SLAB_OFF_SLAB(x) ((x) & SLAB_CFLGS_OFF_SLAB)
-#define SLAB_BUFCTL(x) ((x) & SLAB_CFLGS_BUFCTL)
-#define SLAB_GROWN(x) ((x) & SLAB_CFLGS_GROWN)
-
-#if SLAB_STATS
-#define SLAB_STATS_INC_ACTIVE(x) ((x)->c_num_active++)
-#define SLAB_STATS_DEC_ACTIVE(x) ((x)->c_num_active--)
-#define SLAB_STATS_INC_ALLOCED(x) ((x)->c_num_allocations++)
-#define SLAB_STATS_INC_GROWN(x) ((x)->c_grown++)
-#define SLAB_STATS_INC_REAPED(x) ((x)->c_reaped++)
-#define SLAB_STATS_SET_HIGH(x) do { if ((x)->c_num_active > (x)->c_high_mark) \
- (x)->c_high_mark = (x)->c_num_active; \
- } while (0)
-#define SLAB_STATS_INC_ERR(x) (atomic_inc(&(x)->c_errors))
+#define CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */
+#define CFLGS_OPTIMIZE 0x020000UL /* optimized slab lookup */
+
+/* c_dflags (dynamic flags). Need to hold the spinlock to access this member */
+#define DFLGS_GROWN 0x000001UL /* don't reap a recently grown */
+
+#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
+#define OPTIMIZE(x) ((x)->flags & CFLGS_OPTIMIZE)
+#define GROWN(x) ((x)->dlags & DFLGS_GROWN)
+
+#if STATS
+#define STATS_INC_ACTIVE(x) ((x)->num_active++)
+#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
+#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
+#define STATS_INC_GROWN(x) ((x)->grown++)
+#define STATS_INC_REAPED(x) ((x)->reaped++)
+#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \
+ (x)->high_mark = (x)->num_active; \
+ } while (0)
+#define STATS_INC_ERR(x) ((x)->errors++)
#else
-#define SLAB_STATS_INC_ACTIVE(x)
-#define SLAB_STATS_DEC_ACTIVE(x)
-#define SLAB_STATS_INC_ALLOCED(x)
-#define SLAB_STATS_INC_GROWN(x)
-#define SLAB_STATS_INC_REAPED(x)
-#define SLAB_STATS_SET_HIGH(x)
-#define SLAB_STATS_INC_ERR(x)
-#endif /* SLAB_STATS */
-
-#if SLAB_SELFTEST
-#if !SLAB_DEBUG_SUPPORT
-#error Debug support needed for self-test
+#define STATS_INC_ACTIVE(x) do { } while (0)
+#define STATS_DEC_ACTIVE(x) do { } while (0)
+#define STATS_INC_ALLOCED(x) do { } while (0)
+#define STATS_INC_GROWN(x) do { } while (0)
+#define STATS_INC_REAPED(x) do { } while (0)
+#define STATS_SET_HIGH(x) do { } while (0)
+#define STATS_INC_ERR(x) do { } while (0)
#endif
-static void kmem_self_test(void);
-#endif /* SLAB_SELFTEST */
-/* c_magic - used to detect 'out of slabs' in __kmem_cache_alloc() */
-#define SLAB_C_MAGIC 0x4F17A36DUL
+#if STATS && defined(CONFIG_SMP)
+#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
+#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
+#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
+#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
+#else
+#define STATS_INC_ALLOCHIT(x) do { } while (0)
+#define STATS_INC_ALLOCMISS(x) do { } while (0)
+#define STATS_INC_FREEHIT(x) do { } while (0)
+#define STATS_INC_FREEMISS(x) do { } while (0)
+#endif
-/* maximum size of an obj (in 2^order pages) */
-#define SLAB_OBJ_MAX_ORDER 5 /* 32 pages */
+#if DEBUG
+/* Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define RED_MAGIC1 0x5A2CF071UL /* when obj is active */
+#define RED_MAGIC2 0x170FC2A5UL /* when obj is inactive */
+
+/* ...and for poisoning */
+#define POISON_BYTE 0x5a /* byte value for poisoning */
+#define POISON_END 0xa5 /* end-byte of poisoning */
-/* maximum num of pages for a slab (prevents large requests to the VM layer) */
-#define SLAB_MAX_GFP_ORDER 5 /* 32 pages */
+#endif
-/* the 'preferred' minimum num of objs per slab - maybe less for large objs */
-#define SLAB_MIN_OBJS_PER_SLAB 4
+/* maximum size of an obj (in 2^order pages) */
+#define MAX_OBJ_ORDER 5 /* 32 pages */
-/* If the num of objs per slab is <= SLAB_MIN_OBJS_PER_SLAB,
- * then the page order must be less than this before trying the next order.
+/*
+ * Do not go above this order unless 0 objects fit into the slab.
*/
-#define SLAB_BREAK_GFP_ORDER_HI 2
-#define SLAB_BREAK_GFP_ORDER_LO 1
-static int slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_LO;
+#define BREAK_GFP_ORDER_HI 2
+#define BREAK_GFP_ORDER_LO 1
+static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+
+/*
+ * Absolute limit for the gfp order
+ */
+#define MAX_GFP_ORDER 5 /* 32 pages */
+
/* Macros for storing/retrieving the cachep and or slab from the
- * global 'mem_map'. With off-slab bufctls, these are used to find the
- * slab an obj belongs to. With kmalloc(), and kfree(), these are used
- * to find the cache which an obj belongs to.
+ * global 'mem_map'. These are used to find the slab an obj belongs to.
+ * With kfree(), these are used to find the cache which an obj belongs to.
*/
-#define SLAB_SET_PAGE_CACHE(pg,x) ((pg)->list.next = (struct list_head *)(x))
-#define SLAB_GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->list.next)
-#define SLAB_SET_PAGE_SLAB(pg,x) ((pg)->list.prev = (struct list_head *)(x))
-#define SLAB_GET_PAGE_SLAB(pg) ((kmem_slab_t *)(pg)->list.prev)
+#define SET_PAGE_CACHE(pg,x) ((pg)->list.next = (struct list_head *)(x))
+#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->list.next)
+#define SET_PAGE_SLAB(pg,x) ((pg)->list.prev = (struct list_head *)(x))
+#define GET_PAGE_SLAB(pg) ((slab_t *)(pg)->list.prev)
/* Size description struct for general caches. */
typedef struct cache_sizes {
size_t cs_size;
kmem_cache_t *cs_cachep;
+ kmem_cache_t *cs_dmacachep;
} cache_sizes_t;
static cache_sizes_t cache_sizes[] = {
-#if PAGE_SIZE == 4096
- { 32, NULL},
-#endif
- { 64, NULL},
- { 128, NULL},
- { 256, NULL},
- { 512, NULL},
- {1024, NULL},
- {2048, NULL},
- {4096, NULL},
- {8192, NULL},
- {16384, NULL},
- {32768, NULL},
- {65536, NULL},
- {131072, NULL},
- {0, NULL}
-};
-
-/* Names for the general caches. Not placed into the sizes struct for
- * a good reason; the string ptr is not needed while searching in kmalloc(),
- * and would 'get-in-the-way' in the h/w cache.
- */
-static char *cache_sizes_name[] = {
-#if PAGE_SIZE == 4096
- "size-32",
+#if PAGE_SIZE == 4096
+ { 32, NULL, NULL},
#endif
- "size-64",
- "size-128",
- "size-256",
- "size-512",
- "size-1024",
- "size-2048",
- "size-4096",
- "size-8192",
- "size-16384",
- "size-32768",
- "size-65536",
- "size-131072"
+ { 64, NULL, NULL},
+ { 128, NULL, NULL},
+ { 256, NULL, NULL},
+ { 512, NULL, NULL},
+ { 1024, NULL, NULL},
+ { 2048, NULL, NULL},
+ { 4096, NULL, NULL},
+ { 8192, NULL, NULL},
+ { 16384, NULL, NULL},
+ { 32768, NULL, NULL},
+ { 65536, NULL, NULL},
+ {131072, NULL, NULL},
+ { 0, NULL, NULL}
};
/* internal cache of cache description objs */
-static kmem_cache_t cache_cache = {
-/* freep, flags */ kmem_slab_end(&cache_cache), SLAB_NO_REAP,
-/* offset, num */ sizeof(kmem_cache_t), 0,
-/* c_magic, c_inuse */ SLAB_C_MAGIC, 0,
-/* firstp, lastp */ kmem_slab_end(&cache_cache), kmem_slab_end(&cache_cache),
-/* spinlock */ SPIN_LOCK_UNLOCKED,
-/* growing */ 0,
-/* dflags */ 0,
-/* org_size, gfp */ 0, 0,
-/* ctor, dtor, align */ NULL, NULL, L1_CACHE_BYTES,
-/* colour, colour_next */ 0, 0,
-/* failures */ 0,
-/* name */ "kmem_cache",
-/* nextp */ &cache_cache,
-/* index */ NULL,
+static kmem_cache_t cache_cache = {
+ slabs: LIST_HEAD_INIT(cache_cache.slabs),
+ firstnotfull: &cache_cache.slabs,
+ objsize: sizeof(kmem_cache_t),
+ flags: SLAB_NO_REAP,
+ spinlock: SPIN_LOCK_UNLOCKED,
+ colour_off: L1_CACHE_BYTES,
+ name: "kmem_cache"
};
/* Guard access to the cache-chain. */
static struct semaphore cache_chain_sem;
/* Place maintainer for reaping. */
-static kmem_cache_t *clock_searchp = &cache_cache;
+static kmem_cache_t *clock_searchp = &cache_cache;
-/* Internal slab management cache, for when slab management is off-slab. */
-static kmem_cache_t *cache_slabp;
+#define cache_chain (cache_cache.next)
-/* Max number of objs-per-slab for caches which use bufctl's.
- * Needed to avoid a possible looping condition in kmem_cache_grow().
+#ifdef CONFIG_SMP
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
*/
-static unsigned long bufctl_limit;
+static int g_cpucache_up;
+
+static void drain_cache (void *__cachep);
+static void enable_cpucache (kmem_cache_t *cachep);
+static void enable_all_cpucaches (void);
+#endif
+
+/* Cal the num objs, wastage, and bytes left over for a given slab size. */
+static void kmem_cache_estimate (unsigned long gfporder, size_t size,
+ int flags, size_t *left_over, unsigned int *num)
+{
+ int i;
+ size_t wastage = PAGE_SIZE<<gfporder;
+ size_t extra = 0;
+ size_t base = 0;
+
+ if (!(flags & CFLGS_OFF_SLAB)) {
+ base = sizeof(slab_t);
+ extra = sizeof(kmem_bufctl_t);
+ }
+ i = 0;
+ while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
+ i++;
+ if (i > 0)
+ i--;
+
+ if (i > SLAB_LIMIT)
+ i = SLAB_LIMIT;
+
+ *num = i;
+ wastage -= i*size;
+ wastage -= L1_CACHE_ALIGN(base+i*extra);
+ *left_over = wastage;
+}
/* Initialisation - setup the `cache' cache. */
void __init kmem_cache_init(void)
{
- size_t size, i;
-
-#define kmem_slab_offset(x) ((unsigned long)&((kmem_slab_t *)0)->x)
-#define kmem_slab_diff(a,b) (kmem_slab_offset(a) - kmem_slab_offset(b))
-#define kmem_cache_offset(x) ((unsigned long)&((kmem_cache_t *)0)->x)
-#define kmem_cache_diff(a,b) (kmem_cache_offset(a) - kmem_cache_offset(b))
-
- /* Sanity checks... */
- if (kmem_cache_diff(c_firstp, c_magic) != kmem_slab_diff(s_nextp, s_magic) ||
- kmem_cache_diff(c_firstp, c_inuse) != kmem_slab_diff(s_nextp, s_inuse) ||
- ((kmem_cache_offset(c_lastp) -
- ((unsigned long) kmem_slab_end((kmem_cache_t*)NULL))) !=
- kmem_slab_offset(s_prevp)) ||
- kmem_cache_diff(c_lastp, c_firstp) != kmem_slab_diff(s_prevp, s_nextp)) {
- /* Offsets to the magic are incorrect, either the structures have
- * been incorrectly changed, or adjustments are needed for your
- * architecture.
- */
- panic("kmem_cache_init(): Offsets are wrong - I've been messed with!");
- /* NOTREACHED */
- }
-#undef kmem_cache_offset
-#undef kmem_cache_diff
-#undef kmem_slab_offset
-#undef kmem_slab_diff
+ size_t left_over;
init_MUTEX(&cache_chain_sem);
+ list_add(&cache_cache.next,&cache_chain);
- size = cache_cache.c_offset + sizeof(kmem_bufctl_t);
- size += (L1_CACHE_BYTES-1);
- size &= ~(L1_CACHE_BYTES-1);
- cache_cache.c_offset = size-sizeof(kmem_bufctl_t);
-
- i = (PAGE_SIZE<<cache_cache.c_gfporder)-slab_align_size;
- cache_cache.c_num = i / size; /* num of objs per slab */
+ kmem_cache_estimate(0, cache_cache.objsize, 0,
+ &left_over, &cache_cache.num);
+ if (!cache_cache.num)
+ BUG();
+
+ cache_cache.colour = left_over/cache_cache.colour_off;
+ cache_cache.colour_next = 0;
+}
- /* Cache colouring. */
- cache_cache.c_colour = (i-(cache_cache.c_num*size))/L1_CACHE_BYTES;
- cache_cache.c_colour_next = cache_cache.c_colour;
+/* Initialisation - setup remaining internal and general caches.
+ * Called after the gfp() functions have been enabled, and before smp_init().
+ */
+void __init kmem_cache_sizes_init(void)
+{
+ cache_sizes_t *sizes = cache_sizes;
+ char name[20];
/*
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory.
*/
if (num_physpages > (32 << 20) >> PAGE_SHIFT)
- slab_break_gfp_order = SLAB_BREAK_GFP_ORDER_HI;
+ slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+ do {
+ /* For performance, all the general caches are L1 aligned.
+ * This should be particularly beneficial on SMP boxes, as it
+ * eliminates "false sharing".
+ * Note for systems short on memory removing the alignment will
+ * allow tighter packing of the smaller caches. */
+ sprintf(name,"size-%d",sizes->cs_size);
+ if (!(sizes->cs_cachep =
+ kmem_cache_create(name, sizes->cs_size,
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL))) {
+ BUG();
+ }
+
+ /* Inc off-slab bufctl limit until the ceiling is hit. */
+ if (!(OFF_SLAB(sizes->cs_cachep))) {
+ offslab_limit = sizes->cs_size-sizeof(slab_t);
+ offslab_limit /= 2;
+ }
+ sprintf(name, "size-%d(DMA)",sizes->cs_size);
+ sizes->cs_dmacachep = kmem_cache_create(name, sizes->cs_size, 0,
+ SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!sizes->cs_dmacachep)
+ BUG();
+ sizes++;
+ } while (sizes->cs_size);
}
-/* Initialisation - setup remaining internal and general caches.
- * Called after the gfp() functions have been enabled, and before smp_init().
- */
-void __init kmem_cache_sizes_init(void)
+void __init kmem_cpucache_init(void)
{
- unsigned int found = 0;
-
- cache_slabp = kmem_cache_create("slab_cache", sizeof(kmem_slab_t),
- 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (cache_slabp) {
- char **names = cache_sizes_name;
- cache_sizes_t *sizes = cache_sizes;
- do {
- /* For performance, all the general caches are L1 aligned.
- * This should be particularly beneficial on SMP boxes, as it
- * eliminates "false sharing".
- * Note for systems short on memory removing the alignment will
- * allow tighter packing of the smaller caches. */
- if (!(sizes->cs_cachep =
- kmem_cache_create(*names++, sizes->cs_size,
- 0, SLAB_HWCACHE_ALIGN, NULL, NULL)))
- goto panic_time;
- if (!found) {
- /* Inc off-slab bufctl limit until the ceiling is hit. */
- if (SLAB_BUFCTL(sizes->cs_cachep->c_flags))
- found++;
- else
- bufctl_limit =
- (sizes->cs_size/sizeof(kmem_bufctl_t));
- }
- sizes->cs_cachep->c_flags |= SLAB_CFLGS_GENERAL;
- sizes++;
- } while (sizes->cs_size);
-#if SLAB_SELFTEST
- kmem_self_test();
-#endif /* SLAB_SELFTEST */
- return;
- }
-panic_time:
- panic("kmem_cache_sizes_init: Error creating caches");
- /* NOTREACHED */
+#ifdef CONFIG_SMP
+ g_cpucache_up = 1;
+ enable_all_cpucaches();
+#endif
}
-/* Interface to system's page allocator. Dma pts to non-zero if all
- * of memory is DMAable. No need to hold the cache-lock.
+/* Interface to system's page allocator. No need to hold the cache-lock.
*/
-static inline void *
-kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
+static inline void * kmem_getpages (kmem_cache_t *cachep, unsigned long flags)
{
void *addr;
/*
- * If we requested dmaable memory, we will get it. Even if we
+ * If we requested dmaable memory, we will get it. Even if we
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
- *dma = flags & SLAB_DMA;
- addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
+ flags |= cachep->gfpflags;
+ addr = (void*) __get_free_pages(flags, cachep->gfporder);
/* Assume that now we have the pages no one else can legally
* messes with the 'struct page's.
* However vm_scan() might try to test the structure to see if
@@ -522,11 +494,10 @@ kmem_getpages(kmem_cache_t *cachep, unsigned long flags, unsigned int *dma)
}
/* Interface to system's page release. */
-static inline void
-kmem_freepages(kmem_cache_t *cachep, void *addr)
+static inline void kmem_freepages (kmem_cache_t *cachep, void *addr)
{
- unsigned long i = (1<<cachep->c_gfporder);
- struct page *page = &mem_map[MAP_NR(addr)];
+ unsigned long i = (1<<cachep->gfporder);
+ struct page *page = mem_map + MAP_NR(addr);
/* free_pages() does not clear the type bit - we do that.
* The pages have been unlinked from their cache-slab,
@@ -537,140 +508,84 @@ kmem_freepages(kmem_cache_t *cachep, void *addr)
PageClearSlab(page);
page++;
}
- free_pages((unsigned long)addr, cachep->c_gfporder);
+ free_pages((unsigned long)addr, cachep->gfporder);
}
-#if SLAB_DEBUG_SUPPORT
-static inline void
-kmem_poison_obj(kmem_cache_t *cachep, void *addr)
+#if DEBUG
+static inline void kmem_poison_obj (kmem_cache_t *cachep, void *addr)
{
- memset(addr, SLAB_POISON_BYTE, cachep->c_org_size);
- *(unsigned char *)(addr+cachep->c_org_size-1) = SLAB_POISON_END;
+ int size = cachep->objsize;
+ if (cachep->flags & SLAB_RED_ZONE) {
+ addr += BYTES_PER_WORD;
+ size -= 2*BYTES_PER_WORD;
+ }
+ memset(addr, POISON_BYTE, size);
+ *(unsigned char *)(addr+size-1) = POISON_END;
}
-static inline int
-kmem_check_poison_obj(kmem_cache_t *cachep, void *addr)
+static inline int kmem_check_poison_obj (kmem_cache_t *cachep, void *addr)
{
+ int size = cachep->objsize;
void *end;
- end = memchr(addr, SLAB_POISON_END, cachep->c_org_size);
- if (end != (addr+cachep->c_org_size-1))
+ if (cachep->flags & SLAB_RED_ZONE) {
+ addr += BYTES_PER_WORD;
+ size -= 2*BYTES_PER_WORD;
+ }
+ end = memchr(addr, POISON_END, size);
+ if (end != (addr+size-1))
return 1;
return 0;
}
-#endif /* SLAB_DEBUG_SUPPORT */
-
-/* Three slab chain funcs - all called with ints disabled and the appropriate
- * cache-lock held.
- */
-static inline void
-kmem_slab_unlink(kmem_slab_t *slabp)
-{
- kmem_slab_t *prevp = slabp->s_prevp;
- kmem_slab_t *nextp = slabp->s_nextp;
- prevp->s_nextp = nextp;
- nextp->s_prevp = prevp;
-}
-
-static inline void
-kmem_slab_link_end(kmem_cache_t *cachep, kmem_slab_t *slabp)
-{
- kmem_slab_t *lastp = cachep->c_lastp;
- slabp->s_nextp = kmem_slab_end(cachep);
- slabp->s_prevp = lastp;
- cachep->c_lastp = slabp;
- lastp->s_nextp = slabp;
-}
-
-static inline void
-kmem_slab_link_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
-{
- kmem_slab_t *nextp = cachep->c_freep;
- kmem_slab_t *prevp = nextp->s_prevp;
- slabp->s_nextp = nextp;
- slabp->s_prevp = prevp;
- nextp->s_prevp = slabp;
- slabp->s_prevp->s_nextp = slabp;
-}
+#endif
/* Destroy all the objs in a slab, and release the mem back to the system.
* Before calling the slab must have been unlinked from the cache.
* The cache-lock is not held/needed.
*/
-static void
-kmem_slab_destroy(kmem_cache_t *cachep, kmem_slab_t *slabp)
+static void kmem_slab_destroy (kmem_cache_t *cachep, slab_t *slabp)
{
- if (cachep->c_dtor
-#if SLAB_DEBUG_SUPPORT
- || cachep->c_flags & (SLAB_POISON | SLAB_RED_ZONE)
-#endif /*SLAB_DEBUG_SUPPORT*/
+ if (cachep->dtor
+#if DEBUG
+ || cachep->flags & (SLAB_POISON | SLAB_RED_ZONE)
+#endif
) {
- /* Doesn't use the bufctl ptrs to find objs. */
- unsigned long num = cachep->c_num;
- void *objp = slabp->s_mem;
- do {
-#if SLAB_DEBUG_SUPPORT
- if (cachep->c_flags & SLAB_RED_ZONE) {
- if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1)
- printk(KERN_ERR "kmem_slab_destroy: "
- "Bad front redzone - %s\n",
- cachep->c_name);
+ int i;
+ for (i = 0; i < cachep->num; i++) {
+ void* objp = slabp->s_mem+cachep->objsize*i;
+#if DEBUG
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*((unsigned long*)(objp)) != RED_MAGIC1)
+ BUG();
+ if (*((unsigned long*)(objp + cachep->objsize
+ -BYTES_PER_WORD)) != RED_MAGIC1)
+ BUG();
objp += BYTES_PER_WORD;
- if (*((unsigned long*)(objp+cachep->c_org_size)) !=
- SLAB_RED_MAGIC1)
- printk(KERN_ERR "kmem_slab_destroy: "
- "Bad rear redzone - %s\n",
- cachep->c_name);
- }
- if (cachep->c_dtor)
-#endif /*SLAB_DEBUG_SUPPORT*/
- (cachep->c_dtor)(objp, cachep, 0);
-#if SLAB_DEBUG_SUPPORT
- else if (cachep->c_flags & SLAB_POISON) {
- if (kmem_check_poison_obj(cachep, objp))
- printk(KERN_ERR "kmem_slab_destroy: "
- "Bad poison - %s\n", cachep->c_name);
}
- if (cachep->c_flags & SLAB_RED_ZONE)
+#endif
+ if (cachep->dtor)
+ (cachep->dtor)(objp, cachep, 0);
+#if DEBUG
+ if (cachep->flags & SLAB_RED_ZONE) {
objp -= BYTES_PER_WORD;
-#endif /* SLAB_DEBUG_SUPPORT */
- objp += cachep->c_offset;
- if (!slabp->s_index)
- objp += sizeof(kmem_bufctl_t);
- } while (--num);
+ }
+ if ((cachep->flags & SLAB_POISON) &&
+ kmem_check_poison_obj(cachep, objp))
+ BUG();
+#endif
+ }
}
- slabp->s_magic = SLAB_MAGIC_DESTROYED;
- if (slabp->s_index)
- kmem_cache_free(cachep->c_index_cachep, slabp->s_index);
- kmem_freepages(cachep, slabp->s_mem-slabp->s_offset);
- if (SLAB_OFF_SLAB(cachep->c_flags))
- kmem_cache_free(cache_slabp, slabp);
+ kmem_freepages(cachep, slabp->s_mem-slabp->colouroff);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slabp);
}
-/* Cal the num objs, wastage, and bytes left over for a given slab size. */
-static inline size_t
-kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra,
- unsigned long flags, size_t *left_over, unsigned long *num)
-{
- size_t wastage = PAGE_SIZE<<gfporder;
-
- if (SLAB_OFF_SLAB(flags))
- gfporder = 0;
- else
- gfporder = slab_align_size;
- wastage -= gfporder;
- *num = wastage / size;
- wastage -= (*num * size);
- *left_over = wastage;
-
- return (wastage + gfporder + (extra * *num));
-}
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
- * @offset: The offset to use within the page.
+ * @offset: The offset to use within the page.
* @flags: SLAB flags
* @ctor: A constructor for the objects.
* @dtor: A destructor for the objects.
@@ -695,57 +610,27 @@ kmem_cache_cal_waste(unsigned long gfporder, size_t size, size_t extra,
* as davem.
*/
kmem_cache_t *
-kmem_cache_create(const char *name, size_t size, size_t offset,
+kmem_cache_create (const char *name, size_t size, size_t offset,
unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
void (*dtor)(void*, kmem_cache_t *, unsigned long))
{
- const char *func_nm= KERN_ERR "kmem_create: ";
- kmem_cache_t *searchp;
- kmem_cache_t *cachep=NULL;
- size_t extra;
- size_t left_over;
- size_t align;
-
-#if SLAB_DEBUG_SUPPORT
- flags |= SLAB_POISON;
-#endif
- /* Sanity checks... */
-#if SLAB_MGMT_CHECKS
- if (!name) {
- printk("%sNULL ptr\n", func_nm);
- goto opps;
- }
- if (strlen(name) >= SLAB_CACHE_NAME_LEN) {
- printk("%sname too long\n", func_nm);
- goto opps;
- }
- if (in_interrupt()) {
- printk("%sCalled during int - %s\n", func_nm, name);
- goto opps;
- }
+ const char *func_nm = KERN_ERR "kmem_create: ";
+ size_t left_over, align, slab_size;
+ kmem_cache_t *cachep = NULL;
- if (size < BYTES_PER_WORD) {
- printk("%sSize too small %d - %s\n", func_nm, (int) size, name);
- size = BYTES_PER_WORD;
- }
-
- if (size > ((1<<SLAB_OBJ_MAX_ORDER)*PAGE_SIZE)) {
- printk("%sSize too large %d - %s\n", func_nm, (int) size, name);
- goto opps;
- }
-
- if (dtor && !ctor) {
- /* Decon, but no con - doesn't make sense */
- printk("%sDecon but no con - %s\n", func_nm, name);
- goto opps;
- }
-
- if (offset < 0 || offset > size) {
- printk("%sOffset weird %d - %s\n", func_nm, (int) offset, name);
- offset = 0;
- }
-
-#if SLAB_DEBUG_SUPPORT
+ /*
+ * Sanity checks... these are all serious usage bugs.
+ */
+ if ((!name) ||
+ ((strlen(name) >= CACHE_NAMELEN - 1)) ||
+ in_interrupt() ||
+ (size < BYTES_PER_WORD) ||
+ (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
+ (dtor && !ctor) ||
+ (offset < 0 || offset > size))
+ BUG();
+
+#if DEBUG
if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
/* No constructor, but inital state check requested */
printk("%sNo con, but init state check requested - %s\n", func_nm, name);
@@ -757,27 +642,24 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
printk("%sPoisoning requested, but con given - %s\n", func_nm, name);
flags &= ~SLAB_POISON;
}
-#if 0
- if ((flags & SLAB_HIGH_PACK) && ctor) {
- printk("%sHigh pack requested, but con given - %s\n", func_nm, name);
- flags &= ~SLAB_HIGH_PACK;
- }
- if ((flags & SLAB_HIGH_PACK) && (flags & (SLAB_POISON|SLAB_RED_ZONE))) {
- printk("%sHigh pack requested, but with poisoning/red-zoning - %s\n",
- func_nm, name);
- flags &= ~SLAB_HIGH_PACK;
- }
+#if FORCED_DEBUG
+ if (size < (PAGE_SIZE>>3))
+ /*
+ * do not red zone large object, causes severe
+ * fragmentation.
+ */
+ flags |= SLAB_RED_ZONE;
+ if (!ctor)
+ flags |= SLAB_POISON;
+#endif
#endif
-#endif /* SLAB_DEBUG_SUPPORT */
-#endif /* SLAB_MGMT_CHECKS */
- /* Always checks flags, a caller might be expecting debug
+ /*
+ * Always checks flags, a caller might be expecting debug
* support which isn't available.
*/
- if (flags & ~SLAB_C_MASK) {
- printk("%sIllgl flg %lX - %s\n", func_nm, flags, name);
- flags &= SLAB_C_MASK;
- }
+ if (flags & ~CREATE_MASK)
+ BUG();
/* Get cache's description obj. */
cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
@@ -794,78 +676,36 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
size &= ~(BYTES_PER_WORD-1);
printk("%sForcing size word alignment - %s\n", func_nm, name);
}
-
- cachep->c_org_size = size;
-#if SLAB_DEBUG_SUPPORT
+
+#if DEBUG
if (flags & SLAB_RED_ZONE) {
- /* There is no point trying to honour cache alignment when redzoning. */
+ /*
+ * There is no point trying to honour cache alignment
+ * when redzoning.
+ */
flags &= ~SLAB_HWCACHE_ALIGN;
- size += 2*BYTES_PER_WORD; /* words for redzone */
+ size += 2*BYTES_PER_WORD; /* words for redzone */
}
-#endif /* SLAB_DEBUG_SUPPORT */
-
+#endif
align = BYTES_PER_WORD;
if (flags & SLAB_HWCACHE_ALIGN)
align = L1_CACHE_BYTES;
- /* Determine if the slab management and/or bufclts are 'on' or 'off' slab. */
- extra = sizeof(kmem_bufctl_t);
- if (size < (PAGE_SIZE>>3)) {
- /* Size is small(ish). Use packing where bufctl size per
- * obj is low, and slab management is on-slab.
- */
-#if 0
- if ((flags & SLAB_HIGH_PACK)) {
- /* Special high packing for small objects
- * (mainly for vm_mapping structs, but
- * others can use it).
- */
- if (size == (L1_CACHE_BYTES/4) || size == (L1_CACHE_BYTES/2) ||
- size == L1_CACHE_BYTES) {
- /* The bufctl is stored with the object. */
- extra = 0;
- } else
- flags &= ~SLAB_HIGH_PACK;
- }
-#endif
- } else {
- /* Size is large, assume best to place the slab management obj
+ /* Determine if the slab management is 'on' or 'off' slab. */
+ if (size >= (PAGE_SIZE>>3))
+ /*
+ * Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
- flags |= SLAB_CFLGS_OFF_SLAB;
- if (!(size & ~PAGE_MASK) || size == (PAGE_SIZE/2)
- || size == (PAGE_SIZE/4) || size == (PAGE_SIZE/8)) {
- /* To avoid waste the bufctls are off-slab... */
- flags |= SLAB_CFLGS_BUFCTL;
- extra = 0;
- } /* else slab management is off-slab, but freelist pointers are on. */
- }
- size += extra;
+ flags |= CFLGS_OFF_SLAB;
if (flags & SLAB_HWCACHE_ALIGN) {
/* Need to adjust size so that objs are cache aligned. */
- if (size > (L1_CACHE_BYTES/2)) {
- size_t words = size % L1_CACHE_BYTES;
- if (words)
- size += (L1_CACHE_BYTES-words);
- } else {
- /* Small obj size, can get at least two per cache line. */
- int num_per_line = L1_CACHE_BYTES/size;
- left_over = L1_CACHE_BYTES - (num_per_line*size);
- if (left_over) {
- /* Need to adjust size so objs cache align. */
- if (left_over%num_per_line) {
- /* Odd num of objs per line - fixup. */
- num_per_line--;
- left_over += size;
- }
- size += (left_over/num_per_line);
- }
- }
- } else if (!(size%L1_CACHE_BYTES)) {
- /* Size happens to cache align... */
- flags |= SLAB_HWCACHE_ALIGN;
- align = L1_CACHE_BYTES;
+ /* Small obj size, can get at least two per cache line. */
+ /* FIXME: only power of 2 supported, was better */
+ while (size < align/2)
+ align /= 2;
+ size = (size+align-1)&(~(align-1));
}
/* Cal size (in pages) of slabs, and the num of objs per slab.
@@ -874,133 +714,103 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
* friendly towards high-order requests, this should be changed.
*/
do {
- size_t wastage;
unsigned int break_flag = 0;
cal_wastage:
- wastage = kmem_cache_cal_waste(cachep->c_gfporder, size, extra,
- flags, &left_over, &cachep->c_num);
- if (!cachep->c_num)
- goto next;
+ kmem_cache_estimate(cachep->gfporder, size, flags,
+ &left_over, &cachep->num);
if (break_flag)
break;
- if (SLAB_BUFCTL(flags) && cachep->c_num > bufctl_limit) {
+ if (cachep->gfporder >= MAX_GFP_ORDER)
+ break;
+ if (!cachep->num)
+ goto next;
+ if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {
/* Oops, this num of objs will cause problems. */
- cachep->c_gfporder--;
+ cachep->gfporder--;
break_flag++;
goto cal_wastage;
}
- if (cachep->c_gfporder == SLAB_MAX_GFP_ORDER)
- break;
- /* Large num of objs is good, but v. large slabs are currently
+ /*
+ * Large num of objs is good, but v. large slabs are currently
* bad for the gfp()s.
*/
- if (cachep->c_num <= SLAB_MIN_OBJS_PER_SLAB) {
- if (cachep->c_gfporder < slab_break_gfp_order)
- goto next;
- }
-
- /* Stop caches with small objs having a large num of pages. */
- if (left_over <= slab_align_size)
+ if (cachep->gfporder >= slab_break_gfp_order)
break;
- if ((wastage*8) <= (PAGE_SIZE<<cachep->c_gfporder))
+
+ if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
break; /* Acceptable internal fragmentation. */
next:
- cachep->c_gfporder++;
+ cachep->gfporder++;
} while (1);
- /* If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
+ if (!cachep->num) {
+ printk("kmem_cache_create: couldn't create cache %s.\n", name);
+ kmem_cache_free(&cache_cache, cachep);
+ cachep = NULL;
+ goto opps;
+ }
+ slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(slab_t));
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
*/
- if ((flags & SLAB_CFLGS_OFF_SLAB) && !SLAB_BUFCTL(flags) &&
- left_over >= slab_align_size) {
- flags &= ~SLAB_CFLGS_OFF_SLAB;
- left_over -= slab_align_size;
+ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
+ flags &= ~CFLGS_OFF_SLAB;
+ left_over -= slab_size;
}
/* Offset must be a multiple of the alignment. */
offset += (align-1);
offset &= ~(align-1);
-
- /* Mess around with the offset alignment. */
- if (!left_over) {
- offset = 0;
- } else if (left_over < offset) {
- offset = align;
- if (flags & SLAB_HWCACHE_ALIGN) {
- if (left_over < offset)
- offset = 0;
- } else {
- /* Offset is BYTES_PER_WORD, and left_over is at
- * least BYTES_PER_WORD.
- */
- if (left_over >= (BYTES_PER_WORD*2)) {
- offset >>= 1;
- if (left_over >= (BYTES_PER_WORD*4))
- offset >>= 1;
- }
- }
- } else if (!offset) {
- /* No offset requested, but space enough - give one. */
- offset = left_over/align;
- if (flags & SLAB_HWCACHE_ALIGN) {
- if (offset >= 8) {
- /* A large number of colours - use a larger alignment. */
- align <<= 1;
- }
- } else {
- if (offset >= 10) {
- align <<= 1;
- if (offset >= 16)
- align <<= 1;
- }
- }
- offset = align;
- }
-
-#if 0
-printk("%s: Left_over:%d Align:%d Size:%d\n", name, left_over, offset, size);
-#endif
-
- if ((cachep->c_align = (unsigned long) offset))
- cachep->c_colour = (left_over/offset);
- cachep->c_colour_next = cachep->c_colour;
-
- /* If the bufctl's are on-slab, c_offset does not include the size of bufctl. */
- if (!SLAB_BUFCTL(flags))
- size -= sizeof(kmem_bufctl_t);
- else
- cachep->c_index_cachep =
- kmem_find_general_cachep(cachep->c_num*sizeof(kmem_bufctl_t));
- cachep->c_offset = (unsigned long) size;
- cachep->c_freep = kmem_slab_end(cachep);
- cachep->c_firstp = kmem_slab_end(cachep);
- cachep->c_lastp = kmem_slab_end(cachep);
- cachep->c_flags = flags;
- cachep->c_ctor = ctor;
- cachep->c_dtor = dtor;
- cachep->c_magic = SLAB_C_MAGIC;
+ if (!offset)
+ offset = L1_CACHE_BYTES;
+ cachep->colour_off = offset;
+ cachep->colour = left_over/offset;
+
+ /* init remaining fields */
+ if (!cachep->gfporder && !(flags & CFLGS_OFF_SLAB))
+ flags |= CFLGS_OPTIMIZE;
+
+ cachep->flags = flags;
+ cachep->gfpflags = 0;
+ if (flags & SLAB_CACHE_DMA)
+ cachep->gfpflags |= GFP_DMA;
+ spin_lock_init(&cachep->spinlock);
+ cachep->objsize = size;
+ INIT_LIST_HEAD(&cachep->slabs);
+ cachep->firstnotfull = &cachep->slabs;
+
+ if (flags & CFLGS_OFF_SLAB)
+ cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
+ cachep->ctor = ctor;
+ cachep->dtor = dtor;
/* Copy name over so we don't have problems with unloaded modules */
- strcpy(cachep->c_name, name);
- spin_lock_init(&cachep->c_spinlock);
+ strcpy(cachep->name, name);
+#ifdef CONFIG_SMP
+ if (g_cpucache_up)
+ enable_cpucache(cachep);
+#endif
/* Need the semaphore to access the chain. */
down(&cache_chain_sem);
- searchp = &cache_cache;
- do {
- /* The name field is constant - no lock needed. */
- if (!strcmp(searchp->c_name, name)) {
- printk("%sDup name - %s\n", func_nm, name);
- break;
+ {
+ struct list_head *p;
+
+ list_for_each(p, &cache_chain) {
+ kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
+
+ /* The name field is constant - no lock needed. */
+ if (!strcmp(pc->name, name))
+ BUG();
}
- searchp = searchp->c_nextp;
- } while (searchp != &cache_cache);
+ }
/* There is no reason to lock our new cache before we
* link it in - no one knows about it yet...
*/
- cachep->c_nextp = cache_cache.c_nextp;
- cache_cache.c_nextp = cachep;
+ list_add(&cachep->next, &cache_chain);
up(&cache_chain_sem);
opps:
return cachep;
@@ -1012,59 +822,57 @@ opps:
*/
static int is_chained_kmem_cache(kmem_cache_t * cachep)
{
- kmem_cache_t * searchp;
+ struct list_head *p;
int ret = 0;
/* Find the cache in the chain of caches. */
down(&cache_chain_sem);
- for (searchp = &cache_cache; searchp->c_nextp != &cache_cache;
- searchp = searchp->c_nextp) {
- if (searchp->c_nextp != cachep)
- continue;
-
- /* Accessing clock_searchp is safe - we hold the mutex. */
- if (cachep == clock_searchp)
- clock_searchp = cachep->c_nextp;
- ret = 1;
- break;
+ list_for_each(p, &cache_chain) {
+ if (p == &cachep->next) {
+ ret = 1;
+ break;
+ }
}
up(&cache_chain_sem);
return ret;
}
-/* returns 0 if every slab is been freed -arca */
static int __kmem_cache_shrink(kmem_cache_t *cachep)
{
- kmem_slab_t *slabp;
- int ret;
+ slab_t *slabp;
+ int ret;
- spin_lock_irq(&cachep->c_spinlock);
+#ifdef CONFIG_SMP
+ smp_call_function(drain_cache, cachep, 1, 1);
+ local_irq_disable();
+ drain_cache(cachep);
+ local_irq_enable();
+#endif
+ spin_lock_irq(&cachep->spinlock);
/* If the cache is growing, stop shrinking. */
- while (!cachep->c_growing) {
- slabp = cachep->c_lastp;
- if (slabp->s_inuse || slabp == kmem_slab_end(cachep))
+ while (!cachep->growing) {
+ struct list_head *p;
+
+ p = cachep->slabs.prev;
+ if (p == &cachep->slabs)
break;
- /*
- * If this slab is the first slab with free objects
- * (c_freep), and as we are walking the slab chain
- * backwards, it is also the last slab with free
- * objects. After unlinking it, there will be no
- * slabs with free objects, so point c_freep into the
- * cache structure.
- */
- if (cachep->c_freep == slabp)
- cachep->c_freep = kmem_slab_end(cachep);
- kmem_slab_unlink(slabp);
- spin_unlock_irq(&cachep->c_spinlock);
+
+ slabp = list_entry(cachep->slabs.prev, slab_t, list);
+ if (slabp->inuse)
+ break;
+
+ list_del(&slabp->list);
+ if (cachep->firstnotfull == &slabp->list)
+ cachep->firstnotfull = &cachep->slabs;
+
+ spin_unlock_irq(&cachep->spinlock);
kmem_slab_destroy(cachep, slabp);
- spin_lock_irq(&cachep->c_spinlock);
+ spin_lock_irq(&cachep->spinlock);
}
- ret = 1;
- if (cachep->c_lastp == kmem_slab_end(cachep))
- ret = 0; /* Cache is empty. */
- spin_unlock_irq(&cachep->c_spinlock);
+ ret = !list_empty(&cachep->slabs);
+ spin_unlock_irq(&cachep->spinlock);
return ret;
}
@@ -1075,14 +883,9 @@ static int __kmem_cache_shrink(kmem_cache_t *cachep)
* Releases as many slabs as possible for a cache.
* To help debugging, a zero exit status indicates all slabs were released.
*/
-int
-kmem_cache_shrink(kmem_cache_t *cachep)
+int kmem_cache_shrink(kmem_cache_t *cachep)
{
- if (!cachep)
- BUG();
- if (in_interrupt())
- BUG();
- if (!is_chained_kmem_cache(cachep))
+ if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep))
BUG();
return __kmem_cache_shrink(cachep);
@@ -1100,605 +903,545 @@ kmem_cache_shrink(kmem_cache_t *cachep)
* cache being allocated each time a module is loaded and unloaded, if the
* module doesn't have persistent in-kernel storage across loads and unloads.
*
+ * The caller must guarantee that noone will allocate memory from the cache
+ * during the kmem_cache_destroy().
*/
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy (kmem_cache_t * cachep)
{
- kmem_cache_t * prev;
- int ret;
-
- if (!cachep) {
- printk(KERN_ERR "kmem_destroy: NULL ptr\n");
- return 1;
- }
- if (in_interrupt()) {
- printk(KERN_ERR "kmem_destroy: Called during int - %s\n",
- cachep->c_name);
- return 1;
- }
+ if (!cachep || in_interrupt() || cachep->growing)
+ BUG();
- ret = 0;
/* Find the cache in the chain of caches. */
down(&cache_chain_sem);
- for (prev = &cache_cache; prev->c_nextp != &cache_cache;
- prev = prev->c_nextp) {
- if (prev->c_nextp != cachep)
- continue;
-
- /* Accessing clock_searchp is safe - we hold the mutex. */
- if (cachep == clock_searchp)
- clock_searchp = cachep->c_nextp;
-
- /* remove the cachep from the cache_cache list. -arca */
- prev->c_nextp = cachep->c_nextp;
-
- ret = 1;
- break;
- }
+ /* the chain is never empty, cache_cache is never destroyed */
+ if (clock_searchp == cachep)
+ clock_searchp = list_entry(cachep->next.next,
+ kmem_cache_t, next);
+ list_del(&cachep->next);
up(&cache_chain_sem);
- if (!ret) {
- printk(KERN_ERR "kmem_destroy: Invalid cache addr %p\n",
- cachep);
- return 1;
- }
-
if (__kmem_cache_shrink(cachep)) {
- printk(KERN_ERR "kmem_destroy: Can't free all objects %p\n",
+ printk(KERN_ERR "kmem_cache_destroy: Can't free all objects %p\n",
cachep);
down(&cache_chain_sem);
- cachep->c_nextp = cache_cache.c_nextp;
- cache_cache.c_nextp = cachep;
+ list_add(&cachep->next,&cache_chain);
up(&cache_chain_sem);
return 1;
}
-
+#ifdef CONFIG_SMP
+ {
+ int i;
+ for (i = 0; i < NR_CPUS; i++)
+ kfree(cachep->cpudata[i]);
+ }
+#endif
kmem_cache_free(&cache_cache, cachep);
return 0;
}
/* Get the memory for a slab management obj. */
-static inline kmem_slab_t *
-kmem_cache_slabmgmt(kmem_cache_t *cachep, void *objp, int local_flags)
+static inline slab_t * kmem_cache_slabmgmt (kmem_cache_t *cachep,
+ void *objp, int colour_off, int local_flags)
{
- kmem_slab_t *slabp;
-
- if (SLAB_OFF_SLAB(cachep->c_flags)) {
+ slab_t *slabp;
+
+ if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- slabp = kmem_cache_alloc(cache_slabp, local_flags);
+ slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
+ if (!slabp)
+ return NULL;
} else {
- /* Slab management at end of slab memory, placed so that
- * the position is 'coloured'.
+ /* FIXME: change to
+ slabp = objp
+ * if you enable OPTIMIZE
*/
- void *end;
- end = objp + (cachep->c_num * cachep->c_offset);
- if (!SLAB_BUFCTL(cachep->c_flags))
- end += (cachep->c_num * sizeof(kmem_bufctl_t));
- slabp = (kmem_slab_t *) L1_CACHE_ALIGN((unsigned long)end);
- }
-
- if (slabp) {
- slabp->s_inuse = 0;
- slabp->s_dma = 0;
- slabp->s_index = NULL;
+ slabp = objp+colour_off;
+ colour_off += L1_CACHE_ALIGN(cachep->num *
+ sizeof(kmem_bufctl_t) + sizeof(slab_t));
}
+ slabp->inuse = 0;
+ slabp->colouroff = colour_off;
+ slabp->s_mem = objp+colour_off;
return slabp;
}
-static inline void
-kmem_cache_init_objs(kmem_cache_t * cachep, kmem_slab_t * slabp, void *objp,
- unsigned long ctor_flags)
+static inline void kmem_cache_init_objs (kmem_cache_t * cachep,
+ slab_t * slabp, unsigned long ctor_flags)
{
- kmem_bufctl_t **bufpp = &slabp->s_freep;
- unsigned long num = cachep->c_num-1;
-
- do {
-#if SLAB_DEBUG_SUPPORT
- if (cachep->c_flags & SLAB_RED_ZONE) {
- *((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
+ int i;
+
+ for (i = 0; i < cachep->num; i++) {
+ void* objp = slabp->s_mem+cachep->objsize*i;
+#if DEBUG
+ if (cachep->flags & SLAB_RED_ZONE) {
+ *((unsigned long*)(objp)) = RED_MAGIC1;
+ *((unsigned long*)(objp + cachep->objsize -
+ BYTES_PER_WORD)) = RED_MAGIC1;
objp += BYTES_PER_WORD;
- *((unsigned long*)(objp+cachep->c_org_size)) = SLAB_RED_MAGIC1;
}
-#endif /* SLAB_DEBUG_SUPPORT */
+#endif
- /* Constructors are not allowed to allocate memory from the same cache
- * which they are a constructor for. Otherwise, deadlock.
- * They must also be threaded.
+ /*
+ * Constructors are not allowed to allocate memory from
+ * the same cache which they are a constructor for.
+ * Otherwise, deadlock. They must also be threaded.
*/
- if (cachep->c_ctor)
- cachep->c_ctor(objp, cachep, ctor_flags);
-#if SLAB_DEBUG_SUPPORT
- else if (cachep->c_flags & SLAB_POISON) {
+ if (cachep->ctor)
+ cachep->ctor(objp, cachep, ctor_flags);
+#if DEBUG
+ if (cachep->flags & SLAB_RED_ZONE)
+ objp -= BYTES_PER_WORD;
+ if (cachep->flags & SLAB_POISON)
/* need to poison the objs */
kmem_poison_obj(cachep, objp);
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*((unsigned long*)(objp)) != RED_MAGIC1)
+ BUG();
+ if (*((unsigned long*)(objp + cachep->objsize -
+ BYTES_PER_WORD)) != RED_MAGIC1)
+ BUG();
}
-
- if (cachep->c_flags & SLAB_RED_ZONE) {
- if (*((unsigned long*)(objp+cachep->c_org_size)) !=
- SLAB_RED_MAGIC1) {
- *((unsigned long*)(objp+cachep->c_org_size)) =
- SLAB_RED_MAGIC1;
- printk(KERN_ERR "kmem_init_obj: Bad rear redzone "
- "after constructor - %s\n", cachep->c_name);
- }
- objp -= BYTES_PER_WORD;
- if (*((unsigned long*)(objp)) != SLAB_RED_MAGIC1) {
- *((unsigned long*)(objp)) = SLAB_RED_MAGIC1;
- printk(KERN_ERR "kmem_init_obj: Bad front redzone "
- "after constructor - %s\n", cachep->c_name);
- }
- }
-#endif /* SLAB_DEBUG_SUPPORT */
-
- objp += cachep->c_offset;
- if (!slabp->s_index) {
- *bufpp = objp;
- objp += sizeof(kmem_bufctl_t);
- } else
- *bufpp = &slabp->s_index[num];
- bufpp = &(*bufpp)->buf_nextp;
- } while (num--);
-
- *bufpp = NULL;
+#endif
+ slab_bufctl(slabp)[i] = i+1;
+ }
+ slab_bufctl(slabp)[i-1] = BUFCTL_END;
+ slabp->free = 0;
}
-/* Grow (by 1) the number of slabs within a cache. This is called by
+/*
+ * Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int
-kmem_cache_grow(kmem_cache_t * cachep, int flags)
+static int kmem_cache_grow (kmem_cache_t * cachep, int flags)
{
- kmem_slab_t *slabp;
+ slab_t *slabp;
struct page *page;
void *objp;
size_t offset;
- unsigned int dma, local_flags;
+ unsigned int i, local_flags;
unsigned long ctor_flags;
unsigned long save_flags;
/* Be lazy and only check for valid flags here,
* keeping it out of the critical path in kmem_cache_alloc().
*/
- if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) {
- printk(KERN_WARNING "kmem_grow: Illegal flgs %X (correcting) - %s\n",
- flags, cachep->c_name);
- flags &= (SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW);
- }
-
+ if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+ BUG();
if (flags & SLAB_NO_GROW)
return 0;
- /* The test for missing atomic flag is performed here, rather than
+ /*
+ * The test for missing atomic flag is performed here, rather than
* the more obvious place, simply to reduce the critical path length
- * in kmem_cache_alloc(). If a caller is slightly mis-behaving they
+ * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
* will eventually be caught here (where it matters).
*/
- if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC) {
- printk(KERN_ERR "kmem_grow: Called nonatomically from int - %s\n",
- cachep->c_name);
- flags &= ~SLAB_LEVEL_MASK;
- flags |= SLAB_ATOMIC;
- }
+ if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC)
+ BUG();
+
ctor_flags = SLAB_CTOR_CONSTRUCTOR;
local_flags = (flags & SLAB_LEVEL_MASK);
- if (local_flags == SLAB_ATOMIC) {
- /* Not allowed to sleep. Need to tell a constructor about
+ if (local_flags == SLAB_ATOMIC)
+ /*
+ * Not allowed to sleep. Need to tell a constructor about
* this - it might need to know...
*/
ctor_flags |= SLAB_CTOR_ATOMIC;
- }
/* About to mess with non-constant members - lock. */
- spin_lock_irqsave(&cachep->c_spinlock, save_flags);
+ spin_lock_irqsave(&cachep->spinlock, save_flags);
/* Get colour for the slab, and cal the next value. */
- if (!(offset = cachep->c_colour_next--))
- cachep->c_colour_next = cachep->c_colour;
- offset *= cachep->c_align;
- cachep->c_dflags = SLAB_CFLGS_GROWN;
+ offset = cachep->colour_next;
+ cachep->colour_next++;
+ if (cachep->colour_next >= cachep->colour)
+ cachep->colour_next = 0;
+ offset *= cachep->colour_off;
+ cachep->dflags |= DFLGS_GROWN;
- cachep->c_growing++;
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+ cachep->growing++;
+ spin_unlock_irqrestore(&cachep->spinlock, save_flags);
/* A series of memory allocations for a new slab.
* Neither the cache-chain semaphore, or cache-lock, are
* held, but the incrementing c_growing prevents this
- * this cache from being reaped or shrunk.
+ * cache from being reaped or shrunk.
* Note: The cache could be selected in for reaping in
* kmem_cache_reap(), but when the final test is made the
* growing value will be seen.
*/
/* Get mem for the objs. */
- if (!(objp = kmem_getpages(cachep, flags, &dma)))
+ if (!(objp = kmem_getpages(cachep, flags)))
goto failed;
/* Get slab management. */
- if (!(slabp = kmem_cache_slabmgmt(cachep, objp+offset, local_flags)))
+ if (!(slabp = kmem_cache_slabmgmt(cachep, objp, offset, local_flags)))
goto opps1;
- if (dma)
- slabp->s_dma = 1;
- if (SLAB_BUFCTL(cachep->c_flags)) {
- slabp->s_index = kmem_cache_alloc(cachep->c_index_cachep, local_flags);
- if (!slabp->s_index)
- goto opps2;
- }
- /* Nasty!!!!!! I hope this is OK. */
- dma = 1 << cachep->c_gfporder;
- page = &mem_map[MAP_NR(objp)];
+ /* Nasty!!!!!! I hope this is OK. */
+ i = 1 << cachep->gfporder;
+ page = mem_map + MAP_NR(objp);
do {
- SLAB_SET_PAGE_CACHE(page, cachep);
- SLAB_SET_PAGE_SLAB(page, slabp);
+ SET_PAGE_CACHE(page, cachep);
+ SET_PAGE_SLAB(page, slabp);
PageSetSlab(page);
page++;
- } while (--dma);
+ } while (--i);
- slabp->s_offset = offset; /* It will fit... */
- objp += offset; /* Address of first object. */
- slabp->s_mem = objp;
+ kmem_cache_init_objs(cachep, slabp, ctor_flags);
- /* For on-slab bufctls, c_offset is the distance between the start of
- * an obj and its related bufctl. For off-slab bufctls, c_offset is
- * the distance between objs in the slab.
- */
- kmem_cache_init_objs(cachep, slabp, objp, ctor_flags);
-
- spin_lock_irq(&cachep->c_spinlock);
+ spin_lock_irqsave(&cachep->spinlock, save_flags);
+ cachep->growing--;
/* Make slab active. */
- slabp->s_magic = SLAB_MAGIC_ALLOC;
- kmem_slab_link_end(cachep, slabp);
- if (cachep->c_freep == kmem_slab_end(cachep))
- cachep->c_freep = slabp;
- SLAB_STATS_INC_GROWN(cachep);
- cachep->c_failures = 0;
- cachep->c_growing--;
-
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+ list_add_tail(&slabp->list,&cachep->slabs);
+ if (cachep->firstnotfull == &cachep->slabs)
+ cachep->firstnotfull = &slabp->list;
+ STATS_INC_GROWN(cachep);
+ cachep->failures = 0;
+
+ spin_unlock_irqrestore(&cachep->spinlock, save_flags);
return 1;
-opps2:
- if (SLAB_OFF_SLAB(cachep->c_flags))
- kmem_cache_free(cache_slabp, slabp);
opps1:
- kmem_freepages(cachep, objp);
+ kmem_freepages(cachep, objp);
failed:
- spin_lock_irq(&cachep->c_spinlock);
- cachep->c_growing--;
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
+ spin_lock_irqsave(&cachep->spinlock, save_flags);
+ cachep->growing--;
+ spin_unlock_irqrestore(&cachep->spinlock, save_flags);
return 0;
}
-static void
-kmem_report_alloc_err(const char *str, kmem_cache_t * cachep)
-{
- if (cachep)
- SLAB_STATS_INC_ERR(cachep); /* this is atomic */
- printk(KERN_ERR "kmem_alloc: %s (name=%s)\n",
- str, cachep ? cachep->c_name : "unknown");
-}
+/*
+ * Perform extra freeing checks:
+ * - detect double free
+ * - detect bad pointers.
+ * Called with the cache-lock held.
+ */
-static void
-kmem_report_free_err(const char *str, const void *objp, kmem_cache_t * cachep)
+#if DEBUG
+static int kmem_extra_free_checks (kmem_cache_t * cachep,
+ slab_t *slabp, void * objp)
{
- if (cachep)
- SLAB_STATS_INC_ERR(cachep);
- printk(KERN_ERR "kmem_free: %s (objp=%p, name=%s)\n",
- str, objp, cachep ? cachep->c_name : "unknown");
-}
+ int i;
+ unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
-/* Search for a slab whose objs are suitable for DMA.
- * Note: since testing the first free slab (in __kmem_cache_alloc()),
- * ints must not have been enabled, or the cache-lock released!
- */
-static inline kmem_slab_t *
-kmem_cache_search_dma(kmem_cache_t * cachep)
-{
- kmem_slab_t *slabp = cachep->c_freep->s_nextp;
+ if (objnr >= cachep->num)
+ BUG();
+ if (objp != slabp->s_mem + objnr*cachep->objsize)
+ BUG();
- for (; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
- if (!(slabp->s_dma))
- continue;
- kmem_slab_unlink(slabp);
- kmem_slab_link_free(cachep, slabp);
- cachep->c_freep = slabp;
- break;
+ /* Check slab's freelist to see if this obj is there. */
+ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
+ if (i == objnr)
+ BUG();
}
- return slabp;
+ return 0;
}
+#endif
-#if SLAB_DEBUG_SUPPORT
-/* Perform extra freeing checks. Currently, this check is only for caches
- * that use bufctl structures within the slab. Those which use bufctl's
- * from the internal cache have a reasonable check when the address is
- * searched for. Called with the cache-lock held.
- */
-static void *
-kmem_extra_free_checks(kmem_cache_t * cachep, kmem_bufctl_t *search_bufp,
- kmem_bufctl_t *bufp, void * objp)
+static inline void kmem_cache_alloc_head(kmem_cache_t *cachep, int flags)
{
- if (SLAB_BUFCTL(cachep->c_flags))
- return objp;
-
- /* Check slab's freelist to see if this obj is there. */
- for (; search_bufp; search_bufp = search_bufp->buf_nextp) {
- if (search_bufp != bufp)
- continue;
- return NULL;
+#if DEBUG
+ if (flags & SLAB_DMA) {
+ if (!(cachep->gfpflags & GFP_DMA))
+ BUG();
+ } else {
+ if (cachep->gfpflags & GFP_DMA)
+ BUG();
}
- return objp;
+#endif
}
-#endif /* SLAB_DEBUG_SUPPORT */
-/* Called with cache lock held. */
-static inline void
-kmem_cache_full_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
+static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
+ slab_t *slabp)
{
- if (slabp->s_nextp->s_inuse) {
- /* Not at correct position. */
- if (cachep->c_freep == slabp)
- cachep->c_freep = slabp->s_nextp;
- kmem_slab_unlink(slabp);
- kmem_slab_link_end(cachep, slabp);
+ void *objp;
+
+ STATS_INC_ALLOCED(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ /* get obj pointer */
+ slabp->inuse++;
+ objp = slabp->s_mem + slabp->free*cachep->objsize;
+ slabp->free=slab_bufctl(slabp)[slabp->free];
+
+ if (slabp->free == BUFCTL_END)
+ /* slab now full: move to next slab for next alloc */
+ cachep->firstnotfull = slabp->list.next;
+#if DEBUG
+ if (cachep->flags & SLAB_POISON)
+ if (kmem_check_poison_obj(cachep, objp))
+ BUG();
+ if (cachep->flags & SLAB_RED_ZONE) {
+ /* Set alloc red-zone, and check old one. */
+ if (xchg((unsigned long *)objp, RED_MAGIC2) !=
+ RED_MAGIC1)
+ BUG();
+ if (xchg((unsigned long *)(objp+cachep->objsize -
+ BYTES_PER_WORD), RED_MAGIC2) != RED_MAGIC1)
+ BUG();
+ objp += BYTES_PER_WORD;
}
+#endif
+ return objp;
}
-/* Called with cache lock held. */
-static inline void
-kmem_cache_one_free(kmem_cache_t *cachep, kmem_slab_t *slabp)
+/*
+ * Returns a ptr to an obj in the given cache.
+ * caller must guarantee synchronization
+ * #define for the goto optimization 8-)
+ */
+#define kmem_cache_alloc_one(cachep) \
+({ \
+ slab_t *slabp; \
+ \
+ /* Get slab alloc is to come from. */ \
+ { \
+ struct list_head* p = cachep->firstnotfull; \
+ if (p == &cachep->slabs) \
+ goto alloc_new_slab; \
+ slabp = list_entry(p,slab_t, list); \
+ } \
+ kmem_cache_alloc_one_tail(cachep, slabp); \
+})
+
+#ifdef CONFIG_SMP
+void* kmem_cache_alloc_batch(kmem_cache_t* cachep, int flags)
{
- if (slabp->s_nextp->s_inuse == cachep->c_num) {
- kmem_slab_unlink(slabp);
- kmem_slab_link_free(cachep, slabp);
+ int batchcount = cachep->batchcount;
+ cpucache_t* cc = cc_data(cachep);
+
+ spin_lock(&cachep->spinlock);
+ while (batchcount--) {
+ /* Get slab alloc is to come from. */
+ struct list_head *p = cachep->firstnotfull;
+ slab_t *slabp;
+
+ if (p == &cachep->slabs)
+ break;
+ slabp = list_entry(p,slab_t, list);
+ cc_entry(cc)[cc->avail++] =
+ kmem_cache_alloc_one_tail(cachep, slabp);
}
- cachep->c_freep = slabp;
+ spin_unlock(&cachep->spinlock);
+
+ if (cc->avail)
+ return cc_entry(cc)[--cc->avail];
+ return NULL;
}
+#endif
-/* Returns a ptr to an obj in the given cache. */
-static inline void *
-__kmem_cache_alloc(kmem_cache_t *cachep, int flags)
+static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags)
{
- kmem_slab_t *slabp;
- kmem_bufctl_t *bufp;
- void *objp;
- unsigned long save_flags;
+ unsigned long save_flags;
+ void* objp;
- /* Sanity check. */
- if (!cachep)
- goto nul_ptr;
- spin_lock_irqsave(&cachep->c_spinlock, save_flags);
+ kmem_cache_alloc_head(cachep, flags);
try_again:
- /* Get slab alloc is to come from. */
- slabp = cachep->c_freep;
-
- /* Magic is a sanity check _and_ says if we need a new slab. */
- if (slabp->s_magic != SLAB_MAGIC_ALLOC)
- goto alloc_new_slab;
- /* DMA requests are 'rare' - keep out of the critical path. */
- if (flags & SLAB_DMA)
- goto search_dma;
-try_again_dma:
- SLAB_STATS_INC_ALLOCED(cachep);
- SLAB_STATS_INC_ACTIVE(cachep);
- SLAB_STATS_SET_HIGH(cachep);
- slabp->s_inuse++;
- bufp = slabp->s_freep;
- slabp->s_freep = bufp->buf_nextp;
- if (slabp->s_freep) {
-ret_obj:
- if (!slabp->s_index) {
- bufp->buf_slabp = slabp;
- objp = ((void*)bufp) - cachep->c_offset;
-finished:
- /* The lock is not needed by the red-zone or poison ops, and the
- * obj has been removed from the slab. Should be safe to drop
- * the lock here.
- */
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
-#if SLAB_DEBUG_SUPPORT
- if (cachep->c_flags & SLAB_RED_ZONE)
- goto red_zone;
-ret_red:
- if ((cachep->c_flags & SLAB_POISON) && kmem_check_poison_obj(cachep, objp))
- kmem_report_alloc_err("Bad poison", cachep);
-#endif /* SLAB_DEBUG_SUPPORT */
- return objp;
+ local_irq_save(save_flags);
+#ifdef CONFIG_SMP
+ {
+ cpucache_t *cc = cc_data(cachep);
+
+ if (cc) {
+ if (cc->avail) {
+ STATS_INC_ALLOCHIT(cachep);
+ objp = cc_entry(cc)[--cc->avail];
+ } else {
+ STATS_INC_ALLOCMISS(cachep);
+ objp = kmem_cache_alloc_batch(cachep,flags);
+ if (!objp)
+ goto alloc_new_slab_nolock;
+ }
+ } else {
+ spin_lock(&cachep->spinlock);
+ objp = kmem_cache_alloc_one(cachep);
+ spin_unlock(&cachep->spinlock);
}
- /* Update index ptr. */
- objp = ((bufp-slabp->s_index)*cachep->c_offset) + slabp->s_mem;
- bufp->buf_objp = objp;
- goto finished;
}
- cachep->c_freep = slabp->s_nextp;
- goto ret_obj;
-
-#if SLAB_DEBUG_SUPPORT
-red_zone:
- /* Set alloc red-zone, and check old one. */
- if (xchg((unsigned long *)objp, SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
- kmem_report_alloc_err("Bad front redzone", cachep);
- objp += BYTES_PER_WORD;
- if (xchg((unsigned long *)(objp+cachep->c_org_size), SLAB_RED_MAGIC2) != SLAB_RED_MAGIC1)
- kmem_report_alloc_err("Bad rear redzone", cachep);
- goto ret_red;
-#endif /* SLAB_DEBUG_SUPPORT */
-
-search_dma:
- if (slabp->s_dma || (slabp = kmem_cache_search_dma(cachep))!=kmem_slab_end(cachep))
- goto try_again_dma;
+#else
+ objp = kmem_cache_alloc_one(cachep);
+#endif
+ local_irq_restore(save_flags);
+ return objp;
alloc_new_slab:
- /* Either out of slabs, or magic number corruption. */
- if (slabp == kmem_slab_end(cachep)) {
- /* Need a new slab. Release the lock before calling kmem_cache_grow().
- * This allows objs to be released back into the cache while growing.
+#ifdef CONFIG_SMP
+ spin_unlock(&cachep->spinlock);
+alloc_new_slab_nolock:
+#endif
+ local_irq_restore(save_flags);
+ if (kmem_cache_grow(cachep, flags))
+ /* Someone may have stolen our objs. Doesn't matter, we'll
+ * just come back here again.
*/
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
- if (kmem_cache_grow(cachep, flags)) {
- /* Someone may have stolen our objs. Doesn't matter, we'll
- * just come back here again.
- */
- spin_lock_irq(&cachep->c_spinlock);
- goto try_again;
- }
- /* Couldn't grow, but some objs may have been freed. */
- spin_lock_irq(&cachep->c_spinlock);
- if (cachep->c_freep != kmem_slab_end(cachep)) {
- if ((flags & SLAB_ATOMIC) == 0)
- goto try_again;
- }
- } else {
- /* Very serious error - maybe panic() here? */
- kmem_report_alloc_err("Bad slab magic (corrupt)", cachep);
- }
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
-err_exit:
+ goto try_again;
return NULL;
-nul_ptr:
- kmem_report_alloc_err("NULL ptr", NULL);
- goto err_exit;
}
-/* Release an obj back to its cache. If the obj has a constructed state,
- * it should be in this state _before_ it is released.
+/*
+ * Release an obj back to its cache. If the obj has a constructed
+ * state, it should be in this state _before_ it is released.
+ * - caller is responsible for the synchronization
*/
-static inline void
-__kmem_cache_free(kmem_cache_t *cachep, void *objp)
+
+#if DEBUG
+# define CHECK_NR(nr) \
+ do { \
+ if (nr >= max_mapnr) { \
+ printk(KERN_ERR "kfree: out of range ptr %lxh.\n", \
+ (unsigned long)objp); \
+ BUG(); \
+ } \
+ } while (0)
+# define CHECK_PAGE(page) \
+ do { \
+ if (!PageSlab(page)) { \
+ printk(KERN_ERR "kfree: bad ptr %lxh.\n", \
+ (unsigned long)objp); \
+ BUG(); \
+ } \
+ } while (0)
+
+#else
+# define CHECK_NR(nr) do { } while (0)
+# define CHECK_PAGE(nr) do { } while (0)
+#endif
+
+static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp)
{
- kmem_slab_t *slabp;
- kmem_bufctl_t *bufp;
- unsigned long save_flags;
-
- /* Basic sanity checks. */
- if (!cachep || !objp)
- goto null_addr;
-
-#if SLAB_DEBUG_SUPPORT
- /* A verify func is called without the cache-lock held. */
- if (cachep->c_flags & SLAB_DEBUG_INITIAL)
- goto init_state_check;
-finished_initial:
-
- if (cachep->c_flags & SLAB_RED_ZONE)
- goto red_zone;
-return_red:
-#endif /* SLAB_DEBUG_SUPPORT */
-
- spin_lock_irqsave(&cachep->c_spinlock, save_flags);
-
- if (SLAB_BUFCTL(cachep->c_flags))
- goto bufctl;
- bufp = (kmem_bufctl_t *)(objp+cachep->c_offset);
-
- /* Get slab for the object. */
-#if 0
- /* _NASTY_IF/ELSE_, but avoids a 'distant' memory ref for some objects.
- * Is this worth while? XXX
+ slab_t* slabp;
+
+ CHECK_NR(MAP_NR(objp));
+ CHECK_PAGE(mem_map + MAP_NR(objp));
+ /* reduces memory footprint
+ *
+ if (OPTIMIZE(cachep))
+ slabp = (void*)((unsigned long)objp&(~(PAGE_SIZE-1)));
+ else
*/
- if (cachep->c_flags & SLAB_HIGH_PACK)
- slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(bufp)]);
- else
-#endif
- slabp = bufp->buf_slabp;
-
-check_magic:
- if (slabp->s_magic != SLAB_MAGIC_ALLOC) /* Sanity check. */
- goto bad_slab;
-
-#if SLAB_DEBUG_SUPPORT
- if (cachep->c_flags & SLAB_DEBUG_FREE)
- goto extra_checks;
-passed_extra:
-#endif /* SLAB_DEBUG_SUPPORT */
-
- if (slabp->s_inuse) { /* Sanity check. */
- SLAB_STATS_DEC_ACTIVE(cachep);
- slabp->s_inuse--;
- bufp->buf_nextp = slabp->s_freep;
- slabp->s_freep = bufp;
- if (bufp->buf_nextp) {
- if (slabp->s_inuse) {
- /* (hopefully) The most common case. */
-finished:
-#if SLAB_DEBUG_SUPPORT
- if (cachep->c_flags & SLAB_POISON) {
- if (cachep->c_flags & SLAB_RED_ZONE)
- objp += BYTES_PER_WORD;
- kmem_poison_obj(cachep, objp);
- }
-#endif /* SLAB_DEBUG_SUPPORT */
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
- return;
- }
- kmem_cache_full_free(cachep, slabp);
- goto finished;
- }
- kmem_cache_one_free(cachep, slabp);
- goto finished;
+ slabp = GET_PAGE_SLAB(mem_map + MAP_NR(objp));
+
+#if DEBUG
+ if (cachep->flags & SLAB_DEBUG_INITIAL)
+ /* Need to call the slab's constructor so the
+ * caller can perform a verify of its state (debugging).
+ * Called without the cache-lock held.
+ */
+ cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ objp -= BYTES_PER_WORD;
+ if (xchg((unsigned long *)objp, RED_MAGIC1) != RED_MAGIC2)
+ /* Either write before start, or a double free. */
+ BUG();
+ if (xchg((unsigned long *)(objp+cachep->objsize -
+ BYTES_PER_WORD), RED_MAGIC1) != RED_MAGIC2)
+ /* Either write past end, or a double free. */
+ BUG();
}
+ if (cachep->flags & SLAB_POISON)
+ kmem_poison_obj(cachep, objp);
+ if (kmem_extra_free_checks(cachep, slabp, objp))
+ return;
+#endif
+ {
+ unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
- /* Don't add to freelist. */
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
- kmem_report_free_err("free with no active objs", objp, cachep);
- return;
-bufctl:
- /* No 'extra' checks are performed for objs stored this way, finding
- * the obj is check enough.
- */
- slabp = SLAB_GET_PAGE_SLAB(&mem_map[MAP_NR(objp)]);
- bufp = &slabp->s_index[(objp - slabp->s_mem)/cachep->c_offset];
- if (bufp->buf_objp == objp)
- goto check_magic;
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
- kmem_report_free_err("Either bad obj addr or double free", objp, cachep);
+ slab_bufctl(slabp)[objnr] = slabp->free;
+ slabp->free = objnr;
+ }
+ STATS_DEC_ACTIVE(cachep);
+
+ /* fixup slab chain */
+ if (slabp->inuse-- == cachep->num)
+ goto moveslab_partial;
+ if (!slabp->inuse)
+ goto moveslab_free;
return;
-#if SLAB_DEBUG_SUPPORT
-init_state_check:
- /* Need to call the slab's constructor so the
- * caller can perform a verify of its state (debugging).
+
+moveslab_partial:
+ /* was full.
+ * Even if the page is now empty, we can set c_firstnotfull to
+ * slabp: there are no partial slabs in this case
*/
- cachep->c_ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
- goto finished_initial;
-extra_checks:
- if (!kmem_extra_free_checks(cachep, slabp->s_freep, bufp, objp)) {
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
- kmem_report_free_err("Double free detected during checks", objp, cachep);
+ {
+ struct list_head *t = cachep->firstnotfull;
+
+ cachep->firstnotfull = &slabp->list;
+ if (slabp->list.next == t)
+ return;
+ list_del(&slabp->list);
+ list_add_tail(&slabp->list, t);
return;
}
- goto passed_extra;
-red_zone:
- /* We do not hold the cache-lock while checking the red-zone.
+moveslab_free:
+ /*
+ * was partial, now empty.
+ * c_firstnotfull might point to slabp
+ * FIXME: optimize
*/
- objp -= BYTES_PER_WORD;
- if (xchg((unsigned long *)objp, SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
- /* Either write before start of obj, or a double free. */
- kmem_report_free_err("Bad front redzone", objp, cachep);
- }
- if (xchg((unsigned long *)(objp+cachep->c_org_size+BYTES_PER_WORD), SLAB_RED_MAGIC1) != SLAB_RED_MAGIC2) {
- /* Either write past end of obj, or a double free. */
- kmem_report_free_err("Bad rear redzone", objp, cachep);
+ {
+ struct list_head *t = cachep->firstnotfull->prev;
+
+ list_del(&slabp->list);
+ list_add_tail(&slabp->list, &cachep->slabs);
+ if (cachep->firstnotfull == &slabp->list)
+ cachep->firstnotfull = t->next;
+ return;
}
- goto return_red;
-#endif /* SLAB_DEBUG_SUPPORT */
-
-bad_slab:
- /* Slab doesn't contain the correct magic num. */
- if (slabp->s_magic == SLAB_MAGIC_DESTROYED) {
- /* Magic num says this is a destroyed slab. */
- kmem_report_free_err("free from inactive slab", objp, cachep);
- } else
- kmem_report_free_err("Bad obj addr", objp, cachep);
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
-
-#if 1
-/* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
- BUG();
+}
+
+#ifdef CONFIG_SMP
+static inline void __free_block (kmem_cache_t* cachep,
+ void** objpp, int len)
+{
+ for ( ; len > 0; len--, objpp++)
+ kmem_cache_free_one(cachep, *objpp);
+}
+
+static void free_block (kmem_cache_t* cachep, void** objpp, int len)
+{
+ spin_lock(&cachep->spinlock);
+ __free_block(cachep, objpp, len);
+ spin_unlock(&cachep->spinlock);
+}
#endif
- return;
-null_addr:
- kmem_report_free_err("NULL ptr", objp, cachep);
- return;
+/*
+ * __kmem_cache_free
+ * called with disabled ints
+ */
+static inline void __kmem_cache_free (kmem_cache_t *cachep, void* objp)
+{
+#ifdef CONFIG_SMP
+ cpucache_t *cc = cc_data(cachep);
+
+ CHECK_NR(MAP_NR(objp));
+ CHECK_PAGE(mem_map + MAP_NR(objp));
+ if (cc) {
+ int batchcount;
+ if (cc->avail < cc->limit) {
+ STATS_INC_FREEHIT(cachep);
+ cc_entry(cc)[cc->avail++] = objp;
+ return;
+ }
+ STATS_INC_FREEMISS(cachep);
+ batchcount = cachep->batchcount;
+ cc->avail -= batchcount;
+ free_block(cachep,
+ &cc_entry(cc)[cc->avail],batchcount);
+ cc_entry(cc)[cc->avail++] = objp;
+ return;
+ } else {
+ free_block(cachep, &objp, 1);
+ }
+#else
+ kmem_cache_free_one(cachep, objp);
+#endif
}
/**
@@ -1709,34 +1452,19 @@ null_addr:
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
*/
-void *
-kmem_cache_alloc(kmem_cache_t *cachep, int flags)
+void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)
{
return __kmem_cache_alloc(cachep, flags);
}
/**
- * kmem_cache_free - Deallocate an object
- * @cachep: The cache the allocation was from.
- * @objp: The previously allocated object.
- *
- * Free an object which was previously allocated from this
- * cache.
- */
-void
-kmem_cache_free(kmem_cache_t *cachep, void *objp)
-{
- __kmem_cache_free(cachep, objp);
-}
-
-/**
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*
* kmalloc is the normal method of allocating memory
* in the kernel. The @flags argument may be one of:
- *
+ *
* %GFP_BUFFER - XXX
*
* %GFP_ATOMIC - allocation will not sleep. Use inside interrupt handlers.
@@ -1750,371 +1478,539 @@ kmem_cache_free(kmem_cache_t *cachep, void *objp)
*
* %GFP_KSWAPD - Don't use unless you're modifying kswapd.
*/
-void *
-kmalloc(size_t size, int flags)
+void * kmalloc (size_t size, int flags)
{
- cache_sizes_t *csizep = cache_sizes;
+ cache_sizes_t *csizep = cache_sizes;
for (; csizep->cs_size; csizep++) {
if (size > csizep->cs_size)
continue;
- return __kmem_cache_alloc(csizep->cs_cachep, flags);
+ return __kmem_cache_alloc(flags & GFP_DMA ?
+ csizep->cs_dmacachep : csizep->cs_cachep, flags);
}
- printk(KERN_ERR "kmalloc: Size (%lu) too large\n", (unsigned long) size);
+ BUG(); // too big size
return NULL;
}
/**
- * kfree - free previously allocated memory
- * @objp: pointer returned by kmalloc.
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
*
- * Don't free memory not originally allocated by kmalloc()
- * or you will run into trouble.
+ * Free an object which was previously allocated from this
+ * cache.
*/
-void
-kfree(const void *objp)
+void kmem_cache_free (kmem_cache_t *cachep, void *objp)
{
- struct page *page;
- int nr;
-
- if (!objp)
- goto null_ptr;
- nr = MAP_NR(objp);
- if (nr >= max_mapnr)
- goto bad_ptr;
-
- /* Assume we own the page structure - hence no locking.
- * If someone is misbehaving (for example, calling us with a bad
- * address), then access to the page structure can race with the
- * kmem_slab_destroy() code. Need to add a spin_lock to each page
- * structure, which would be useful in threading the gfp() functions....
- */
- page = &mem_map[nr];
- if (PageSlab(page)) {
- kmem_cache_t *cachep;
-
- /* Here, we again assume the obj address is good.
- * If it isn't, and happens to map onto another
- * general cache page which has no active objs, then
- * we race.
- */
- cachep = SLAB_GET_PAGE_CACHE(page);
- if (cachep && (cachep->c_flags & SLAB_CFLGS_GENERAL)) {
- __kmem_cache_free(cachep, (void *)objp);
- return;
- }
- }
-bad_ptr:
- printk(KERN_ERR "kfree: Bad obj %p\n", objp);
-
-#if 1
-/* FORCE A KERNEL DUMP WHEN THIS HAPPENS. SPEAK IN ALL CAPS. GET THE CALL CHAIN. */
-BUG();
+ unsigned long flags;
+#if DEBUG
+ CHECK_NR(MAP_NR(objp));
+ CHECK_PAGE(mem_map + MAP_NR(objp));
+ if (cachep != GET_PAGE_CACHE(mem_map + MAP_NR(objp)))
+ BUG();
#endif
-null_ptr:
- return;
+ local_irq_save(flags);
+ __kmem_cache_free(cachep, objp);
+ local_irq_restore(flags);
}
/**
- * kfree_s - free previously allocated memory
+ * kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
- * @size: size of object which is being freed.
*
- * This function performs the same task as kfree() except
- * that it can use the extra information to speed up deallocation
- * or perform additional tests.
* Don't free memory not originally allocated by kmalloc()
- * or allocated with a different size, or you will run into trouble.
+ * or you will run into trouble.
*/
-void
-kfree_s(const void *objp, size_t size)
+void kfree (const void *objp)
{
- struct page *page;
- int nr;
+ kmem_cache_t *c;
+ unsigned long flags;
if (!objp)
- goto null_ptr;
- nr = MAP_NR(objp);
- if (nr >= max_mapnr)
- goto null_ptr;
- /* See comment in kfree() */
- page = &mem_map[nr];
- if (PageSlab(page)) {
- kmem_cache_t *cachep;
- /* See comment in kfree() */
- cachep = SLAB_GET_PAGE_CACHE(page);
- if (cachep && cachep->c_flags & SLAB_CFLGS_GENERAL) {
- if (size <= cachep->c_org_size) { /* XXX better check */
- __kmem_cache_free(cachep, (void *)objp);
- return;
- }
- }
- }
-null_ptr:
- printk(KERN_ERR "kfree_s: Bad obj %p\n", objp);
- return;
+ return;
+ local_irq_save(flags);
+ CHECK_NR(MAP_NR(objp));
+ CHECK_PAGE(mem_map + MAP_NR(objp));
+ c = GET_PAGE_CACHE(mem_map + MAP_NR(objp));
+ __kmem_cache_free(c, (void*)objp);
+ local_irq_restore(flags);
}
-kmem_cache_t *
-kmem_find_general_cachep(size_t size)
+kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
{
- cache_sizes_t *csizep = cache_sizes;
+ cache_sizes_t *csizep = cache_sizes;
/* This function could be moved to the header file, and
* made inline so consumers can quickly determine what
* cache pointer they require.
*/
- for (; csizep->cs_size; csizep++) {
+ for ( ; csizep->cs_size; csizep++) {
if (size > csizep->cs_size)
continue;
break;
}
- return csizep->cs_cachep;
+ return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * called with local interrupts disabled
+ */
+static void drain_cache (void* __cachep)
+{
+ kmem_cache_t *cachep = __cachep;
+ cpucache_t *cc = cc_data(cachep);
+
+ if (cc && cc->avail) {
+ free_block(cachep, cc_entry(cc), cc->avail);
+ cc->avail = 0;
+ }
}
+typedef struct ccupdate_struct_s
+{
+ kmem_cache_t* cachep;
+ cpucache_t* new[NR_CPUS];
+} ccupdate_struct_t;
+
+/*
+ * called with local interrupts disabled
+ */
+static void ccupdate_callback (void* __new)
+{
+ ccupdate_struct_t* new = __new;
+ cpucache_t *old = cc_data(new->cachep);
+
+ cc_data(new->cachep) = new->new[smp_processor_id()];
+ new->new[smp_processor_id()] = old;
+}
+
+/* called with cache_chain_sem acquired. */
+static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
+{
+ ccupdate_struct_t new;
+ int i;
+
+ /*
+ * These are admin-provided, so we are more graceful.
+ */
+ if (limit < 0)
+ return -EINVAL;
+ if (batchcount < 0)
+ return -EINVAL;
+ if (batchcount > limit)
+ return -EINVAL;
+ if (limit != 0 && !batchcount)
+ return -EINVAL;
+
+ memset(&new.new,0,sizeof(new.new));
+ if (limit) {
+ for (i = 0; i< smp_num_cpus; i++) {
+ cpucache_t* ccnew;
+
+
+ ccnew = kmalloc(sizeof(void*)*limit+
+ sizeof(cpucache_t), GFP_KERNEL);
+ if (!ccnew)
+ goto oom;
+ ccnew->limit = limit;
+ ccnew->avail = 0;
+ new.new[cpu_logical_map(i)] = ccnew;
+ }
+ }
+ new.cachep = cachep;
+ spin_lock_irq(&cachep->spinlock);
+ cachep->batchcount = batchcount;
+ spin_unlock_irq(&cachep->spinlock);
+
+ smp_call_function(ccupdate_callback,&new,1,1);
+ local_irq_disable();
+ ccupdate_callback(&new);
+ local_irq_enable();
+
+ for (i = 0; i < smp_num_cpus; i++) {
+ cpucache_t* ccold = new.new[cpu_logical_map(i)];
+ if (!ccold)
+ continue;
+ local_irq_disable();
+ free_block(cachep, cc_entry(ccold), ccold->avail);
+ local_irq_enable();
+ kfree(ccold);
+ }
+ return 0;
+oom:
+ for (i--; i >= 0; i--)
+ kfree(new.new[cpu_logical_map(i)]);
+ return -ENOMEM;
+}
+
+static void enable_cpucache (kmem_cache_t *cachep)
+{
+ int err;
+ int limit;
+
+ /* FIXME: optimize */
+ if (cachep->objsize > PAGE_SIZE)
+ return;
+ if (cachep->objsize > 1024)
+ limit = 60;
+ else if (cachep->objsize > 256)
+ limit = 124;
+ else
+ limit = 252;
+
+ err = kmem_tune_cpucache(cachep, limit, limit/2);
+ if (err)
+ printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
+ cachep->name, -err);
+}
+
+static void enable_all_cpucaches (void)
+{
+ struct list_head* p;
+
+ down(&cache_chain_sem);
+
+ p = &cache_cache.next;
+ do {
+ kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
+
+ enable_cpucache(cachep);
+ p = cachep->next.next;
+ } while (p != &cache_cache.next);
+
+ up(&cache_chain_sem);
+}
+#endif
/**
* kmem_cache_reap - Reclaim memory from caches.
* @gfp_mask: the type of memory required.
*
* Called from try_to_free_page().
- * This function _cannot_ be called within a int, but it
- * can be interrupted.
*/
-void
-kmem_cache_reap(int gfp_mask)
+void kmem_cache_reap (int gfp_mask)
{
- kmem_slab_t *slabp;
- kmem_cache_t *searchp;
- kmem_cache_t *best_cachep;
- unsigned int scan;
- unsigned int reap_level;
-
- if (in_interrupt()) {
- printk("kmem_cache_reap() called within int!\n");
- return;
- }
-
- /* We really need a test semaphore op so we can avoid sleeping when
- * !wait is true.
- */
- down(&cache_chain_sem);
-
- scan = 10;
- reap_level = 0;
+ slab_t *slabp;
+ kmem_cache_t *searchp;
+ kmem_cache_t *best_cachep;
+ unsigned int best_pages;
+ unsigned int best_len;
+ unsigned int scan;
+
+ if (gfp_mask & __GFP_WAIT)
+ down(&cache_chain_sem);
+ else
+ if (down_trylock(&cache_chain_sem))
+ return;
+ scan = REAP_SCANLEN;
+ best_len = 0;
+ best_pages = 0;
best_cachep = NULL;
searchp = clock_searchp;
do {
- unsigned int full_free;
- unsigned int dma_flag;
+ unsigned int pages;
+ struct list_head* p;
+ unsigned int full_free;
/* It's safe to test this without holding the cache-lock. */
- if (searchp->c_flags & SLAB_NO_REAP)
+ if (searchp->flags & SLAB_NO_REAP)
goto next;
- spin_lock_irq(&searchp->c_spinlock);
- if (searchp->c_growing)
+ /* FIXME: is this really a good idea? */
+ if (gfp_mask & GFP_DMA) {
+ if (!(searchp->gfpflags & GFP_DMA))
+ goto next;
+ } else {
+ if (searchp->gfpflags & GFP_DMA)
+ goto next;
+ }
+ spin_lock_irq(&searchp->spinlock);
+ if (searchp->growing)
goto next_unlock;
- if (searchp->c_dflags & SLAB_CFLGS_GROWN) {
- searchp->c_dflags &= ~SLAB_CFLGS_GROWN;
+ if (searchp->dflags & DFLGS_GROWN) {
+ searchp->dflags &= ~DFLGS_GROWN;
goto next_unlock;
}
- /* Sanity check for corruption of static values. */
- if (searchp->c_inuse || searchp->c_magic != SLAB_C_MAGIC) {
- spin_unlock_irq(&searchp->c_spinlock);
- printk(KERN_ERR "kmem_reap: Corrupted cache struct for %s\n", searchp->c_name);
- goto next;
+#ifdef CONFIG_SMP
+ {
+ cpucache_t *cc = cc_data(searchp);
+ if (cc && cc->avail) {
+ __free_block(searchp, cc_entry(cc), cc->avail);
+ cc->avail = 0;
+ }
}
- dma_flag = 0;
- full_free = 0;
+#endif
- /* Count the fully free slabs. There should not be not many,
- * since we are holding the cache lock.
- */
- slabp = searchp->c_lastp;
- while (!slabp->s_inuse && slabp != kmem_slab_end(searchp)) {
- slabp = slabp->s_prevp;
+ full_free = 0;
+ p = searchp->slabs.prev;
+ while (p != &searchp->slabs) {
+ slabp = list_entry(p, slab_t, list);
+ if (slabp->inuse)
+ break;
full_free++;
- if (slabp->s_dma)
- dma_flag++;
+ p = p->prev;
}
- spin_unlock_irq(&searchp->c_spinlock);
-
- if ((gfp_mask & GFP_DMA) && !dma_flag)
- goto next;
-
- if (full_free) {
- if (full_free >= 10) {
- best_cachep = searchp;
- break;
- }
- /* Try to avoid slabs with constructors and/or
- * more than one page per slab (as it can be difficult
- * to get high orders from gfp()).
- */
- if (full_free >= reap_level) {
- reap_level = full_free;
- best_cachep = searchp;
+ /*
+ * Try to avoid slabs with constructors and/or
+ * more than one page per slab (as it can be difficult
+ * to get high orders from gfp()).
+ */
+ pages = full_free * (1<<searchp->gfporder);
+ if (searchp->ctor)
+ pages = (pages*4+1)/5;
+ if (searchp->gfporder)
+ pages = (pages*4+1)/5;
+ if (pages > best_pages) {
+ best_cachep = searchp;
+ best_len = full_free;
+ best_pages = pages;
+ if (full_free >= REAP_PERFECT) {
+ clock_searchp = list_entry(searchp->next.next,
+ kmem_cache_t,next);
+ goto perfect;
}
}
- goto next;
next_unlock:
- spin_unlock_irq(&searchp->c_spinlock);
+ spin_unlock_irq(&searchp->spinlock);
next:
- searchp = searchp->c_nextp;
+ searchp = list_entry(searchp->next.next,kmem_cache_t,next);
} while (--scan && searchp != clock_searchp);
clock_searchp = searchp;
- if (!best_cachep) {
+ if (!best_cachep)
/* couldn't find anything to reap */
goto out;
- }
- spin_lock_irq(&best_cachep->c_spinlock);
- while (!best_cachep->c_growing &&
- !(slabp = best_cachep->c_lastp)->s_inuse &&
- slabp != kmem_slab_end(best_cachep)) {
- if (gfp_mask & GFP_DMA) {
- do {
- if (slabp->s_dma)
- goto good_dma;
- slabp = slabp->s_prevp;
- } while (!slabp->s_inuse && slabp != kmem_slab_end(best_cachep));
-
- /* Didn't found a DMA slab (there was a free one -
- * must have been become active).
- */
- goto dma_fail;
-good_dma:
- }
- if (slabp == best_cachep->c_freep)
- best_cachep->c_freep = slabp->s_nextp;
- kmem_slab_unlink(slabp);
- SLAB_STATS_INC_REAPED(best_cachep);
+ spin_lock_irq(&best_cachep->spinlock);
+perfect:
+ /* free only 80% of the free slabs */
+ best_len = (best_len*4 + 1)/5;
+ for (scan = 0; scan < best_len; scan++) {
+ struct list_head *p;
+
+ if (best_cachep->growing)
+ break;
+ p = best_cachep->slabs.prev;
+ if (p == &best_cachep->slabs)
+ break;
+ slabp = list_entry(p,slab_t,list);
+ if (slabp->inuse)
+ break;
+ list_del(&slabp->list);
+ if (best_cachep->firstnotfull == &slabp->list)
+ best_cachep->firstnotfull = &best_cachep->slabs;
+ STATS_INC_REAPED(best_cachep);
- /* Safe to drop the lock. The slab is no longer linked to the
+ /* Safe to drop the lock. The slab is no longer linked to the
* cache.
*/
- spin_unlock_irq(&best_cachep->c_spinlock);
+ spin_unlock_irq(&best_cachep->spinlock);
kmem_slab_destroy(best_cachep, slabp);
- spin_lock_irq(&best_cachep->c_spinlock);
+ spin_lock_irq(&best_cachep->spinlock);
}
-dma_fail:
- spin_unlock_irq(&best_cachep->c_spinlock);
+ spin_unlock_irq(&best_cachep->spinlock);
out:
up(&cache_chain_sem);
return;
}
-#if SLAB_SELFTEST
-/* A few v. simple tests */
-static void
-kmem_self_test(void)
+#ifdef CONFIG_PROC_FS
+/* /proc/slabinfo
+ * cache-name num-active-objs total-objs
+ * obj-size num-active-slabs total-slabs
+ * num-pages-per-slab
+ */
+#define FIXUP(t) \
+ do { \
+ if (len <= off) { \
+ off -= len; \
+ len = 0; \
+ } else { \
+ if (len-off > count) \
+ goto t; \
+ } \
+ } while (0)
+
+static int proc_getdata (char*page, char**start, off_t off, int count)
{
- kmem_cache_t *test_cachep;
-
- printk(KERN_INFO "kmem_test() - start\n");
- test_cachep = kmem_cache_create("test-cachep", 16, 0, SLAB_RED_ZONE|SLAB_POISON, NULL, NULL);
- if (test_cachep) {
- char *objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
- if (objp) {
- /* Write in front and past end, red-zone test. */
- *(objp-1) = 1;
- *(objp+16) = 1;
- kmem_cache_free(test_cachep, objp);
-
- /* Mess up poisoning. */
- *objp = 10;
- objp = kmem_cache_alloc(test_cachep, SLAB_KERNEL);
- kmem_cache_free(test_cachep, objp);
-
- /* Mess up poisoning (again). */
- *objp = 10;
- kmem_cache_shrink(test_cachep);
+ struct list_head *p;
+ int len = 0;
+
+ /* Output format version, so at least we can change it without _too_
+ * many complaints.
+ */
+ len += sprintf(page+len, "slabinfo - version: 1.1"
+#if STATS
+ " (statistics)"
+#endif
+#ifdef CONFIG_SMP
+ " (SMP)"
+#endif
+ "\n");
+ FIXUP(got_data);
+
+ down(&cache_chain_sem);
+ p = &cache_cache.next;
+ do {
+ kmem_cache_t *cachep;
+ struct list_head *q;
+ slab_t *slabp;
+ unsigned long active_objs;
+ unsigned long num_objs;
+ unsigned long active_slabs = 0;
+ unsigned long num_slabs;
+ cachep = list_entry(p, kmem_cache_t, next);
+
+ spin_lock_irq(&cachep->spinlock);
+ active_objs = 0;
+ num_slabs = 0;
+ list_for_each(q,&cachep->slabs) {
+ slabp = list_entry(q, slab_t, list);
+ active_objs += slabp->inuse;
+ num_objs += cachep->num;
+ if (slabp->inuse)
+ active_slabs++;
+ else
+ num_slabs++;
}
- }
- printk(KERN_INFO "kmem_test() - finished\n");
+ num_slabs+=active_slabs;
+ num_objs = num_slabs*cachep->num;
+
+ len += sprintf(page+len, "%-17s %6lu %6lu %6u %4lu %4lu %4u",
+ cachep->name, active_objs, num_objs, cachep->objsize,
+ active_slabs, num_slabs, (1<<cachep->gfporder));
+
+#if STATS
+ {
+ unsigned long errors = cachep->errors;
+ unsigned long high = cachep->high_mark;
+ unsigned long grown = cachep->grown;
+ unsigned long reaped = cachep->reaped;
+ unsigned long allocs = cachep->num_allocations;
+
+ len += sprintf(page+len, " : %6lu %7lu %5lu %4lu %4lu",
+ high, allocs, grown, reaped, errors);
+ }
+#endif
+#ifdef CONFIG_SMP
+ {
+ unsigned int batchcount = cachep->batchcount;
+ unsigned int limit;
+
+ if (cc_data(cachep))
+ limit = cc_data(cachep)->limit;
+ else
+ limit = 0;
+ len += sprintf(page+len, " : %4u %4u",
+ limit, batchcount);
+ }
+#endif
+#if STATS && defined(CONFIG_SMP)
+ {
+ unsigned long allochit = atomic_read(&cachep->allochit);
+ unsigned long allocmiss = atomic_read(&cachep->allocmiss);
+ unsigned long freehit = atomic_read(&cachep->freehit);
+ unsigned long freemiss = atomic_read(&cachep->freemiss);
+ len += sprintf(page+len, " : %6lu %6lu %6lu %6lu",
+ allochit, allocmiss, freehit, freemiss);
+ }
+#endif
+ len += sprintf(page+len,"\n");
+ spin_unlock_irq(&cachep->spinlock);
+ FIXUP(got_data_up);
+ p = cachep->next.next;
+ } while (p != &cache_cache.next);
+got_data_up:
+ up(&cache_chain_sem);
+
+got_data:
+ *start = page+off;
+ return len;
}
-#endif /* SLAB_SELFTEST */
-#if defined(CONFIG_PROC_FS)
/**
- * get_slabinfo - generates /proc/slabinfo
- * @buf: the buffer to write it into
+ * slabinfo_read_proc - generates /proc/slabinfo
+ * @page: scratch area, one page long
+ * @start: pointer to the pointer to the output buffer
+ * @off: offset within /proc/slabinfo the caller is interested in
+ * @count: requested len in bytes
+ * @eof: eof marker
+ * @data: unused
*
* The contents of the buffer are
* cache-name
* num-active-objs
* total-objs
+ * object size
* num-active-slabs
* total-slabs
* num-pages-per-slab
+ * + further values on SMP and with statistics enabled
*/
-int
-get_slabinfo(char *buf)
+int slabinfo_read_proc (char *page, char **start, off_t off,
+ int count, int *eof, void *data)
{
- kmem_cache_t *cachep;
- kmem_slab_t *slabp;
- unsigned long active_objs;
- unsigned long save_flags;
- unsigned long num_slabs;
- unsigned long num_objs;
- int len=0;
-#if SLAB_STATS
- unsigned long active_slabs;
-#endif /* SLAB_STATS */
-
- __save_flags(save_flags);
+ int len = proc_getdata(page, start, off, count);
+ len -= (*start-page);
+ if (len <= count)
+ *eof = 1;
+ if (len>count) len = count;
+ if (len<0) len = 0;
+ return len;
+}
- /* Output format version, so at least we can change it without _too_
- * many complaints.
- */
-#if SLAB_STATS
- len = sprintf(buf, "slabinfo - version: 1.0 (statistics)\n");
-#else
- len = sprintf(buf, "slabinfo - version: 1.0\n");
-#endif /* SLAB_STATS */
+#define MAX_SLABINFO_WRITE 128
+/**
+ * slabinfo_write_proc - SMP tuning for the slab allocator
+ * @file:
+ * @buffer: user buffer
+ * @count: data len
+ * @data: unused
+ */
+int slabinfo_write_proc (struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+#ifdef CONFIG_SMP
+ char kbuf[MAX_SLABINFO_WRITE], *tmp;
+ int limit, batchcount, res;
+ struct list_head *p;
+
+ if (count > MAX_SLABINFO_WRITE)
+ return -EINVAL;
+ if (copy_from_user(&kbuf, buffer, count))
+ return -EFAULT;
+
+ tmp = strchr(kbuf, ' ');
+ if (!tmp)
+ return -EINVAL;
+ *tmp = '\0';
+ tmp++;
+ limit = simple_strtol(tmp, &tmp, 10);
+ while (*tmp == ' ')
+ tmp++;
+ batchcount = simple_strtol(tmp, &tmp, 10);
+
+ /* Find the cache in the chain of caches. */
down(&cache_chain_sem);
- cachep = &cache_cache;
- do {
-#if SLAB_STATS
- active_slabs = 0;
-#endif /* SLAB_STATS */
- num_slabs = active_objs = 0;
- spin_lock_irq(&cachep->c_spinlock);
- for (slabp = cachep->c_firstp; slabp != kmem_slab_end(cachep); slabp = slabp->s_nextp) {
- active_objs += slabp->s_inuse;
- num_slabs++;
-#if SLAB_STATS
- if (slabp->s_inuse)
- active_slabs++;
-#endif /* SLAB_STATS */
- }
- num_objs = cachep->c_num*num_slabs;
-#if SLAB_STATS
- {
- unsigned long errors;
- unsigned long high = cachep->c_high_mark;
- unsigned long grown = cachep->c_grown;
- unsigned long reaped = cachep->c_reaped;
- unsigned long allocs = cachep->c_num_allocations;
- errors = (unsigned long) atomic_read(&cachep->c_errors);
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
- len += sprintf(buf+len, "%-16s %6lu %6lu %6lu %4lu %4lu %4lu %6lu %7lu %5lu %4lu %4lu\n",
- cachep->c_name, active_objs, num_objs, cachep->c_offset, active_slabs, num_slabs,
- (1<<cachep->c_gfporder)*num_slabs,
- high, allocs, grown, reaped, errors);
+ res = -EINVAL;
+ list_for_each(p,&cache_chain) {
+ kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
+
+ if (!strcmp(cachep->name, kbuf)) {
+ res = kmem_tune_cpucache(cachep, limit, batchcount);
+ break;
}
-#else
- spin_unlock_irqrestore(&cachep->c_spinlock, save_flags);
- len += sprintf(buf+len, "%-17s %6lu %6lu %6lu\n", cachep->c_name, active_objs, num_objs, cachep->c_offset);
-#endif /* SLAB_STATS */
- } while ((cachep = cachep->c_nextp) != &cache_cache);
+ }
up(&cache_chain_sem);
-
- return len;
+ if (res >= 0)
+ res = count;
+ return res;
+#else
+ return -EINVAL;
+#endif
}
-#endif /* CONFIG_PROC_FS */
+#endif
diff --git a/net/atm/Makefile b/net/atm/Makefile
index a43d790b1..c21cdcad1 100644
--- a/net/atm/Makefile
+++ b/net/atm/Makefile
@@ -25,7 +25,7 @@ ifeq ($(CONFIG_NET_SCH_ATM),y)
NEED_IPCOM = ipcommon.o
endif
-O_OBJS += $(NEED_IPCOM)
+OX_OBJS += $(NEED_IPCOM)
ifeq ($(CONFIG_PROC_FS),y)
OX_OBJS += proc.o
diff --git a/net/atm/clip.c b/net/atm/clip.c
index c2b6788c9..ca79e0066 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -274,14 +274,14 @@ static void clip_neigh_error(struct neighbour *neigh,struct sk_buff *skb)
static struct neigh_ops clip_neigh_ops = {
- AF_INET, /* family */
- clip_neigh_destroy, /* destructor */
- clip_neigh_solicit, /* solicit */
- clip_neigh_error, /* error_report */
- dev_queue_xmit, /* output */
- dev_queue_xmit, /* connected_output */
- dev_queue_xmit, /* hh_output */
- dev_queue_xmit /* queue_xmit */
+ family: AF_INET,
+ destructor: clip_neigh_destroy,
+ solicit: clip_neigh_solicit,
+ error_report: clip_neigh_error,
+ output: dev_queue_xmit,
+ connected_output: dev_queue_xmit,
+ hh_output: dev_queue_xmit,
+ queue_xmit: dev_queue_xmit,
};
@@ -384,6 +384,7 @@ static int clip_start_xmit(struct sk_buff *skb,struct net_device *dev)
if (!skb->dst) {
printk(KERN_ERR "clip_start_xmit: skb->dst == NULL\n");
dev_kfree_skb(skb);
+ clip_priv->stats.tx_dropped++;
return 0;
}
if (!skb->dst->neighbour) {
@@ -395,8 +396,10 @@ static int clip_start_xmit(struct sk_buff *skb,struct net_device *dev)
return 0;
}
#endif
-printk("clip_start_xmit: NO NEIGHBOUR !\n");
-return 0;
+ printk(KERN_ERR "clip_start_xmit: NO NEIGHBOUR !\n");
+ dev_kfree_skb(skb);
+ clip_priv->stats.tx_dropped++;
+ return 0;
}
entry = NEIGH2ENTRY(skb->dst->neighbour);
if (!entry->vccs) {
@@ -440,7 +443,6 @@ return 0;
entry->vccs->xoff = 0;
return 0;
}
- if (old) return 0;
spin_lock_irqsave(&clip_priv->xoff_lock,flags);
netif_stop_queue(dev); /* XOFF -> throttle immediately */
barrier();
@@ -482,6 +484,7 @@ int clip_mkip(struct atm_vcc *vcc,int timeout)
clip_vcc->old_pop = vcc->pop;
vcc->push = clip_push;
vcc->pop = clip_pop;
+ skb_queue_head_init(&copy);
skb_migrate(&vcc->recvq,&copy);
/* re-process everything received between connection setup and MKIP */
while ((skb = skb_dequeue(&copy)))
@@ -622,7 +625,7 @@ static int clip_device_event(struct notifier_block *this,unsigned long event,
DPRINTK("clip_device_event NETDEV_UP\n");
(void) to_atmarpd(act_up,PRIV(dev)->number,0);
break;
- case NETDEV_DOWN:
+ case NETDEV_GOING_DOWN:
DPRINTK("clip_device_event NETDEV_DOWN\n");
(void) to_atmarpd(act_down,PRIV(dev)->number,0);
break;
@@ -633,6 +636,7 @@ static int clip_device_event(struct notifier_block *this,unsigned long event,
break;
case NETDEV_REBOOT:
case NETDEV_REGISTER:
+ case NETDEV_DOWN:
DPRINTK("clip_device_event %ld\n",event);
/* ignore */
break;
diff --git a/net/atm/common.c b/net/atm/common.c
index 867085ed8..16a6fb6b4 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -105,6 +105,7 @@ int atm_create(struct socket *sock,int protocol,int family)
vcc->callback = NULL;
memset(&vcc->local,0,sizeof(struct sockaddr_atmsvc));
memset(&vcc->remote,0,sizeof(struct sockaddr_atmsvc));
+ vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */
atomic_set(&vcc->tx_inuse,0);
atomic_set(&vcc->rx_inuse,0);
vcc->push = NULL;
@@ -116,6 +117,7 @@ int atm_create(struct socket *sock,int protocol,int family)
init_waitqueue_head(&vcc->sleep);
skb_queue_head_init(&vcc->recvq);
skb_queue_head_init(&vcc->listenq);
+ sk->sleep = &vcc->sleep;
sock->sk = sk;
return 0;
}
@@ -409,6 +411,7 @@ int atm_sendmsg(struct socket *sock,struct msghdr *m,int total_len,
return vcc->reply;
if (!test_bit(ATM_VF_READY,&vcc->flags)) return -EPIPE;
if (!size) return 0;
+ if (size < 0 || size > vcc->qos.txtp.max_sdu) return -EMSGSIZE;
/* verify_area is done by net/socket.c */
eff = (size+3) & ~3; /* align to word boundary */
add_wait_queue(&vcc->sleep,&wait);
@@ -750,8 +753,10 @@ int atm_ioctl(struct socket *sock,unsigned int cmd,unsigned long arg)
}
-int atm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos)
+static int atm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos)
{
+ int error;
+
/*
* Don't let the QoS change the already connected AAL type nor the
* traffic class.
@@ -760,6 +765,9 @@ int atm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos)
qos->rxtp.traffic_class != vcc->qos.rxtp.traffic_class ||
qos->txtp.traffic_class != vcc->qos.txtp.traffic_class)
return -EINVAL;
+ error = adjust_tp(&qos->txtp,qos->aal);
+ if (!error) error = adjust_tp(&qos->rxtp,qos->aal);
+ if (error) return error;
if (!vcc->dev->ops->change_qos) return -EOPNOTSUPP;
if (vcc->family == AF_ATMPVC)
return vcc->dev->ops->change_qos(vcc,qos,ATM_MF_SET);
diff --git a/net/atm/common.h b/net/atm/common.h
index faf1866ac..6330ca31c 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -26,7 +26,6 @@ int atm_getsockopt(struct socket *sock,int level,int optname,char *optval,
int atm_connect_vcc(struct atm_vcc *vcc,int itf,short vpi,int vci);
void atm_release_vcc_sk(struct sock *sk,int free_sk);
-int atm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos);
void atm_shutdown_dev(struct atm_dev *dev);
int atm_proc_init(void);
diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c
index d7c4a4d3a..707b74fb6 100644
--- a/net/atm/ipcommon.c
+++ b/net/atm/ipcommon.c
@@ -3,6 +3,7 @@
/* Written 1996-2000 by Werner Almesberger, EPFL LRC/ICA */
+#include <linux/module.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
@@ -31,7 +32,11 @@ const unsigned char llc_oui[] = {
/*
- * skb_migrate moves the list at FROM to TO, emptying FROM in the process.
+ * skb_migrate appends the list at "from" to "to", emptying "from" in the
+ * process. skb_migrate is atomic with respect to all other skb operations on
+ * "from" and "to". Note that it locks both lists at the same time, so beware
+ * of potential deadlocks.
+ *
* This function should live in skbuff.c or skbuff.h.
*/
@@ -40,18 +45,26 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
{
struct sk_buff *skb;
unsigned long flags;
+ struct sk_buff *skb_from = (struct sk_buff *) from;
+ struct sk_buff *skb_to = (struct sk_buff *) to;
+ struct sk_buff *prev;
spin_lock_irqsave(&from->lock,flags);
- *to = *from;
- from->prev = (struct sk_buff *) from;
- from->next = (struct sk_buff *) from;
+ spin_lock(&to->lock);
+ prev = from->prev;
+ from->next->prev = to->prev;
+ prev->next = skb_to;
+ to->prev->next = from->next;
+ to->prev = from->prev;
+ for (skb = from->next; skb != skb_to; skb = skb->next)
+ skb->list = to;
+ to->qlen += from->qlen;
+ spin_unlock(&to->lock);
+ from->prev = skb_from;
+ from->next = skb_from;
from->qlen = 0;
spin_unlock_irqrestore(&from->lock,flags);
- spin_lock_init(&to->lock);
- for (skb = ((struct sk_buff *) to)->next;
- skb != (struct sk_buff *) from; skb = skb->next) skb->list = to;
- if (to->next == (struct sk_buff *) from)
- to->next = (struct sk_buff *) to;
- to->next->prev = (struct sk_buff *) to;
- to->prev->next = (struct sk_buff *) to;
}
+
+
+EXPORT_SYMBOL(skb_migrate);
diff --git a/net/atm/ipcommon.h b/net/atm/ipcommon.h
index 30a5583b0..bc1675eca 100644
--- a/net/atm/ipcommon.h
+++ b/net/atm/ipcommon.h
@@ -16,8 +16,8 @@
extern struct net_device *clip_devs;
/*
- * Moves all skbs from "from" to "to". The operation is atomic for "from", but
- * not for "to". "to" may only be accessed after skb_migrate finishes.
+ * Appends all skbs from "from" to "to". The operation is atomic with respect
+ * to all other skb operations on "from" or "to".
*/
void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to);
diff --git a/net/atm/lec.c b/net/atm/lec.c
index f9b14dce5..d9921b408 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -332,23 +332,33 @@ lec_send_packet(struct sk_buff *skb, struct net_device *dev)
lec_h->h_dest[0], lec_h->h_dest[1], lec_h->h_dest[2],
lec_h->h_dest[3], lec_h->h_dest[4], lec_h->h_dest[5]);
ATM_SKB(skb2)->vcc = send_vcc;
- atomic_add(skb2->truesize, &send_vcc->tx_inuse);
ATM_SKB(skb2)->iovcnt = 0;
ATM_SKB(skb2)->atm_options = send_vcc->atm_options;
DPRINTK("%s:sending to vpi:%d vci:%d\n", dev->name,
send_vcc->vpi, send_vcc->vci);
- priv->stats.tx_packets++;
- priv->stats.tx_bytes += skb2->len;
- send_vcc->send(send_vcc, skb2);
+ if (atm_may_send(send_vcc, skb2->len)) {
+ atomic_add(skb2->truesize, &send_vcc->tx_inuse);
+ priv->stats.tx_packets++;
+ priv->stats.tx_bytes += skb2->len;
+ send_vcc->send(send_vcc, skb2);
+ } else {
+ priv->stats.tx_dropped++;
+ dev_kfree_skb(skb2);
+ }
}
ATM_SKB(skb)->vcc = send_vcc;
- atomic_add(skb->truesize, &send_vcc->tx_inuse);
ATM_SKB(skb)->iovcnt = 0;
ATM_SKB(skb)->atm_options = send_vcc->atm_options;
- priv->stats.tx_packets++;
- priv->stats.tx_bytes += skb->len;
- send_vcc->send(send_vcc, skb);
+ if (atm_may_send(send_vcc, skb->len)) {
+ atomic_add(skb->truesize, &send_vcc->tx_inuse);
+ priv->stats.tx_packets++;
+ priv->stats.tx_bytes += skb->len;
+ send_vcc->send(send_vcc, skb);
+ } else {
+ priv->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ }
#if 0
/* Should we wait for card's device driver to notify us? */
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 1b3e13ad6..68cd8a034 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -239,7 +239,7 @@ void atm_mpoa_disp_qos(char *page, int *len)
while (qos != NULL) {
ip = (unsigned char *)&qos->ipaddr;
sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(ip));
- *len += sprintf(page + *len, "%%u.%u.%u.%u\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n",
+ *len += sprintf(page + *len, "%u.%u.%u.%u\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n",
NIPQUAD(ipaddr),
qos->qos.txtp.max_pcr, qos->qos.txtp.pcr, qos->qos.txtp.min_pcr, qos->qos.txtp.max_cdv, qos->qos.txtp.max_sdu,
qos->qos.rxtp.max_pcr, qos->qos.rxtp.pcr, qos->qos.rxtp.min_pcr, qos->qos.rxtp.max_cdv, qos->qos.rxtp.max_sdu);
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 79ab6e045..b2b186ac4 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -104,7 +104,7 @@ static int svc_addr(char *buf,struct sockaddr_atmsvc *addr)
strcpy(buf,addr->sas_addr.pub);
len = strlen(addr->sas_addr.pub);
buf += len;
- if (*addr->sas_addr.pub) {
+ if (*addr->sas_addr.prv) {
*buf++ = '+';
len++;
}
@@ -233,9 +233,10 @@ static void svc_info(struct atm_vcc *vcc,char *buf)
int i;
if (!vcc->dev)
- sprintf(buf,sizeof(void *) == 4 ? "N/A@%p%6s" : "N/A@%p%2s",
+ sprintf(buf,sizeof(void *) == 4 ? "N/A@%p%10s" : "N/A@%p%2s",
vcc,"");
- else sprintf(buf,"%3d %3d %5d ",vcc->dev->number,vcc->vpi,vcc->vci);
+ else sprintf(buf,"%3d %3d %5d ",vcc->dev->number,vcc->vpi,
+ vcc->vci);
here = strchr(buf,0);
here += sprintf(here,"%-10s ",vcc_state(vcc));
here += sprintf(here,"%s%s",vcc->remote.sas_addr.pub,
@@ -376,7 +377,7 @@ static int atm_svc_info(loff_t pos,char *buf)
int left;
if (!pos)
- return sprintf(buf,"Itf VPI VCI State Remote\n");
+ return sprintf(buf,"Itf VPI VCI State Remote\n");
left = pos-1;
for (dev = atm_devs; dev; dev = dev->next)
for (vcc = dev->vccs; vcc; vcc = vcc->next)
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 0240aa874..dc7998fd1 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -158,9 +158,9 @@ static int sigd_send(struct atm_vcc *vcc,struct sk_buff *skb)
}
-void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
+void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type,
struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
- const struct sockaddr_atmsvc *svc)
+ const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply)
{
struct sk_buff *skb;
struct atmsvc_msg *msg;
@@ -173,21 +173,26 @@ void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
msg->type = type;
*(struct atm_vcc **) &msg->vcc = vcc;
*(struct atm_vcc **) &msg->listen_vcc = listen_vcc;
- msg->reply = 0; /* other ISP applications may use this field */
- if (vcc) {
- msg->qos = vcc->qos;
- msg->sap = vcc->sap;
- }
- if (!svc) msg->svc.sas_family = 0;
- else msg->svc = *svc;
+ msg->reply = reply;
+ if (qos) msg->qos = *qos;
+ if (vcc) msg->sap = vcc->sap;
+ if (svc) msg->svc = *svc;
if (vcc) msg->local = vcc->local;
- if (!pvc) memset(&msg->pvc,0,sizeof(msg->pvc));
- else msg->pvc = *pvc;
+ if (pvc) msg->pvc = *pvc;
sigd_put_skb(skb);
if (vcc) set_bit(ATM_VF_REGIS,&vcc->flags);
}
+void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc)
+{
+ sigd_enq2(vcc,type,listen_vcc,pvc,svc,vcc ? &vcc->qos : NULL,0);
+ /* other ISP applications may use "reply" */
+}
+
+
static void purge_vccs(struct atm_vcc *vcc)
{
while (vcc) {
diff --git a/net/atm/signaling.h b/net/atm/signaling.h
index 30d5d51d4..3b933ddb7 100644
--- a/net/atm/signaling.h
+++ b/net/atm/signaling.h
@@ -17,6 +17,14 @@
extern struct atm_vcc *sigd; /* needed in svc_release */
+/*
+ * sigd_enq is a wrapper for sigd_enq2, covering the more common cases, and
+ * avoiding huge lists of null values.
+ */
+
+void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply);
void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
const struct sockaddr_atmsvc *svc);
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 70fa063cb..bffe7aac5 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -77,8 +77,7 @@ static void svc_disconnect(struct atm_vcc *vcc)
as_indicate has been answered */
while ((skb = skb_dequeue(&vcc->listenq))) {
DPRINTK("LISTEN REL\n");
- sigd_enq(NULL,as_reject,vcc,NULL,NULL); /* @@@ should include
- the reason */
+ sigd_enq2(NULL,as_reject,vcc,NULL,NULL,&vcc->qos,0);
dev_kfree_skb(skb);
}
clear_bit(ATM_VF_REGIS,&vcc->flags);
@@ -310,8 +309,8 @@ static int svc_accept(struct socket *sock,struct socket *newsock,int flags)
dev_kfree_skb(skb);
old_vcc->backlog_quota++;
if (error) {
- sigd_enq(NULL,as_reject,old_vcc,NULL,NULL);
- /* @@@ should include the reason */
+ sigd_enq2(NULL,as_reject,old_vcc,NULL,NULL,
+ &old_vcc->qos,error);
return error == -EAGAIN ? -EBUSY : error;
}
/* wait should be short, so we ignore the non-blocking flag */
@@ -348,13 +347,9 @@ static int svc_getname(struct socket *sock,struct sockaddr *sockaddr,
int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos)
{
DECLARE_WAITQUEUE(wait,current);
- struct atm_qos save_qos;
vcc->reply = WAITING;
- save_qos = vcc->qos; /* @@@ really gross hack ... */
- vcc->qos = *qos;
- sigd_enq(vcc,as_modify,NULL,NULL,&vcc->local);
- vcc->qos = save_qos;
+ sigd_enq2(vcc,as_modify,NULL,NULL,&vcc->local,qos,0);
add_wait_queue(&vcc->sleep,&wait);
while (vcc->reply == WAITING && !test_bit(ATM_VF_RELEASED,&vcc->flags)
&& sigd) {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b51d1c4e9..79dc3d629 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -27,6 +27,7 @@
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
#include <net/sock.h>
#include <net/ip.h>
@@ -616,6 +617,12 @@ drop_nolock:
return(0);
}
+/* Need this wrapper because NF_HOOK takes the function address */
+static inline int do_ip_send(struct sk_buff *skb)
+{
+ return ip_send(skb);
+}
+
static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
@@ -829,7 +836,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
stats->tx_bytes += skb->len;
stats->tx_packets++;
- ip_send(skb);
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ do_ip_send);
tunnel->recursion--;
return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4069795fb..e343f34e8 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,7 @@
/*
* Linux NET3: IP/IP protocol decoder.
*
- * Version: $Id: ipip.c,v 1.34 2000/05/22 08:12:19 davem Exp $
+ * Version: $Id: ipip.c,v 1.35 2000/07/07 01:55:20 davem Exp $
*
* Authors:
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
@@ -107,6 +107,7 @@
#include <linux/if_arp.h>
#include <linux/mroute.h>
#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
#include <net/sock.h>
#include <net/ip.h>
@@ -499,6 +500,12 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len)
return 0;
}
+/* Need this wrapper because NF_HOOK takes the function address */
+static inline int do_ip_send(struct sk_buff *skb)
+{
+ return ip_send(skb);
+}
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
@@ -631,7 +638,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
stats->tx_bytes += skb->len;
stats->tx_packets++;
- ip_send(skb);
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ do_ip_send);
tunnel->recursion--;
return 0;
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 780afc05b..47e7fb01b 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -303,6 +303,7 @@ icmp_error_track(struct sk_buff *skb,
struct ip_conntrack_tuple_hash *h;
IP_NF_ASSERT(iph->protocol == IPPROTO_ICMP);
+ IP_NF_ASSERT(skb->nfct == NULL);
iph = skb->nh.iph;
hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
@@ -350,10 +351,27 @@ icmp_error_track(struct sk_buff *skb,
DEBUGP("icmp_error_track: Can't invert tuple\n");
return NULL;
}
+
+ *ctinfo = IP_CT_RELATED;
+
h = ip_conntrack_find_get(&innertuple, NULL);
if (!h) {
- DEBUGP("icmp_error_track: no match\n");
- return NULL;
+ /* Locally generated ICMPs will match inverted if they
+ haven't been SNAT'ed yet */
+ /* FIXME: NAT code has to handle half-done double NAT --RR */
+ if (hooknum == NF_IP_LOCAL_OUT)
+ h = ip_conntrack_find_get(&origtuple, NULL);
+
+ if (!h) {
+ DEBUGP("icmp_error_track: no match\n");
+ return NULL;
+ }
+ /* Reverse direction from that found */
+ if (DIRECTION(h) != IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+ } else {
+ if (DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
}
/* REJECT target does this commonly, so allow locally
@@ -365,10 +383,6 @@ icmp_error_track(struct sk_buff *skb,
return NULL;
}
- *ctinfo = IP_CT_RELATED;
- if (DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
-
/* Update skb to refer to this connection */
skb->nfct = &h->ctrack->infos[*ctinfo];
return h->ctrack;
@@ -816,7 +830,9 @@ ip_ct_gather_frags(struct sk_buff *skb)
unsigned int olddebug = skb->nf_debug;
#endif
if (sk) sock_hold(sk);
+ local_bh_disable();
skb = ip_defrag(skb);
+ local_bh_enable();
if (!skb) {
if (sk) sock_put(sk);
return skb;
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index c3b1091cf..2e7547c38 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -181,8 +181,9 @@ static int help(const struct iphdr *iph, size_t len,
connection tracking, not packet filtering.
However, it is neccessary for accurate tracking in
this case. */
- DEBUGP("conntrack_ftp: partial `%.*s'\n",
- (int)datalen, data);
+ if (net_ratelimit())
+ printk("conntrack_ftp: partial %u+%u\n",
+ ntohl(tcph->seq), datalen);
return NF_DROP;
case 0: /* no match */
diff --git a/net/ipv4/netfilter/ip_fw_compat.c b/net/ipv4/netfilter/ip_fw_compat.c
index 501dd0463..6f0503e05 100644
--- a/net/ipv4/netfilter/ip_fw_compat.c
+++ b/net/ipv4/netfilter/ip_fw_compat.c
@@ -86,7 +86,8 @@ fw_in(unsigned int hooknum,
int ret = FW_BLOCK;
u_int16_t redirpt;
- (*pskb)->nfcache |= NFC_UNKNOWN;
+ /* Assume worse case: any hook could change packet */
+ (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
(*pskb)->ip_summed = CHECKSUM_NONE;
switch (hooknum) {
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 56b08a9ed..a07749ecb 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -467,7 +467,7 @@ helper_cmp(const struct ip_nat_helper *helper,
static unsigned int opposite_hook[NF_IP_NUMHOOKS]
= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
[NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
- [NF_IP_LOCAL_OUT] = NF_IP_PRE_ROUTING
+ [NF_IP_LOCAL_OUT] = NF_IP_POST_ROUTING
};
unsigned int
@@ -663,8 +663,10 @@ void place_in_hashes(struct ip_conntrack *conntrack,
static void
manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
const struct ip_conntrack_manip *manip,
- enum ip_nat_manip_type maniptype)
+ enum ip_nat_manip_type maniptype,
+ __u32 *nfcache)
{
+ *nfcache |= NFC_ALTERED;
find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
if (maniptype == IP_NAT_MANIP_SRC) {
@@ -718,7 +720,8 @@ do_bindings(struct ip_conntrack *ct,
(*pskb)->nh.iph,
(*pskb)->len,
&info->manips[i].manip,
- info->manips[i].maniptype);
+ info->manips[i].maniptype,
+ &(*pskb)->nfcache);
}
}
helper = info->helper;
@@ -754,7 +757,7 @@ icmp_reply_translation(struct sk_buff *skb,
(even though a "host unreachable" coming from the host
itself is a bit wierd).
- More explanation: some people use NAT for anonomizing.
+ More explanation: some people use NAT for anonymizing.
Also, CERT recommends dropping all packets from private IP
addresses (although ICMP errors from internal links with
such addresses are not too uncommon, as Alan Cox points
@@ -782,11 +785,11 @@ icmp_reply_translation(struct sk_buff *skb,
manip_pkt(inner->protocol, inner,
skb->len - ((void *)inner - (void *)iph),
&info->manips[i].manip,
- !info->manips[i].maniptype);
+ !info->manips[i].maniptype,
+ &skb->nfcache);
/* Outer packet needs to have IP header NATed like
it's a reply. */
- } else if (info->manips[i].direction == dir
- && info->manips[i].hooknum == hooknum) {
+ } else if (info->manips[i].hooknum == hooknum) {
/* Use mapping to map outer packet: 0 give no
per-proto mapping */
DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
@@ -795,7 +798,8 @@ icmp_reply_translation(struct sk_buff *skb,
IP_PARTS(info->manips[i].manip.ip));
manip_pkt(0, iph, skb->len,
&info->manips[i].manip,
- info->manips[i].maniptype);
+ info->manips[i].maniptype,
+ &skb->nfcache);
}
}
READ_UNLOCK(&ip_nat_lock);
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index 12d40f554..a0de5a351 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -123,7 +123,8 @@ mangle_packet(struct sk_buff **pskb,
if (newlen > (*pskb)->len + skb_tailroom(*pskb)) {
struct sk_buff *newskb;
- newskb = skb_copy_expand(*pskb, skb_headroom(*pskb), newlen,
+ newskb = skb_copy_expand(*pskb, skb_headroom(*pskb),
+ newlen - (*pskb)->len,
GFP_ATOMIC);
if (!newskb) {
DEBUGP("ftp: oom\n");
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 3c8f4f2d6..11e16e25e 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -60,8 +60,7 @@ ip_nat_fn(unsigned int hooknum,
IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
& __constant_htons(IP_MF|IP_OFFSET)));
- /* FIXME: One day, fill in properly. --RR */
- (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
+ (*pskb)->nfcache |= NFC_UNKNOWN;
/* If we had a hardware checksum before, it's now invalid */
if ((*pskb)->pkt_type != PACKET_LOOPBACK)
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 82e798f71..792ae1552 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -6,6 +6,8 @@
*
* 2000-03-27: Simplified code (thanks to Andi Kleen for clues). (JM)
* 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). (JM)
+ * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
+ * Zander). (JM)
*
*/
#include <linux/module.h>
@@ -391,6 +393,7 @@ static struct sk_buff *netlink_build_message(ipq_queue_element_t *e, int *errp)
pm->data_len = data_len;
pm->timestamp_sec = e->skb->stamp.tv_sec;
pm->timestamp_usec = e->skb->stamp.tv_usec;
+ pm->mark = e->skb->nfmark;
pm->hook = e->info->hook;
if (e->info->indev) strcpy(pm->indev_name, e->info->indev->name);
else pm->indev_name[0] = '\0';
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index c739eda3d..2f9c11915 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -127,8 +127,8 @@ int masq_device_event(struct notifier_block *this,
{
struct net_device *dev = ptr;
- if (event == NETDEV_DOWN) {
- /* Device was downed. Search entire table for
+ if (event == NETDEV_DOWN || event == NETDEV_CHANGEADDR) {
+ /* Device was downed/changed (diald) Search entire table for
conntracks which were associated with that device,
and forget them. */
IP_NF_ASSERT(dev->ifindex != 0);
diff --git a/net/ipv4/netfilter/ipt_MIRROR.c b/net/ipv4/netfilter/ipt_MIRROR.c
index 54e62c000..d7718b557 100644
--- a/net/ipv4/netfilter/ipt_MIRROR.c
+++ b/net/ipv4/netfilter/ipt_MIRROR.c
@@ -41,23 +41,25 @@ static int route_mirror(struct sk_buff *skb)
struct iphdr *iph = skb->nh.iph;
struct rtable *rt;
- if (ip_route_output(&rt, iph->daddr, iph->saddr,
+ /* Backwards */
+ if (ip_route_output(&rt, iph->saddr, iph->daddr,
RT_TOS(iph->tos) | RTO_CONN,
0)) {
- return -EINVAL;
+ return 0;
}
- /* check if the interface we are living by is the same as the one we arrived on */
+ /* check if the interface we are leaving by is the same as the
+ one we arrived on */
if (skb->rx_dev == rt->u.dst.dev) {
/* Drop old route. */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
- return 0;
+ return 1;
}
- else return -EINVAL;
+ return 0;
}
-static int
+static void
ip_rewrite(struct sk_buff *skb)
{
struct iphdr *iph = skb->nh.iph;
@@ -69,10 +71,27 @@ ip_rewrite(struct sk_buff *skb)
/* Rewrite IP header */
iph->daddr = odaddr;
iph->saddr = osaddr;
-
- return 0;
}
+/* Stolen from ip_finish_output2 */
+static void ip_direct_send(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+ struct hh_cache *hh = dst->hh;
+
+ if (hh) {
+ read_lock_bh(&hh->hh_lock);
+ memcpy(skb->data - 16, hh->hh_data, 16);
+ read_unlock_bh(&hh->hh_lock);
+ skb_push(skb, hh->hh_len);
+ hh->hh_output(skb);
+ } else if (dst->neighbour)
+ dst->neighbour->output(skb);
+ else {
+ printk(KERN_DEBUG "khm in MIRROR\n");
+ kfree(skb);
+ }
+}
static unsigned int ipt_mirror_target(struct sk_buff **pskb,
unsigned int hooknum,
@@ -82,8 +101,12 @@ static unsigned int ipt_mirror_target(struct sk_buff **pskb,
void *userinfo)
{
if ((*pskb)->dst != NULL) {
- if (!ip_rewrite(*pskb) && !route_mirror(*pskb)) {
- ip_send(*pskb);
+ if (route_mirror(*pskb)) {
+ ip_rewrite(*pskb);
+ /* Don't let conntrack code see this packet:
+ it will think we are starting a new
+ connection! --RR */
+ ip_direct_send(*pskb);
return NF_STOLEN;
}
}
diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c
index 1cc17398d..ce280b3c2 100644
--- a/net/ipv4/netfilter/ipt_mac.c
+++ b/net/ipv4/netfilter/ipt_mac.c
@@ -33,9 +33,11 @@ ipt_mac_checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ /* FORWARD isn't always valid, but it's nice to be able to do --RR */
if (hook_mask
- & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN))) {
- printk("ipt_mac: only valid for PRE_ROUTING or LOCAL_IN.\n");
+ & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+ | (1 << NF_IP_FORWARD))) {
+ printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
return 0;
}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0e823a16c..114b59daa 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -6,7 +6,7 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * $Id: sit.c,v 1.38 2000/05/03 06:37:07 davem Exp $
+ * $Id: sit.c,v 1.39 2000/07/07 01:55:20 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -29,6 +29,7 @@
#include <linux/icmp.h>
#include <asm/uaccess.h>
#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -404,6 +405,12 @@ int ipip6_rcv(struct sk_buff *skb, unsigned short len)
return 0;
}
+/* Need this wrapper because NF_HOOK takes the function address */
+static inline int do_ip_send(struct sk_buff *skb)
+{
+ return ip_send(skb);
+}
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
@@ -559,7 +566,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
stats->tx_bytes += skb->len;
stats->tx_packets++;
- ip_send(skb);
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ do_ip_send);
tunnel->recursion--;
return 0;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index ce93ab71c..a908812c5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -484,7 +484,6 @@ call_encode(struct rpc_task *task)
req->rq_rvec[0].iov_len = bufsiz;
req->rq_rlen = bufsiz;
req->rq_rnr = 1;
- req->rq_damaged = 0;
/* Zero buffer so we have automatic zero-padding of opaque & string */
memset(task->tk_buffer, 0, bufsiz);
@@ -603,10 +602,7 @@ call_status(struct rpc_task *task)
rpc_sleep_on(&xprt->sending, task, NULL, NULL);
case -ENOMEM:
case -EAGAIN:
- if (req->rq_damaged)
- task->tk_action = call_encode;
- else
- task->tk_action = call_transmit;
+ task->tk_action = call_transmit;
clnt->cl_stats->rpcretrans++;
break;
default:
@@ -664,10 +660,7 @@ call_timeout(struct rpc_task *task)
minor_timeout:
if (!req)
task->tk_action = call_reserve;
- else if (req->rq_damaged) {
- task->tk_action = call_encode;
- clnt->cl_stats->rpcretrans++;
- } else if (!clnt->cl_port) {
+ else if (!clnt->cl_port) {
task->tk_action = call_bind;
clnt->cl_stats->rpcretrans++;
} else if (clnt->cl_xprt->stream && !clnt->cl_xprt->connected) {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9dc2d1247..93a4fbb18 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -97,6 +97,41 @@ static __inline__ void rpc_unlock_swapbuf(void)
}
/*
+ * Disable the timer for a given RPC task. Should be called with
+ * rpc_queue_lock and bh_disabled in order to avoid races within
+ * rpc_run_timer().
+ */
+static inline void
+__rpc_disable_timer(struct rpc_task *task)
+{
+ dprintk("RPC: %4d disabling timer\n", task->tk_pid);
+ task->tk_timeout_fn = NULL;
+ task->tk_timeout = 0;
+}
+
+/*
+ * Run a timeout function.
+ * We use the callback in order to allow __rpc_wake_up_task()
+ * and friends to disable the timer synchronously on SMP systems
+ * without calling del_timer_sync(). The latter could cause a
+ * deadlock if called while we're holding spinlocks...
+ */
+static void
+rpc_run_timer(struct rpc_task *task)
+{
+ void (*callback)(struct rpc_task *);
+
+ spin_lock_bh(&rpc_queue_lock);
+ callback = task->tk_timeout_fn;
+ task->tk_timeout_fn = NULL;
+ spin_unlock_bh(&rpc_queue_lock);
+ if (callback) {
+ dprintk("RPC: %4d running timer\n", task->tk_pid);
+ callback(task);
+ }
+}
+
+/*
* Set up a timer for the current task.
*/
static inline void
@@ -108,17 +143,11 @@ __rpc_add_timer(struct rpc_task *task, rpc_action timer)
dprintk("RPC: %4d setting alarm for %lu ms\n",
task->tk_pid, task->tk_timeout * 1000 / HZ);
- if (timer_pending(&task->tk_timer)) {
- printk(KERN_ERR "RPC: Bug! Overwriting active timer\n");
- del_timer(&task->tk_timer);
- }
- if (!timer)
- timer = __rpc_default_timer;
- init_timer(&task->tk_timer);
- task->tk_timer.expires = jiffies + task->tk_timeout;
- task->tk_timer.data = (unsigned long) task;
- task->tk_timer.function = (void (*)(unsigned long)) timer;
- add_timer(&task->tk_timer);
+ if (timer)
+ task->tk_timeout_fn = timer;
+ else
+ task->tk_timeout_fn = __rpc_default_timer;
+ mod_timer(&task->tk_timer, jiffies + task->tk_timeout);
}
/*
@@ -133,15 +162,16 @@ void rpc_add_timer(struct rpc_task *task, rpc_action timer)
}
/*
- * Delete any timer for the current task.
+ * Delete any timer for the current task. Because we use del_timer_sync(),
+ * this function should never be called while holding rpc_queue_lock.
*/
static inline void
-__rpc_del_timer(struct rpc_task *task)
+rpc_delete_timer(struct rpc_task *task)
{
- dprintk("RPC: %4d deleting timer\n", task->tk_pid);
- if (timer_pending(&task->tk_timer))
- del_timer(&task->tk_timer);
- task->tk_timeout = 0;
+ if (timer_pending(&task->tk_timer)) {
+ dprintk("RPC: %4d deleting timer\n", task->tk_pid);
+ del_timer_sync(&task->tk_timer);
+ }
}
/*
@@ -223,11 +253,11 @@ rpc_remove_wait_queue(struct rpc_task *task)
static inline void
rpc_make_runnable(struct rpc_task *task)
{
- if (task->tk_timeout) {
+ if (task->tk_timeout_fn) {
printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n");
return;
}
- task->tk_flags |= RPC_TASK_RUNNING;
+ task->tk_running = 1;
if (RPC_IS_ASYNC(task)) {
if (RPC_IS_SLEEPING(task)) {
int status;
@@ -238,10 +268,12 @@ rpc_make_runnable(struct rpc_task *task)
} else
task->tk_sleeping = 0;
}
- wake_up(&rpciod_idle);
+ if (waitqueue_active(&rpciod_idle))
+ wake_up(&rpciod_idle);
} else {
task->tk_sleeping = 0;
- wake_up(&task->tk_wait);
+ if (waitqueue_active(&task->tk_wait))
+ wake_up(&task->tk_wait);
}
}
@@ -267,7 +299,8 @@ void rpciod_wake_up(void)
{
if(rpciod_pid==0)
printk(KERN_ERR "rpciod: wot no daemon?\n");
- wake_up(&rpciod_idle);
+ if (waitqueue_active(&rpciod_idle))
+ wake_up(&rpciod_idle);
}
/*
@@ -301,12 +334,14 @@ __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
task->tk_status = status;
} else {
- task->tk_flags &= ~RPC_TASK_RUNNING;
+ task->tk_running = 0;
+ if (task->tk_callback) {
+ printk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid);
+ BUG();
+ }
task->tk_callback = action;
__rpc_add_timer(task, timer);
}
-
- return;
}
void
@@ -330,20 +365,17 @@ rpc_sleep_locked(struct rpc_wait_queue *q, struct rpc_task *task,
*/
spin_lock_bh(&rpc_queue_lock);
__rpc_sleep_on(q, task, action, timer);
- rpc_lock_task(task);
+ __rpc_lock_task(task);
spin_unlock_bh(&rpc_queue_lock);
}
/*
* Wake up a single task -- must be invoked with spin lock held.
- *
- * It would probably suffice to cli/sti the del_timer and remove_wait_queue
- * operations individually.
*/
static void
-__rpc_wake_up(struct rpc_task *task)
+__rpc_wake_up_task(struct rpc_task *task)
{
- dprintk("RPC: %4d __rpc_wake_up (now %ld inh %d)\n",
+ dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n",
task->tk_pid, jiffies, rpc_inhibit);
#ifdef RPC_DEBUG
@@ -362,7 +394,7 @@ __rpc_wake_up(struct rpc_task *task)
if (RPC_IS_RUNNING(task))
return;
- __rpc_del_timer(task);
+ __rpc_disable_timer(task);
/* If the task has been locked, then set tk_wakeup so that
* rpc_unlock_task() wakes us up... */
@@ -374,10 +406,9 @@ __rpc_wake_up(struct rpc_task *task)
if (task->tk_rpcwait != &schedq)
__rpc_remove_wait_queue(task);
- task->tk_flags |= RPC_TASK_CALLBACK;
rpc_make_runnable(task);
- dprintk("RPC: __rpc_wake_up done\n");
+ dprintk("RPC: __rpc_wake_up_task done\n");
}
/*
@@ -388,7 +419,6 @@ __rpc_default_timer(struct rpc_task *task)
{
dprintk("RPC: %d timeout (default timer)\n", task->tk_pid);
task->tk_status = -ETIMEDOUT;
- task->tk_timeout = 0;
rpc_wake_up_task(task);
}
@@ -401,7 +431,7 @@ rpc_wake_up_task(struct rpc_task *task)
if (RPC_IS_RUNNING(task))
return;
spin_lock_bh(&rpc_queue_lock);
- __rpc_wake_up(task);
+ __rpc_wake_up_task(task);
spin_unlock_bh(&rpc_queue_lock);
}
@@ -416,7 +446,7 @@ rpc_wake_up_next(struct rpc_wait_queue *queue)
dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue));
spin_lock_bh(&rpc_queue_lock);
if ((task = queue->task) != 0)
- __rpc_wake_up(task);
+ __rpc_wake_up_task(task);
spin_unlock_bh(&rpc_queue_lock);
return task;
@@ -430,7 +460,7 @@ rpc_wake_up(struct rpc_wait_queue *queue)
{
spin_lock_bh(&rpc_queue_lock);
while (queue->task)
- __rpc_wake_up(queue->task);
+ __rpc_wake_up_task(queue->task);
spin_unlock_bh(&rpc_queue_lock);
}
@@ -445,7 +475,7 @@ rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
spin_lock_bh(&rpc_queue_lock);
while ((task = queue->task) != NULL) {
task->tk_status = status;
- __rpc_wake_up(task);
+ __rpc_wake_up_task(task);
}
spin_unlock_bh(&rpc_queue_lock);
}
@@ -458,7 +488,7 @@ rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
* rpc_queue_lock held.
*/
int
-rpc_lock_task(struct rpc_task *task)
+__rpc_lock_task(struct rpc_task *task)
{
if (!RPC_IS_RUNNING(task))
return ++task->tk_lock;
@@ -470,7 +500,7 @@ rpc_unlock_task(struct rpc_task *task)
{
spin_lock_bh(&rpc_queue_lock);
if (task->tk_lock && !--task->tk_lock && task->tk_wakeup)
- __rpc_wake_up(task);
+ __rpc_wake_up_task(task);
spin_unlock_bh(&rpc_queue_lock);
}
@@ -517,7 +547,6 @@ __rpc_execute(struct rpc_task *task)
/* Define a callback save pointer */
void (*save_callback)(struct rpc_task *);
- task->tk_flags &= ~RPC_TASK_CALLBACK;
/*
* If a callback exists, save it, reset it,
* call it.
@@ -525,11 +554,9 @@ __rpc_execute(struct rpc_task *task)
* another callback set within the callback handler
* - Dave
*/
- if (task->tk_callback) {
- save_callback=task->tk_callback;
- task->tk_callback=NULL;
- save_callback(task);
- }
+ save_callback=task->tk_callback;
+ task->tk_callback=NULL;
+ save_callback(task);
}
/*
@@ -538,6 +565,10 @@ __rpc_execute(struct rpc_task *task)
* by someone else.
*/
if (RPC_IS_RUNNING(task)) {
+ /*
+ * Garbage collection of pending timers...
+ */
+ rpc_delete_timer(task);
if (!task->tk_action)
break;
task->tk_action(task);
@@ -639,7 +670,7 @@ rpc_execute(struct rpc_task *task)
}
task->tk_active = 1;
- task->tk_flags |= RPC_TASK_RUNNING;
+ task->tk_running = 1;
return __rpc_execute(task);
out_release:
rpc_release_task(task);
@@ -758,6 +789,8 @@ rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
{
memset(task, 0, sizeof(*task));
init_timer(&task->tk_timer);
+ task->tk_timer.data = (unsigned long) task;
+ task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer;
task->tk_client = clnt;
task->tk_flags = flags;
task->tk_exit = callback;
@@ -864,8 +897,8 @@ rpc_release_task(struct rpc_task *task)
/* Protect the execution below. */
spin_lock_bh(&rpc_queue_lock);
- /* Delete any running timer */
- __rpc_del_timer(task);
+ /* Disable timer to prevent zombie wakeup */
+ __rpc_disable_timer(task);
/* Remove from any wait queue we're still on */
__rpc_remove_wait_queue(task);
@@ -874,6 +907,9 @@ rpc_release_task(struct rpc_task *task)
spin_unlock_bh(&rpc_queue_lock);
+ /* Synchronously delete any running timer */
+ rpc_delete_timer(task);
+
/* Release resources */
if (task->tk_rqstp)
xprt_release(task);
@@ -921,7 +957,7 @@ rpc_child_exit(struct rpc_task *child)
spin_lock_bh(&rpc_queue_lock);
if ((parent = rpc_find_parent(child)) != NULL) {
parent->tk_status = child->tk_status;
- __rpc_wake_up(parent);
+ __rpc_wake_up_task(parent);
}
spin_unlock_bh(&rpc_queue_lock);
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f0f714ff0..a036faef9 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -250,7 +250,8 @@ static int
svc_sendto(struct svc_rqst *rqstp, struct iovec *iov, int nr)
{
mm_segment_t oldfs;
- struct socket *sock = rqstp->rq_sock->sk_sock;
+ struct svc_sock *svsk = rqstp->rq_sock;
+ struct socket *sock = svsk->sk_sock;
struct msghdr msg;
int i, buflen, len;
@@ -342,13 +343,16 @@ svc_udp_data_ready(struct sock *sk, int count)
struct svc_sock *svsk = (struct svc_sock *)(sk->user_data);
if (!svsk)
- return;
+ goto out;
dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
svsk, sk, count, svsk->sk_busy);
spin_lock_bh(&svsk->sk_lock);
svsk->sk_data = 1;
svc_sock_enqueue(svsk);
spin_unlock_bh(&svsk->sk_lock);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
}
/*
@@ -459,16 +463,19 @@ svc_tcp_state_change1(struct sock *sk)
if (sk->state != TCP_ESTABLISHED) {
/* Aborted connection, SYN_RECV or whatever... */
- return;
+ goto out;
}
if (!(svsk = (struct svc_sock *) sk->user_data)) {
printk("svc: socket %p: no user data\n", sk);
- return;
+ goto out;
}
spin_lock_bh(&svsk->sk_lock);
svsk->sk_conn++;
svc_sock_enqueue(svsk);
spin_unlock_bh(&svsk->sk_lock);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible_all(sk->sleep);
}
/*
@@ -484,12 +491,15 @@ svc_tcp_state_change2(struct sock *sk)
if (!(svsk = (struct svc_sock *) sk->user_data)) {
printk("svc: socket %p: no user data\n", sk);
- return;
+ goto out;
}
spin_lock_bh(&svsk->sk_lock);
svsk->sk_close = 1;
svc_sock_enqueue(svsk);
spin_unlock_bh(&svsk->sk_lock);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible_all(sk->sleep);
}
static void
@@ -497,20 +507,17 @@ svc_tcp_data_ready(struct sock *sk, int count)
{
struct svc_sock * svsk;
- /* Disconnect signalled through data_ready?!? */
- if (sk->state != TCP_ESTABLISHED) {
- svc_tcp_state_change2(sk);
- return;
- }
-
dprintk("svc: socket %p TCP data ready (svsk %p)\n",
sk, sk->user_data);
if (!(svsk = (struct svc_sock *)(sk->user_data)))
- return;
+ goto out;
spin_lock_bh(&svsk->sk_lock);
svsk->sk_data++;
svc_sock_enqueue(svsk);
spin_unlock_bh(&svsk->sk_lock);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
}
/*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 7534288db..55c816ce5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -67,7 +67,7 @@
#include <asm/uaccess.h>
/* Following value should be > 32k + RPC overhead */
-#define XPRT_MIN_WRITE_SPACE 35000
+#define XPRT_MIN_WRITE_SPACE (35000 + SOCK_MIN_WRITE_SPACE)
extern spinlock_t rpc_queue_lock;
@@ -175,11 +175,10 @@ xprt_move_iov(struct msghdr *msg, struct iovec *niv, unsigned amount)
msg->msg_iov=niv;
}
-
+
/*
* Write data to socket.
*/
-
static inline int
xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
@@ -288,11 +287,12 @@ xprt_recvmsg(struct rpc_xprt *xprt, struct iovec *iov, int nr, unsigned len, uns
static void
xprt_adjust_cwnd(struct rpc_xprt *xprt, int result)
{
- unsigned long cwnd = xprt->cwnd;
+ unsigned long cwnd;
- spin_lock_bh(&xprt_sock_lock);
if (xprt->nocong)
- goto out;
+ return;
+ spin_lock_bh(&xprt_sock_lock);
+ cwnd = xprt->cwnd;
if (result >= 0) {
if (xprt->cong < cwnd || time_before(jiffies, xprt->congtime))
goto out;
@@ -536,7 +536,7 @@ xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid)
out_bad:
req = NULL;
out:
- if (req && !rpc_lock_task(req->rq_task))
+ if (req && !__rpc_lock_task(req->rq_task))
req = NULL;
spin_unlock_bh(&rpc_queue_lock);
return req;
@@ -575,6 +575,7 @@ xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied)
dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied);
task->tk_status = copied;
+ req->rq_received = 1;
/* ... and wake up the process. */
rpc_wake_up_task(task);
@@ -589,7 +590,7 @@ static int csum_partial_copy_to_page_cache(struct iovec *iov,
struct sk_buff *skb,
int copied)
{
- __u8 *pkt_data = skb->data + sizeof(struct udphdr);
+ __u8 *pkt_data = skb->h.raw + sizeof(struct udphdr);
__u8 *cur_ptr = iov->iov_base;
__kernel_size_t cur_len = iov->iov_len;
unsigned int csum = skb->csum;
@@ -632,7 +633,7 @@ static int csum_partial_copy_to_page_cache(struct iovec *iov,
* Input handler for RPC replies. Called from a bottom half and hence
* atomic.
*/
-static inline void
+static void
udp_data_ready(struct sock *sk, int len)
{
struct rpc_task *task;
@@ -644,13 +645,13 @@ udp_data_ready(struct sock *sk, int len)
dprintk("RPC: udp_data_ready...\n");
if (!(xprt = xprt_from_sock(sk))) {
printk("RPC: udp_data_ready request not found!\n");
- return;
+ goto out;
}
dprintk("RPC: udp_data_ready client %p\n", xprt);
if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
- return;
+ goto out;
if (xprt->shutdown)
goto dropit;
@@ -674,7 +675,6 @@ udp_data_ready(struct sock *sk, int len)
if ((copied = rovr->rq_rlen) > repsize)
copied = repsize;
- rovr->rq_damaged = 1;
/* Suck it into the iovec, verify checksum if not done by hw. */
if (csum_partial_copy_to_page_cache(rovr->rq_rvec, skb, copied))
goto out_unlock;
@@ -689,6 +689,9 @@ udp_data_ready(struct sock *sk, int len)
dropit:
skb_free_datagram(sk, skb);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
}
/*
@@ -857,11 +860,8 @@ tcp_input_record(struct rpc_xprt *xprt)
req = xprt_lookup_rqst(xprt, xprt->tcp_xid);
if (req) {
task = req->rq_task;
- if (xprt->tcp_copied == sizeof(xprt->tcp_xid) || req->rq_damaged) {
- req->rq_damaged = 1;
- /* Read in the request data */
- result = tcp_read_request(xprt, req, avail);
- }
+ /* Read in the request data */
+ result = tcp_read_request(xprt, req, avail);
rpc_unlock_task(task);
if (result < 0)
return result;
@@ -973,11 +973,11 @@ static void tcp_data_ready(struct sock *sk, int len)
if (!(xprt = xprt_from_sock(sk)))
{
printk("Not a socket with xprt %p\n", sk);
- return;
+ goto out;
}
if (xprt->shutdown)
- return;
+ goto out;
xprt_append_pending(xprt);
@@ -985,6 +985,9 @@ static void tcp_data_ready(struct sock *sk, int len)
dprintk("RPC: state %x conn %d dead %d zapped %d\n",
sk->state, xprt->connected,
sk->dead, sk->zapped);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
}
@@ -994,7 +997,7 @@ tcp_state_change(struct sock *sk)
struct rpc_xprt *xprt;
if (!(xprt = xprt_from_sock(sk)))
- return;
+ goto out;
dprintk("RPC: tcp_state_change client %p...\n", xprt);
dprintk("RPC: state %x conn %d dead %d zapped %d\n",
sk->state, xprt->connected,
@@ -1014,6 +1017,9 @@ tcp_state_change(struct sock *sk)
break;
}
spin_unlock_bh(&xprt_sock_lock);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible_all(sk->sleep);
}
/*
@@ -1024,8 +1030,9 @@ static void
tcp_write_space(struct sock *sk)
{
struct rpc_xprt *xprt;
+ struct socket *sock;
- if (!(xprt = xprt_from_sock(sk)))
+ if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->socket))
return;
if (xprt->shutdown)
return;
@@ -1042,6 +1049,12 @@ tcp_write_space(struct sock *sk)
if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->sending)
rpc_wake_up_task(xprt->snd_task);
+ if (test_bit(SOCK_NOSPACE, &sock->flags)) {
+ if (sk->sleep && waitqueue_active(sk->sleep)) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+ wake_up_interruptible(sk->sleep);
+ }
+ }
out_unlock:
spin_unlock_bh(&xprt_sock_lock);
}
@@ -1071,6 +1084,8 @@ udp_write_space(struct sock *sk)
rpc_wake_up_task(xprt->snd_task);
out_unlock:
spin_unlock_bh(&xprt_sock_lock);
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
}
/*
@@ -1198,6 +1213,9 @@ do_xprt_transmit(struct rpc_task *task)
*/
while (1) {
xprt->write_space = 0;
+ status = -ENOMEM;
+ if (sock_wspace(xprt->inet) < req->rq_slen + SOCK_MIN_WRITE_SPACE)
+ break;
status = xprt_sendmsg(xprt, req);
if (status < 0)
@@ -1225,8 +1243,6 @@ do_xprt_transmit(struct rpc_task *task)
}
rpc_unlock_task(task);
- task->tk_status = status;
-
/* Note: at this point, task->tk_sleeping has not yet been set,
* hence there is no danger of the waking up task being put on
* schedq, and being picked up by a parallel run of rpciod().
@@ -1234,14 +1250,19 @@ do_xprt_transmit(struct rpc_task *task)
rpc_wake_up_task(task);
if (!RPC_IS_RUNNING(task))
goto out_release;
+ if (req->rq_received)
+ goto out_release;
+
+ task->tk_status = status;
switch (status) {
case -ENOMEM:
/* Protect against (udp|tcp)_write_space */
- task->tk_timeout = req->rq_timeout.to_current;
spin_lock_bh(&xprt_sock_lock);
- if (!xprt->write_space)
+ if (!xprt->write_space) {
+ task->tk_timeout = req->rq_timeout.to_current;
rpc_sleep_on(&xprt->sending, task, NULL, NULL);
+ }
spin_unlock_bh(&xprt_sock_lock);
return;
case -EAGAIN:
@@ -1279,6 +1300,7 @@ xprt_receive(struct rpc_task *task)
dprintk("RPC: %4d xprt_receive\n", task->tk_pid);
+ req->rq_received = 0;
task->tk_timeout = 0;
rpc_sleep_locked(&xprt->pending, task, NULL, NULL);
}
@@ -1610,7 +1632,8 @@ xprt_shutdown(struct rpc_xprt *xprt)
rpc_wake_up(&xprt->pending);
rpc_wake_up(&xprt->backlog);
rpc_wake_up(&xprt->reconn);
- wake_up(&xprt->cong_wait);
+ if (waitqueue_active(&xprt->cong_wait))
+ wake_up(&xprt->cong_wait);
}
/*
@@ -1621,7 +1644,8 @@ xprt_clear_backlog(struct rpc_xprt *xprt) {
if (RPCXPRT_CONGESTED(xprt))
return 0;
rpc_wake_up_next(&xprt->backlog);
- wake_up(&xprt->cong_wait);
+ if (waitqueue_active(&xprt->cong_wait))
+ wake_up(&xprt->cong_wait);
return 1;
}