diff options
Diffstat (limited to 'arch/i386')
47 files changed, 2862 insertions, 2243 deletions
diff --git a/arch/i386/.cvsignore b/arch/i386/.cvsignore new file mode 100644 index 000000000..002b34149 --- /dev/null +++ b/arch/i386/.cvsignore @@ -0,0 +1 @@ +.kernel_offset.lds diff --git a/arch/i386/Makefile b/arch/i386/Makefile index cc2cd92ab..e13adc089 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -12,6 +12,9 @@ # # Copyright (C) 1994 by Linus Torvalds # +# 19990713 Artur Skawina <skawina@geocities.com> +# Added '-march' and '-mpreferred-stack-boundary' support +# LD=$(CROSS_COMPILE)ld -m elf_i386 CPP=$(CC) -E @@ -23,28 +26,35 @@ CFLAGS_PIPE := -pipe CFLAGS_NSR := -fno-strength-reduce CFLAGS := $(CFLAGS) $(CFLAGS_PIPE) $(CFLAGS_NSR) +# prevent gcc from keeping the stack 16 byte aligned +CFLAGS += $(shell if $(CC) -mpreferred-stack-boundary=2 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-mpreferred-stack-boundary=2"; fi) + ifdef CONFIG_M386 CFLAGS := $(CFLAGS) -m386 -DCPU=386 AFLAGS := $(AFLAGS) -DCPU=386 endif ifdef CONFIG_M486 -CFLAGS := $(CFLAGS) -m486 -DCPU=486 +CFLAGS := $(CFLAGS) -DCPU=486 +CFLAGS += $(shell if $(CC) -march=i486 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-march=i486"; else echo "-m486"; fi) AFLAGS := $(AFLAGS) -DCPU=486 endif ifdef CONFIG_M586 CFLAGS := $(CFLAGS) -DCPU=586 +CFLAGS += $(shell if $(CC) -march=i586 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-march=i586"; fi) AFLAGS := $(AFLAGS) -DCPU=586 endif ifdef CONFIG_M586TSC CFLAGS := $(CFLAGS) -DCPU=586 +CFLAGS += $(shell if $(CC) -march=i586 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-march=i586"; fi) AFLAGS := $(AFLAGS) -DCPU=586 endif ifdef CONFIG_M686 CFLAGS := $(CFLAGS) -DCPU=686 +CFLAGS += $(shell if $(CC) -march=i686 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-march=i686"; fi) AFLAGS := $(AFLAGS) -DCPU=686 endif @@ -101,6 +111,7 @@ archclean: @$(MAKEBOOT) clean archmrproper: + rm -f arch/i386/vmlinux.lds archdep: @$(MAKEBOOT) dep diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index 64b9377a2..eccc87ddb 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -51,7 +51,7 @@ setup: setup.o setup.o: setup.s $(AS86) -o $@ $< -setup.s: setup.S video.S Makefile $(BOOT_INCL) $(TOPDIR)/include/linux/version.h +setup.s: setup.S video.S Makefile $(BOOT_INCL) $(TOPDIR)/include/linux/version.h $(TOPDIR)/include/linux/compile.h $(CPP) -traditional $(SVGA_MODE) $(RAMDISK) $< -o $@ bsetup: bsetup.o @@ -60,7 +60,7 @@ bsetup: bsetup.o bsetup.o: bsetup.s $(AS86) -o $@ $< -bsetup.s: setup.S video.S Makefile $(BOOT_INCL) $(TOPDIR)/include/linux/version.h +bsetup.s: setup.S video.S Makefile $(BOOT_INCL) $(TOPDIR)/include/linux/version.h $(TOPDIR)/include/linux/compile.h $(CPP) -D__BIG_KERNEL__ -traditional $(SVGA_MODE) $(RAMDISK) $< -o $@ bootsect: bootsect.o diff --git a/arch/i386/boot/bootsect.S b/arch/i386/boot/bootsect.S index d8e2bc6c1..82c6e7459 100644 --- a/arch/i386/boot/bootsect.S +++ b/arch/i386/boot/bootsect.S @@ -98,7 +98,7 @@ go: mov di,#0x4000-12 ! 0x4000 is arbitrary value >= length of * fs = 0, gs is unused. */ -! cx contains 0 from rep movsw above +! cx contains 0 from rep movsd above mov fs,cx mov bx,#0x78 ! fs:bx is parameter table address @@ -106,12 +106,12 @@ go: mov di,#0x4000-12 ! 0x4000 is arbitrary value >= length of seg fs lds si,(bx) ! ds:si is source - mov cl,#6 ! copy 12 bytes + mov cl,#3 ! copy 12 bytes cld push di rep - movsw + movsd pop di pop ds @@ -125,7 +125,7 @@ go: mov di,#0x4000-12 ! 0x4000 is arbitrary value >= length of ! load the setup-sectors directly after the bootblock. ! Note that 'es' is already set up. -! Also cx is 0 from rep movsw above. +! Also cx is 0 from rep movsd above. load_setup: xor ah,ah ! reset FDC diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile index 6a3fda06d..aef761602 100644 --- a/arch/i386/boot/compressed/Makefile +++ b/arch/i386/boot/compressed/Makefile @@ -30,7 +30,7 @@ vmlinux: piggy.o $(OBJECTS) bvmlinux: piggy.o $(OBJECTS) $(LD) $(BZLINKFLAGS) -o bvmlinux $(OBJECTS) piggy.o -head.o: head.S $(TOPDIR)/include/linux/tasks.h +head.o: head.S $(CC) $(AFLAGS) -traditional -c head.S piggy.o: $(SYSTEM) diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index ad78c419d..128b53427 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -104,7 +104,7 @@ static long free_mem_end_ptr = 0x90000; #define LOW_BUFFER_START 0x2000 #define LOW_BUFFER_END 0x90000 #define LOW_BUFFER_SIZE ( LOW_BUFFER_END - LOW_BUFFER_START ) -#define HEAP_SIZE 0x2000 +#define HEAP_SIZE 0x3000 static int high_loaded =0; static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index e45fcda2d..70cfc0724 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -37,6 +37,7 @@ #include <linux/version.h> #include <linux/compile.h> #include <asm/boot.h> +#include <asm/e820.h> ! Signature words to ensure LILO loaded us right #define SIG1 0xAA55 @@ -59,7 +60,7 @@ begbss: entry start start: - jmp start_of_setup + jmp trampoline ! ------------------------ start of header -------------------------------- ! ! SETUP-header, must start at CS:2 (old 0x9020:2) @@ -119,6 +120,8 @@ bootsect_kludge: heap_end_ptr: .word modelist+1024 ! space from here (exclusive) down to ! end of setup code can be used by setup ! for local heap purposes. +trampoline: call start_of_setup + .space 1024 ! ------------------------ end of header ---------------------------------- start_of_setup: @@ -188,9 +191,9 @@ bad_sig: xor bh,bh mov bl,[497] ! get setup sects from boot sector sub bx,#4 ! LILO loads 4 sectors of setup - shl bx,#8 ! convert to words + shl bx,#7 ! convert to dwords (1sect=2^7 dwords) mov cx,bx - shr bx,#3 ! convert to segment + shr bx,#2 ! convert to segment add bx,#SYSSEG seg cs mov start_sys_seg,bx @@ -203,7 +206,7 @@ bad_sig: mov ax,#SYSSEG mov ds,ax rep - movsw + movsd mov ax,cs ! aka #SETUPSEG mov ds,ax @@ -245,37 +248,91 @@ loader_panic_mess: loader_ok: ! Get memory size (extended mem, kB) + xor eax, eax + mov dword ptr [0x1e0], eax #ifndef STANDARD_MEMORY_BIOS_CALL - push ebx - xor ebx,ebx ! preload new memory slot with 0k - mov [0x1e0], ebx + mov byte ptr [E820NR], al - mov ax,#0xe801 - int 0x15 - jc oldstylemem +! Try three different memory detection schemes. First, try +! e820h, which lets us assemble a memory map, then try e801h, +! which returns a 32-bit memory size, and finally 88h, which +! returns 0-64m + +! method E820H: +! the memory map from hell. e820h returns memory classified into +! a whole bunch of different types, and allows memory holes and +! everything. We scan through this memory map and build a list +! of the first 32 memory areas, which we return at [E820MAP]. +! + +meme820: + mov edx, #0x534d4150 ! ascii `SMAP' + xor ebx, ebx ! continuation counter + + mov di, #E820MAP ! point into the whitelist + ! so we can have the bios + ! directly write into it. + +jmpe820: + mov eax, #0x0000e820 ! e820, upper word zeroed + mov ecx, #20 ! size of the e820rec + + push ds ! data record. + pop es + int 0x15 ! make the call + jc bail820 ! fall to e801 if it fails + + cmp eax, #0x534d4150 ! check the return is `SMAP' + jne bail820 ! fall to e801 if it fails -! Memory size is in 1 k chunksizes, to avoid confusing loadlin. -! We store the 0xe801 memory size in a completely different place, +! cmp dword ptr [16+di], #1 ! is this usable memory? +! jne again820 + + ! If this is usable memory, we save it by simply advancing di by + ! sizeof(e820rec). + ! +good820: + mov al, byte ptr [E820NR] ! up to 32 good entries, that is + cmp al, #E820MAX + jnl bail820 + inc byte ptr [E820NR] + mov ax, di + add ax, #20 + mov di, ax + +again820: + cmp ebx, #0 ! check to see if ebx is + jne jmpe820 ! set to EOF + +bail820: + + +! method E801H: +! memory size is in 1k chunksizes, to avoid confusing loadlin. +! we store the 0xe801 memory size in a completely different place, ! because it will most likely be longer than 16 bits. ! (use 1e0 because that's what Larry Augustine uses in his ! alternative new memory detection scheme, and it's sensible ! to write everything into the same place.) - and ebx, #0xffff ! clear sign extend - shl ebx, 6 ! and go from 64k to 1k chunks - mov [0x1e0],ebx ! store extended memory size +meme801: - and eax, #0xffff ! clear sign extend - add [0x1e0],eax ! and add lower memory into total size. - - ! and fall into the old memory detection code to populate the - ! compatibility slot. + mov ax,#0xe801 + int 0x15 + jc mem88 + + and edx, #0xffff ! clear sign extend + shl edx, 6 ! and go from 64k to 1k chunks + mov [0x1e0],edx ! store extended memory size + + and ecx, #0xffff ! clear sign extend + add [0x1e0],ecx ! and add lower memory into total size. + +! Ye Olde Traditional Methode. Returns the memory size (up to 16mb or +! 64mb, depending on the bios) in ax. +mem88: -oldstylemem: - pop ebx -#else - mov dword ptr [0x1e0], #0 #endif mov ah,#0x88 int 0x15 @@ -404,7 +461,7 @@ no_psmouse: int 0x15 ! ignore return code mov ax,#0x05303 ! 32 bit connect - xor bx,bx + xor ebx,ebx int 0x15 jc no_32_apm_bios ! error @@ -485,9 +542,9 @@ do_move: add bx,#0x100 sub di,di sub si,si - mov cx,#0x800 + mov cx,#0x400 rep - movsw + movsd cmp bx,bp ! we assume start_sys_seg > 0x200, ! so we will perhaps read one page more then ! needed, but never overwrite INITSEG because diff --git a/arch/i386/config.in b/arch/i386/config.in index 56b3fd802..136034f6f 100644 --- a/arch/i386/config.in +++ b/arch/i386/config.in @@ -4,6 +4,9 @@ # mainmenu_name "Linux Kernel Configuration" +define_bool CONFIG_X86 y +define_bool CONFIG_ISA y + mainmenu_option next_comment comment 'Code maturity level options' bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL @@ -54,6 +57,7 @@ endmenu mainmenu_option next_comment comment 'General setup' +bool 'BIGMEM support' CONFIG_BIGMEM bool 'Networking support' CONFIG_NET bool 'PCI support' CONFIG_PCI if [ "$CONFIG_PCI" = "y" ]; then @@ -67,11 +71,6 @@ if [ "$CONFIG_PCI" = "y" ]; then if [ "$CONFIG_PCI_GODIRECT" = "y" -o "$CONFIG_PCI_GOANY" = "y" ]; then define_bool CONFIG_PCI_DIRECT y fi - bool ' PCI quirks' CONFIG_PCI_QUIRKS - if [ "$CONFIG_PCI_QUIRKS" = "y" -a "$CONFIG_EXPERIMENTAL" = "y" ]; then - bool ' PCI bridge optimization (experimental)' CONFIG_PCI_OPTIMIZE - fi - bool ' Backward-compatible /proc/pci' CONFIG_PCI_OLD_PROC fi bool 'MCA support' CONFIG_MCA bool 'SGI Visual Workstation support' CONFIG_VISWS @@ -85,6 +84,8 @@ else fi fi +source drivers/pcmcia/Config.in + bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL @@ -92,10 +93,10 @@ tristate 'Kernel support for a.out binaries' CONFIG_BINFMT_AOUT tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC -source drivers/misc/Config.in +source drivers/parport/Config.in -bool 'Advanced Power Management BIOS support' CONFIG_APM -if [ "$CONFIG_APM" = "y" ]; then +tristate 'Advanced Power Management BIOS support' CONFIG_APM +if [ "$CONFIG_APM" != "n" ]; then bool ' Ignore USER SUSPEND' CONFIG_APM_IGNORE_USER_SUSPEND bool ' Enable PM at boot time' CONFIG_APM_DO_ENABLE bool ' Make CPU Idle calls when idle' CONFIG_APM_CPU_IDLE @@ -136,6 +137,9 @@ if [ "$CONFIG_NET" = "y" ]; then bool 'Network device support' CONFIG_NETDEVICES if [ "$CONFIG_NETDEVICES" = "y" ]; then source drivers/net/Config.in + if [ "$CONFIG_ATM" = "y" ]; then + source drivers/atm/Config.in + fi fi endmenu fi @@ -167,6 +171,8 @@ source drivers/char/Config.in source drivers/usb/Config.in +source drivers/misc/Config.in + source fs/Config.in if [ "$CONFIG_VT" = "y" ]; then diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 3ff204313..606a51630 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,6 +1,8 @@ # # Automatically generated make config: don't edit # +CONFIG_X86=y +CONFIG_ISA=y # # Code maturity level options @@ -37,6 +39,7 @@ CONFIG_MODULES=y # # General setup # +# CONFIG_BIGMEM is not set CONFIG_NET=y CONFIG_PCI=y # CONFIG_PCI_GOBIOS is not set @@ -44,12 +47,16 @@ CONFIG_PCI=y CONFIG_PCI_GOANY=y CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y -CONFIG_PCI_QUIRKS=y -CONFIG_PCI_OLD_PROC=y # CONFIG_MCA is not set # CONFIG_VISWS is not set CONFIG_X86_IO_APIC=y CONFIG_X86_LOCAL_APIC=y + +# +# PCMCIA/Cardbus support +# +CONFIG_PCMCIA=y +CONFIG_CARDBUS=y CONFIG_SYSVIPC=y # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y @@ -70,6 +77,12 @@ CONFIG_BINFMT_MISC=y # CONFIG_I2O_PROC is not set # +# Plug and Play configuration +# +CONFIG_PNP=y +CONFIG_ISAPNP=y + +# # Block devices # CONFIG_BLK_DEV_FD=y @@ -94,7 +107,9 @@ CONFIG_BLK_DEV_IDEPCI=y # CONFIG_BLK_DEV_OFFBOARD is not set # CONFIG_BLK_DEV_AEC6210 is not set # CONFIG_BLK_DEV_HPT34X is not set +# CONFIG_BLK_DEV_HPT366 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_SIS5513 is not set # CONFIG_IDE_CHIPSETS is not set # CONFIG_BLK_CPQ_DA is not set @@ -115,8 +130,9 @@ CONFIG_BLK_DEV_IDE_MODES=y # Networking options # CONFIG_PACKET=y +# CONFIG_PACKET_MMAP is not set # CONFIG_NETLINK is not set -# CONFIG_FIREWALL is not set +# CONFIG_NETFILTER is not set # CONFIG_FILTER is not set CONFIG_UNIX=y CONFIG_INET=y @@ -132,7 +148,6 @@ CONFIG_INET=y # # (it is safe to leave these untouched) # -# CONFIG_INET_RARP is not set CONFIG_SKB_LARGE=y # @@ -170,6 +185,7 @@ CONFIG_SCSI_CONSTANTS=y # CONFIG_SCSI_AHA1542 is not set # CONFIG_SCSI_AHA1740 is not set # CONFIG_SCSI_AIC7XXX is not set +# CONFIG_SCSI_IPS is not set # CONFIG_SCSI_ADVANSYS is not set # CONFIG_SCSI_IN2000 is not set # CONFIG_SCSI_AM53C974 is not set @@ -219,6 +235,7 @@ CONFIG_NETDEVICES=y # CONFIG_ARCNET is not set CONFIG_DUMMY=m # CONFIG_EQUALIZER is not set +# CONFIG_NET_SB1000 is not set # # Ethernet (10 or 100Mbit) @@ -240,6 +257,7 @@ CONFIG_EEXPRESS_PRO100=y # CONFIG_NE2K_PCI is not set # CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set +# CONFIG_ADAPTEC_STARFIRE is not set # CONFIG_NET_POCKET is not set # CONFIG_FDDI is not set # CONFIG_PPP is not set @@ -250,6 +268,7 @@ CONFIG_EEXPRESS_PRO100=y # Token ring devices # # CONFIG_TR is not set +# CONFIG_NET_FC is not set # # Wan interfaces @@ -263,12 +282,20 @@ CONFIG_EEXPRESS_PRO100=y # CONFIG_X25_ASY is not set # +# PCMCIA network devices +# +# CONFIG_PCMCIA_PCNET is not set +# CONFIG_PCMCIA_3C589 is not set +CONFIG_PCMCIA_RAYCS=y +CONFIG_PCMCIA_NETCARD=y + +# # Amateur Radio support # # CONFIG_HAMRADIO is not set # -# IrDA subsystem support +# IrDA (infrared) support # # CONFIG_IRDA is not set @@ -293,14 +320,12 @@ CONFIG_SERIAL=y # CONFIG_SERIAL_NONSTANDARD is not set CONFIG_UNIX98_PTYS=y CONFIG_UNIX98_PTY_COUNT=256 -CONFIG_MOUSE=y # # Mice # -# CONFIG_ATIXL_BUSMOUSE is not set # CONFIG_BUSMOUSE is not set -# CONFIG_MS_BUSMOUSE is not set +CONFIG_MOUSE=y CONFIG_PSMOUSE=y CONFIG_82C710_MOUSE=y # CONFIG_PC110_PAD is not set @@ -319,6 +344,8 @@ CONFIG_82C710_MOUSE=y # # CONFIG_JOYSTICK is not set # CONFIG_DTLK is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set # # Ftape, the floppy tape device driver @@ -331,6 +358,10 @@ CONFIG_82C710_MOUSE=y # CONFIG_USB is not set # +# Misc devices +# + +# # Filesystems # # CONFIG_QUOTA is not set @@ -343,6 +374,7 @@ CONFIG_AUTOFS_FS=y # CONFIG_VFAT_FS is not set CONFIG_ISO9660_FS=y # CONFIG_JOLIET is not set +# CONFIG_UDF_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_NTFS_FS is not set # CONFIG_HPFS_FS is not set @@ -358,6 +390,8 @@ CONFIG_EXT2_FS=y # # CONFIG_CODA_FS is not set CONFIG_NFS_FS=y +CONFIG_NFSD=y +# CONFIG_NFSD_SUN is not set CONFIG_SUNRPC=y CONFIG_LOCKD=y # CONFIG_SMB_FS is not set @@ -366,11 +400,13 @@ CONFIG_LOCKD=y # # Partition Types # +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y # CONFIG_BSD_DISKLABEL is not set -# CONFIG_MAC_PARTITION is not set -# CONFIG_SMD_DISKLABEL is not set # CONFIG_SOLARIS_X86_PARTITION is not set -# CONFIG_SGI_DISKLABEL is not set +# CONFIG_UNIXWARE_DISKLABEL is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_SUN_PARTITION is not set # CONFIG_NLS is not set # diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 0c3f24889..875f52d5a 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -13,8 +13,8 @@ all: kernel.o head.o init_task.o O_TARGET := kernel.o -O_OBJS := process.o signal.o entry.o traps.o irq.o vm86.o \ - ptrace.o ioport.o ldt.o setup.o time.o sys_i386.o +O_OBJS := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ + ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o OX_OBJS := i386_ksyms.o MX_OBJS := @@ -34,8 +34,12 @@ else endif endif -ifdef CONFIG_APM +ifeq ($(CONFIG_APM),y) OX_OBJS += apm.o +else + ifeq ($(CONFIG_APM),m) + MX_OBJS += apm.o + endif endif ifdef CONFIG_SMP @@ -50,7 +54,7 @@ ifdef CONFIG_X86_VISWS_APIC O_OBJS += visws_apic.o endif -head.o: head.S $(TOPDIR)/include/linux/tasks.h +head.o: head.S $(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $*.S -o $*.o include $(TOPDIR)/Rules.make diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 7931e8df8..3bafdfcfc 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -273,7 +273,6 @@ static void standby(void); static void set_time(void); static void check_events(void); -static void do_apm_timer(unsigned long); static int do_open(struct inode *, struct file *); static int do_release(struct inode *, struct file *); @@ -289,7 +288,7 @@ extern void apm_unregister_callback(int (*)(apm_event_t)); /* * Local variables */ -static asmlinkage struct { +static struct { unsigned long offset; unsigned short segment; } apm_bios_entry; @@ -314,11 +313,9 @@ static int got_clock_diff = 0; static int debug = 0; static int apm_disabled = 0; -static DECLARE_WAIT_QUEUE_HEAD(process_list); +static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static struct apm_bios_struct * user_list = NULL; -static struct timer_list apm_timer; - static char driver_version[] = "1.9"; /* no spaces */ #ifdef APM_DEBUG @@ -543,6 +540,50 @@ static int apm_set_power_state(u_short state) return set_power_state(0x0001, state); } +/* + * If no process has been interested in this + * CPU for some time, we want to wake up the + * power management thread - we probably want + * to conserve power. + */ +#define HARD_IDLE_TIMEOUT (HZ/3) + +/* This should wake up kapmd and ask it to slow the CPU */ +#define powermanagement_idle() do { } while (0) + +extern int hlt_counter; + +/* + * This is the idle thing. + */ +void apm_cpu_idle(void) +{ + unsigned int start_idle; + + start_idle = jiffies; + while (1) { + if (!current->need_resched) { + if (jiffies - start_idle < HARD_IDLE_TIMEOUT) { + if (!current_cpu_data.hlt_works_ok) + continue; + if (hlt_counter) + continue; + asm volatile("sti ; hlt" : : : "memory"); + continue; + } + + /* + * Ok, do some power management - we've been idle for too long + */ + powermanagement_idle(); + } + + schedule(); + check_pgt_cache(); + start_idle = jiffies; + } +} + void apm_power_off(void) { /* @@ -756,7 +797,7 @@ static int queue_event(apm_event_t event, struct apm_bios_struct *sender) break; } } - wake_up_interruptible(&process_list); + wake_up_interruptible(&apm_waitqueue); return 1; } @@ -942,15 +983,14 @@ static void check_events(void) } } -static void do_apm_timer(unsigned long unused) +static void apm_event_handler(void) { - int err; - - static int pending_count = 0; + static int pending_count = 0; if (((standbys_pending > 0) || (suspends_pending > 0)) && (apm_bios_info.version > 0x100) && (pending_count-- <= 0)) { + int err; pending_count = 4; err = apm_set_power_state(APM_STATE_BUSY); @@ -961,14 +1001,9 @@ static void do_apm_timer(unsigned long unused) if (!(((standbys_pending > 0) || (suspends_pending > 0)) && (apm_bios_info.version == 0x100))) check_events(); - - init_timer(&apm_timer); - apm_timer.expires = APM_CHECK_TIMEOUT + jiffies; - add_timer(&apm_timer); } -/* Called from sys_idle, must make sure apm_enabled. */ -int apm_do_idle(void) +static int apm_do_idle(void) { #ifdef CONFIG_APM_CPU_IDLE u32 dummy; @@ -979,30 +1014,74 @@ int apm_do_idle(void) if (apm_bios_call_simple(0x5305, 0, 0, &dummy)) return 0; +#ifdef ALWAYS_CALL_BUSY + clock_slowed = 1; +#else clock_slowed = (apm_bios_info.flags & APM_IDLE_SLOWS_CLOCK) != 0; +#endif return 1; #else return 0; #endif } -/* Called from sys_idle, must make sure apm_enabled. */ -void apm_do_busy(void) +static void apm_do_busy(void) { #ifdef CONFIG_APM_CPU_IDLE u32 dummy; - if (apm_enabled -#ifndef ALWAYS_CALL_BUSY - && clock_slowed -#endif - ) { + if (clock_slowed) { (void) apm_bios_call_simple(0x5306, 0, 0, &dummy); clock_slowed = 0; } #endif } +/* + * This is the APM thread main loop. + * + * Check whether we're the only running process to + * decide if we should just power down. + * + * Do this by checking the runqueue: if we're the + * only one, then the current process run_list will + * have both prev and next pointing to the same + * entry (the true idle process) + */ +#define system_idle() (current->run_list.next == current->run_list.prev) + +static void apm_mainloop(void) +{ + DECLARE_WAITQUEUE(wait, current); + apm_enabled = 1; + + add_wait_queue(&apm_waitqueue, &wait); + current->state = TASK_INTERRUPTIBLE; + for (;;) { + /* Nothing to do, just sleep for the timeout */ + schedule_timeout(APM_CHECK_TIMEOUT); + + /* + * Ok, check all events, check for idle (and mark us sleeping + * so as not to count towards the load average).. + */ + current->state = TASK_INTERRUPTIBLE; + apm_event_handler(); + if (!system_idle()) + continue; + if (apm_do_idle()) { + unsigned long start = jiffies; + do { + apm_do_idle(); + if (jiffies - start > APM_CHECK_TIMEOUT) + break; + } while (system_idle()); + apm_do_busy(); + apm_event_handler(); + } + } +} + static int check_apm_bios_struct(struct apm_bios_struct *as, const char *func) { if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { @@ -1027,15 +1106,15 @@ static ssize_t do_read(struct file *fp, char *buf, size_t count, loff_t *ppos) if (queue_empty(as)) { if (fp->f_flags & O_NONBLOCK) return -EAGAIN; - add_wait_queue(&process_list, &wait); + add_wait_queue(&apm_waitqueue, &wait); repeat: - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); if (queue_empty(as) && !signal_pending(current)) { schedule(); goto repeat; } current->state = TASK_RUNNING; - remove_wait_queue(&process_list, &wait); + remove_wait_queue(&apm_waitqueue, &wait); } i = count; while ((i >= sizeof(event)) && !queue_empty(as)) { @@ -1069,7 +1148,7 @@ static unsigned int do_poll(struct file *fp, poll_table * wait) as = fp->private_data; if (check_apm_bios_struct(as, "select")) return 0; - poll_wait(fp, &process_list, wait); + poll_wait(fp, &apm_waitqueue, wait); if (!queue_empty(as)) return POLLIN | POLLRDNORM; return 0; @@ -1263,7 +1342,97 @@ int apm_get_info(char *buf, char **start, off_t fpos, int length, int dummy) return p - buf; } -void __init apm_setup(char *str, int *dummy) +static int apm(void *unused) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + unsigned short error; + char * power_stat; + char * bat_stat; + + strcpy(current->comm, "kapmd"); + sigfillset(¤t->blocked); + + if (apm_bios_info.version > 0x100) { + /* + * We only support BIOSs up to version 1.2 + */ + if (apm_bios_info.version > 0x0102) + apm_bios_info.version = 0x0102; + if (apm_driver_version(&apm_bios_info.version) != APM_SUCCESS) { + /* Fall back to an APM 1.0 connection. */ + apm_bios_info.version = 0x100; + } + } + if (debug) { + printk(KERN_INFO "apm: Connection version %d.%d\n", + (apm_bios_info.version >> 8) & 0xff, + apm_bios_info.version & 0xff ); + + error = apm_get_power_status(&bx, &cx, &dx); + if (error) + printk(KERN_INFO "apm: power status not available\n"); + else { + switch ((bx >> 8) & 0xff) { + case 0: power_stat = "off line"; break; + case 1: power_stat = "on line"; break; + case 2: power_stat = "on backup power"; break; + default: power_stat = "unknown"; break; + } + switch (bx & 0xff) { + case 0: bat_stat = "high"; break; + case 1: bat_stat = "low"; break; + case 2: bat_stat = "critical"; break; + case 3: bat_stat = "charging"; break; + default: bat_stat = "unknown"; break; + } + printk(KERN_INFO + "apm: AC %s, battery status %s, battery life ", + power_stat, bat_stat); + if ((cx & 0xff) == 0xff) + printk("unknown\n"); + else + printk("%d%%\n", cx & 0xff); + if (apm_bios_info.version > 0x100) { + printk(KERN_INFO + "apm: battery flag 0x%02x, battery life ", + (cx >> 8) & 0xff); + if (dx == 0xffff) + printk("unknown\n"); + else + printk("%d %s\n", dx & 0x7fff, + (dx & 0x8000) ? + "minutes" : "seconds"); + } + } + } + +#ifdef CONFIG_APM_DO_ENABLE + if (apm_bios_info.flags & APM_BIOS_DISABLED) { + /* + * This call causes my NEC UltraLite Versa 33/C to hang if it + * is booted with PM disabled but not in the docking station. + * Unfortunate ... + */ + error = apm_enable_power_management(); + if (error) { + apm_error("enable power management", error); + return -1; + } + } +#endif + if (((apm_bios_info.flags & APM_BIOS_DISENGAGED) == 0) + && (apm_bios_info.version > 0x0100)) { + if (apm_engage_power_management(0x0001) == APM_SUCCESS) + apm_bios_info.flags &= ~APM_BIOS_DISENGAGED; + } + + apm_mainloop(); + return 0; +} + +static int __init apm_setup(char *str) { int invert; @@ -1283,16 +1452,23 @@ void __init apm_setup(char *str, int *dummy) if (str != NULL) str += strspn(str, ", \t"); } + return 1; } -void __init apm_bios_init(void) +__setup("apm=", apm_setup); + +/* + * Just start the APM thread. We do NOT want to do APM BIOS + * calls from anything but the APM thread, if for no other reason + * than the fact that we don't trust the APM BIOS. This way, + * most common APM BIOS problems that lead to protection errors + * etc will have at least some level of being contained... + * + * In short, if something bad happens, at least we have a choice + * of just killing the apm thread.. + */ +static int __init apm_init(void) { - unsigned short bx; - unsigned short cx; - unsigned short dx; - unsigned short error; - char * power_stat; - char * bat_stat; static struct proc_dir_entry *ent; if (apm_bios_info.version == 0) { @@ -1339,6 +1515,15 @@ void __init apm_bios_init(void) return; } +#ifdef CONFIG_SMP + if (smp_num_cpus > 1) { + printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + if (smp_hack) + smp_hack = 2; + return -1; + } +#endif + /* * Set up a segment that references the real mode segment 0x40 * that extends up to the end of page zero (that we have reserved). @@ -1378,92 +1563,6 @@ void __init apm_bios_init(void) (apm_bios_info.dseg_len - 1) & 0xffff); } #endif -#ifdef CONFIG_SMP - if (smp_num_cpus > 1) { - printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); - if (smp_hack) - smp_hack = 2; - return; - } -#endif - if (apm_bios_info.version > 0x100) { - /* - * We only support BIOSs up to version 1.2 - */ - if (apm_bios_info.version > 0x0102) - apm_bios_info.version = 0x0102; - if (apm_driver_version(&apm_bios_info.version) != APM_SUCCESS) { - /* Fall back to an APM 1.0 connection. */ - apm_bios_info.version = 0x100; - } - } - if (debug) { - printk(KERN_INFO "apm: Connection version %d.%d\n", - (apm_bios_info.version >> 8) & 0xff, - apm_bios_info.version & 0xff ); - - error = apm_get_power_status(&bx, &cx, &dx); - if (error) - printk(KERN_INFO "apm: power status not available\n"); - else { - switch ((bx >> 8) & 0xff) { - case 0: power_stat = "off line"; break; - case 1: power_stat = "on line"; break; - case 2: power_stat = "on backup power"; break; - default: power_stat = "unknown"; break; - } - switch (bx & 0xff) { - case 0: bat_stat = "high"; break; - case 1: bat_stat = "low"; break; - case 2: bat_stat = "critical"; break; - case 3: bat_stat = "charging"; break; - default: bat_stat = "unknown"; break; - } - printk(KERN_INFO - "apm: AC %s, battery status %s, battery life ", - power_stat, bat_stat); - if ((cx & 0xff) == 0xff) - printk("unknown\n"); - else - printk("%d%%\n", cx & 0xff); - if (apm_bios_info.version > 0x100) { - printk(KERN_INFO - "apm: battery flag 0x%02x, battery life ", - (cx >> 8) & 0xff); - if (dx == 0xffff) - printk("unknown\n"); - else - printk("%d %s\n", dx & 0x7fff, - (dx & 0x8000) ? - "minutes" : "seconds"); - } - } - } - -#ifdef CONFIG_APM_DO_ENABLE - if (apm_bios_info.flags & APM_BIOS_DISABLED) { - /* - * This call causes my NEC UltraLite Versa 33/C to hang if it - * is booted with PM disabled but not in the docking station. - * Unfortunate ... - */ - error = apm_enable_power_management(); - if (error) { - apm_error("enable power management", error); - return; - } - } -#endif - if (((apm_bios_info.flags & APM_BIOS_DISABLED) == 0) - && (apm_bios_info.version > 0x0100)) { - if (apm_engage_power_management(0x0001) == APM_SUCCESS) - apm_bios_info.flags &= ~APM_BIOS_DISENGAGED; - } - - init_timer(&apm_timer); - apm_timer.function = do_apm_timer; - apm_timer.expires = APM_CHECK_TIMEOUT + jiffies; - add_timer(&apm_timer); ent = create_proc_entry("apm", 0, 0); if (ent != NULL) @@ -1471,5 +1570,7 @@ void __init apm_bios_init(void) misc_register(&apm_device); - apm_enabled = 1; + kernel_thread(apm, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND | SIGCHLD); } + +module_init(apm_init) diff --git a/arch/i386/kernel/bios32.c b/arch/i386/kernel/bios32.c index 91d338b2c..f0c63c938 100644 --- a/arch/i386/kernel/bios32.c +++ b/arch/i386/kernel/bios32.c @@ -75,6 +75,8 @@ * Jan 23, 1999 : More improvements to peer host bridge logic. i450NX fixup. [mj] * * Feb 8, 1999 : Added UM8886BF I/O address fixup. [mj] + * + * August 1999 : New resource management and configuration access stuff. [mj] */ #include <linux/config.h> @@ -85,15 +87,14 @@ #include <linux/ioport.h> #include <linux/malloc.h> #include <linux/smp_lock.h> +#include <linux/irq.h> +#include <linux/spinlock.h> #include <asm/page.h> #include <asm/segment.h> #include <asm/system.h> #include <asm/io.h> #include <asm/smp.h> -#include <asm/spinlock.h> - -#include "irq.h" #undef DEBUG @@ -103,72 +104,6 @@ #define DBG(x...) #endif -/* - * This interrupt-safe spinlock protects all accesses to PCI - * configuration space. - */ - -spinlock_t pci_lock = SPIN_LOCK_UNLOCKED; - -/* - * Generic PCI access -- indirect calls according to detected HW. - */ - -struct pci_access { - int pci_present; - int (*read_config_byte)(unsigned char, unsigned char, unsigned char, unsigned char *); - int (*read_config_word)(unsigned char, unsigned char, unsigned char, unsigned short *); - int (*read_config_dword)(unsigned char, unsigned char, unsigned char, unsigned int *); - int (*write_config_byte)(unsigned char, unsigned char, unsigned char, unsigned char); - int (*write_config_word)(unsigned char, unsigned char, unsigned char, unsigned short); - int (*write_config_dword)(unsigned char, unsigned char, unsigned char, unsigned int); -}; - -static int pci_stub(void) -{ - return PCIBIOS_FUNC_NOT_SUPPORTED; -} - -static struct pci_access pci_access_none = { - 0, /* No PCI present */ - (void *) pci_stub, - (void *) pci_stub, - (void *) pci_stub, - (void *) pci_stub, - (void *) pci_stub, - (void *) pci_stub -}; - -static struct pci_access *access_pci = &pci_access_none; - -int pcibios_present(void) -{ - return access_pci->pci_present; -} - -#define PCI_byte_BAD 0 -#define PCI_word_BAD (pos & 1) -#define PCI_dword_BAD (pos & 3) - -#define PCI_STUB(rw,size,type) \ -int pcibios_##rw##_config_##size (u8 bus, u8 dfn, u8 pos, type value) \ -{ \ - int res; \ - unsigned long flags; \ - if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \ - spin_lock_irqsave(&pci_lock, flags); \ - res = access_pci->rw##_config_##size(bus, dfn, pos, value); \ - spin_unlock_irqrestore(&pci_lock, flags); \ - return res; \ -} - -PCI_STUB(read, byte, u8 *) -PCI_STUB(read, word, u16 *) -PCI_STUB(read, dword, u32 *) -PCI_STUB(write, byte, u8) -PCI_STUB(write, word, u16) -PCI_STUB(write, dword, u32) - #define PCI_PROBE_BIOS 1 #define PCI_PROBE_CONF1 2 #define PCI_PROBE_CONF2 4 @@ -176,6 +111,7 @@ PCI_STUB(write, dword, u32) #define PCI_BIOS_SORT 0x200 #define PCI_NO_CHECKS 0x400 #define PCI_NO_PEER_FIXUP 0x800 +#define PCI_ASSIGN_ROMS 0x1000 static unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2; @@ -189,60 +125,53 @@ static unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CON * Functions for accessing PCI configuration space with type 1 accesses */ -#define CONFIG_CMD(bus, device_fn, where) (0x80000000 | (bus << 16) | (device_fn << 8) | (where & ~3)) +#define CONFIG_CMD(dev, where) (0x80000000 | (dev->bus->number << 16) | (dev->devfn << 8) | (where & ~3)) -static int pci_conf1_read_config_byte(unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned char *value) +static int pci_conf1_read_config_byte(struct pci_dev *dev, int where, u8 *value) { - outl(CONFIG_CMD(bus,device_fn,where), 0xCF8); + outl(CONFIG_CMD(dev,where), 0xCF8); *value = inb(0xCFC + (where&3)); return PCIBIOS_SUCCESSFUL; } -static int pci_conf1_read_config_word (unsigned char bus, - unsigned char device_fn, unsigned char where, unsigned short *value) +static int pci_conf1_read_config_word(struct pci_dev *dev, int where, u16 *value) { - outl(CONFIG_CMD(bus,device_fn,where), 0xCF8); + outl(CONFIG_CMD(dev,where), 0xCF8); *value = inw(0xCFC + (where&2)); return PCIBIOS_SUCCESSFUL; } -static int pci_conf1_read_config_dword (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned int *value) +static int pci_conf1_read_config_dword(struct pci_dev *dev, int where, u32 *value) { - outl(CONFIG_CMD(bus,device_fn,where), 0xCF8); + outl(CONFIG_CMD(dev,where), 0xCF8); *value = inl(0xCFC); return PCIBIOS_SUCCESSFUL; } -static int pci_conf1_write_config_byte (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned char value) +static int pci_conf1_write_config_byte(struct pci_dev *dev, int where, u8 value) { - outl(CONFIG_CMD(bus,device_fn,where), 0xCF8); + outl(CONFIG_CMD(dev,where), 0xCF8); outb(value, 0xCFC + (where&3)); return PCIBIOS_SUCCESSFUL; } -static int pci_conf1_write_config_word (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned short value) +static int pci_conf1_write_config_word(struct pci_dev *dev, int where, u16 value) { - outl(CONFIG_CMD(bus,device_fn,where), 0xCF8); + outl(CONFIG_CMD(dev,where), 0xCF8); outw(value, 0xCFC + (where&2)); return PCIBIOS_SUCCESSFUL; } -static int pci_conf1_write_config_dword (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned int value) +static int pci_conf1_write_config_dword(struct pci_dev *dev, int where, u32 value) { - outl(CONFIG_CMD(bus,device_fn,where), 0xCF8); + outl(CONFIG_CMD(dev,where), 0xCF8); outl(value, 0xCFC); return PCIBIOS_SUCCESSFUL; } #undef CONFIG_CMD -static struct pci_access pci_direct_conf1 = { - 1, +static struct pci_ops pci_direct_conf1 = { pci_conf1_read_config_byte, pci_conf1_read_config_word, pci_conf1_read_config_dword, @@ -255,86 +184,65 @@ static struct pci_access pci_direct_conf1 = { * Functions for accessing PCI configuration space with type 2 accesses */ -#define IOADDR(devfn, where) ((0xC000 | ((devfn & 0x78) << 5)) + where) -#define FUNC(devfn) (((devfn & 7) << 1) | 0xf0) +#define IOADDR(devfn, where) ((0xC000 | ((devfn & 0x78) << 5)) + where) +#define FUNC(devfn) (((devfn & 7) << 1) | 0xf0) +#define SET(dev) if (dev->devfn) return PCIBIOS_DEVICE_NOT_FOUND; \ + outb(FUNC(dev->devfn), 0xCF8); \ + outb(dev->bus->number, 0xCFA); -static int pci_conf2_read_config_byte(unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned char *value) +static int pci_conf2_read_config_byte(struct pci_dev *dev, int where, u8 *value) { - if (device_fn & 0x80) - return PCIBIOS_DEVICE_NOT_FOUND; - outb (FUNC(device_fn), 0xCF8); - outb (bus, 0xCFA); - *value = inb(IOADDR(device_fn,where)); + SET(dev); + *value = inb(IOADDR(dev->devfn,where)); outb (0, 0xCF8); return PCIBIOS_SUCCESSFUL; } -static int pci_conf2_read_config_word (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned short *value) +static int pci_conf2_read_config_word(struct pci_dev *dev, int where, u16 *value) { - if (device_fn & 0x80) - return PCIBIOS_DEVICE_NOT_FOUND; - outb (FUNC(device_fn), 0xCF8); - outb (bus, 0xCFA); - *value = inw(IOADDR(device_fn,where)); + SET(dev); + *value = inw(IOADDR(dev->devfn,where)); outb (0, 0xCF8); return PCIBIOS_SUCCESSFUL; } -static int pci_conf2_read_config_dword (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned int *value) +static int pci_conf2_read_config_dword(struct pci_dev *dev, int where, u32 *value) { - if (device_fn & 0x80) - return PCIBIOS_DEVICE_NOT_FOUND; - outb (FUNC(device_fn), 0xCF8); - outb (bus, 0xCFA); - *value = inl (IOADDR(device_fn,where)); + SET(dev); + *value = inl (IOADDR(dev->devfn,where)); outb (0, 0xCF8); return PCIBIOS_SUCCESSFUL; } -static int pci_conf2_write_config_byte (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned char value) +static int pci_conf2_write_config_byte(struct pci_dev *dev, int where, u8 value) { - if (device_fn & 0x80) - return PCIBIOS_DEVICE_NOT_FOUND; - outb (FUNC(device_fn), 0xCF8); - outb (bus, 0xCFA); - outb (value, IOADDR(device_fn,where)); + SET(dev); + outb (value, IOADDR(dev->devfn,where)); outb (0, 0xCF8); return PCIBIOS_SUCCESSFUL; } -static int pci_conf2_write_config_word (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned short value) +static int pci_conf2_write_config_word(struct pci_dev *dev, int where, u16 value) { - if (device_fn & 0x80) - return PCIBIOS_DEVICE_NOT_FOUND; - outb (FUNC(device_fn), 0xCF8); - outb (bus, 0xCFA); - outw (value, IOADDR(device_fn,where)); + SET(dev); + outw (value, IOADDR(dev->devfn,where)); outb (0, 0xCF8); return PCIBIOS_SUCCESSFUL; } -static int pci_conf2_write_config_dword (unsigned char bus, unsigned char device_fn, - unsigned char where, unsigned int value) +static int pci_conf2_write_config_dword(struct pci_dev *dev, int where, u32 value) { - if (device_fn & 0x80) - return PCIBIOS_DEVICE_NOT_FOUND; - outb (FUNC(device_fn), 0xCF8); - outb (bus, 0xCFA); - outl (value, IOADDR(device_fn,where)); + SET(dev); + outl (value, IOADDR(dev->devfn,where)); outb (0, 0xCF8); return PCIBIOS_SUCCESSFUL; } +#undef SET #undef IOADDR #undef FUNC -static struct pci_access pci_direct_conf2 = { - 1, +static struct pci_ops pci_direct_conf2 = { pci_conf2_read_config_byte, pci_conf2_read_config_word, pci_conf2_read_config_dword, @@ -353,9 +261,11 @@ static struct pci_access pci_direct_conf2 = { * This should be close to trivial, but it isn't, because there are buggy * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID. */ -__initfunc(int pci_sanity_check(struct pci_access *a)) +static int __init pci_sanity_check(struct pci_ops *o) { - u16 dfn, x; + u16 x; + struct pci_bus bus; /* Fake bus and device */ + struct pci_dev dev; #ifdef CONFIG_VISWS return 1; /* Lithium PCI Bridges are non-standard */ @@ -363,17 +273,19 @@ __initfunc(int pci_sanity_check(struct pci_access *a)) if (pci_probe & PCI_NO_CHECKS) return 1; - for(dfn=0; dfn < 0x100; dfn++) - if ((!a->read_config_word(0, dfn, PCI_CLASS_DEVICE, &x) && + bus.number = 0; + dev.bus = &bus; + for(dev.devfn=0; dev.devfn < 0x100; dev.devfn++) + if ((!o->read_word(&dev, PCI_CLASS_DEVICE, &x) && (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA)) || - (!a->read_config_word(0, dfn, PCI_VENDOR_ID, &x) && + (!o->read_word(&dev, PCI_VENDOR_ID, &x) && (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ))) return 1; DBG("PCI: Sanity check failed\n"); return 0; } -__initfunc(static struct pci_access *pci_check_direct(void)) +static struct pci_ops * __init pci_check_direct(void) { unsigned int tmp; unsigned long flags; @@ -497,7 +409,7 @@ static unsigned long bios32_service(unsigned long service) unsigned long entry; /* %edx */ unsigned long flags; - spin_lock_irqsave(&pci_lock, flags); + __save_flags(flags); __cli(); __asm__("lcall (%%edi)" : "=a" (return_code), "=b" (address), @@ -506,7 +418,7 @@ static unsigned long bios32_service(unsigned long service) : "0" (service), "1" (0), "D" (&bios32_indirect)); - spin_unlock_irqrestore(&pci_lock, flags); + __restore_flags(flags); switch (return_code) { case 0: @@ -528,7 +440,7 @@ static struct { static int pci_bios_present; -__initfunc(static int check_pcibios(void)) +static int __init check_pcibios(void) { u32 signature, eax, ebx, ecx; u8 status, major_ver, minor_ver, hw_mech, last_bus; @@ -602,8 +514,8 @@ static int pci_bios_find_class (unsigned int class_code, unsigned short index, #endif -__initfunc(static int pci_bios_find_device (unsigned short vendor, unsigned short device_id, - unsigned short index, unsigned char *bus, unsigned char *device_fn)) +static int __init pci_bios_find_device (unsigned short vendor, unsigned short device_id, + unsigned short index, unsigned char *bus, unsigned char *device_fn) { unsigned short bx; unsigned short ret; @@ -624,11 +536,10 @@ __initfunc(static int pci_bios_find_device (unsigned short vendor, unsigned shor return (int) (ret & 0xff00) >> 8; } -static int pci_bios_read_config_byte(unsigned char bus, - unsigned char device_fn, unsigned char where, unsigned char *value) +static int pci_bios_read_config_byte(struct pci_dev *dev, int where, u8 *value) { unsigned long ret; - unsigned long bx = (bus << 8) | device_fn; + unsigned long bx = (dev->bus->number << 8) | dev->devfn; __asm__("lcall (%%esi)\n\t" "jc 1f\n\t" @@ -643,11 +554,10 @@ static int pci_bios_read_config_byte(unsigned char bus, return (int) (ret & 0xff00) >> 8; } -static int pci_bios_read_config_word (unsigned char bus, - unsigned char device_fn, unsigned char where, unsigned short *value) +static int pci_bios_read_config_word(struct pci_dev *dev, int where, u16 *value) { unsigned long ret; - unsigned long bx = (bus << 8) | device_fn; + unsigned long bx = (dev->bus->number << 8) | dev->devfn; __asm__("lcall (%%esi)\n\t" "jc 1f\n\t" @@ -662,11 +572,10 @@ static int pci_bios_read_config_word (unsigned char bus, return (int) (ret & 0xff00) >> 8; } -static int pci_bios_read_config_dword (unsigned char bus, - unsigned char device_fn, unsigned char where, unsigned int *value) +static int pci_bios_read_config_dword(struct pci_dev *dev, int where, u32 *value) { unsigned long ret; - unsigned long bx = (bus << 8) | device_fn; + unsigned long bx = (dev->bus->number << 8) | dev->devfn; __asm__("lcall (%%esi)\n\t" "jc 1f\n\t" @@ -681,11 +590,10 @@ static int pci_bios_read_config_dword (unsigned char bus, return (int) (ret & 0xff00) >> 8; } -static int pci_bios_write_config_byte (unsigned char bus, - unsigned char device_fn, unsigned char where, unsigned char value) +static int pci_bios_write_config_byte(struct pci_dev *dev, int where, u8 value) { unsigned long ret; - unsigned long bx = (bus << 8) | device_fn; + unsigned long bx = (dev->bus->number << 8) | dev->devfn; __asm__("lcall (%%esi)\n\t" "jc 1f\n\t" @@ -700,11 +608,10 @@ static int pci_bios_write_config_byte (unsigned char bus, return (int) (ret & 0xff00) >> 8; } -static int pci_bios_write_config_word (unsigned char bus, - unsigned char device_fn, unsigned char where, unsigned short value) +static int pci_bios_write_config_word(struct pci_dev *dev, int where, u16 value) { unsigned long ret; - unsigned long bx = (bus << 8) | device_fn; + unsigned long bx = (dev->bus->number << 8) | dev->devfn; __asm__("lcall (%%esi)\n\t" "jc 1f\n\t" @@ -719,11 +626,10 @@ static int pci_bios_write_config_word (unsigned char bus, return (int) (ret & 0xff00) >> 8; } -static int pci_bios_write_config_dword (unsigned char bus, - unsigned char device_fn, unsigned char where, unsigned int value) +static int pci_bios_write_config_dword(struct pci_dev *dev, int where, u32 value) { unsigned long ret; - unsigned long bx = (bus << 8) | device_fn; + unsigned long bx = (dev->bus->number << 8) | dev->devfn; __asm__("lcall (%%esi)\n\t" "jc 1f\n\t" @@ -742,8 +648,7 @@ static int pci_bios_write_config_dword (unsigned char bus, * Function table for BIOS32 access */ -static struct pci_access pci_bios_access = { - 1, +static struct pci_ops pci_bios_access = { pci_bios_read_config_byte, pci_bios_read_config_word, pci_bios_read_config_dword, @@ -756,7 +661,7 @@ static struct pci_access pci_bios_access = { * Try to find PCI BIOS. */ -__initfunc(static struct pci_access *pci_find_bios(void)) +static struct pci_ops * __init pci_find_bios(void) { union bios32 *check; unsigned char sum; @@ -855,26 +760,15 @@ static void __init pcibios_sort(void) #endif /* - * Several BIOS'es forget to assign addresses to I/O ranges. - * We try to fix it here, expecting there are free addresses - * starting with 0x5800. Ugly, but until we come with better - * resource management, it's the only simple solution. + * Several BIOS'es forget to assign addresses to I/O ranges. Try to fix it. */ -static int pci_last_io_addr __initdata = 0x5800; - static void __init pcibios_fixup_io_addr(struct pci_dev *dev, int idx) { - unsigned short cmd; unsigned int reg = PCI_BASE_ADDRESS_0 + 4*idx; - unsigned int size, addr, try; - unsigned int bus = dev->bus->number; - unsigned int devfn = dev->devfn; + struct resource *r = &dev->resource[idx]; + unsigned int size = r->end - r->start + 1; - if (!pci_last_io_addr) { - printk("PCI: Unassigned I/O space for %02x:%02x\n", bus, devfn); - return; - } if (((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && idx < 4) || (dev->class >> 8) == PCI_CLASS_DISPLAY_VGA) { /* @@ -888,33 +782,54 @@ static void __init pcibios_fixup_io_addr(struct pci_dev *dev, int idx) */ return; } - pcibios_read_config_word(bus, devfn, PCI_COMMAND, &cmd); - pcibios_write_config_word(bus, devfn, PCI_COMMAND, cmd & ~PCI_COMMAND_IO); - pcibios_write_config_dword(bus, devfn, reg, ~0); - pcibios_read_config_dword(bus, devfn, reg, &size); - size = (~(size & PCI_BASE_ADDRESS_IO_MASK) & 0xffff) + 1; - addr = 0; - if (!size || size > 0x100) - printk("PCI: Unable to handle I/O allocation for %02x:%02x (%04x), tell <mj@ucw.cz>\n", bus, devfn, size); - else { - do { - addr = (pci_last_io_addr + size - 1) & ~(size-1); - pci_last_io_addr = addr + size; - } while (check_region(addr, size)); - printk("PCI: Assigning I/O space %04x-%04x to device %02x:%02x\n", addr, addr+size-1, bus, devfn); - pcibios_write_config_dword(bus, devfn, reg, addr | PCI_BASE_ADDRESS_SPACE_IO); - pcibios_read_config_dword(bus, devfn, reg, &try); - if ((try & PCI_BASE_ADDRESS_IO_MASK) != addr) { - addr = 0; - printk("PCI: Address setup failed, got %04x\n", try); - } else - dev->base_address[idx] = try; + /* + * We need to avoid collisions with `mirrored' VGA ports and other strange + * ISA hardware, so we always want the addresses kilobyte aligned. + */ + if (!size || size > 256) { + printk(KERN_ERR "PCI: Cannot assign I/O space to device %s, %d bytes are too much.\n", dev->name, size); + return; + } else { + u32 try; + + r->start = 0; + r->end = size - 1; + if (pci_assign_resource(dev, idx)) { + printk(KERN_ERR "PCI: Unable to find free %d bytes of I/O space for device %s.\n", size, dev->name); + return; + } + printk("PCI: Assigned I/O space %04lx-%04lx to device %s\n", r->start, r->end, dev->name); + pci_read_config_dword(dev, reg, &try); + if ((try & PCI_BASE_ADDRESS_IO_MASK) != r->start) { + r->start = 0; + pci_write_config_dword(dev, reg, 0); + printk(KERN_ERR "PCI: I/O address setup failed, got %04x\n", try); + } } - if (!addr) { - pcibios_write_config_dword(bus, devfn, reg, 0); - dev->base_address[idx] = 0; +} + +/* + * Assign address to expansion ROM. This is a highly experimental feature + * and you must enable it by "pci=rom". It's even not guaranteed to work + * with all cards since the PCI specs allow address decoders to be shared + * between the ROM space and one of the standard regions (sigh!). + */ +static void __init pcibios_fixup_rom_addr(struct pci_dev *dev) +{ + int reg = (dev->hdr_type == 1) ? PCI_ROM_ADDRESS1 : PCI_ROM_ADDRESS; + struct resource *r = &dev->resource[PCI_ROM_RESOURCE]; + unsigned long rom_size = r->end - r->start + 1; + + r->start = 0; + r->end = rom_size - 1; + if (pci_assign_resource(dev, PCI_ROM_RESOURCE)) + printk(KERN_ERR "PCI: Unable to find free space for expansion ROM of device %s (0x%lx bytes)\n", + dev->name, rom_size); + else { + DBG("PCI: Assigned address %08lx to expansion ROM of %s (0x%lx bytes)\n", r->start, dev->name, rom_size); + pci_write_config_dword(dev, reg, r->start | PCI_ROM_ADDRESS_ENABLE); + r->flags |= PCI_ROM_ADDRESS_ENABLE; } - pcibios_write_config_word(bus, devfn, PCI_COMMAND, cmd); } /* @@ -929,18 +844,25 @@ static void __init pcibios_fixup_ghosts(struct pci_bus *b) struct pci_dev *d, *e, **z; int mirror = PCI_DEVFN(16,0); int seen_host_bridge = 0; + int i; DBG("PCI: Scanning for ghost devices on bus %d\n", b->number); for(d=b->devices; d && d->devfn < mirror; d=d->sibling) { if ((d->class >> 8) == PCI_CLASS_BRIDGE_HOST) seen_host_bridge++; - for(e=d->next; e; e=e->sibling) - if (e->devfn == d->devfn + mirror && - e->vendor == d->vendor && - e->device == d->device && - e->class == d->class && - !memcmp(e->base_address, d->base_address, sizeof(e->base_address))) - break; + for(e=d->next; e; e=e->sibling) { + if (e->devfn != d->devfn + mirror || + e->vendor != d->vendor || + e->device != d->device || + e->class != d->class) + continue; + for(i=0; i<PCI_NUM_RESOURCES; i++) + if (e->resource[i].start != d->resource[i].start || + e->resource[i].end != d->resource[i].end || + e->resource[i].flags != d->resource[i].flags) + continue; + break; + } if (!e) return; } @@ -966,12 +888,13 @@ static void __init pcibios_fixup_ghosts(struct pci_bus *b) */ static void __init pcibios_fixup_peer_bridges(void) { - struct pci_bus *b = &pci_root; - int i, n, cnt=-1; + struct pci_bus *b = pci_root; + int n, cnt=-1; struct pci_dev *d; + struct pci_ops *ops = pci_root->ops; #ifdef CONFIG_VISWS - pci_scan_peer_bridge(1); + pci_scan_bus(1, ops, NULL); return; #endif @@ -981,7 +904,7 @@ static void __init pcibios_fixup_peer_bridges(void) * since it reads bogus values for non-existent busses and * chipsets supporting multiple primary busses use conf1 anyway. */ - if (access_pci == &pci_direct_conf2) + if (ops == &pci_direct_conf2) return; #endif @@ -992,26 +915,31 @@ static void __init pcibios_fixup_peer_bridges(void) while (n <= 0xff) { int found = 0; u16 l; - for(i=0; i<256; i += 8) - if (!pcibios_read_config_word(n, i, PCI_VENDOR_ID, &l) && + struct pci_bus bus; + struct pci_dev dev; + bus.number = n; + bus.ops = ops; + dev.bus = &bus; + for(dev.devfn=0; dev.devfn<256; dev.devfn += 8) + if (!pci_read_config_word(&dev, PCI_VENDOR_ID, &l) && l != 0x0000 && l != 0xffff) { #ifdef CONFIG_PCI_BIOS if (pci_bios_present) { int err, idx = 0; u8 bios_bus, bios_dfn; u16 d; - pcibios_read_config_word(n, i, PCI_DEVICE_ID, &d); - DBG("BIOS test for %02x:%02x (%04x:%04x)\n", n, i, l, d); + pci_read_config_word(&dev, PCI_DEVICE_ID, &d); + DBG("BIOS test for %02x:%02x (%04x:%04x)\n", n, dev.devfn, l, d); while (!(err = pci_bios_find_device(l, d, idx, &bios_bus, &bios_dfn)) && - (bios_bus != n || bios_dfn != i)) + (bios_bus != n || bios_dfn != dev.devfn)) idx++; if (err) break; } #endif - DBG("Found device at %02x:%02x\n", n, i); + DBG("Found device at %02x:%02x\n", n, dev.devfn); found++; - if (!pcibios_read_config_word(n, i, PCI_CLASS_DEVICE, &l) && + if (!pci_read_config_word(&dev, PCI_CLASS_DEVICE, &l) && l == PCI_CLASS_BRIDGE_HOST) cnt++; } @@ -1019,8 +947,9 @@ static void __init pcibios_fixup_peer_bridges(void) break; if (found) { printk("PCI: Discovered primary peer bus %02x\n", n); - b = pci_scan_peer_bridge(n); - n = b->subordinate; + b = pci_scan_bus(n, ops, NULL); + if (b) + n = b->subordinate; } n++; } @@ -1037,6 +966,7 @@ static void __init pci_fixup_i450nx(struct pci_dev *d) */ int pxb, reg; u8 busno, suba, subb; + printk("PCI: Searching for i450NX host bridges on %s\n", d->name); reg = 0xd0; for(pxb=0; pxb<2; pxb++) { pci_read_config_byte(d, reg++, &busno); @@ -1044,9 +974,9 @@ static void __init pci_fixup_i450nx(struct pci_dev *d) pci_read_config_byte(d, reg++, &subb); DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); if (busno) - pci_scan_peer_bridge(busno); /* Bus A */ + pci_scan_bus(busno, pci_root->ops, NULL); /* Bus A */ if (suba < subb) - pci_scan_peer_bridge(suba+1); /* Bus B */ + pci_scan_bus(suba+1, pci_root->ops, NULL); /* Bus B */ } pci_probe |= PCI_NO_PEER_FIXUP; } @@ -1059,35 +989,44 @@ static void __init pci_fixup_umc_ide(struct pci_dev *d) */ int i; + printk("PCI: Fixing base address flags for device %s\n", d->name); for(i=0; i<4; i++) - d->base_address[i] |= PCI_BASE_ADDRESS_SPACE_IO; + d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO; } -struct dev_ex { - u16 vendor, device; - void (*handler)(struct pci_dev *); - char *comment; +struct pci_fixup pcibios_fixups[] = { + { PCI_FIXUP_HEADER, PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx }, + { PCI_FIXUP_HEADER, PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide }, + { 0 } }; -static struct dev_ex __initdata dev_ex_table[] = { - { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx, "Scanning peer host bridges" }, - { PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide, "Working around UM8886BF bugs" } -}; +/* + * Allocate resources for all PCI devices. We need to do that before + * we try to fix up anything. + */ -static void __init pcibios_scan_buglist(struct pci_bus *b) +static void __init pcibios_claim_resources(struct pci_bus *bus) { - struct pci_dev *d; - int i; + struct pci_dev *dev; + int idx; - for(d=b->devices; d; d=d->sibling) - for(i=0; i<sizeof(dev_ex_table)/sizeof(dev_ex_table[0]); i++) { - struct dev_ex *e = &dev_ex_table[i]; - if (e->vendor == d->vendor && e->device == d->device) { - printk("PCI: %02x:%02x [%04x/%04x]: %s\n", - b->number, d->devfn, d->vendor, d->device, e->comment); - e->handler(d); - } + while (bus) { + for (dev=bus->devices; dev; dev=dev->sibling) + for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) { + struct resource *r = &dev->resource[idx]; + struct resource *pr; + if (!r->start) + continue; + pr = pci_find_parent_resource(dev, r); + if (!pr || request_resource(pr, r) < 0) { + printk(KERN_ERR "PCI: Address space collision on region %d of device %s\n", idx, dev->name); + /* We probably should disable the region, shouldn't we? */ + } } + if (bus->children) + pcibios_claim_resources(bus->children); + bus = bus->next; + } } /* @@ -1112,13 +1051,12 @@ static void __init pcibios_fixup_devices(void) */ has_io = has_mem = 0; for(i=0; i<6; i++) { - unsigned long a = dev->base_address[i]; - if (a & PCI_BASE_ADDRESS_SPACE_IO) { + struct resource *r = &dev->resource[i]; + if (r->flags & PCI_BASE_ADDRESS_SPACE_IO) { has_io = 1; - a &= PCI_BASE_ADDRESS_IO_MASK; - if (!a || a == PCI_BASE_ADDRESS_IO_MASK) + if (!r->start || r->start == PCI_BASE_ADDRESS_IO_MASK) pcibios_fixup_io_addr(dev, i); - } else if (a & PCI_BASE_ADDRESS_MEM_MASK) + } else if (r->start) has_mem = 1; } /* @@ -1133,18 +1071,21 @@ static void __init pcibios_fixup_devices(void) ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)) { pci_read_config_word(dev, PCI_COMMAND, &cmd); if (has_io && !(cmd & PCI_COMMAND_IO)) { - printk("PCI: Enabling I/O for device %02x:%02x\n", - dev->bus->number, dev->devfn); + printk("PCI: Enabling I/O for device %s\n", dev->name); cmd |= PCI_COMMAND_IO; pci_write_config_word(dev, PCI_COMMAND, cmd); } if (has_mem && !(cmd & PCI_COMMAND_MEMORY)) { - printk("PCI: Enabling memory for device %02x:%02x\n", - dev->bus->number, dev->devfn); + printk("PCI: Enabling memory for device %s\n", dev->name); cmd |= PCI_COMMAND_MEMORY; pci_write_config_word(dev, PCI_COMMAND, cmd); } } + /* + * Assign address to expansion ROM if requested. + */ + if ((pci_probe & PCI_ASSIGN_ROMS) && dev->resource[PCI_ROM_RESOURCE].end) + pcibios_fixup_rom_addr(dev); #if defined(CONFIG_X86_IO_APIC) /* * Recalculate IRQ numbers if we use the I/O APIC @@ -1185,38 +1126,27 @@ static void __init pcibios_fixup_devices(void) } /* - * Arch-dependent fixups. + * Called after each bus is probed, but before its children + * are examined. */ -__initfunc(void pcibios_fixup(void)) -{ - if (!(pci_probe & PCI_NO_PEER_FIXUP)) - pcibios_fixup_peer_bridges(); - pcibios_fixup_devices(); - -#ifdef CONFIG_PCI_BIOS - if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) - pcibios_sort(); -#endif -} - -__initfunc(void pcibios_fixup_bus(struct pci_bus *b)) +void __init pcibios_fixup_bus(struct pci_bus *b) { pcibios_fixup_ghosts(b); - pcibios_scan_buglist(b); } /* * Initialization. Try all known PCI access methods. Note that we support * using both PCI BIOS and direct access: in such cases, we use I/O ports * to access config space, but we still keep BIOS order of cards to be - * compatible with 2.0.X. This should go away in 2.3. + * compatible with 2.0.X. This should go away some day. */ -__initfunc(void pcibios_init(void)) +void __init pcibios_init(void) { - struct pci_access *bios = NULL; - struct pci_access *dir = NULL; + struct pci_ops *bios = NULL; + struct pci_ops *dir = NULL; + struct pci_ops *ops; #ifdef CONFIG_PCI_BIOS if ((pci_probe & PCI_PROBE_BIOS) && ((bios = pci_find_bios()))) { @@ -1229,23 +1159,33 @@ __initfunc(void pcibios_init(void)) dir = pci_check_direct(); #endif if (dir) - access_pci = dir; + ops = dir; else if (bios) - access_pci = bios; + ops = bios; + else { + printk("PCI: No PCI bus detected\n"); + return; + } + + printk("PCI: Probing PCI hardware\n"); + pci_scan_bus(0, ops, NULL); + + if (!(pci_probe & PCI_NO_PEER_FIXUP)) + pcibios_fixup_peer_bridges(); + pcibios_claim_resources(pci_root); + pcibios_fixup_devices(); + +#ifdef CONFIG_PCI_BIOS + if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) + pcibios_sort(); +#endif } -__initfunc(char *pcibios_setup(char *str)) +char * __init pcibios_setup(char *str) { if (!strcmp(str, "off")) { pci_probe = 0; return NULL; - } else if (!strncmp(str, "io=", 3)) { - char *p; - unsigned int x = simple_strtoul(str+3, &p, 16); - if (p && *p) - return str; - pci_last_io_addr = x; - return NULL; } #ifdef CONFIG_PCI_BIOS else if (!strcmp(str, "bios")) { @@ -1272,6 +1212,9 @@ __initfunc(char *pcibios_setup(char *str)) else if (!strcmp(str, "nopeer")) { pci_probe |= PCI_NO_PEER_FIXUP; return NULL; + } else if (!strcmp(str, "rom")) { + pci_probe |= PCI_ASSIGN_ROMS; + return NULL; } return str; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3a5fc93a1..47f23b6b6 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -145,7 +145,30 @@ ENTRY(lcall7) andl $-8192,%ebx # GET_CURRENT movl exec_domain(%ebx),%edx # Get the execution domain movl 4(%edx),%edx # Get the lcall7 handler for the domain + pushl $0x7 call *%edx + addl $4, %esp + popl %eax + jmp ret_from_sys_call + +ENTRY(lcall27) + pushfl # We get a different stack layout with call gates, + pushl %eax # which has to be cleaned up later.. + SAVE_ALL + movl EIP(%esp),%eax # due to call gates, this is eflags, not eip.. + movl CS(%esp),%edx # this is eip.. + movl EFLAGS(%esp),%ecx # and this is cs.. + movl %eax,EFLAGS(%esp) # + movl %edx,EIP(%esp) # Now we move them to their "normal" places + movl %ecx,CS(%esp) # + movl %esp,%ebx + pushl %ebx + andl $-8192,%ebx # GET_CURRENT + movl exec_domain(%ebx),%edx # Get the execution domain + movl 4(%edx),%edx # Get the lcall7 handler for the domain + pushl $0x27 + call *%edx + addl $4, %esp popl %eax jmp ret_from_sys_call @@ -153,11 +176,9 @@ ENTRY(lcall7) ALIGN .globl ret_from_fork ret_from_fork: -#ifdef __SMP__ pushl %ebx call SYMBOL_NAME(schedule_tail) addl $4, %esp -#endif /* __SMP__ */ GET_CURRENT(%ebx) jmp ret_from_sys_call @@ -483,7 +504,7 @@ ENTRY(sys_call_table) .long SYMBOL_NAME(sys_uname) .long SYMBOL_NAME(sys_iopl) /* 110 */ .long SYMBOL_NAME(sys_vhangup) - .long SYMBOL_NAME(sys_idle) + .long SYMBOL_NAME(sys_ni_syscall) /* old "idle" system call */ .long SYMBOL_NAME(sys_vm86old) .long SYMBOL_NAME(sys_wait4) .long SYMBOL_NAME(sys_swapoff) /* 115 */ diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index acbc3e325..ac854e721 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -8,11 +8,12 @@ */ .text -#include <linux/tasks.h> +#include <linux/threads.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/desc.h> #define CL_MAGIC_ADDR 0x90020 @@ -330,7 +331,7 @@ ignore_int: * of tasks we can have.. */ #define IDT_ENTRIES 256 -#define GDT_ENTRIES (12+2*NR_TASKS) +#define GDT_ENTRIES (__TSS(NR_CPUS)) .globl SYMBOL_NAME(idt) @@ -519,8 +520,7 @@ ENTRY(empty_zero_page) ALIGN /* - * This contains up to 8192 quadwords depending on NR_TASKS - 64kB of - * gdt entries. Ugh. + * This contains typically 140 quadwords, depending on NR_CPUS. * * NOTE! Make sure the gdt descriptor in head.S matches this if you * change anything. @@ -542,7 +542,7 @@ ENTRY(gdt_table) .quad 0x00409a0000000000 /* 0x48 APM CS code */ .quad 0x00009a0000000000 /* 0x50 APM CS 16 code (16 bit) */ .quad 0x0040920000000000 /* 0x58 APM DS data */ - .fill 2*NR_TASKS,8,0 /* space for LDT's and TSS's etc */ + .fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ /* * This is to aid debugging, the various locking macros will be putting diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index afcfd274e..61422f372 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -76,7 +76,6 @@ EXPORT_SYMBOL(strlen_user); #ifdef __SMP__ EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(kernel_flag); -EXPORT_SYMBOL(smp_invalidate_needed); EXPORT_SYMBOL(cpu_number_map); EXPORT_SYMBOL(__cpu_logical_map); EXPORT_SYMBOL(smp_num_cpus); @@ -89,6 +88,7 @@ EXPORT_SYMBOL(synchronize_bh); EXPORT_SYMBOL(global_bh_count); EXPORT_SYMBOL(global_bh_lock); EXPORT_SYMBOL(global_irq_holder); +EXPORT_SYMBOL(i386_bh_lock); EXPORT_SYMBOL(__global_cli); EXPORT_SYMBOL(__global_sti); EXPORT_SYMBOL(__global_save_flags); @@ -111,6 +111,7 @@ EXPORT_SYMBOL(mca_isadapter); EXPORT_SYMBOL(mca_mark_as_used); EXPORT_SYMBOL(mca_mark_as_unused); EXPORT_SYMBOL(mca_find_unused_adapter); +EXPORT_SYMBOL(mca_is_adapter_used); #endif #ifdef CONFIG_VT diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c new file mode 100644 index 000000000..ce4082848 --- /dev/null +++ b/arch/i386/kernel/i8259.c @@ -0,0 +1,409 @@ +#include <linux/config.h> +#include <linux/ptrace.h> +#include <linux/errno.h> +#include <linux/kernel_stat.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/malloc.h> +#include <linux/random.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/init.h> + +#include <asm/system.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/bitops.h> +#include <asm/smp.h> +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> + +#include <linux/irq.h> + + +/* + * Intel specific no controller code + * odd that no-controller should be architecture dependent + * but see the ifdef __SMP__ + */ + +static void enable_none(unsigned int irq) { } +static unsigned int startup_none(unsigned int irq) { return 0; } +static void disable_none(unsigned int irq) { } +static void ack_none(unsigned int irq) +{ +#ifdef __SMP__ + /* + * [currently unexpected vectors happen only on SMP and APIC. + * if we want to have non-APIC and non-8259A controllers + * in the future with unexpected vectors, this ack should + * probably be made controller-specific.] + */ + ack_APIC_irq(); +#endif +} + +/* startup is the same as "enable", shutdown is same as "disable" */ +#define shutdown_none disable_none +#define end_none enable_none + +struct hw_interrupt_type no_irq_type = { + "none", + startup_none, + shutdown_none, + enable_none, + disable_none, + ack_none, + end_none +}; + + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + * this file should become arch/i386/kernel/irq.c when the old irq.c + * moves to arch independent land + */ +/* + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + + +BUILD_COMMON_IRQ() + +#define BI(x,y) \ + BUILD_IRQ(##x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x20-0x30) + */ +BUILD_16_IRQS(0x0) + +#ifdef CONFIG_X86_IO_APIC +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) +#endif + +#undef BUILD_16_IRQS +#undef BI + + +#ifdef __SMP__ +/* + * The following vectors are part of the Linux architecture, there + * is no hardware IRQ pin equivalent for them, they are triggered + * through the ICC by us (IPIs) + */ +BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) +BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) +BUILD_SMP_INTERRUPT(stop_cpu_interrupt,STOP_CPU_VECTOR) +BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) +BUILD_SMP_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) + +/* + * every pentium local APIC has two 'local interrupts', with a + * soft-definable vector attached to both interrupts, one of + * which is a timer interrupt, the other one is error counter + * overflow. Linux uses the local APIC timer interrupt to get + * a much simpler SMP time architecture: + */ +BUILD_SMP_TIMER_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) + +#endif + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +static void (*interrupt[NR_IRQS])(void) = { + IRQLIST_16(0x0), + +#ifdef CONFIG_X86_IO_APIC + IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd) +#endif +}; + +#undef IRQ +#undef IRQLIST_16 + + + + +static void enable_8259A_irq(unsigned int irq); +void disable_8259A_irq(unsigned int irq); + +/* shutdown is same as "disable" */ +#define end_8259A_irq enable_8259A_irq +#define shutdown_8259A_irq disable_8259A_irq + +static void mask_and_ack_8259A(unsigned int); + +static unsigned int startup_8259A_irq(unsigned int irq) +{ + enable_8259A_irq(irq); + return 0; /* never anything pending */ +} + +static struct hw_interrupt_type i8259A_irq_type = { + "XT-PIC", + startup_8259A_irq, + shutdown_8259A_irq, + enable_8259A_irq, + disable_8259A_irq, + mask_and_ack_8259A, + end_8259A_irq +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +static unsigned int cached_irq_mask = 0xffff; + +#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define cached_21 (__byte(0,cached_irq_mask)) +#define cached_A1 (__byte(1,cached_irq_mask)) + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not connected to any IO-APIC pin, it's + * fed to the CPU IRQ line directly. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs = 0; + +/* + * These have to be protected by the irq controller spinlock + * before being called. + */ +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + cached_irq_mask |= mask; + if (irq & 8) { + outb(cached_A1,0xA1); + } else { + outb(cached_21,0x21); + } +} + +static void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + cached_irq_mask &= mask; + if (irq & 8) { + outb(cached_A1,0xA1); + } else { + outb(cached_21,0x21); + } +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<<irq; + + if (irq < 8) + return (inb(0x20) & mask); + return (inb(0xA0) & (mask >> 8)); +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<<irq); + irq_desc[irq].handler = &i8259A_irq_type; + enable_irq(irq); +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + cached_irq_mask |= 1 << irq; + if (irq & 8) { + inb(0xA1); /* DUMMY */ + outb(cached_A1,0xA1); + outb(0x62,0x20); /* Specific EOI to cascade */ + outb(0x20,0xA0); + } else { + inb(0x21); /* DUMMY */ + outb(cached_21,0x21); + outb(0x20,0x20); + } +} + +#ifndef CONFIG_VISWS +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + +static void math_error_irq(int cpl, void *dev_id, struct pt_regs *regs) +{ + outb(0,0xF0); + if (ignore_irq13 || !boot_cpu_data.hard_math) + return; + math_error(); +} + +static struct irqaction irq13 = { math_error_irq, 0, 0, "fpu", NULL, NULL }; + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { no_action, 0, 0, "cascade", NULL, NULL}; +#endif + + +void init_ISA_irqs (void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = 0; + irq_desc[i].depth = 0; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + irq_desc[i].handler = &i8259A_irq_type; + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].handler = &no_irq_type; + } + } +} + +void __init init_IRQ(void) +{ + int i; + +#ifndef CONFIG_X86_VISWS_APIC + init_ISA_irqs(); +#else + init_VISWS_APIC_irqs(); +#endif + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < NR_IRQS; i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef __SMP__ + + /* + IRQ0 must be given a fixed assignment and initialized + before init_IRQ_SMP. + */ + set_intr_gate(IRQ0_TRAP_VECTOR, interrupt[0]); + + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for CPU halt */ + set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt); + + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI vector for APIC spurious interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); +#endif + + /* + * Set the clock to HZ Hz, we already have a valid + * vector now: + */ + outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(LATCH & 0xff , 0x40); /* LSB */ + outb(LATCH >> 8 , 0x40); /* MSB */ + +#ifndef CONFIG_VISWS + setup_irq(2, &irq2); + setup_irq(13, &irq13); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +void __init init_IRQ_SMP(void) +{ + int i; + for (i = 0; i < NR_IRQS ; i++) + if (IO_APIC_VECTOR(i) > 0) + set_intr_gate(IO_APIC_VECTOR(i), interrupt[i]); +} +#endif diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index 0faa696a4..84fba5106 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -1,5 +1,6 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/init.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -7,7 +8,6 @@ static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; -static struct file * init_fd_array[NR_OPEN] = { NULL, }; static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); @@ -22,4 +22,13 @@ struct mm_struct init_mm = INIT_MM(init_mm); union task_union init_task_union __attribute__((__section__(".data.init_task"))) = { INIT_TASK(init_task_union.task) }; - + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; + diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 42ebd9643..34e3ff86f 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -5,6 +5,12 @@ * * Many thanks to Stig Venaas for trying out countless experimental * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, + * further tested and cleaned up by Zach Brown <zab@redhat.com> + * and Ingo Molnar <mingo@redhat.com> */ #include <linux/sched.h> @@ -13,13 +19,13 @@ #include <linux/delay.h> #include <asm/io.h> -#include "irq.h" +#include <linux/irq.h> /* * volatile is justified in this case, IO-APIC register contents * might change spontaneously, GCC should not cache it */ -#define IO_APIC_BASE ((volatile int *)fix_to_virt(FIX_IO_APIC_BASE)) +#define IO_APIC_BASE(idx) ((volatile int *)__fix_to_virt(FIX_IO_APIC_BASE_0 + idx)) /* * The structure of the IO-APIC: @@ -45,9 +51,10 @@ struct IO_APIC_reg_02 { } __attribute__ ((packed)); /* - * # of IRQ routing registers + * # of IO-APICs and # of IRQ routing registers */ -int nr_ioapic_registers = 0; +int nr_ioapics = 0; +int nr_ioapic_registers[MAX_IO_APICS]; enum ioapic_irq_destination_types { dest_Fixed = 0, @@ -94,6 +101,7 @@ enum mp_irq_source_types { mp_ExtINT = 3 }; +struct mpc_config_ioapic mp_apics[MAX_IO_APICS];/* I/O APIC entries */ int mp_irq_entries = 0; /* # of MP IRQ source entries */ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* MP IRQ source entries */ @@ -108,34 +116,34 @@ int mpc_default_type = 0; /* non-0 if default (table-less) * between pins and IRQs. */ -static inline unsigned int io_apic_read(unsigned int reg) +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { - *IO_APIC_BASE = reg; - return *(IO_APIC_BASE+4); + *IO_APIC_BASE(apic) = reg; + return *(IO_APIC_BASE(apic)+4); } -static inline void io_apic_write(unsigned int reg, unsigned int value) +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { - *IO_APIC_BASE = reg; - *(IO_APIC_BASE+4) = value; + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = value; } /* * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. */ -static inline void io_apic_modify(unsigned int value) +static inline void io_apic_modify(unsigned int apic, unsigned int value) { - *(IO_APIC_BASE+4) = value; + *(IO_APIC_BASE(apic)+4) = value; } /* * Synchronize the IO-APIC and the CPU by doing * a dummy read from the IO-APIC */ -static inline void io_apic_sync(void) +static inline void io_apic_sync(unsigned int apic) { - (void) *(IO_APIC_BASE+4); + (void) *(IO_APIC_BASE(apic)+4); } /* @@ -146,7 +154,7 @@ static inline void io_apic_sync(void) #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) static struct irq_pin_list { - int pin, next; + int apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; /* @@ -154,7 +162,7 @@ static struct irq_pin_list { * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq(unsigned int irq, int pin) +static void add_pin_to_irq(unsigned int irq, int apic, int pin) { static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; @@ -168,6 +176,7 @@ static void add_pin_to_irq(unsigned int irq, int pin) if (++first_free_entry >= PIN_MAP_SIZE) panic("io_apic.c: whoops"); } + entry->apic = apic; entry->pin = pin; } @@ -183,9 +192,9 @@ static void name##_IO_APIC_irq(unsigned int irq) \ pin = entry->pin; \ if (pin == -1) \ break; \ - reg = io_apic_read(0x10 + R + pin*2); \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ - io_apic_modify(reg); \ + io_apic_modify(entry->apic, reg); \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -197,12 +206,12 @@ static void name##_IO_APIC_irq(unsigned int irq) \ * We disable IO-APIC IRQs by setting their 'destination CPU mask' to * zero. Trick by Ramesh Nalluri. */ -DO_ACTION( disable, 1, &= 0x00ffffff, io_apic_sync()) /* destination = 0x00 */ +DO_ACTION( disable, 1, &= 0x00ffffff, io_apic_sync(entry->apic))/* destination = 0x00 */ DO_ACTION( enable, 1, |= 0xff000000, ) /* destination = 0xff */ -DO_ACTION( mask, 0, |= 0x00010000, io_apic_sync()) /* mask = 1 */ +DO_ACTION( mask, 0, |= 0x00010000, io_apic_sync(entry->apic))/* mask = 1 */ DO_ACTION( unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ -static void clear_IO_APIC_pin(unsigned int pin) +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -211,16 +220,17 @@ static void clear_IO_APIC_pin(unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - io_apic_write(0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(0x11 + 2 * pin, *(((int *)&entry) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); } static void clear_IO_APIC (void) { - int pin; + int apic, pin; - for (pin = 0; pin < nr_ioapic_registers; pin++) - clear_IO_APIC_pin(pin); + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); } /* @@ -232,50 +242,54 @@ static void clear_IO_APIC (void) int pirq_entries [MAX_PIRQS]; int pirqs_enabled; -void __init ioapic_setup(char *str, int *ints) +static int __init ioapic_setup(char *str) { extern int skip_ioapic_setup; /* defined in arch/i386/kernel/smp.c */ skip_ioapic_setup = 1; + return 1; } -void __init ioapic_pirq_setup(char *str, int *ints) +__setup("noapic", ioapic_setup); + +static int __init ioapic_pirq_setup(char *str) { int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); for (i = 0; i < MAX_PIRQS; i++) pirq_entries[i] = -1; - if (!ints) { - pirqs_enabled = 0; - printk("PIRQ redirection, trusting MP-BIOS.\n"); + pirqs_enabled = 1; + printk("PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; - } else { - pirqs_enabled = 1; - printk("PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - printk("... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } + for (i = 0; i < max; i++) { + printk("... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; } + return 1; } +__setup("pirq=", ioapic_pirq_setup); + /* * Find the IRQ entry number of a certain pin. */ -static int __init find_irq_entry(int pin, int type) +static int __init find_irq_entry(int apic, int pin, int type) { int i; for (i = 0; i < mp_irq_entries; i++) if ( (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_dstapic == mp_apics[apic].mpc_apicid) && (mp_irqs[i].mpc_dstirq == pin)) return i; @@ -305,23 +319,28 @@ static int __init find_timer_pin(int type) /* * Find a specific PCI IRQ entry. - * Not an initfunc, possibly needed by modules + * Not an __init, possibly needed by modules */ +static int __init pin_2_irq(int idx, int apic, int pin); int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pci_pin) { - int i; + int apic, i; for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if (IO_APIC_IRQ(mp_irqs[i].mpc_dstirq) && + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_apics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + break; + + if ((apic || IO_APIC_IRQ(mp_irqs[i].mpc_dstirq)) && (mp_bus_id_to_type[lbus] == MP_BUS_PCI) && !mp_irqs[i].mpc_irqtype && (bus == mp_bus_id_to_pci_bus[mp_irqs[i].mpc_srcbus]) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f)) && (pci_pin == (mp_irqs[i].mpc_srcbusirq & 3))) - return mp_irqs[i].mpc_dstirq; + return pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); } return -1; } @@ -491,9 +510,9 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } -static int __init pin_2_irq(int idx, int pin) +static int __init pin_2_irq(int idx, int apic, int pin) { - int irq; + int irq, i; int bus = mp_irqs[idx].mpc_srcbus; /* @@ -513,9 +532,12 @@ static int __init pin_2_irq(int idx, int pin) case MP_BUS_PCI: /* PCI pin */ { /* - * PCI IRQs are 'directly mapped' + * PCI IRQs are mapped in order */ - irq = pin; + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; break; } default: @@ -545,12 +567,14 @@ static int __init pin_2_irq(int idx, int pin) static inline int IO_APIC_irq_trigger(int irq) { - int idx, pin; + int apic, idx, pin; - for (pin = 0; pin < nr_ioapic_registers; pin++) { - idx = find_irq_entry(pin,mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx,pin))) - return irq_trigger(idx); + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } } /* * nonexistent IRQs are edge default @@ -582,11 +606,12 @@ static int __init assign_irq_vector(int irq) void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; - int pin, idx, bus, irq, first_notcon = 1; + int apic, pin, idx, irq, first_notcon = 1; printk("init IO_APIC IRQs\n"); - for (pin = 0; pin < nr_ioapic_registers; pin++) { + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { /* * add it to the IO-APIC irq-routing table: @@ -598,13 +623,13 @@ void __init setup_IO_APIC_irqs(void) entry.mask = 0; /* enable IRQ */ entry.dest.logical.logical_dest = 0; /* but no route */ - idx = find_irq_entry(pin,mp_INT); + idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { - printk(" IO-APIC pin %d", pin); + printk(" IO-APIC (apicid-pin) %d-%d", mp_apics[apic].mpc_apicid, pin); first_notcon = 0; } else - printk(", %d", pin); + printk(", %d-%d", mp_apics[apic].mpc_apicid, pin); continue; } @@ -617,18 +642,17 @@ void __init setup_IO_APIC_irqs(void) entry.dest.logical.logical_dest = 0xff; } - irq = pin_2_irq(idx,pin); - add_pin_to_irq(irq, pin); + irq = pin_2_irq(idx,apic,pin); + add_pin_to_irq(irq, apic, pin); - if (!IO_APIC_IRQ(irq)) + if (!apic && !IO_APIC_IRQ(irq)) continue; entry.vector = assign_irq_vector(irq); - bus = mp_irqs[idx].mpc_srcbus; - - io_apic_write(0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + } } if (!first_notcon) @@ -638,7 +662,7 @@ void __init setup_IO_APIC_irqs(void) /* * Set up a certain pin as ExtINT delivered interrupt */ -void __init setup_ExtINT_pin(unsigned int pin, int irq) +void __init setup_ExtINT_pin(unsigned int apic, unsigned int pin, int irq) { struct IO_APIC_route_entry entry; @@ -662,8 +686,8 @@ void __init setup_ExtINT_pin(unsigned int pin, int irq) entry.polarity = 0; entry.trigger = 0; - io_apic_write(0x10+2*pin, *(((int *)&entry)+0)); - io_apic_write(0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); } void __init UNEXPECTED_IO_APIC(void) @@ -674,17 +698,14 @@ void __init UNEXPECTED_IO_APIC(void) void __init print_IO_APIC(void) { - int i; + int apic, i; struct IO_APIC_reg_00 reg_00; struct IO_APIC_reg_01 reg_01; struct IO_APIC_reg_02 reg_02; printk("number of MP IRQ sources: %d.\n", mp_irq_entries); - printk("number of IO-APIC registers: %d.\n", nr_ioapic_registers); - - *(int *)®_00 = io_apic_read(0); - *(int *)®_01 = io_apic_read(1); - *(int *)®_02 = io_apic_read(2); + for (i = 0; i < nr_ioapics; i++) + printk("number of IO-APIC #%d registers: %d.\n", mp_apics[i].mpc_apicid, nr_ioapic_registers[i]); /* * We are a bit conservative about what we expect. We have to @@ -692,6 +713,12 @@ void __init print_IO_APIC(void) */ printk("testing the IO APIC.......................\n"); + for (apic = 0; apic < nr_ioapics; apic++) { + + *(int *)®_00 = io_apic_read(apic, 0); + *(int *)®_01 = io_apic_read(apic, 1); + *(int *)®_02 = io_apic_read(apic, 2); + printk("\nIO APIC #%d......\n", mp_apics[apic].mpc_apicid); printk(".... register #00: %08X\n", *(int *)®_00); printk("....... : physical APIC id: %02X\n", reg_00.ID); if (reg_00.__reserved_1 || reg_00.__reserved_2) @@ -706,8 +733,6 @@ void __init print_IO_APIC(void) (reg_01.entries != 0x3F) /* bigger Xeon boards */ ) UNEXPECTED_IO_APIC(); - if (reg_01.entries == 0x0f) - printk("....... [IO-APIC cannot route PCI PIRQ 0-3]\n"); printk("....... : IO APIC version: %04X\n", reg_01.version); if ( (reg_01.version != 0x10) && /* oldest IO-APICs */ @@ -731,8 +756,8 @@ void __init print_IO_APIC(void) for (i = 0; i <= reg_01.entries; i++) { struct IO_APIC_route_entry entry; - *(((int *)&entry)+0) = io_apic_read(0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(0x11+i*2); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); printk(" %02x %03X %02X ", i, @@ -751,7 +776,7 @@ void __init print_IO_APIC(void) entry.vector ); } - + } printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < NR_IRQS; i++) { struct irq_pin_list *entry = irq_2_pin + i; @@ -796,9 +821,12 @@ static void __init init_sym_mode(void) */ { struct IO_APIC_reg_01 reg_01; + int i; - *(int *)®_01 = io_apic_read(1); - nr_ioapic_registers = reg_01.entries+1; + for (i = 0; i < nr_ioapics; i++) { + *(int *)®_01 = io_apic_read(i, 1); + nr_ioapic_registers[i] = reg_01.entries+1; + } } /* @@ -808,7 +836,7 @@ static void __init init_sym_mode(void) } /* - * Not an initfunc, needed by the reboot code + * Not an __init, needed by the reboot code */ void init_pic_mode(void) { @@ -827,55 +855,6 @@ void init_pic_mode(void) printk("...done.\n"); } -char ioapic_OEM_ID [16]; -char ioapic_Product_ID [16]; - -struct ioapic_list_entry { - char * oem_id; - char * product_id; -}; - -struct ioapic_list_entry __initdata ioapic_whitelist [] = { - - { "INTEL " , "PR440FX " }, - { "INTEL " , "82440FX " }, - { "AIR " , "KDI " }, - { 0 , 0 } -}; - -struct ioapic_list_entry __initdata ioapic_blacklist [] = { - - { "OEM00000" , "PROD00000000" }, - { 0 , 0 } -}; - -static int __init in_ioapic_list(struct ioapic_list_entry * table) -{ - for ( ; table->oem_id ; table++) - if ((!strcmp(table->oem_id,ioapic_OEM_ID)) && - (!strcmp(table->product_id,ioapic_Product_ID))) - return 1; - return 0; -} - -static int __init ioapic_whitelisted(void) -{ -/* - * Right now, whitelist everything to see whether the new parsing - * routines really do work for everybody. - */ -#if 1 - return 1; -#else - return in_ioapic_list(ioapic_whitelist); -#endif -} - -static int __init ioapic_blacklisted(void) -{ - return in_ioapic_list(ioapic_blacklist); -} - static void __init setup_ioapic_id(void) { struct IO_APIC_reg_00 reg_00; @@ -897,15 +876,15 @@ static void __init setup_ioapic_id(void) /* * Set the ID */ - *(int *)®_00 = io_apic_read(0); + *(int *)®_00 = io_apic_read(0, 0); printk("...changing IO-APIC physical APIC ID to 2...\n"); reg_00.ID = 0x2; - io_apic_write(0, *(int *)®_00); + io_apic_write(0, 0, *(int *)®_00); /* * Sanity check */ - *(int *)®_00 = io_apic_read(0); + *(int *)®_00 = io_apic_read(0, 0); if (reg_00.ID != 0x2) panic("could not set ID"); } @@ -978,24 +957,13 @@ static int __init timer_irq_works(void) * better to do it this way as thus we do not have to be aware of * 'pending' interrupts in the IRQ path, except at this point. */ -static inline void self_IPI(unsigned int irq) -{ - irq_desc_t *desc = irq_desc + irq; - unsigned int status = desc->status; - - if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = status | IRQ_REPLAY; - send_IPI_self(IO_APIC_VECTOR(irq)); - } -} - /* * Edge triggered needs to resend any interrupt - * that was delayed. + * that was delayed but this is now handled in the device + * independent code. */ static void enable_edge_ioapic_irq(unsigned int irq) { - self_IPI(irq); enable_IO_APIC_irq(irq); } @@ -1008,129 +976,52 @@ static void disable_edge_ioapic_irq(unsigned int irq) * Starting up a edge-triggered IO-APIC interrupt is * nasty - we need to make sure that we get the edge. * If it is already asserted for some reason, we need - * to fake an edge by marking it IRQ_PENDING.. + * return 1 to indicate that is was pending. * * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... */ -static void startup_edge_ioapic_irq(unsigned int irq) +static unsigned int startup_edge_ioapic_irq(unsigned int irq) { + int was_pending = 0; if (irq < 16) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) - irq_desc[irq].status |= IRQ_PENDING; + was_pending = 1; } enable_edge_ioapic_irq(irq); + return was_pending; } #define shutdown_edge_ioapic_irq disable_edge_ioapic_irq +void static ack_edge_ioapic_irq(unsigned int i) +{ + ack_APIC_irq(); +} +void static end_edge_ioapic_irq(unsigned int i){} + /* * Level triggered interrupts can just be masked, * and shutting down and starting up the interrupt - * is the same as enabling and disabling them. + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. */ -#define startup_level_ioapic_irq unmask_IO_APIC_irq -#define shutdown_level_ioapic_irq mask_IO_APIC_irq -#define enable_level_ioapic_irq unmask_IO_APIC_irq -#define disable_level_ioapic_irq mask_IO_APIC_irq - -static void do_edge_ioapic_IRQ(unsigned int irq, struct pt_regs * regs) +static unsigned int startup_level_ioapic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; - struct irqaction * action; - unsigned int status; - - spin_lock(&irq_controller_lock); - - /* - * Edge triggered IRQs can be acknowledged immediately - * and do not need to be masked. - */ - ack_APIC_irq(); - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { - action = desc->action; - status &= ~IRQ_PENDING; - status |= IRQ_INPROGRESS; - } - desc->status = status; - spin_unlock(&irq_controller_lock); - - /* - * If there is no IRQ handler or it was disabled, exit early. - */ - if (!action) - return; - - /* - * Edge triggered interrupts need to remember - * pending events. - */ - for (;;) { - handle_IRQ_event(irq, regs, action); - - spin_lock(&irq_controller_lock); - if (!(desc->status & IRQ_PENDING)) - break; - desc->status &= ~IRQ_PENDING; - spin_unlock(&irq_controller_lock); - } - desc->status &= ~IRQ_INPROGRESS; - spin_unlock(&irq_controller_lock); + unmask_IO_APIC_irq(irq); + return 0; /* don't check for pending */ } -static void do_level_ioapic_IRQ(unsigned int irq, struct pt_regs * regs) +#define shutdown_level_ioapic_irq mask_IO_APIC_irq +#define enable_level_ioapic_irq unmask_IO_APIC_irq +#define disable_level_ioapic_irq mask_IO_APIC_irq +#define end_level_ioapic_irq unmask_IO_APIC_irq +void static mask_and_ack_level_ioapic_irq(unsigned int i) { - irq_desc_t *desc = irq_desc + irq; - struct irqaction * action; - unsigned int status; - - spin_lock(&irq_controller_lock); - /* - * In the level triggered case we first disable the IRQ - * in the IO-APIC, then we 'early ACK' the IRQ, then we - * handle it and enable the IRQ when finished. - * - * disable has to happen before the ACK, to avoid IRQ storms. - * So this all has to be within the spinlock. - */ - mask_IO_APIC_irq(irq); - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - - /* - * If the IRQ is disabled for whatever reason, we must - * not enter the IRQ action. - */ - action = NULL; - if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { - action = desc->action; - status |= IRQ_INPROGRESS; - } - desc->status = status; - + mask_IO_APIC_irq(i); ack_APIC_irq(); - spin_unlock(&irq_controller_lock); - - /* Exit early if we had no action or it was disabled */ - if (!action) - return; - - handle_IRQ_event(irq, regs, action); - - spin_lock(&irq_controller_lock); - desc->status &= ~IRQ_INPROGRESS; - if (!(desc->status & IRQ_DISABLED)) - unmask_IO_APIC_irq(irq); - spin_unlock(&irq_controller_lock); } /* @@ -1146,18 +1037,20 @@ static struct hw_interrupt_type ioapic_edge_irq_type = { "IO-APIC-edge", startup_edge_ioapic_irq, shutdown_edge_ioapic_irq, - do_edge_ioapic_IRQ, enable_edge_ioapic_irq, - disable_edge_ioapic_irq + disable_edge_ioapic_irq, + ack_edge_ioapic_irq, + end_edge_ioapic_irq }; static struct hw_interrupt_type ioapic_level_irq_type = { "IO-APIC-level", startup_level_ioapic_irq, shutdown_level_ioapic_irq, - do_level_ioapic_IRQ, enable_level_ioapic_irq, - disable_level_ioapic_irq + disable_level_ioapic_irq, + mask_and_ack_level_ioapic_irq, + end_level_ioapic_irq }; static inline void init_IO_APIC_traps(void) @@ -1227,7 +1120,10 @@ static inline void check_timer(void) if (pin2 != -1) { printk(".. (found pin %d) ...", pin2); - setup_ExtINT_pin(pin2, 0); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_pin(0, pin2, 0); make_8259A_irq(0); } @@ -1238,9 +1134,9 @@ static inline void check_timer(void) * Just in case ... */ if (pin1 != -1) - clear_IO_APIC_pin(pin1); + clear_IO_APIC_pin(0, pin1); if (pin2 != -1) - clear_IO_APIC_pin(pin2); + clear_IO_APIC_pin(0, pin2); make_8259A_irq(0); @@ -1273,29 +1169,8 @@ void __init setup_IO_APIC(void) { init_sym_mode(); - /* - * Determine the range of IRQs handled by the IO-APIC. The - * following boards can be fully enabled: - * - * - whitelisted ones - * - those which have no PCI pins connected - * - those for which the user has specified a pirq= parameter - */ - if ( ioapic_whitelisted() || - (nr_ioapic_registers == 16) || - pirqs_enabled) - { - printk("ENABLING IO-APIC IRQs\n"); - io_apic_irqs = ~PIC_IRQS; - } else { - if (ioapic_blacklisted()) - printk(" blacklisted board, DISABLING IO-APIC IRQs\n"); - else - printk(" unlisted board, DISABLING IO-APIC IRQs\n"); - - printk(" see Documentation/IO-APIC.txt to enable them\n"); - io_apic_irqs = 0; - } + printk("ENABLING IO-APIC IRQs\n"); + io_apic_irqs = ~PIC_IRQS; /* * If there are no explicit MP IRQ entries, it's either one of the diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 445a26613..070667cbf 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -54,7 +54,8 @@ static void set_bitmap(unsigned long *bitmap, short base, short extent, int new_ */ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - struct thread_struct * t = ¤t->tss; + struct thread_struct * t = ¤t->thread; + struct tss_struct * tss = init_tss + smp_processor_id(); if ((from + num <= from) || (from + num > IO_BITMAP_SIZE*32)) return -EINVAL; @@ -65,14 +66,24 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) * IO bitmap up. ioperm() is much less timing critical than clone(), * this is why we delay this operation until now: */ -#define IO_BITMAP_OFFSET offsetof(struct thread_struct,io_bitmap) - - if (t->bitmap != IO_BITMAP_OFFSET) { - t->bitmap = IO_BITMAP_OFFSET; + if (!t->ioperm) { + /* + * just in case ... + */ memset(t->io_bitmap,0xff,(IO_BITMAP_SIZE+1)*4); + t->ioperm = 1; + /* + * this activates it in the TSS + */ + tss->bitmap = IO_BITMAP_OFFSET; } - - set_bitmap((unsigned long *)t->io_bitmap, from, num, !turn_on); + + /* + * do it in the per-thread copy and in the TSS ... + */ + set_bitmap(t->io_bitmap, from, num, !turn_on); + set_bitmap(tss->io_bitmap, from, num, !turn_on); + return 0; } diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index ea218fe45..3106f1966 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -1,3 +1,8 @@ +/* mostly architecture independent + some moved to i8259.c + the beautiful visws architecture code needs to be updated too. + and, finally, the BUILD_IRQ and SMP_BUILD macros in irq.h need fixed. + */ /* * linux/arch/i386/kernel/irq.c * @@ -15,7 +20,6 @@ * Naturally it's not a 1:1 relation, but there are similarities. */ -#include <linux/config.h> #include <linux/ptrace.h> #include <linux/errno.h> #include <linux/kernel_stat.h> @@ -27,20 +31,19 @@ #include <linux/malloc.h> #include <linux/random.h> #include <linux/smp.h> -#include <linux/tasks.h> #include <linux/smp_lock.h> #include <linux/init.h> #include <asm/system.h> #include <asm/io.h> -#include <asm/irq.h> #include <asm/bitops.h> #include <asm/smp.h> #include <asm/pgtable.h> #include <asm/delay.h> #include <asm/desc.h> +#include <asm/irq.h> +#include <linux/irq.h> -#include "irq.h" unsigned int local_bh_count[NR_CPUS]; unsigned int local_irq_count[NR_CPUS]; @@ -68,297 +71,11 @@ atomic_t nmi_counter; * system. We never hold this lock when we call the actual * IRQ handler. */ -spinlock_t irq_controller_lock; - -/* - * Dummy controller type for unused interrupts - */ -static void do_none(unsigned int irq, struct pt_regs * regs) -{ - /* - * we are careful. While for ISA irqs it's common to happen - * outside of any driver (think autodetection), this is not - * at all nice for PCI interrupts. So we are stricter and - * print a warning when such spurious interrupts happen. - * Spurious interrupts can confuse other drivers if the PCI - * IRQ line is shared. - * - * Such spurious interrupts are either driver bugs, or - * sometimes hw (chipset) bugs. - */ - printk("unexpected IRQ vector %d on CPU#%d!\n",irq, smp_processor_id()); - -#ifdef __SMP__ - /* - * [currently unexpected vectors happen only on SMP and APIC. - * if we want to have non-APIC and non-8259A controllers - * in the future with unexpected vectors, this ack should - * probably be made controller-specific.] - */ - ack_APIC_irq(); -#endif -} -static void enable_none(unsigned int irq) { } -static void disable_none(unsigned int irq) { } - -/* startup is the same as "enable", shutdown is same as "disable" */ -#define startup_none enable_none -#define shutdown_none disable_none - -struct hw_interrupt_type no_irq_type = { - "none", - startup_none, - shutdown_none, - do_none, - enable_none, - disable_none -}; - -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - */ - -static void do_8259A_IRQ(unsigned int irq, struct pt_regs * regs); -static void enable_8259A_irq(unsigned int irq); -void disable_8259A_irq(unsigned int irq); - -/* startup is the same as "enable", shutdown is same as "disable" */ -#define startup_8259A_irq enable_8259A_irq -#define shutdown_8259A_irq disable_8259A_irq - -static struct hw_interrupt_type i8259A_irq_type = { - "XT-PIC", - startup_8259A_irq, - shutdown_8259A_irq, - do_8259A_IRQ, - enable_8259A_irq, - disable_8259A_irq -}; - +spinlock_t irq_controller_lock = SPIN_LOCK_UNLOCKED; /* * Controller mappings for all interrupt sources: */ -irq_desc_t irq_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = { 0, &no_irq_type, }}; - - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -static unsigned int cached_irq_mask = 0xffff; - -#define __byte(x,y) (((unsigned char *)&(y))[x]) -#define cached_21 (__byte(0,cached_irq_mask)) -#define cached_A1 (__byte(1,cached_irq_mask)) - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not connected to any IO-APIC pin, it's - * fed to the CPU IRQ line directly. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs = 0; - -/* - * These have to be protected by the irq controller spinlock - * before being called. - */ -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - cached_irq_mask |= mask; - if (irq & 8) { - outb(cached_A1,0xA1); - } else { - outb(cached_21,0x21); - } -} - -static void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - cached_irq_mask &= mask; - if (irq & 8) { - outb(cached_A1,0xA1); - } else { - outb(cached_21,0x21); - } -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<<irq; - - if (irq < 8) - return (inb(0x20) & mask); - return (inb(0xA0) & (mask >> 8)); -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<<irq); - irq_desc[irq].handler = &i8259A_irq_type; - enable_irq(irq); -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static inline void mask_and_ack_8259A(unsigned int irq) -{ - cached_irq_mask |= 1 << irq; - if (irq & 8) { - inb(0xA1); /* DUMMY */ - outb(cached_A1,0xA1); - outb(0x62,0x20); /* Specific EOI to cascade */ - outb(0x20,0xA0); - } else { - inb(0x21); /* DUMMY */ - outb(cached_21,0x21); - outb(0x20,0x20); - } -} - -static void do_8259A_IRQ(unsigned int irq, struct pt_regs * regs) -{ - struct irqaction * action; - irq_desc_t *desc = irq_desc + irq; - - spin_lock(&irq_controller_lock); - { - unsigned int status; - mask_and_ack_8259A(irq); - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - action = NULL; - if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { - action = desc->action; - status |= IRQ_INPROGRESS; - } - desc->status = status; - } - spin_unlock(&irq_controller_lock); - - /* Exit early if we had no action or it was disabled */ - if (!action) - return; - - handle_IRQ_event(irq, regs, action); - - spin_lock(&irq_controller_lock); - { - unsigned int status = desc->status & ~IRQ_INPROGRESS; - desc->status = status; - if (!(status & IRQ_DISABLED)) - enable_8259A_irq(irq); - } - spin_unlock(&irq_controller_lock); -} - -/* - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - - -BUILD_COMMON_IRQ() - -#define BI(x,y) \ - BUILD_IRQ(##x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x20-0x30) - */ -BUILD_16_IRQS(0x0) - -#ifdef CONFIG_X86_IO_APIC -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) -#endif - -#undef BUILD_16_IRQS -#undef BI - - -#ifdef __SMP__ -/* - * The following vectors are part of the Linux architecture, there - * is no hardware IRQ pin equivalent for them, they are triggered - * through the ICC by us (IPIs) - */ -BUILD_SMP_INTERRUPT(reschedule_interrupt) -BUILD_SMP_INTERRUPT(invalidate_interrupt) -BUILD_SMP_INTERRUPT(stop_cpu_interrupt) -BUILD_SMP_INTERRUPT(call_function_interrupt) -BUILD_SMP_INTERRUPT(spurious_interrupt) - -/* - * every pentium local APIC has two 'local interrupts', with a - * soft-definable vector attached to both interrupts, one of - * which is a timer interrupt, the other one is error counter - * overflow. Linux uses the local APIC timer interrupt to get - * a much simpler SMP time architecture: - */ -BUILD_SMP_TIMER_INTERRUPT(apic_timer_interrupt) - -#endif - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -static void (*interrupt[NR_IRQS])(void) = { - IRQLIST_16(0x0), - -#ifdef CONFIG_X86_IO_APIC - IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd) -#endif -}; - -#undef IRQ -#undef IRQLIST_16 - +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { 0, &no_irq_type, }}; /* * Special irq handlers. @@ -366,36 +83,6 @@ static void (*interrupt[NR_IRQS])(void) = { void no_action(int cpl, void *dev_id, struct pt_regs *regs) { } -#ifndef CONFIG_VISWS -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static void math_error_irq(int cpl, void *dev_id, struct pt_regs *regs) -{ - outb(0,0xF0); - if (ignore_irq13 || !boot_cpu_data.hard_math) - return; - math_error(); -} - -static struct irqaction irq13 = { math_error_irq, 0, 0, "fpu", NULL, NULL }; - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { no_action, 0, 0, "cascade", NULL, NULL}; -#endif - /* * Generic, controller-independent functions: */ @@ -438,10 +125,13 @@ int get_irq_list(char *buf) return p - buf; } + /* * Global interrupt locks for SMP. Allow interrupts to come in on any * CPU, yet make cli/sti act globally to protect critical regions.. */ +spinlock_t i386_bh_lock = SPIN_LOCK_UNLOCKED; + #ifdef __SMP__ unsigned char global_irq_holder = NO_PROC_ID; unsigned volatile int global_irq_lock; @@ -461,7 +151,10 @@ atomic_t global_bh_lock; static inline void check_smp_invalidate(int cpu) { if (test_bit(cpu, &smp_invalidate_needed)) { + struct mm_struct *mm = current->mm; clear_bit(cpu, &smp_invalidate_needed); + if (mm) + atomic_set_mask(1 << cpu, &mm->cpu_vm_mask); local_flush_tlb(); } } @@ -471,7 +164,6 @@ static void show(char * str) int i; unsigned long *stack; int cpu = smp_processor_id(); - extern char *get_options(char *str, int *ints); printk("\n%s, CPU %d:\n", str, cpu); printk("irq: %d [%d %d]\n", @@ -481,7 +173,7 @@ static void show(char * str) stack = (unsigned long *) &stack; for (i = 40; i ; i--) { unsigned long x = *++stack; - if (x > (unsigned long) &get_options && x < (unsigned long) &vsprintf) { + if (x > (unsigned long) &get_option && x < (unsigned long) &vsprintf) { printk("<[%08lx]> ", x); } } @@ -782,10 +474,16 @@ void enable_irq(unsigned int irq) spin_lock_irqsave(&irq_controller_lock, flags); switch (irq_desc[irq].depth) { - case 1: - irq_desc[irq].status &= ~IRQ_DISABLED; + case 1: { + unsigned int status = irq_desc[irq].status & ~IRQ_DISABLED; + irq_desc[irq].status = status; + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + irq_desc[irq].status = status | IRQ_REPLAY; + hw_resend_irq(irq_desc[irq].handler,irq); + } irq_desc[irq].handler->enable(irq); - /* fall throught */ + /* fall-through */ + } default: irq_desc[irq].depth--; break; @@ -801,7 +499,7 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage void do_IRQ(struct pt_regs regs) +asmlinkage unsigned int do_IRQ(struct pt_regs regs) { /* * We ack quickly, we don't want the irq controller @@ -813,76 +511,81 @@ asmlinkage void do_IRQ(struct pt_regs regs) * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* subtle, see irq.h */ + int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ int cpu = smp_processor_id(); + irq_desc_t *desc; + struct irqaction * action; + unsigned int status; kstat.irqs[cpu][irq]++; - irq_desc[irq].handler->handle(irq, ®s); + desc = irq_desc + irq; + spin_lock(&irq_controller_lock); + irq_desc[irq].handler->ack(irq); + /* + REPLAY is when Linux resends an IRQ that was dropped earlier + WAITING is used by probe to mark irqs that are being tested + */ + status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); + status |= IRQ_PENDING; /* we _want_ to handle it */ /* - * This should be conditional: we should really get - * a return code from the irq handler to tell us - * whether the handler wants us to do software bottom - * half handling or not.. + * If the IRQ is disabled for whatever reason, we cannot + * use the action we have. */ - if (1) { - if (bh_active & bh_mask) - do_bottom_half(); + action = NULL; + if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { + action = desc->action; + status &= ~IRQ_PENDING; /* we commit to handling */ + status |= IRQ_INPROGRESS; /* we are handling it */ } -} - -int setup_x86_irq(unsigned int irq, struct irqaction * new) -{ - int shared = 0; - struct irqaction *old, **p; - unsigned long flags; + desc->status = status; + spin_unlock(&irq_controller_lock); /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. + * If there is no IRQ handler or it was disabled, exit early. + Since we set PENDING, if another processor is handling + a different instance of this same irq, the other processor + will take care of it. */ - if (new->flags & SA_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } + if (!action) + return 1; /* - * The following block of code has to be executed atomically + * Edge triggered interrupts need to remember + * pending events. + * This applies to any hw interrupts that allow a second + * instance of the same irq to arrive while we are in do_IRQ + * or in the handler. But the code here only handles the _second_ + * instance of the irq, not the third or fourth. So it is mostly + * useful for irq hardware that does not mask cleanly in an + * SMP environment. */ - spin_lock_irqsave(&irq_controller_lock,flags); - p = &irq_desc[irq].action; - if ((old = *p) != NULL) { - /* Can't share interrupts unless both agree to */ - if (!(old->flags & new->flags & SA_SHIRQ)) { - spin_unlock_irqrestore(&irq_controller_lock,flags); - return -EBUSY; - } - - /* add new interrupt at end of irq queue */ - do { - p = &old->next; - old = *p; - } while (old); - shared = 1; + for (;;) { + handle_IRQ_event(irq, ®s, action); + spin_lock(&irq_controller_lock); + + if (!(desc->status & IRQ_PENDING)) + break; + desc->status &= ~IRQ_PENDING; + spin_unlock(&irq_controller_lock); } + desc->status &= ~IRQ_INPROGRESS; + if (!(desc->status & IRQ_DISABLED)){ + irq_desc[irq].handler->end(irq); + } + spin_unlock(&irq_controller_lock); - *p = new; - - if (!shared) { - irq_desc[irq].depth = 0; - irq_desc[irq].status &= ~IRQ_DISABLED; - irq_desc[irq].handler->startup(irq); + /* + * This should be conditional: we should really get + * a return code from the irq handler to tell us + * whether the handler wants us to do software bottom + * half handling or not.. + */ + if (1) { + if (bh_active & bh_mask) + do_bottom_half(); } - spin_unlock_irqrestore(&irq_controller_lock,flags); - return 0; + return 1; } int request_irq(unsigned int irq, @@ -911,8 +614,7 @@ int request_irq(unsigned int irq, action->next = NULL; action->dev_id = dev_id; - retval = setup_x86_irq(irq, action); - + retval = setup_irq(irq, action); if (retval) kfree(action); return retval; @@ -920,29 +622,40 @@ int request_irq(unsigned int irq, void free_irq(unsigned int irq, void *dev_id) { - struct irqaction * action, **p; + struct irqaction **p; unsigned long flags; if (irq >= NR_IRQS) return; spin_lock_irqsave(&irq_controller_lock,flags); - for (p = &irq_desc[irq].action; (action = *p) != NULL; p = &action->next) { - if (action->dev_id != dev_id) - continue; + p = &irq_desc[irq].action; + for (;;) { + struct irqaction * action = *p; + if (action) { + struct irqaction **pp = p; + p = &action->next; + if (action->dev_id != dev_id) + continue; - /* Found it - now free it */ - *p = action->next; - kfree(action); - if (!irq_desc[irq].action) { - irq_desc[irq].status |= IRQ_DISABLED; - irq_desc[irq].handler->shutdown(irq); + /* Found it - now remove it from the list of entries */ + *pp = action->next; + if (!irq_desc[irq].action) { + irq_desc[irq].status |= IRQ_DISABLED; + irq_desc[irq].handler->shutdown(irq); + } + spin_unlock_irqrestore(&irq_controller_lock,flags); + + /* Wait to make sure it's not being used on another CPU */ + while (irq_desc[irq].status & IRQ_INPROGRESS) + barrier(); + kfree(action); + return; } - goto out; + printk("Trying to free free IRQ%d\n",irq); + spin_unlock_irqrestore(&irq_controller_lock,flags); + return; } - printk("Trying to free free IRQ%d\n",irq); -out: - spin_unlock_irqrestore(&irq_controller_lock,flags); } /* @@ -965,7 +678,8 @@ unsigned long probe_irq_on(void) for (i = NR_IRQS-1; i > 0; i--) { if (!irq_desc[i].action) { irq_desc[i].status |= IRQ_AUTODETECT | IRQ_WAITING; - irq_desc[i].handler->startup(i); + if(irq_desc[i].handler->startup(i)) + irq_desc[i].status |= IRQ_PENDING; } } spin_unlock_irq(&irq_controller_lock); @@ -1028,102 +742,58 @@ int probe_irq_off(unsigned long unused) return irq_found; } -void init_ISA_irqs (void) +/* this was setup_x86_irq but it seems pretty generic */ +int setup_irq(unsigned int irq, struct irqaction * new) { - int i; - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = 0; - irq_desc[i].depth = 0; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - irq_desc[i].handler = &i8259A_irq_type; - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].handler = &no_irq_type; - } - } -} - -__initfunc(void init_IRQ(void)) -{ - int i; + int shared = 0; + struct irqaction *old, **p; + unsigned long flags; -#ifndef CONFIG_X86_VISWS_APIC - init_ISA_irqs(); -#else - init_VISWS_APIC_irqs(); -#endif /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) + * Some drivers like serial.c use request_irq() heavily, + * so we have to be careful not to interfere with a + * running system. */ - for (i = 0; i < NR_IRQS; i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + if (new->flags & SA_SAMPLE_RANDOM) { + /* + * This function might sleep, we want to call it first, + * outside of the atomic block. + * Yes, this might clear the entropy pool if the wrong + * driver is attempted to be loaded, without actually + * installing a new handler, but is this really a problem, + * only the sysadmin is able to do this. + */ + rand_initialize_irq(irq); } -#ifdef __SMP__ - /* - IRQ0 must be given a fixed assignment and initialized - before init_IRQ_SMP. - */ - set_intr_gate(IRQ0_TRAP_VECTOR, interrupt[0]); - - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. + * The following block of code has to be executed atomically */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); - - /* IPI for CPU halt */ - set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt); - - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI vector for APIC spurious interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); -#endif - request_region(0x20,0x20,"pic1"); - request_region(0xa0,0x20,"pic2"); + spin_lock_irqsave(&irq_controller_lock,flags); + p = &irq_desc[irq].action; + if ((old = *p) != NULL) { + /* Can't share interrupts unless both agree to */ + if (!(old->flags & new->flags & SA_SHIRQ)) { + spin_unlock_irqrestore(&irq_controller_lock,flags); + return -EBUSY; + } - /* - * Set the clock to 100 Hz, we already have a valid - * vector now: - */ - outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff , 0x40); /* LSB */ - outb(LATCH >> 8 , 0x40); /* MSB */ + /* add new interrupt at end of irq queue */ + do { + p = &old->next; + old = *p; + } while (old); + shared = 1; + } -#ifndef CONFIG_VISWS - setup_x86_irq(2, &irq2); - setup_x86_irq(13, &irq13); -#endif -} + *p = new; -#ifdef CONFIG_X86_IO_APIC -__initfunc(void init_IRQ_SMP(void)) -{ - int i; - for (i = 0; i < NR_IRQS ; i++) - if (IO_APIC_VECTOR(i) > 0) - set_intr_gate(IO_APIC_VECTOR(i), interrupt[i]); + if (!shared) { + irq_desc[irq].depth = 0; + irq_desc[irq].status &= ~IRQ_DISABLED; + irq_desc[irq].handler->startup(irq); + } + spin_unlock_irqrestore(&irq_controller_lock,flags); + return 0; } -#endif diff --git a/arch/i386/kernel/irq.h b/arch/i386/kernel/irq.h deleted file mode 100644 index 1023cd4da..000000000 --- a/arch/i386/kernel/irq.h +++ /dev/null @@ -1,255 +0,0 @@ -#ifndef __irq_h -#define __irq_h - -#include <asm/irq.h> - -/* - * Interrupt controller descriptor. This is all we need - * to describe about the low-level hardware. - */ -struct hw_interrupt_type { - const char * typename; - void (*startup)(unsigned int irq); - void (*shutdown)(unsigned int irq); - void (*handle)(unsigned int irq, struct pt_regs * regs); - void (*enable)(unsigned int irq); - void (*disable)(unsigned int irq); -}; - -extern struct hw_interrupt_type no_irq_type; - -/* - * IRQ line status. - */ -#define IRQ_INPROGRESS 1 /* IRQ handler active - do not enter! */ -#define IRQ_DISABLED 2 /* IRQ disabled - do not enter! */ -#define IRQ_PENDING 4 /* IRQ pending - replay on enable */ -#define IRQ_REPLAY 8 /* IRQ has been replayed but not acked yet */ -#define IRQ_AUTODETECT 16 /* IRQ is being autodetected */ -#define IRQ_WAITING 32 /* IRQ not yet seen - for autodetection */ - -/* - * This is the "IRQ descriptor", which contains various information - * about the irq, including what kind of hardware handling it has, - * whether it is disabled etc etc. - * - * Pad this out to 32 bytes for cache and indexing reasons. - */ -typedef struct { - unsigned int status; /* IRQ status - IRQ_INPROGRESS, IRQ_DISABLED */ - struct hw_interrupt_type *handler; /* handle/enable/disable functions */ - struct irqaction *action; /* IRQ action list */ - unsigned int depth; /* Disable depth for nested irq disables */ -} irq_desc_t; - -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. - */ - -/* - * Special IRQ vectors used by the SMP architecture: - * - * (some of the following vectors are 'rare', they might be merged - * into a single vector to save vector space. TLB, reschedule and - * local APIC vectors are performance-critical.) - */ -#define RESCHEDULE_VECTOR 0x30 -#define INVALIDATE_TLB_VECTOR 0x31 -#define STOP_CPU_VECTOR 0x40 -#define LOCAL_TIMER_VECTOR 0x41 -#define CALL_FUNCTION_VECTOR 0x50 - -/* - * First APIC vector available to drivers: (vectors 0x51-0xfe) - */ -#define IRQ0_TRAP_VECTOR 0x51 - -/* - * This IRQ should never happen, but we print a message nevertheless. - */ -#define SPURIOUS_APIC_VECTOR 0xff - -extern irq_desc_t irq_desc[NR_IRQS]; -extern int irq_vector[NR_IRQS]; -#define IO_APIC_VECTOR(irq) irq_vector[irq] - -extern void init_IRQ_SMP(void); -extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); -extern int setup_x86_irq(unsigned int, struct irqaction *); - -/* - * Various low-level irq details needed by irq.c, process.c, - * time.c, io_apic.c and smp.c - * - * Interrupt entry/exit code at both C and assembly level - */ - -extern void no_action(int cpl, void *dev_id, struct pt_regs *regs); -extern void mask_irq(unsigned int irq); -extern void unmask_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void ack_APIC_irq(void); -extern void FASTCALL(send_IPI_self(int vector)); -extern void init_VISWS_APIC_irqs(void); -extern void setup_IO_APIC(void); -extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); -extern void make_8259A_irq(unsigned int irq); -extern void send_IPI(int dest, int vector); -extern void init_pic_mode(void); -extern void print_IO_APIC(void); - -extern unsigned long io_apic_irqs; - -extern char _stext, _etext; - -#define MAX_IRQ_SOURCES 128 -#define MAX_MP_BUSSES 32 -enum mp_bustype { - MP_BUS_ISA, - MP_BUS_EISA, - MP_BUS_PCI -}; -extern int mp_bus_id_to_type [MAX_MP_BUSSES]; -extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES]; -extern char ioapic_OEM_ID [16]; -extern char ioapic_Product_ID [16]; - -extern spinlock_t irq_controller_lock; - -#ifdef __SMP__ - -#include <asm/atomic.h> - -static inline void irq_enter(int cpu, unsigned int irq) -{ - hardirq_enter(cpu); - while (test_bit(0,&global_irq_lock)) { - /* nothing */; - } -} - -static inline void irq_exit(int cpu, unsigned int irq) -{ - hardirq_exit(cpu); -} - -#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) - -#else - -#define irq_enter(cpu, irq) (++local_irq_count[cpu]) -#define irq_exit(cpu, irq) (--local_irq_count[cpu]) - -#define IO_APIC_IRQ(x) (0) - -#endif - -#define __STR(x) #x -#define STR(x) __STR(x) - -#define SAVE_ALL \ - "cld\n\t" \ - "pushl %es\n\t" \ - "pushl %ds\n\t" \ - "pushl %eax\n\t" \ - "pushl %ebp\n\t" \ - "pushl %edi\n\t" \ - "pushl %esi\n\t" \ - "pushl %edx\n\t" \ - "pushl %ecx\n\t" \ - "pushl %ebx\n\t" \ - "movl $" STR(__KERNEL_DS) ",%edx\n\t" \ - "movl %dx,%ds\n\t" \ - "movl %dx,%es\n\t" - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -#define GET_CURRENT \ - "movl %esp, %ebx\n\t" \ - "andl $-8192, %ebx\n\t" - -#ifdef __SMP__ - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_SMP_INTERRUPT(x) \ -asmlinkage void x(void); \ -__asm__( \ -"\n"__ALIGN_STR"\n" \ -SYMBOL_NAME_STR(x) ":\n\t" \ - "pushl $-1\n\t" \ - SAVE_ALL \ - "call "SYMBOL_NAME_STR(smp_##x)"\n\t" \ - "jmp ret_from_intr\n"); - -#define BUILD_SMP_TIMER_INTERRUPT(x) \ -asmlinkage void x(struct pt_regs * regs); \ -__asm__( \ -"\n"__ALIGN_STR"\n" \ -SYMBOL_NAME_STR(x) ":\n\t" \ - "pushl $-1\n\t" \ - SAVE_ALL \ - "movl %esp,%eax\n\t" \ - "pushl %eax\n\t" \ - "call "SYMBOL_NAME_STR(smp_##x)"\n\t" \ - "addl $4,%esp\n\t" \ - "jmp ret_from_intr\n"); - -#endif /* __SMP__ */ - -#define BUILD_COMMON_IRQ() \ -__asm__( \ - "\n" __ALIGN_STR"\n" \ - "common_interrupt:\n\t" \ - SAVE_ALL \ - "pushl $ret_from_intr\n\t" \ - "jmp "SYMBOL_NAME_STR(do_IRQ)); - -/* - * subtle. orig_eax is used by the signal code to distinct between - * system calls and interrupted 'random user-space'. Thus we have - * to put a negative value into orig_eax here. (the problem is that - * both system calls and IRQs want to have small integer numbers in - * orig_eax, and the syscall code has won the optimization conflict ;) - */ -#define BUILD_IRQ(nr) \ -asmlinkage void IRQ_NAME(nr); \ -__asm__( \ -"\n"__ALIGN_STR"\n" \ -SYMBOL_NAME_STR(IRQ) #nr "_interrupt:\n\t" \ - "pushl $"#nr"-256\n\t" \ - "jmp common_interrupt"); - -/* - * x86 profiling function, SMP safe. We might want to do this in - * assembly totally? - */ -static inline void x86_do_profile (unsigned long eip) -{ - if (prof_buffer) { - eip -= (unsigned long) &_stext; - eip >>= prof_shift; - /* - * Don't ignore out-of-bounds EIP values silently, - * put them into the last histogram slot, so if - * present, they will show up as a sharp peak. - */ - if (eip > prof_len-1) - eip = prof_len-1; - atomic_inc((atomic_t *)&prof_buffer[eip]); - } -} - -#endif diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c index 25e8deec4..1c359b4f4 100644 --- a/arch/i386/kernel/ldt.c +++ b/arch/i386/kernel/ldt.c @@ -2,6 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> */ #include <linux/errno.h> @@ -17,19 +18,31 @@ #include <asm/ldt.h> #include <asm/desc.h> +/* + * read_ldt() is not really atomic - this is not a problem since + * synchronization of reads and writes done to the LDT has to be + * assured by user-space anyway. Writes are atomic, to protect + * the security checks done on new descriptors. + */ static int read_ldt(void * ptr, unsigned long bytecount) { - void * address = current->mm->segments; + int err; unsigned long size; + struct mm_struct * mm = current->mm; + + err = 0; + if (!mm->segments) + goto out; - if (!ptr) - return -EINVAL; - if (!address) - return 0; size = LDT_ENTRIES*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; - return copy_to_user(ptr, address, size) ? -EFAULT : size; + + err = size; + if (copy_to_user(ptr, mm->segments, size)) + err = -EFAULT; +out: + return err; } static int write_ldt(void * ptr, unsigned long bytecount, int oldmode) @@ -64,31 +77,30 @@ static int write_ldt(void * ptr, unsigned long bytecount, int oldmode) * you get strange behaviour (the kernel is safe, it's just user * space strangeness). * - * For no good reason except historical, the GDT index of the LDT - * is chosen to follow the index number in the task[] array. + * we have two choices: either we preallocate the LDT descriptor + * and can do a shared modify_ldt(), or we postallocate it and do + * an smp message pass to update it. Currently we are a bit + * un-nice to user-space and reload the LDT only on the next + * schedule. (only an issue on SMP) + * + * the GDT index of the LDT is allocated dynamically, and is + * limited by MAX_LDT_DESCRIPTORS. */ + down(&mm->mmap_sem); if (!mm->segments) { - void * ldt; + error = -ENOMEM; - ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); - if (!ldt) - goto out; - memset(ldt, 0, LDT_ENTRIES*LDT_ENTRY_SIZE); + mm->segments = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); + if (!mm->segments) + goto out_unlock; + + if (atomic_read(&mm->mm_users) > 1) + printk(KERN_WARNING "LDT allocated for cloned task!\n"); /* - * Make sure someone else hasn't allocated it for us ... + * Possibly do an SMP cross-call to other CPUs to reload + * their LDTs? */ - if (!mm->segments) { - int i = current->tarray_ptr - &task[0]; - mm->segments = ldt; - set_ldt_desc(i, ldt, LDT_ENTRIES); - current->tss.ldt = _LDT(i); - load_ldt(i); - if (atomic_read(&mm->count) > 1) - printk(KERN_WARNING - "LDT allocated for cloned task!\n"); - } else { - vfree(ldt); - } + load_LDT(mm); } lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->segments); @@ -127,6 +139,9 @@ install: *lp = entry_1; *(lp+1) = entry_2; error = 0; + +out_unlock: + up(&mm->mmap_sem); out: return error; } @@ -135,7 +150,6 @@ asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) { int ret = -ENOSYS; - lock_kernel(); switch (func) { case 0: ret = read_ldt(ptr, bytecount); @@ -147,6 +161,5 @@ asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) ret = write_ldt(ptr, bytecount, 0); break; } - unlock_kernel(); return ret; } diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c index 8bfd7fa45..792cc8c0d 100644 --- a/arch/i386/kernel/mca.c +++ b/arch/i386/kernel/mca.c @@ -210,7 +210,19 @@ static void mca_configure_adapter_status(int slot) { /*--------------------------------------------------------------------*/ -__initfunc(void mca_init(void)) +struct resource mca_standard_resources[] = { + { "system control port B (MCA)", 0x60, 0x60 }, + { "arbitration (MCA)", 0x90, 0x90 }, + { "card Select Feedback (MCA)", 0x91, 0x91 }, + { "system Control port A (MCA)", 0x92, 0x92 }, + { "system board setup (MCA)", 0x94, 0x94 }, + { "POS (MCA)", 0x96, 0x97 }, + { "POS (MCA)", 0x100, 0x107 } +}; + +#define MCA_STANDARD_RESOURCES (sizeof(mca_standard_resources)/sizeof(struct resource)) + +void __init mca_init(void) { unsigned int i, j; unsigned long flags; @@ -319,13 +331,8 @@ __initfunc(void mca_init(void)) restore_flags(flags); - request_region(0x60,0x01,"system control port B (MCA)"); - request_region(0x90,0x01,"arbitration (MCA)"); - request_region(0x91,0x01,"card Select Feedback (MCA)"); - request_region(0x92,0x01,"system Control port A (MCA)"); - request_region(0x94,0x01,"system board setup (MCA)"); - request_region(0x96,0x02,"POS (MCA)"); - request_region(0x100,0x08,"POS (MCA)"); + for (i = 0; i < MCA_STANDARD_RESOURCES; i++) + request_resource(&ioport_resource, mca_standard_resources + i); #ifdef CONFIG_PROC_FS mca_do_proc_init(); @@ -691,7 +698,7 @@ int get_mca_info(char *buf) /*--------------------------------------------------------------------*/ -__initfunc(void mca_do_proc_init(void)) +void __init mca_do_proc_init(void) { int i; struct proc_dir_entry* node = NULL; diff --git a/arch/i386/kernel/mtrr.c b/arch/i386/kernel/mtrr.c index 084ad431c..f76c68f59 100644 --- a/arch/i386/kernel/mtrr.c +++ b/arch/i386/kernel/mtrr.c @@ -201,6 +201,28 @@ 19990512 Richard Gooch <rgooch@atnf.csiro.au> Minor cleanups. v1.35 + 19990707 Zoltan Boszormenyi <zboszor@mol.hu> + Check whether ARR3 is protected in cyrix_get_free_region() + and mtrr_del(). The code won't attempt to delete or change it + from now on if the BIOS protected ARR3. It silently skips ARR3 + in cyrix_get_free_region() or returns with an error code from + mtrr_del(). + 19990711 Zoltan Boszormenyi <zboszor@mol.hu> + Reset some bits in the CCRs in cyrix_arr_init() to disable SMM + if ARR3 isn't protected. This is needed because if SMM is active + and ARR3 isn't protected then deleting and setting ARR3 again + may lock up the processor. With SMM entirely disabled, it does + not happen. + 19990812 Zoltan Boszormenyi <zboszor@mol.hu> + Rearrange switch() statements so the driver accomodates to + the fact that the AMD Athlon handles its MTRRs the same way + as Intel does. + 19990814 Zoltan Boszormenyi <zboszor@mol.hu> + Double check for Intel in mtrr_add()'s big switch() because + that revision check is only valid for Intel CPUs. + 19990819 Alan Cox <alan@redhat.com> + Tested Zoltan's changes on a pre production Athlon - 100% + success. */ #include <linux/types.h> #include <linux/errno.h> @@ -235,7 +257,7 @@ #include <asm/msr.h> #include <asm/hardirq.h> -#include "irq.h" +#include <linux/irq.h> #define MTRR_VERSION "1.35 (19990512)" @@ -309,6 +331,7 @@ struct set_mtrr_context unsigned long ccr3; }; +static int arr3_protected; /* Put the processor into a state where MTRRs can be safely set */ static void set_mtrr_prepare (struct set_mtrr_context *ctxt) @@ -321,6 +344,8 @@ static void set_mtrr_prepare (struct set_mtrr_context *ctxt) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (boot_cpu_data.x86 >= 6) break; /* Athlon and post-Athlon CPUs */ + /* else fall through */ case X86_VENDOR_CENTAUR: return; /*break;*/ @@ -344,6 +369,7 @@ static void set_mtrr_prepare (struct set_mtrr_context *ctxt) switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: case X86_VENDOR_INTEL: /* Disable MTRRs, and set the default type to uncached */ rdmsr (MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); @@ -365,6 +391,8 @@ static void set_mtrr_done (struct set_mtrr_context *ctxt) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (boot_cpu_data.x86 >= 6) break; /* Athlon and post-Athlon CPUs */ + /* else fall through */ case X86_VENDOR_CENTAUR: __restore_flags (ctxt->flags); return; @@ -376,6 +404,7 @@ static void set_mtrr_done (struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: case X86_VENDOR_INTEL: wrmsr (MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); break; @@ -406,6 +435,9 @@ static unsigned int get_num_var_ranges (void) switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) return 2; /* pre-Athlon CPUs */ + /* else fall through */ case X86_VENDOR_INTEL: rdmsr (MTRRcap_MSR, config, dummy); return (config & 0xff); @@ -416,9 +448,6 @@ static unsigned int get_num_var_ranges (void) /* and Centaur has 8 MCR's */ return 8; /*break;*/ - case X86_VENDOR_AMD: - return 2; - /*break;*/ } return 0; } /* End Function get_num_var_ranges */ @@ -430,12 +459,14 @@ static int have_wrcomb (void) switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) return 1; /* pre-Athlon CPUs */ + /* else fall through */ case X86_VENDOR_INTEL: rdmsr (MTRRcap_MSR, config, dummy); return (config & (1<<10)); /*break;*/ case X86_VENDOR_CYRIX: - case X86_VENDOR_AMD: case X86_VENDOR_CENTAUR: return 1; /*break;*/ @@ -731,8 +762,8 @@ struct mtrr_var_range /* Get the MSR pair relating to a var range */ -__initfunc(static void get_mtrr_var_range (unsigned int index, - struct mtrr_var_range *vr)) +static void __init get_mtrr_var_range (unsigned int index, + struct mtrr_var_range *vr) { rdmsr (MTRRphysBase_MSR (index), vr->base_lo, vr->base_hi); rdmsr (MTRRphysMask_MSR (index), vr->mask_lo, vr->mask_hi); @@ -741,8 +772,8 @@ __initfunc(static void get_mtrr_var_range (unsigned int index, /* Set the MSR pair relating to a var range. Returns TRUE if changes are made */ -__initfunc(static int set_mtrr_var_range_testing (unsigned int index, - struct mtrr_var_range *vr)) +static int __init set_mtrr_var_range_testing (unsigned int index, + struct mtrr_var_range *vr) { unsigned int lo, hi; int changed = FALSE; @@ -764,7 +795,7 @@ __initfunc(static int set_mtrr_var_range_testing (unsigned int index, return changed; } /* End Function set_mtrr_var_range_testing */ -__initfunc(static void get_fixed_ranges(mtrr_type *frs)) +static void __init get_fixed_ranges(mtrr_type *frs) { unsigned long *p = (unsigned long *)frs; int i; @@ -777,7 +808,7 @@ __initfunc(static void get_fixed_ranges(mtrr_type *frs)) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i*2], p[7 + i*2]); } /* End Function get_fixed_ranges */ -__initfunc(static int set_fixed_ranges_testing(mtrr_type *frs)) +static int __init set_fixed_ranges_testing(mtrr_type *frs) { unsigned long *p = (unsigned long *)frs; int changed = FALSE; @@ -819,7 +850,7 @@ struct mtrr_state /* Grab all of the MTRR state for this CPU into *state */ -__initfunc(static void get_mtrr_state(struct mtrr_state *state)) +static void __init get_mtrr_state(struct mtrr_state *state) { unsigned int nvrs, i; struct mtrr_var_range *vrs; @@ -842,14 +873,14 @@ __initfunc(static void get_mtrr_state(struct mtrr_state *state)) /* Free resources associated with a struct mtrr_state */ -__initfunc(static void finalize_mtrr_state(struct mtrr_state *state)) +static void __init finalize_mtrr_state(struct mtrr_state *state) { if (state->var_ranges) kfree (state->var_ranges); } /* End Function finalize_mtrr_state */ -__initfunc(static unsigned long set_mtrr_state (struct mtrr_state *state, - struct set_mtrr_context *ctxt)) +static unsigned long __init set_mtrr_state (struct mtrr_state *state, + struct set_mtrr_context *ctxt) /* [SUMMARY] Set the MTRR state for this CPU. <state> The MTRR state information to read. <ctxt> Some relevant CPU context. @@ -948,7 +979,7 @@ static void set_mtrr_smp (unsigned int reg, unsigned long base, /* Some BIOS's are fucked and don't set all MTRRs the same! */ -__initfunc(static void mtrr_state_warn (unsigned long mask)) +static void __init mtrr_state_warn(unsigned long mask) { if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) @@ -1030,6 +1061,7 @@ static int cyrix_get_free_region (unsigned long base, unsigned long size) for (i = 0; i < 7; i++) { cyrix_get_arr (i, &lbase, &lsize, <ype); + if ((i == 3) && arr3_protected) continue; if (lsize < 1) return i; } /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ @@ -1062,13 +1094,30 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, if ( !(boot_cpu_data.x86_capability & X86_FEATURE_MTRR) ) return -ENODEV; switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) { /* pre-Athlon CPUs */ + /* Apply the K6 block alignment and size rules + In order + o Uncached or gathering only + o 128K or bigger block + o Power of 2 block + o base suitably aligned to the power + */ + if (type > MTRR_TYPE_WRCOMB || size < (1 << 17) || + (size & ~(size-1))-size || (base & (size-1))) + return -EINVAL; + break; + } /* else fall through */ case X86_VENDOR_INTEL: - /* For Intel PPro stepping <= 7, must be 4 MiB aligned */ - if ( (boot_cpu_data.x86 == 6) && (boot_cpu_data.x86_model == 1) && - (boot_cpu_data.x86_mask <= 7) && ( base & ( (1 << 22) - 1 ) ) ) - { - printk ("mtrr: base(0x%lx) is not 4 MiB aligned\n", base); - return -EINVAL; + /* Double check for Intel, we may run on Athlon. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + /* For Intel PPro stepping <= 7, must be 4 MiB aligned */ + if ( (boot_cpu_data.x86 == 6) && (boot_cpu_data.x86_model == 1) && + (boot_cpu_data.x86_mask <= 7) && ( base & ( (1 << 22) - 1 ) ) ) + { + printk ("mtrr: base(0x%lx) is not 4 MiB aligned\n", base); + return -EINVAL; + } } /* Fall through */ case X86_VENDOR_CYRIX: @@ -1105,18 +1154,6 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, return -EINVAL; } break; - case X86_VENDOR_AMD: - /* Apply the K6 block alignment and size rules - In order - o Uncached or gathering only - o 128K or bigger block - o Power of 2 block - o base suitably aligned to the power - */ - if (type > MTRR_TYPE_WRCOMB || size < (1 << 17) || - (size & ~(size-1))-size || (base & (size-1))) - return -EINVAL; - break; default: return -EINVAL; /*break;*/ @@ -1221,6 +1258,15 @@ int mtrr_del (int reg, unsigned long base, unsigned long size) printk ("mtrr: register: %d too big\n", reg); return -EINVAL; } + if (boot_cpu_data.x86_vendor == X86_VENDOR_CYRIX) + { + if ((reg == 3) && arr3_protected) + { + spin_unlock (&main_lock); + printk ("mtrr: ARR3 cannot be changed\n"); + return -EINVAL; + } + } (*get_mtrr) (reg, &lbase, &lsize, <ype); if (lsize < 1) { @@ -1532,7 +1578,7 @@ arr_state_t arr_state[8] __initdata = { unsigned char ccr_state[7] __initdata = { 0, 0, 0, 0, 0, 0, 0 }; -__initfunc(static void cyrix_arr_init_secondary(void)) +static void __init cyrix_arr_init_secondary(void) { struct set_mtrr_context ctxt; int i; @@ -1565,7 +1611,7 @@ __initfunc(static void cyrix_arr_init_secondary(void)) * - (maybe) disable ARR3 * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set) */ -__initfunc(static void cyrix_arr_init(void)) +static void __init cyrix_arr_init(void) { struct set_mtrr_context ctxt; unsigned char ccr[7]; @@ -1585,22 +1631,22 @@ __initfunc(static void cyrix_arr_init(void)) ccr[5] = getCx86 (CX86_CCR5); ccr[6] = getCx86 (CX86_CCR6); - if (ccr[3] & 1) + if (ccr[3] & 1) { ccrc[3] = 1; - else { + arr3_protected = 1; + } else { /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and * access to SMM memory through ARR3 (bit 7). */ -/* if (ccr[1] & 0x80) { ccr[1] &= 0x7f; ccrc[1] |= 0x80; } if (ccr[1] & 0x04) { ccr[1] &= 0xfb; ccrc[1] |= 0x04; } if (ccr[1] & 0x02) { ccr[1] &= 0xfd; ccrc[1] |= 0x02; } -*/ + arr3_protected = 0; if (ccr[6] & 0x02) { ccr[6] &= 0xfd; ccrc[6] = 1; /* Disable write protection of ARR3. */ setCx86 (CX86_CCR6, ccr[6]); } - /* Disable ARR3. */ + /* Disable ARR3. This is safe now that we disabled SMM. */ /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */ } /* If we changed CCR1 in memory, change it in the processor, too. */ @@ -1631,7 +1677,7 @@ __initfunc(static void cyrix_arr_init(void)) if ( ccrc[6] ) printk ("mtrr: ARR3 was write protected, unprotected\n"); } /* End Function cyrix_arr_init */ -__initfunc(static void centaur_mcr_init (void)) +static void __init centaur_mcr_init(void) { unsigned i; struct set_mtrr_context ctxt; @@ -1655,11 +1701,17 @@ __initfunc(static void centaur_mcr_init (void)) set_mtrr_done (&ctxt); } /* End Function centaur_mcr_init */ -__initfunc(static void mtrr_setup (void)) +static void __init mtrr_setup(void) { printk ("mtrr: v%s Richard Gooch (rgooch@atnf.csiro.au)\n", MTRR_VERSION); switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) { /* pre-Athlon CPUs */ + get_mtrr = amd_get_mtrr; + set_mtrr_up = amd_set_mtrr_up; + break; + } /* else fall through */ case X86_VENDOR_INTEL: get_mtrr = intel_get_mtrr; set_mtrr_up = intel_set_mtrr_up; @@ -1669,10 +1721,6 @@ __initfunc(static void mtrr_setup (void)) set_mtrr_up = cyrix_set_arr_up; get_free_region = cyrix_get_free_region; break; - case X86_VENDOR_AMD: - get_mtrr = amd_get_mtrr; - set_mtrr_up = amd_set_mtrr_up; - break; case X86_VENDOR_CENTAUR: get_mtrr = centaur_get_mcr; set_mtrr_up = centaur_set_mcr_up; @@ -1685,12 +1733,14 @@ __initfunc(static void mtrr_setup (void)) static volatile unsigned long smp_changes_mask __initdata = 0; static struct mtrr_state smp_mtrr_state __initdata = {0, 0}; -__initfunc(void mtrr_init_boot_cpu (void)) +void __init mtrr_init_boot_cpu(void) { if ( !(boot_cpu_data.x86_capability & X86_FEATURE_MTRR) ) return; mtrr_setup (); switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) break; /* pre-Athlon CPUs */ case X86_VENDOR_INTEL: get_mtrr_state (&smp_mtrr_state); break; @@ -1703,7 +1753,7 @@ __initfunc(void mtrr_init_boot_cpu (void)) } } /* End Function mtrr_init_boot_cpu */ -__initfunc(static void intel_mtrr_init_secondary_cpu (void)) +static void __init intel_mtrr_init_secondary_cpu(void) { unsigned long mask, count; struct set_mtrr_context ctxt; @@ -1722,11 +1772,14 @@ __initfunc(static void intel_mtrr_init_secondary_cpu (void)) } } /* End Function intel_mtrr_init_secondary_cpu */ -__initfunc(void mtrr_init_secondary_cpu (void)) +void __init mtrr_init_secondary_cpu(void) { if ( !(boot_cpu_data.x86_capability & X86_FEATURE_MTRR) ) return; switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* Just for robustness: pre-Athlon CPUs cannot do SMP. */ + if (boot_cpu_data.x86 < 6) break; case X86_VENDOR_INTEL: intel_mtrr_init_secondary_cpu (); break; @@ -1746,12 +1799,14 @@ __initfunc(void mtrr_init_secondary_cpu (void)) } /* End Function mtrr_init_secondary_cpu */ #endif /* __SMP__ */ -__initfunc(int mtrr_init(void)) +int __init mtrr_init(void) { if ( !(boot_cpu_data.x86_capability & X86_FEATURE_MTRR) ) return 0; # ifdef __SMP__ switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) break; /* pre-Athlon CPUs */ case X86_VENDOR_INTEL: finalize_mtrr_state (&smp_mtrr_state); mtrr_state_warn (smp_changes_mask); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 08dde1ed7..4937efec2 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -40,24 +40,18 @@ #include <asm/ldt.h> #include <asm/processor.h> #include <asm/desc.h> +#include <asm/mmu_context.h> #ifdef CONFIG_MATH_EMULATION #include <asm/math_emu.h> #endif -#include "irq.h" +#include <linux/irq.h> spinlock_t semaphore_wake_lock = SPIN_LOCK_UNLOCKED; asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -#ifdef CONFIG_APM -extern int apm_do_idle(void); -extern void apm_do_busy(void); -#endif - -static int hlt_counter=0; - -#define HARD_IDLE_TIMEOUT (HZ / 3) +int hlt_counter=0; void disable_hlt(void) { @@ -69,103 +63,39 @@ void enable_hlt(void) hlt_counter--; } -#ifndef __SMP__ - -static void hard_idle(void) -{ - while (!current->need_resched) { - if (boot_cpu_data.hlt_works_ok && !hlt_counter) { -#ifdef CONFIG_APM - /* If the APM BIOS is not enabled, or there - is an error calling the idle routine, we - should hlt if possible. We need to check - need_resched again because an interrupt - may have occurred in apm_do_idle(). */ - start_bh_atomic(); - if (!apm_do_idle() && !current->need_resched) - __asm__("hlt"); - end_bh_atomic(); -#else - __asm__("hlt"); -#endif - } - if (current->need_resched) - break; - schedule(); - } -#ifdef CONFIG_APM - apm_do_busy(); -#endif -} - /* - * The idle loop on a uniprocessor i386.. - */ -static int cpu_idle(void *unused) -{ - int work = 1; - unsigned long start_idle = 0; - - /* endless idle loop with no priority at all */ - current->priority = 0; - current->counter = -100; - init_idle(); - - for (;;) { - if (work) - start_idle = jiffies; - - if (jiffies - start_idle > HARD_IDLE_TIMEOUT) - hard_idle(); - else { - if (boot_cpu_data.hlt_works_ok && !hlt_counter && !current->need_resched) - __asm__("hlt"); - } - - work = current->need_resched; - schedule(); - check_pgt_cache(); - } -} - -#else + * Powermanagement idle function, if any.. + */ +void (*acpi_idle)(void) = NULL; /* - * This is being executed in task 0 'user space'. + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) */ - -int cpu_idle(void *unused) +void cpu_idle(void) { /* endless idle loop with no priority at all */ + init_idle(); current->priority = 0; current->counter = -100; - init_idle(); - while(1) { - if (current_cpu_data.hlt_works_ok && !hlt_counter && - !current->need_resched) - __asm__("hlt"); - /* - * although we are an idle CPU, we do not want to - * get into the scheduler unnecessarily. - */ - if (current->need_resched) { - schedule(); - check_pgt_cache(); + while (1) { + while (!current->need_resched) { + if (!current_cpu_data.hlt_works_ok) + continue; + if (hlt_counter) + continue; + asm volatile("sti ; hlt" : : : "memory"); } + schedule(); + check_pgt_cache(); + if (acpi_idle) + acpi_idle(); } } -#endif - -asmlinkage int sys_idle(void) -{ - if (current->pid != 0) - return -EPERM; - cpu_idle(NULL); - return 0; -} - /* * This routine reboots the machine by asking the keyboard * controller to pulse the reset-line low. We try that for a while, @@ -176,7 +106,7 @@ static long no_idt[2] = {0, 0}; static int reboot_mode = 0; static int reboot_thru_bios = 0; -__initfunc(void reboot_setup(char *str, int *ints)) +static int __init reboot_setup(char *str) { while(1) { switch (*str) { @@ -198,8 +128,10 @@ __initfunc(void reboot_setup(char *str, int *ints)) else break; } + return 1; } +__setup("reboot=", reboot_setup); /* The following code and data reboots the machine by switching to real mode and jumping to the BIOS reset entry point, as if the CPU has @@ -321,13 +253,9 @@ void machine_restart(char * __unused) pg0[0] = _PAGE_RW | _PAGE_PRESENT; /* - * Use `swapper_pg_dir' as our page directory. We bother with - * `SET_PAGE_DIR' because although might be rebooting, but if we change - * the way we set root page dir in the future, then we wont break a - * seldom used feature ;) + * Use `swapper_pg_dir' as our page directory. */ - - SET_PAGE_DIR(current,swapper_pg_dir); + asm volatile("movl %0,%%cr3": :"r" (__pa(swapper_pg_dir))); /* Write 0x1234 to absolute memory location 0x472. The BIOS reads this on booting to tell it to "Bypass memory test (also warm @@ -405,6 +333,7 @@ void show_regs(struct pt_regs * regs) regs->esi, regs->edi, regs->ebp); printk(" DS: %04x ES: %04x\n", 0xffff & regs->xds,0xffff & regs->xes); + __asm__("movl %%cr0, %0": "=r" (cr0)); __asm__("movl %%cr2, %0": "=r" (cr2)); __asm__("movl %%cr3, %0": "=r" (cr3)); @@ -475,11 +404,19 @@ void free_task_struct(struct task_struct *p) free_pages((unsigned long) p, 1); } +/* + * No need to lock the MM as we are the last user + */ void release_segments(struct mm_struct *mm) { - if (mm->segments) { - void * ldt = mm->segments; + void * ldt = mm->segments; + + /* + * free the LDT + */ + if (ldt) { mm->segments = NULL; + clear_LDT(); vfree(ldt); } } @@ -492,10 +429,9 @@ void forget_segments(void) : "r" (0)); /* - * Get the LDT entry from init_task. + * Load the LDT entry of init_task. */ - current->tss.ldt = _LDT(0); - load_ldt(0); + load_LDT(&init_mm); } /* @@ -537,12 +473,9 @@ void exit_thread(void) void flush_thread(void) { - int i; struct task_struct *tsk = current; - for (i=0 ; i<8 ; i++) - tsk->tss.debugreg[i] = 0; - + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); /* * Forget coprocessor state.. */ @@ -552,33 +485,45 @@ void flush_thread(void) void release_thread(struct task_struct *dead_task) { + if (dead_task->mm) { + void * ldt = dead_task->mm->segments; + + // temporary debugging check + if (ldt) { + printk("WARNING: dead process %8s still has LDT? <%p>\n", + dead_task->comm, ldt); + BUG(); + } + } } /* - * If new_mm is NULL, we're being called to set up the LDT descriptor - * for a clone task. Each clone must have a separate entry in the GDT. + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. */ -void copy_segments(int nr, struct task_struct *p, struct mm_struct *new_mm) +void copy_segments(struct task_struct *p, struct mm_struct *new_mm) { struct mm_struct * old_mm = current->mm; void * old_ldt = old_mm->segments, * ldt = old_ldt; - /* default LDT - use the one from init_task */ - p->tss.ldt = _LDT(0); - if (old_ldt) { - if (new_mm) { - ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); - new_mm->segments = ldt; - if (!ldt) { - printk(KERN_WARNING "ldt allocation failed\n"); - return; - } - memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); - } - p->tss.ldt = _LDT(nr); - set_ldt_desc(nr, ldt, LDT_ENTRIES); + if (!old_mm->segments) { + /* + * default LDT - use the one from init_task + */ + new_mm->segments = NULL; return; } + + /* + * Completely new LDT, we initialize it from the parent: + */ + ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); + if (!ldt) + printk(KERN_WARNING "ldt allocation failed\n"); + else + memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); + new_mm->segments = ldt; + return; } /* @@ -592,31 +537,21 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, { struct pt_regs * childregs; - childregs = ((struct pt_regs *) (2*PAGE_SIZE + (unsigned long) p)) - 1; + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; *childregs = *regs; childregs->eax = 0; childregs->esp = esp; - p->tss.esp = (unsigned long) childregs; - p->tss.esp0 = (unsigned long) (childregs+1); - p->tss.ss0 = __KERNEL_DS; + p->thread.esp = (unsigned long) childregs; + p->thread.esp0 = (unsigned long) (childregs+1); - p->tss.tr = _TSS(nr); - set_tss_desc(nr,&(p->tss)); - p->tss.eip = (unsigned long) ret_from_fork; + p->thread.eip = (unsigned long) ret_from_fork; - savesegment(fs,p->tss.fs); - savesegment(gs,p->tss.gs); - - /* - * a bitmap offset pointing outside of the TSS limit causes a nicely - * controllable SIGSEGV. The first sys_ioperm() call sets up the - * bitmap properly. - */ - p->tss.bitmap = sizeof(struct thread_struct); + savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); unlazy_fpu(current); - p->tss.i387 = current->tss.i387; + p->thread.i387 = current->thread.i387; return 0; } @@ -632,7 +567,7 @@ int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu) fpvalid = tsk->used_math; if (fpvalid) { unlazy_fpu(tsk); - memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu)); + memcpy(fpu,&tsk->thread.i387.hard,sizeof(*fpu)); } return fpvalid; @@ -654,7 +589,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->tss.debugreg[i]; + dump->u_debugreg[i] = current->thread.debugreg[i]; if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; @@ -683,11 +618,10 @@ void dump_thread(struct pt_regs * regs, struct user * dump) /* * This special macro can be used to load a debugging register */ -#define loaddebug(tsk,register) \ +#define loaddebug(thread,register) \ __asm__("movl %0,%%db" #register \ : /* no output */ \ - :"r" (tsk->tss.debugreg[register])) - + :"r" (thread->debugreg[register])) /* * switch_to(x,yn) should switch tasks from x to y. @@ -712,60 +646,67 @@ void dump_thread(struct pt_regs * regs, struct user * dump) * More important, however, is the fact that this allows us much * more flexibility. */ -void __switch_to(struct task_struct *prev, struct task_struct *next) +extern int cpus_initialized; +void __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - /* Do the FPU save and set TS if it wasn't set before.. */ - unlazy_fpu(prev); + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + struct tss_struct *tss = init_tss + smp_processor_id(); + + unlazy_fpu(prev_p); /* - * Reload TR, LDT and the page table pointers.. - * - * We need TR for the IO permission bitmask (and - * the vm86 bitmasks in case we ever use enhanced - * v86 mode properly). - * - * We may want to get rid of the TR register some - * day, and copy the bitmaps around by hand. Oh, - * well. In the meantime we have to clear the busy - * bit in the TSS entry, ugh. + * Reload esp0, LDT and the page table pointer: */ - gdt_table[next->tss.tr >> 3].b &= 0xfffffdff; - asm volatile("ltr %0": :"g" (*(unsigned short *)&next->tss.tr)); + tss->esp0 = next->esp0; /* * Save away %fs and %gs. No need to save %es and %ds, as * those are always kernel segments while inside the kernel. */ - asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->tss.fs)); - asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->tss.gs)); - - /* Re-load LDT if necessary */ - if (next->mm->segments != prev->mm->segments) - asm volatile("lldt %0": :"g" (*(unsigned short *)&next->tss.ldt)); - - /* Re-load page tables */ - { - unsigned long new_cr3 = next->tss.cr3; - if (new_cr3 != prev->tss.cr3) - asm volatile("movl %0,%%cr3": :"r" (new_cr3)); - } + asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs)); + asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); /* * Restore %fs and %gs. */ - loadsegment(fs,next->tss.fs); - loadsegment(gs,next->tss.gs); + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); /* * Now maybe reload the debug registers */ - if (next->tss.debugreg[7]){ - loaddebug(next,0); - loaddebug(next,1); - loaddebug(next,2); - loaddebug(next,3); - loaddebug(next,6); - loaddebug(next,7); + if (next->debugreg[7]){ + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (prev->ioperm || next->ioperm) { + if (next->ioperm) { + /* + * 4 cachelines copy ... not good, but not that + * bad either. Anyone got something better? + * This only affects processes which use ioperm(). + * [Putting the TSSs into 4k-tlb mapped regions + * and playing VM tricks to switch the IO bitmap + * is not really acceptable.] + */ + memcpy(tss->io_bitmap, next->io_bitmap, + IO_BITMAP_SIZE*sizeof(unsigned long)); + tss->bitmap = IO_BITMAP_OFFSET; + } else + /* + * a bitmap offset pointing outside of the TSS limit + * causes a nicely controllable SIGSEGV if a process + * tries to use a port IO instruction. The first + * sys_ioperm() call sets up the bitmap properly. + */ + tss->bitmap = INVALID_IO_BITMAP_OFFSET; } } diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 9935cdf53..e86451291 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -45,7 +45,7 @@ static inline int get_stack_long(struct task_struct *task, int offset) { unsigned char *stack; - stack = (unsigned char *)task->tss.esp0; + stack = (unsigned char *)task->thread.esp0; stack += offset; return (*((int *)stack)); } @@ -61,7 +61,7 @@ static inline int put_stack_long(struct task_struct *task, int offset, { unsigned char * stack; - stack = (unsigned char *) task->tss.esp0; + stack = (unsigned char *) task->thread.esp0; stack += offset; *(unsigned long *) stack = data; return 0; @@ -76,12 +76,12 @@ static int putreg(struct task_struct *child, case FS: if (value && (value & 3) != 3) return -EIO; - child->tss.fs = value; + child->thread.fs = value; return 0; case GS: if (value && (value & 3) != 3) return -EIO; - child->tss.gs = value; + child->thread.gs = value; return 0; case DS: case ES: @@ -112,10 +112,10 @@ static unsigned long getreg(struct task_struct *child, switch (regno >> 2) { case FS: - retval = child->tss.fs; + retval = child->thread.fs; break; case GS: - retval = child->tss.gs; + retval = child->thread.gs; break; case DS: case ES: @@ -229,7 +229,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) addr <= (long) &dummy->u_debugreg[7]){ addr -= (long) &dummy->u_debugreg[0]; addr = addr >> 2; - tmp = child->tss.debugreg[addr]; + tmp = child->thread.debugreg[addr]; }; ret = put_user(tmp,(unsigned long *) data); goto out; @@ -278,7 +278,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) addr -= (long) &dummy->u_debugreg; addr = addr >> 2; - child->tss.debugreg[addr] = data; + child->thread.debugreg[addr] = data; ret = 0; goto out; }; @@ -409,18 +409,18 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) ret = 0; if ( !child->used_math ) { /* Simulate an empty FPU. */ - child->tss.i387.hard.cwd = 0xffff037f; - child->tss.i387.hard.swd = 0xffff0000; - child->tss.i387.hard.twd = 0xffffffff; + child->thread.i387.hard.cwd = 0xffff037f; + child->thread.i387.hard.swd = 0xffff0000; + child->thread.i387.hard.twd = 0xffffffff; } #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_to_user((void *)data, &child->tss.i387.hard, + __copy_to_user((void *)data, &child->thread.i387.hard, sizeof(struct user_i387_struct)); #ifdef CONFIG_MATH_EMULATION } else { - save_i387_soft(&child->tss.i387.soft, + save_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); } #endif @@ -438,11 +438,11 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data) #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_from_user(&child->tss.i387.hard, (void *)data, + __copy_from_user(&child->thread.i387.hard, (void *)data, sizeof(struct user_i387_struct)); #ifdef CONFIG_MATH_EMULATION } else { - restore_i387_soft(&child->tss.i387.soft, + restore_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); } #endif diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c new file mode 100644 index 000000000..cf556282d --- /dev/null +++ b/arch/i386/kernel/semaphore.c @@ -0,0 +1,220 @@ +/* + * i386 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + */ +#include <linux/sched.h> + +#include <asm/semaphore.h> + +/* + * Semaphores are implemented using a two-way counter: + * The "count" variable is decremented for each process + * that tries to aquire the semaphore, while the "sleeping" + * variable is a count of such aquires. + * + * Notably, the inline "up()" and "down()" functions can + * efficiently test if they need to do any extra work (up + * needs to do something only if count was negative before + * the increment operation. + * + * "sleeping" and the contention routine ordering is + * protected by the semaphore spinlock. + * + * Note that these functions are only called when there is + * contention on the lock, and as such all this is the + * "non-critical" part of the whole semaphore business. The + * critical part is the inline stuff in <asm/semaphore.h> + * where we want to avoid any extra jumps and calls. + */ + +/* + * Logic: + * - only on a boundary condition do we need to care. When we go + * from a negative count to a non-negative, we wake people up. + * - when we go from a non-negative count to a negative do we + * (a) synchronize with the "sleeper" count and (b) make sure + * that we're on the wakeup list before we synchronize so that + * we cannot lose wakeup events. + */ + +void __up(struct semaphore *sem) +{ + wake_up(&sem->wait); +} + +static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; + +void __down(struct semaphore * sem) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + tsk->state = TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE; + add_wait_queue_exclusive(&sem->wait, &wait); + + spin_lock_irq(&semaphore_lock); + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irq(&semaphore_lock); + + schedule(); + tsk->state = TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE; + spin_lock_irq(&semaphore_lock); + } + spin_unlock_irq(&semaphore_lock); + remove_wait_queue(&sem->wait, &wait); + tsk->state = TASK_RUNNING; + wake_up(&sem->wait); +} + +int __down_interruptible(struct semaphore * sem) +{ + int retval = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + tsk->state = TASK_INTERRUPTIBLE|TASK_EXCLUSIVE; + add_wait_queue_exclusive(&sem->wait, &wait); + + spin_lock_irq(&semaphore_lock); + sem->sleepers ++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * With signals pending, this turns into + * the trylock failure case - we won't be + * sleeping, and we* can't get the lock as + * it has contention. Just correct the count + * and exit. + */ + if (signal_pending(current)) { + retval = -EINTR; + sem->sleepers = 0; + atomic_add(sleepers, &sem->count); + break; + } + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock. The + * "-1" is because we're still hoping to get + * the lock. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irq(&semaphore_lock); + + schedule(); + tsk->state = TASK_INTERRUPTIBLE|TASK_EXCLUSIVE; + spin_lock_irq(&semaphore_lock); + } + spin_unlock_irq(&semaphore_lock); + tsk->state = TASK_RUNNING; + remove_wait_queue(&sem->wait, &wait); + wake_up(&sem->wait); + return retval; +} + +/* + * Trylock failed - make sure we correct for + * having decremented the count. + * + * We could have done the trylock with a + * single "cmpxchg" without failure cases, + * but then it wouldn't work on a 386. + */ +int __down_trylock(struct semaphore * sem) +{ + int sleepers; + + spin_lock_irq(&semaphore_lock); + sleepers = sem->sleepers + 1; + sem->sleepers = 0; + + /* + * Add "everybody else" and us into it. They aren't + * playing, because we own the spinlock. + */ + if (!atomic_add_negative(sleepers, &sem->count)) + wake_up(&sem->wait); + + spin_unlock_irq(&semaphore_lock); + return 1; +} + + +/* + * The semaphore operations have a special calling sequence that + * allow us to do a simpler in-line version of them. These routines + * need to convert that sequence back into the C sequence when + * there is contention on the semaphore. + * + * %ecx contains the semaphore pointer on entry. Save the C-clobbered + * registers (%eax, %edx and %ecx) except %eax when used as a return + * value.. + */ +asm( +".align 4\n" +".globl __down_failed\n" +"__down_failed:\n\t" + "pushl %eax\n\t" + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __down\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" + "popl %eax\n\t" + "ret" +); + +asm( +".align 4\n" +".globl __down_failed_interruptible\n" +"__down_failed_interruptible:\n\t" + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __down_interruptible\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" + "ret" +); + +asm( +".align 4\n" +".globl __down_failed_trylock\n" +"__down_failed_trylock:\n\t" + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __down_trylock\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" + "ret" +); + +asm( +".align 4\n" +".globl __up_wakeup\n" +"__up_wakeup:\n\t" + "pushl %eax\n\t" + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __up\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" + "popl %eax\n\t" + "ret" +); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index c0721b482..88ba3feeb 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -14,6 +14,17 @@ * Bart Hartgers <bart@etpmod.phys.tue.nl>, May 1999. * * Intel Mobile Pentium II detection fix. Sean Gilley, June 1999. + * + * IDT Winchip tweaks, misc clean ups. + * Dave Jones <dave@powertweak.com>, August 1999 + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Better detection of Centaur/IDT WinChip models. + * Bart Hartgers <bart@etpmod.phys.tue.nl>, August 1999. + * + * Memory region support + * David Parsons <orc@pell.chi.il.us>, July-August 1999 */ /* @@ -35,12 +46,11 @@ #include <linux/delay.h> #include <linux/config.h> #include <linux/init.h> -#ifdef CONFIG_APM #include <linux/apm_bios.h> -#endif #ifdef CONFIG_BLK_DEV_RAM #include <linux/blk.h> #endif +#include <linux/bigmem.h> #include <asm/processor.h> #include <linux/console.h> #include <asm/uaccess.h> @@ -49,6 +59,9 @@ #include <asm/smp.h> #include <asm/cobalt.h> #include <asm/msr.h> +#include <asm/desc.h> +#include <asm/e820.h> +#include <asm/dma.h> /* * Machine setup.. @@ -57,6 +70,8 @@ char ignore_irq13 = 0; /* set if exception 16 works */ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +unsigned long mmu_cr4_features __initdata = 0; + /* * Bus types .. */ @@ -74,14 +89,14 @@ unsigned int mca_pentium_flag = 0; */ struct drive_info_struct { char dummy[32]; } drive_info; struct screen_info screen_info; -#ifdef CONFIG_APM struct apm_bios_info apm_bios_info; -#endif struct sys_desc_table_struct { unsigned short length; unsigned char table[0]; }; +struct e820map e820 = { 0 }; + unsigned char aux_device_present; #ifdef CONFIG_BLK_DEV_RAM @@ -91,7 +106,7 @@ extern int rd_image_start; /* starting block # of image */ #endif extern int root_mountflags; -extern int _etext, _edata, _end; +extern int _text, _etext, _edata, _end; extern unsigned long cpu_hz; /* @@ -101,6 +116,8 @@ extern unsigned long cpu_hz; #define SCREEN_INFO (*(struct screen_info *) (PARAM+0)) #define EXT_MEM_K (*(unsigned short *) (PARAM+2)) #define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0)) +#define E820_MAP_NR (*(char*) (PARAM+E820NR)) +#define E820_MAP ((unsigned long *) (PARAM+E820MAP)) #define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40)) #define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80)) #define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0)) @@ -249,12 +266,207 @@ visws_get_board_type_and_rev(void) static char command_line[COMMAND_LINE_SIZE] = { 0, }; char saved_command_line[COMMAND_LINE_SIZE]; -__initfunc(void setup_arch(char **cmdline_p, - unsigned long * memory_start_p, unsigned long * memory_end_p)) +struct resource standard_io_resources[] = { + { "dma1", 0x00, 0x1f, IORESOURCE_BUSY }, + { "pic1", 0x20, 0x3f, IORESOURCE_BUSY }, + { "timer", 0x40, 0x5f, IORESOURCE_BUSY }, + { "keyboard", 0x60, 0x6f, IORESOURCE_BUSY }, + { "dma page reg", 0x80, 0x8f, IORESOURCE_BUSY }, + { "pic2", 0xa0, 0xbf, IORESOURCE_BUSY }, + { "dma2", 0xc0, 0xdf, IORESOURCE_BUSY }, + { "fpu", 0xf0, 0xff, IORESOURCE_BUSY } +}; + +#define STANDARD_IO_RESOURCES (sizeof(standard_io_resources)/sizeof(struct resource)) + +/* System RAM - interrupted by the 640kB-1M hole */ +#define code_resource (ram_resources[3]) +#define data_resource (ram_resources[4]) +static struct resource ram_resources[] = { + { "System RAM", 0x000000, 0x09ffff, IORESOURCE_BUSY }, + { "System RAM", 0x100000, 0x100000, IORESOURCE_BUSY }, + { "Video RAM area", 0x0a0000, 0x0bffff, IORESOURCE_BUSY }, + { "Kernel code", 0x100000, 0 }, + { "Kernel data", 0, 0 } +}; + +/* System ROM resources */ +#define MAXROMS 6 +static struct resource rom_resources[MAXROMS] = { + { "System ROM", 0xF0000, 0xFFFFF, IORESOURCE_BUSY }, + { "Video ROM", 0xc0000, 0xc7fff, IORESOURCE_BUSY } +}; + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static void __init probe_roms(void) +{ + int roms = 1; + unsigned long base; + unsigned char *romstart; + + request_resource(&iomem_resource, rom_resources+0); + + /* Video ROM is standard at C000:0000 - C7FF:0000, check signature */ + for (base = 0xC0000; base < 0xE0000; base += 2048) { + romstart = bus_to_virt(base); + if (!romsignature(romstart)) + continue; + request_resource(&iomem_resource, rom_resources + roms); + roms++; + break; + } + + /* Extension roms at C800:0000 - DFFF:0000 */ + for (base = 0xC8000; base < 0xE0000; base += 2048) { + unsigned long length; + + romstart = bus_to_virt(base); + if (!romsignature(romstart)) + continue; + length = romstart[2] * 512; + if (length) { + unsigned int i; + unsigned char chksum; + + chksum = 0; + for (i = 0; i < length; i++) + chksum += romstart[i]; + + /* Good checksum? */ + if (!chksum) { + rom_resources[roms].start = base; + rom_resources[roms].end = base + length - 1; + rom_resources[roms].name = "Extension ROM"; + rom_resources[roms].flags = IORESOURCE_BUSY; + + request_resource(&iomem_resource, rom_resources + roms); + roms++; + if (roms >= MAXROMS) + return; + } + } + } + + /* Final check for motherboard extension rom at E000:0000 */ + base = 0xE0000; + romstart = bus_to_virt(base); + + if (romsignature(romstart)) { + rom_resources[roms].start = base; + rom_resources[roms].end = base + 65535; + rom_resources[roms].name = "Extension ROM"; + rom_resources[roms].flags = IORESOURCE_BUSY; + + request_resource(&iomem_resource, rom_resources + roms); + } +} + +unsigned long __init memparse(char *ptr, char **retptr) +{ + unsigned long ret; + + ret = simple_strtoul(ptr, retptr, 0); + + if (**retptr == 'K' || **retptr == 'k') { + ret <<= 10; + (*retptr)++; + } + else if (**retptr == 'M' || **retptr == 'm') { + ret <<= 20; + (*retptr)++; + } + return ret; +} /* memparse */ + + +void __init add_memory_region(unsigned long start, + unsigned long size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk("Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} /* add_memory_region */ + + +#define LOWMEMSIZE() ((*(unsigned short *)__va(0x413)) * 1024) + + +void __init setup_memory_region(void) +{ +#define E820_DEBUG 0 +#ifdef E820_DEBUG + int i; +#endif + + /* + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ + if (E820_MAP_NR > 1) { + /* got a memory map; copy it into a safe place. + */ + e820.nr_map = E820_MAP_NR; + if (e820.nr_map > E820MAX) + e820.nr_map = E820MAX; + memcpy(e820.map, E820_MAP, e820.nr_map * sizeof e820.map[0]); +#ifdef E820_DEBUG + for (i=0; i < e820.nr_map; i++) { + printk("e820: %ld @ %08lx ", + (unsigned long)(e820.map[i].size), + (unsigned long)(e820.map[i].addr)); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +#endif + } + else { + /* otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + unsigned long mem_size; + + mem_size = (ALT_MEM_K < EXT_MEM_K) ? EXT_MEM_K : ALT_MEM_K; + + add_memory_region(0, LOWMEMSIZE(), E820_RAM); + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + } +} /* setup_memory_region */ + + +void __init setup_arch(char **cmdline_p, unsigned long * memory_start_p, unsigned long * memory_end_p) { unsigned long memory_start, memory_end; char c = ' ', *to = command_line, *from = COMMAND_LINE; int len = 0; + int i; + int usermem=0; #ifdef CONFIG_VISWS visws_get_board_type_and_rev(); @@ -263,9 +475,7 @@ __initfunc(void setup_arch(char **cmdline_p, ROOT_DEV = to_kdev_t(ORIG_ROOT_DEV); drive_info = DRIVE_INFO; screen_info = SCREEN_INFO; -#ifdef CONFIG_APM apm_bios_info = APM_BIOS_INFO; -#endif if( SYS_DESC_TABLE.length != 0 ) { MCA_bus = SYS_DESC_TABLE.table[3] &0x2; machine_id = SYS_DESC_TABLE.table[0]; @@ -273,29 +483,26 @@ __initfunc(void setup_arch(char **cmdline_p, BIOS_revision = SYS_DESC_TABLE.table[2]; } aux_device_present = AUX_DEVICE_INFO; - memory_end = (1<<20) + (EXT_MEM_K<<10); -#ifndef STANDARD_MEMORY_BIOS_CALL - { - unsigned long memory_alt_end = (1<<20) + (ALT_MEM_K<<10); - /* printk(KERN_DEBUG "Memory sizing: %08x %08x\n", memory_end, memory_alt_end); */ - if (memory_alt_end > memory_end) - memory_end = memory_alt_end; - } -#endif - memory_end &= PAGE_MASK; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); #endif + setup_memory_region(); + if (!MOUNT_ROOT_RDONLY) root_mountflags &= ~MS_RDONLY; memory_start = (unsigned long) &_end; - init_task.mm->start_code = PAGE_OFFSET; - init_task.mm->end_code = (unsigned long) &_etext; - init_task.mm->end_data = (unsigned long) &_edata; - init_task.mm->brk = (unsigned long) &_end; + init_mm.start_code = (unsigned long) &_text; + init_mm.end_code = (unsigned long) &_etext; + init_mm.end_data = (unsigned long) &_edata; + init_mm.brk = (unsigned long) &_end; + + code_resource.start = virt_to_bus(&_text); + code_resource.end = virt_to_bus(&_etext)-1; + data_resource.start = virt_to_bus(&_etext); + data_resource.end = virt_to_bus(&_edata)-1; /* Save unparsed command line copy for /proc/cmdline */ memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); @@ -304,8 +511,10 @@ __initfunc(void setup_arch(char **cmdline_p, for (;;) { /* * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" overrides the BIOS-reported - * memory size + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to <mem>, overriding the bios size. + * "mem=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * <start> to <start>+<mem>, overriding the bios size. */ if (c == ' ' && *(const unsigned long *)from == *(const unsigned long *)"mem=") { if (to != command_line) to--; @@ -313,14 +522,29 @@ __initfunc(void setup_arch(char **cmdline_p, from += 9+4; boot_cpu_data.x86_capability &= ~X86_FEATURE_PSE; } else { - memory_end = simple_strtoul(from+4, &from, 0); - if ( *from == 'K' || *from == 'k' ) { - memory_end = memory_end << 10; - from++; - } else if ( *from == 'M' || *from == 'm' ) { - memory_end = memory_end << 20; - from++; + /* If the user specifies memory size, we + * blow away any automatically generated + * size + */ + unsigned long start_at, mem_size; + + if (usermem == 0) { + /* first time in: zap the whitelist + * and reinitialize it with the + * standard low-memory region. + */ + e820.nr_map = 0; + usermem = 1; + add_memory_region(0, LOWMEMSIZE(), E820_RAM); + } + mem_size = memparse(from+4, &from); + if (*from == '@') + start_at = memparse(from+1,&from); + else { + start_at = HIGH_MEMORY; + mem_size -= HIGH_MEMORY; } + add_memory_region(start_at, mem_size, E820_RAM); } } c = *(from++); @@ -333,15 +557,47 @@ __initfunc(void setup_arch(char **cmdline_p, *to = '\0'; *cmdline_p = command_line; -#define VMALLOC_RESERVE (64 << 20) /* 64MB for vmalloc */ +#define VMALLOC_RESERVE (128 << 20) /* 128MB for vmalloc and initrd */ #define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE)) + memory_end = 0; + for (i=0; i < e820.nr_map; i++) { + /* RAM? */ + if (e820.map[i].type == E820_RAM) { + unsigned long end = e820.map[i].addr + e820.map[i].size; + + if (end > memory_end) + memory_end = end; + } + } + memory_end &= PAGE_MASK; + ram_resources[1].end = memory_end-1; + +#ifdef CONFIG_BIGMEM + bigmem_start = bigmem_end = memory_end; +#endif if (memory_end > MAXMEM) { +#ifdef CONFIG_BIGMEM +#define MAXBIGMEM ((unsigned long)(~(VMALLOC_RESERVE-1))) + bigmem_start = MAXMEM; + bigmem_end = (memory_end < MAXBIGMEM) ? memory_end : MAXBIGMEM; +#endif memory_end = MAXMEM; +#ifdef CONFIG_BIGMEM + printk(KERN_NOTICE "%ldMB BIGMEM available.\n", + (bigmem_end-bigmem_start)>>20); +#else printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); +#endif } +#if defined(CONFIG_BIGMEM) && defined(BIGMEM_DEBUG) + else { + memory_end -= memory_end/4; + bigmem_start = memory_end; + } +#endif memory_end += PAGE_OFFSET; *memory_start_p = memory_start; @@ -367,12 +623,20 @@ __initfunc(void setup_arch(char **cmdline_p, } #endif + /* + * Request the standard RAM and ROM resources - + * they eat up PCI memory space + */ + request_resource(&iomem_resource, ram_resources+0); + request_resource(&iomem_resource, ram_resources+1); + request_resource(&iomem_resource, ram_resources+2); + request_resource(ram_resources+1, &code_resource); + request_resource(ram_resources+1, &data_resource); + probe_roms(); + /* request I/O space for devices used on all i[345]86 PCs */ - request_region(0x00,0x20,"dma1"); - request_region(0x40,0x20,"timer"); - request_region(0x80,0x10,"dma page reg"); - request_region(0xc0,0x20,"dma2"); - request_region(0xf0,0x10,"fpu"); + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, standard_io_resources+i); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) @@ -381,13 +645,9 @@ __initfunc(void setup_arch(char **cmdline_p, conswitchp = &dummy_con; #endif #endif - /* - * Check the bugs that will bite us before we get booting - */ - } -__initfunc(static int get_model_name(struct cpuinfo_x86 *c)) +static int __init get_model_name(struct cpuinfo_x86 *c) { unsigned int n, dummy, *v; @@ -415,7 +675,7 @@ __initfunc(static int get_model_name(struct cpuinfo_x86 *c)) return 1; } -__initfunc(static int amd_model(struct cpuinfo_x86 *c)) +static int __init amd_model(struct cpuinfo_x86 *c) { u32 l, h; unsigned long flags; @@ -480,6 +740,19 @@ __initfunc(static int amd_model(struct cpuinfo_x86 *c)) break; } break; + case 6: /* An Athlon. We can trust the BIOS probably */ + { + + u32 ecx, edx, dummy; + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + printk("L1 I Cache: %dK L1 D Cache: %dK\n", + ecx>>24, edx>>24); + cpuid(0x80000006, &dummy, &dummy, &ecx, &edx); + printk("L2 Cache: %dK\n", ecx>>16); + c->x86_cache_size = ecx>>16; + break; + } + } return r; } @@ -544,7 +817,7 @@ static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock"; static char cyrix_model_mult1[] __initdata = "12??43"; static char cyrix_model_mult2[] __initdata = "12233445"; -__initfunc(static void cyrix_model(struct cpuinfo_x86 *c)) +static void __init cyrix_model(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; char *buf = c->x86_model_id; @@ -615,6 +888,15 @@ __initfunc(static void cyrix_model(struct cpuinfo_x86 *c)) c->x86_model = (dir1 & 0x20) ? 1 : 2; c->x86_capability&=~X86_FEATURE_TSC; } +#ifdef CONFIG_PCI + /* It isnt really a PCI quirk directly, but the cure is the + same. The MediaGX has deep magic SMM stuff that handles the + SB emulation. It thows away the fifo on disable_dma() which + is wrong and ruins the audio. */ + + printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bug.\n"); + isa_dma_bridge_buggy = 1; +#endif break; case 5: /* 6x86MX/M II */ @@ -640,8 +922,8 @@ __initfunc(static void cyrix_model(struct cpuinfo_x86 *c)) dir0_msn = 0; p = Cx486S_name[0]; break; - break; } + break; default: /* unknown (shouldn't happen, we know everyone ;-) */ dir0_msn = 7; @@ -652,7 +934,99 @@ __initfunc(static void cyrix_model(struct cpuinfo_x86 *c)) return; } -__initfunc(void get_cpu_vendor(struct cpuinfo_x86 *c)) +static void __init centaur_model(struct cpuinfo_x86 *c) +{ + enum { + ECX8=1<<1, + EIERRINT=1<<2, + DPM=1<<3, + DMCE=1<<4, + DSTPCLK=1<<5, + ELINEAR=1<<6, + DSMC=1<<7, + DTLOCK=1<<8, + EDCTLB=1<<8, + EMMX=1<<9, + DPDC=1<<11, + EBRPRED=1<<12, + DIC=1<<13, + DDC=1<<14, + DNA=1<<15, + ERETSTK=1<<16, + E2MMX=1<<19, + EAMD3D=1<<20, + }; + + char *name; + u32 fcr_set=0; + u32 fcr_clr=0; + u32 lo,hi,newlo; + u32 aa,bb,cc,dd; + + switch(c->x86_model) { + case 4: + name="C6"; + fcr_set=ECX8|DSMC|EDCTLB|EMMX|ERETSTK; + fcr_clr=DPDC; + break; + case 8: + switch(c->x86_mask) { + default: + name="2"; + break; + case 7 ... 9: + name="2A"; + break; + case 10 ... 15: + name="2B"; + break; + } + fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D; + fcr_clr=DPDC; + break; + case 9: + name="3"; + fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D; + fcr_clr=DPDC; + break; + case 10: + name="4"; + /* no info on the WC4 yet */ + break; + default: + name="??"; + } + + /* get FCR */ + rdmsr(0x107, lo, hi); + + newlo=(lo|fcr_set) & (~fcr_clr); + + if (newlo!=lo) { + printk("Centaur FCR was 0x%X now 0x%X\n", lo, newlo ); + wrmsr(0x107, newlo, hi ); + } else { + printk("Centaur FCR is 0x%X\n",lo); + } + + /* Emulate MTRRs using Centaur's MCR. */ + c->x86_capability |= X86_FEATURE_MTRR; + /* Report CX8 */ + c->x86_capability |= X86_FEATURE_CX8; + /* Set 3DNow! on Winchip 2 and above. */ + if (c->x86_model >=8) + c->x86_capability |= X86_FEATURE_AMD3D; + /* See if we can find out some more. */ + cpuid(0x80000000,&aa,&bb,&cc,&dd); + if (aa>=0x80000005) { /* Yes, we can. */ + cpuid(0x80000005,&aa,&bb,&cc,&dd); + /* Add L1 data and code cache sizes. */ + c->x86_cache_size = (cc>>24)+(dd>>24); + } + sprintf( c->x86_model_id, "WinChip %s", name ); +} + +void __init get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -701,18 +1075,20 @@ static struct cpu_model_info cpu_models[] __initdata = { "K5", "K5", NULL, NULL, "K6", "K6", "K6-2", "K6-3", NULL, NULL, NULL, NULL, NULL, NULL }}, + { X86_VENDOR_AMD, 6, + { "Athlon", "Athlon", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL }}, { X86_VENDOR_UMC, 4, { NULL, "U5D", "U5S", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }}, - { X86_VENDOR_CENTAUR, 5, - { NULL, NULL, NULL, NULL, "C6", NULL, NULL, NULL, "C6-2", NULL, NULL, - NULL, NULL, NULL, NULL, NULL }}, { X86_VENDOR_NEXGEN, 5, { "Nx586", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }}, }; -__initfunc(void identify_cpu(struct cpuinfo_x86 *c)) +void __init identify_cpu(struct cpuinfo_x86 *c) { int i; char *p = NULL; @@ -733,6 +1109,11 @@ __initfunc(void identify_cpu(struct cpuinfo_x86 *c)) if (c->x86_vendor == X86_VENDOR_AMD && amd_model(c)) return; + + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + centaur_model(c); + return; + } if (c->cpuid_level > 0 && c->x86_vendor == X86_VENDOR_INTEL) { @@ -809,7 +1190,6 @@ __initfunc(void identify_cpu(struct cpuinfo_x86 *c)) p = "Celeron (Dixon)"; } } - } if (p) { @@ -824,7 +1204,7 @@ __initfunc(void identify_cpu(struct cpuinfo_x86 *c)) * Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c */ -__initfunc(void dodgy_tsc(void)) +void __init dodgy_tsc(void) { get_cpu_vendor(&boot_cpu_data); @@ -841,7 +1221,7 @@ static char *cpu_vendor_names[] __initdata = { "Intel", "Cyrix", "AMD", "UMC", "NexGen", "Centaur" }; -__initfunc(void print_cpu_info(struct cpuinfo_x86 *c)) +void __init print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -859,22 +1239,7 @@ __initfunc(void print_cpu_info(struct cpuinfo_x86 *c)) printk("%s", c->x86_model_id); if (c->x86_mask || c->cpuid_level>=0) - printk(" stepping %02x", c->x86_mask); - - if(c->x86_vendor == X86_VENDOR_CENTAUR) - { - u32 hv,lv; - rdmsr(0x107, lv, hv); - printk("\nCentaur FSR was 0x%X ",lv); - lv|=(1<<8); - lv|=(1<<7); - /* lv|=(1<<6); - may help too if the board can cope */ - printk("now 0x%X", lv); - wrmsr(0x107, lv, hv); - /* Emulate MTRRs using Centaur's MCR. */ - c->x86_capability |= X86_FEATURE_MTRR; - } - printk("\n"); + printk(" stepping %02x\n", c->x86_mask); } /* @@ -909,7 +1274,7 @@ int get_cpuinfo(char * buffer) c->x86 + '0', c->x86_model, c->x86_model_id[0] ? c->x86_model_id : "unknown"); - + if (c->x86_mask || c->cpuid_level >= 0) p += sprintf(p, "stepping\t: %d\n", c->x86_mask); else @@ -925,14 +1290,20 @@ int get_cpuinfo(char * buffer) p += sprintf(p, "cache size\t: %d KB\n", c->x86_cache_size); /* Modify the capabilities according to chip type */ - if (c->x86_vendor == X86_VENDOR_CYRIX) { + switch (c->x86_vendor) { + + case X86_VENDOR_CYRIX: x86_cap_flags[24] = "cxmmx"; - } else if (c->x86_vendor == X86_VENDOR_AMD) { - x86_cap_flags[16] = "fcmov"; - x86_cap_flags[31] = "3dnow"; + break; + + case X86_VENDOR_AMD: if (c->x86 == 5 && c->x86_model == 6) x86_cap_flags[10] = "sep"; - } else if (c->x86_vendor == X86_VENDOR_INTEL) { + x86_cap_flags[16] = "fcmov"; + x86_cap_flags[31] = "3dnow"; + break; + + case X86_VENDOR_INTEL: x86_cap_flags[6] = "pae"; x86_cap_flags[9] = "apic"; x86_cap_flags[14] = "mca"; @@ -940,6 +1311,16 @@ int get_cpuinfo(char * buffer) x86_cap_flags[17] = "pse36"; x86_cap_flags[18] = "psn"; x86_cap_flags[24] = "osfxsr"; + break; + + case X86_VENDOR_CENTAUR: + if (c->x86_model >=8) /* Only Winchip2 and above */ + x86_cap_flags[31] = "3dnow"; + break; + + default: + /* Unknown CPU manufacturer. Transmeta ? :-) */ + break; } sep_bug = c->x86_vendor == X86_VENDOR_INTEL && @@ -978,3 +1359,64 @@ int get_cpuinfo(char * buffer) } return p - buffer; } + +int cpus_initialized = 0; +unsigned long cpu_initialized = 0; + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void cpu_init (void) +{ + int nr = smp_processor_id(); + struct tss_struct * t = &init_tss[nr]; + + if (test_and_set_bit(nr,&cpu_initialized)) { + printk("CPU#%d already initialized!\n", nr); + for (;;) __sti(); + } + cpus_initialized++; + printk("Initializing CPU#%d\n", nr); + + if (boot_cpu_data.x86_capability & X86_FEATURE_PSE) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + __asm__ __volatile__("lgdt %0": "=m" (gdt_descr)); + __asm__ __volatile__("lidt %0": "=m" (idt_descr)); + + /* + * Delete NT + */ + __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); + + /* + * set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + t->esp0 = current->thread.esp0; + set_tss_desc(nr,t); + gdt_table[__TSS(nr)].b &= 0xfffffdff; + load_TR(nr); + load_LDT(&init_mm); + + /* + * Clear all 6 debug registers: + */ + +#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); + + CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); + +#undef CD + + /* + * Force FPU initialization: + */ + current->flags &= ~PF_USEDFPU; + current->used_math = 0; + stts(); +} diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 32e7c4c56..cc9a992da 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -155,7 +155,7 @@ static inline int restore_i387_hard(struct _fpstate *buf) { struct task_struct *tsk = current; clear_fpu(tsk); - return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf)); + return __copy_from_user(&tsk->thread.i387.hard, buf, sizeof(*buf)); } static inline int restore_i387(struct _fpstate *buf) @@ -167,7 +167,7 @@ static inline int restore_i387(struct _fpstate *buf) if (boot_cpu_data.hard_math) err = restore_i387_hard(buf); else - err = restore_i387_soft(¤t->tss.i387.soft, buf); + err = restore_i387_soft(¤t->thread.i387.soft, buf); #endif current->used_math = 1; return err; @@ -308,8 +308,8 @@ static inline int save_i387_hard(struct _fpstate * buf) struct task_struct *tsk = current; unlazy_fpu(tsk); - tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd; - if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf))) + tsk->thread.i387.hard.status = tsk->thread.i387.hard.swd; + if (__copy_to_user(buf, &tsk->thread.i387.hard, sizeof(*buf))) return -1; return 1; } @@ -328,7 +328,7 @@ static int save_i387(struct _fpstate *buf) return save_i387_hard(buf); #else return boot_cpu_data.hard_math ? save_i387_hard(buf) - : save_i387_soft(¤t->tss.i387.soft, buf); + : save_i387_soft(¤t->thread.i387.soft, buf); #endif } @@ -354,8 +354,8 @@ setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, err |= __put_user(regs->edx, &sc->edx); err |= __put_user(regs->ecx, &sc->ecx); err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->tss.trap_no, &sc->trapno); - err |= __put_user(current->tss.error_code, &sc->err); + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->eip, &sc->eip); err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); err |= __put_user(regs->eflags, &sc->eflags); @@ -370,7 +370,7 @@ setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->tss.cr2, &sc->cr2); + err |= __put_user(current->thread.cr2, &sc->cr2); return err; } @@ -687,12 +687,8 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) case SIGQUIT: case SIGILL: case SIGTRAP: case SIGABRT: case SIGFPE: case SIGSEGV: - lock_kernel(); - if (current->binfmt - && current->binfmt->core_dump - && current->binfmt->core_dump(signr, regs)) + if (do_coredump(signr, regs)) exit_code |= 0x80; - unlock_kernel(); /* FALLTHRU */ default: diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index f092d0905..f44234eb7 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -42,7 +42,7 @@ #include <asm/mtrr.h> #include <asm/msr.h> -#include "irq.h" +#include <linux/irq.h> #define JIFFIE_TIMEOUT 100 @@ -104,7 +104,7 @@ int smp_found_config=0; /* Have we found an SMP box */ unsigned long cpu_present_map = 0; /* Bitmask of physically existing CPUs */ unsigned long cpu_online_map = 0; /* Bitmask of currently online CPUs */ -int smp_num_cpus = 1; /* Total count of live CPUs */ +int smp_num_cpus = 0; /* Total count of live CPUs */ int smp_threads_ready=0; /* Set when the idlers are all forked */ volatile int cpu_number_map[NR_CPUS]; /* which CPU maps to which logical number */ volatile int __cpu_logical_map[NR_CPUS]; /* which logical number maps to which CPU */ @@ -128,6 +128,8 @@ volatile unsigned long ipi_count; /* Number of IPIs delivered */ const char lk_lockmsg[] = "lock from interrupt context at %p\n"; int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, }; +extern int nr_ioapics; +extern struct mpc_config_ioapic mp_apics [MAX_IO_APICS]; extern int mp_irq_entries; extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; extern int mpc_default_type; @@ -162,14 +164,22 @@ int skip_ioapic_setup = 0; /* 1 if "noapic" boot option passed */ * SMP mode to <NUM>. */ -void __init smp_setup(char *str, int *ints) +static int __init nosmp(char *str) { - if (ints && ints[0] > 0) - max_cpus = ints[1]; - else - max_cpus = 0; + max_cpus = 0; + return 1; +} + +__setup("nosmp", nosmp); + +static int __init maxcpus(char *str) +{ + get_option(&str, &max_cpus); + return 1; } +__setup("maxcpus=", maxcpus); + void ack_APIC_irq(void) { /* Clear the IPI */ @@ -225,6 +235,7 @@ static char *mpc_family(int family,int model) return n; } + /* * Read the MPC */ @@ -257,12 +268,10 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } memcpy(str,mpc->mpc_oem,8); str[8]=0; - memcpy(ioapic_OEM_ID,str,9); printk("OEM ID: %s ",str); memcpy(str,mpc->mpc_productid,12); str[12]=0; - memcpy(ioapic_Product_ID,str,13); printk("Product ID: %s ",str); printk("APIC at: 0x%lX\n",mpc->mpc_lapic); @@ -367,11 +376,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) printk("I/O APIC #%d Version %d at 0x%lX.\n", m->mpc_apicid,m->mpc_apicver, m->mpc_apicaddr); - /* - * we use the first one only currently - */ - if (ioapics == 1) - mp_ioapic_addr = m->mpc_apicaddr; + mp_apics [nr_ioapics] = *m; + if (++nr_ioapics > MAX_IO_APICS) + --nr_ioapics; } mpt+=sizeof(*m); count+=sizeof(*m); @@ -403,9 +410,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } } } - if (ioapics > 1) + if (ioapics > MAX_IO_APICS) { - printk("Warning: Multiple IO-APICs not yet supported.\n"); + printk("Warning: Max I/O APICs exceeded (max %d, found %d).\n", MAX_IO_APICS, ioapics); printk("Warning: switching to non APIC mode.\n"); skip_ioapic_setup=1; } @@ -637,6 +644,8 @@ void __init init_smp_config (void) #endif } + + /* * Trampoline 80x86 program as an array. */ @@ -722,7 +731,11 @@ void __init enable_local_APIC(void) value = apic_read(APIC_SPIV); value |= (1<<8); /* Enable APIC (bit==1) */ +#if 0 value &= ~(1<<9); /* Enable focus processor (bit==0) */ +#else + value |= (1<<9); /* Disable focus processor (bit==1) */ +#endif value |= 0xff; /* Set spurious IRQ vector to 0xff */ apic_write(APIC_SPIV,value); @@ -771,18 +784,22 @@ unsigned long __init init_smp_mappings(unsigned long memory_start) #ifdef CONFIG_X86_IO_APIC { - unsigned long ioapic_phys; - - if (smp_found_config) { - ioapic_phys = mp_ioapic_addr; - } else { - ioapic_phys = __pa(memory_start); - memset((void *)memory_start, 0, PAGE_SIZE); - memory_start += PAGE_SIZE; + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_apics[i].mpc_apicaddr; + } else { + ioapic_phys = __pa(memory_start); + memset((void *)memory_start, 0, PAGE_SIZE); + memory_start += PAGE_SIZE; + } + set_fixmap(idx,ioapic_phys); + printk("mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; } - set_fixmap(FIX_IO_APIC_BASE,ioapic_phys); - printk("mapped IOAPIC to %08lx (%08lx)\n", - fix_to_virt(FIX_IO_APIC_BASE), ioapic_phys); } #endif @@ -870,7 +887,7 @@ void __init smp_callin(void) int cpucount = 0; -extern int cpu_idle(void * unused); +extern int cpu_idle(void); /* * Activate a secondary processor. @@ -882,10 +899,11 @@ int __init start_secondary(void *unused) * booting is too fragile that we want to limit the * things done here to the most necessary things. */ + cpu_init(); smp_callin(); while (!atomic_read(&smp_commenced)) /* nothing */ ; - return cpu_idle(NULL); + return cpu_idle(); } /* @@ -896,15 +914,6 @@ int __init start_secondary(void *unused) */ void __init initialize_secondary(void) { - struct thread_struct * p = ¤t->tss; - - /* - * Load up the LDT and the task register. - */ - asm volatile("lldt %%ax": :"a" (p->ldt)); - asm volatile("ltr %%ax": :"a" (p->tr)); - stts(); - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. @@ -914,7 +923,7 @@ void __init initialize_secondary(void) "movl %0,%%esp\n\t" "jmp *%1" : - :"r" (p->esp),"r" (p->eip)); + :"r" (current->thread.esp),"r" (current->thread.eip)); } extern struct { @@ -922,6 +931,14 @@ extern struct { unsigned short ss; } stack_start; +static int __init fork_by_hand(void) +{ + struct pt_regs regs; + /* don't care about the eip and regs settings since we'll never + reschedule the forked task. */ + return do_fork(CLONE_VM|CLONE_PID, 0, ®s); +} + static void __init do_boot_cpu(int i) { unsigned long cfg; @@ -931,13 +948,17 @@ static void __init do_boot_cpu(int i) int timeout, num_starts, j; unsigned long start_eip; - /* - * We need an idle process for each processor. - */ - kernel_thread(start_secondary, NULL, CLONE_PID); cpucount++; + /* We can't use kernel_thread since we must _avoid_ to reschedule + the child. */ + if (fork_by_hand() < 0) + panic("failed fork for CPU %d", i); - idle = task[cpucount]; + /* + * We remove it from the pidhash and the runqueue + * once we got the process: + */ + idle = init_task.prev_task; if (!idle) panic("No idle process for CPU %d", i); @@ -945,7 +966,11 @@ static void __init do_boot_cpu(int i) __cpu_logical_map[cpucount] = i; cpu_number_map[i] = cpucount; idle->has_cpu = 1; /* we schedule the first task manually */ - idle->tss.eip = (unsigned long) start_secondary; + idle->thread.eip = (unsigned long) start_secondary; + + del_from_runqueue(idle); + unhash_process(idle); + init_tasks[cpucount] = idle; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -1179,7 +1204,6 @@ void __init smp_boot_cpus(void) /* Must be done before other processors booted */ mtrr_init_boot_cpu (); #endif - init_idle(); /* * Initialize the logical to physical CPU number mapping * and the per-CPU profiling counter/multiplier @@ -1210,6 +1234,8 @@ void __init smp_boot_cpus(void) cpu_number_map[boot_cpu_id] = 0; + init_idle(); + /* * If we couldnt find an SMP configuration at boot time, * get out of here now! @@ -1222,6 +1248,7 @@ void __init smp_boot_cpus(void) io_apic_irqs = 0; #endif cpu_online_map = cpu_present_map; + smp_num_cpus = 1; goto smp_done; } @@ -1356,27 +1383,23 @@ void __init smp_boot_cpus(void) */ SMP_PRINTK(("Before bogomips.\n")); - if (cpucount==0) - { + if (!cpucount) { printk(KERN_ERR "Error: only one processor found.\n"); cpu_online_map = (1<<hard_smp_processor_id()); - } - else - { - unsigned long bogosum=0; - for(i=0;i<32;i++) - { + } else { + unsigned long bogosum = 0; + for(i = 0; i < 32; i++) if (cpu_online_map&(1<<i)) bogosum+=cpu_data[i].loops_per_sec; - } printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", cpucount+1, (bogosum+2500)/500000, ((bogosum+2500)/5000)%100); SMP_PRINTK(("Before bogocount - setting activated=1.\n")); - smp_activated=1; - smp_num_cpus=cpucount+1; + smp_activated = 1; } + smp_num_cpus = cpucount + 1; + if (smp_b_stepping) printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); SMP_PRINTK(("Boot done.\n")); @@ -1392,6 +1415,11 @@ void __init smp_boot_cpus(void) #endif smp_done: + /* + * now we know the other CPUs have fired off and we know our + * APIC ID, so we can go init the TSS and stuff: + */ + cpu_init(); } @@ -1571,8 +1599,7 @@ static inline void send_IPI_single(int dest, int vector) * bad as in the early days of SMP, so we might ease some of the * paranoia here. */ - -void smp_flush_tlb(void) +static void flush_tlb_others(unsigned int cpumask) { int cpu = smp_processor_id(); int stuck; @@ -1582,17 +1609,9 @@ void smp_flush_tlb(void) * it's important that we do not generate any APIC traffic * until the AP CPUs have booted up! */ - if (cpu_online_map) { - /* - * The assignment is safe because it's volatile so the - * compiler cannot reorder it, because the i586 has - * strict memory ordering and because only the kernel - * lock holder may issue a tlb flush. If you break any - * one of those three change this to an atomic bus - * locked or. - */ - - smp_invalidate_needed = cpu_online_map; + cpumask &= cpu_online_map; + if (cpumask) { + atomic_set_mask(cpumask, &smp_invalidate_needed); /* * Processors spinning on some lock with IRQs disabled @@ -1615,8 +1634,13 @@ void smp_flush_tlb(void) /* * Take care of "crossing" invalidates */ - if (test_bit(cpu, &smp_invalidate_needed)) - clear_bit(cpu, &smp_invalidate_needed); + if (test_bit(cpu, &smp_invalidate_needed)) { + struct mm_struct *mm = current->mm; + clear_bit(cpu, &smp_invalidate_needed); + if (mm) + atomic_set_mask(1 << cpu, &mm->cpu_vm_mask); + local_flush_tlb(); + } --stuck; if (!stuck) { printk("stuck on TLB IPI wait (CPU#%d)\n",cpu); @@ -1625,12 +1649,57 @@ void smp_flush_tlb(void) } __restore_flags(flags); } +} - /* - * Flush the local TLB - */ +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + */ +void flush_tlb_current_task(void) +{ + unsigned long vm_mask = 1 << current->processor; + struct mm_struct *mm = current->mm; + unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask; + + mm->cpu_vm_mask = vm_mask; + flush_tlb_others(cpu_mask); local_flush_tlb(); +} + +void flush_tlb_mm(struct mm_struct * mm) +{ + unsigned long vm_mask = 1 << current->processor; + unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask; + mm->cpu_vm_mask = 0; + if (current->active_mm == mm) { + mm->cpu_vm_mask = vm_mask; + local_flush_tlb(); + } + flush_tlb_others(cpu_mask); +} + +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ + unsigned long vm_mask = 1 << current->processor; + struct mm_struct *mm = vma->vm_mm; + unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask; + + mm->cpu_vm_mask = 0; + if (current->active_mm == mm) { + __flush_tlb_one(va); + mm->cpu_vm_mask = vm_mask; + } + flush_tlb_others(cpu_mask); +} + +void flush_tlb_all(void) +{ + flush_tlb_others(~(1 << current->processor)); + local_flush_tlb(); } @@ -1853,13 +1922,24 @@ asmlinkage void smp_reschedule_interrupt(void) } /* - * Invalidate call-back + * Invalidate call-back. + * + * Mark the CPU as a VM user if there is a active + * thread holding on to an mm at this time. This + * allows us to optimize CPU cross-calls even in the + * presense of lazy TLB handling. */ asmlinkage void smp_invalidate_interrupt(void) { - if (test_and_clear_bit(smp_processor_id(), &smp_invalidate_needed)) - local_flush_tlb(); + struct task_struct *tsk = current; + unsigned int cpu = tsk->processor; + if (test_and_clear_bit(cpu, &smp_invalidate_needed)) { + struct mm_struct *mm = tsk->mm; + if (mm) + atomic_set_mask(1 << cpu, &mm->cpu_vm_mask); + local_flush_tlb(); + } ack_APIC_irq(); } diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 2ab29d479..9d18999a0 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -59,7 +59,7 @@ /* * for x86_do_profile() */ -#include "irq.h" +#include <linux/irq.h> unsigned long cpu_hz; /* Detected as we calibrate the TSC */ @@ -547,7 +547,7 @@ static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT, 0, "timer", NUL #define CALIBRATE_LATCH (5 * LATCH) #define CALIBRATE_TIME (5 * 1000020/HZ) -__initfunc(static unsigned long calibrate_tsc(void)) +static unsigned long __init calibrate_tsc(void) { /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -612,7 +612,7 @@ bad_ctc: return 0; } -__initfunc(void time_init(void)) +void __init time_init(void) { xtime.tv_sec = get_cmos_time(); xtime.tv_usec = 0; @@ -681,8 +681,8 @@ __initfunc(void time_init(void)) co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */ - setup_x86_irq(CO_IRQ_TIMER, &irq0); + setup_irq(CO_IRQ_TIMER, &irq0); #else - setup_x86_irq(0, &irq0); + setup_irq(0, &irq0); #endif } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index cce35ac80..f3e6f75aa 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -20,6 +20,7 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/delay.h> +#include <linux/spinlock.h> #ifdef CONFIG_MCA #include <linux/mca.h> @@ -29,7 +30,6 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <asm/io.h> -#include <asm/spinlock.h> #include <asm/atomic.h> #include <asm/debugreg.h> #include <asm/desc.h> @@ -42,12 +42,14 @@ #include <asm/lithium.h> #endif -#include "irq.h" +#include <linux/irq.h> asmlinkage int system_call(void); asmlinkage void lcall7(void); +asmlinkage void lcall27(void); -struct desc_struct default_ldt = { 0, 0 }; +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 } }; /* * The IDT has to be page-aligned to simplify the Pentium @@ -65,10 +67,10 @@ static inline void console_verbose(void) #define DO_ERROR(trapnr, signr, str, name, tsk) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ - tsk->tss.error_code = error_code; \ - tsk->tss.trap_no = trapnr; \ - force_sig(signr, tsk); \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ die_if_no_fixup(str,regs,error_code); \ + force_sig(signr, tsk); \ } #define DO_VM86_ERROR(trapnr, signr, str, name, tsk) \ @@ -80,8 +82,8 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ goto out; \ /* else fall through */ \ } \ - tsk->tss.error_code = error_code; \ - tsk->tss.trap_no = trapnr; \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ force_sig(signr, tsk); \ die_if_kernel(str,regs,error_code); \ out: \ @@ -143,10 +145,8 @@ static void show_registers(struct pt_regs *regs) regs->esi, regs->edi, regs->ebp, esp); printk("ds: %04x es: %04x ss: %04x\n", regs->xds & 0xffff, regs->xes & 0xffff, ss); - store_TR(i); - printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)", - current->comm, current->pid, 0xffff & i, 4096+(unsigned long)current); - + printk("Process %s (pid: %d, stackpage=%08lx)", + current->comm, current->pid, 4096+(unsigned long)current); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -201,6 +201,9 @@ void die(const char * str, struct pt_regs * regs, long err) spin_lock_irq(&die_lock); printk("%s: %04lx\n", str, err & 0xffff); show_registers(regs); + +spin_lock_irq(&die_lock); + spin_unlock_irq(&die_lock); do_exit(SIGSEGV); } @@ -249,8 +252,8 @@ asmlinkage void cache_flush_denied(struct pt_regs * regs, long error_code) return; } die_if_kernel("cache flush denied",regs,error_code); - current->tss.error_code = error_code; - current->tss.trap_no = 19; + current->thread.error_code = error_code; + current->thread.trap_no = 19; force_sig(SIGSEGV, current); } @@ -262,8 +265,8 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) if (!(regs->xcs & 3)) goto gp_in_kernel; - current->tss.error_code = error_code; - current->tss.trap_no = 13; + current->thread.error_code = error_code; + current->thread.trap_no = 13; force_sig(SIGSEGV, current); return; @@ -354,11 +357,17 @@ asmlinkage void do_debug(struct pt_regs * regs, long error_code) unsigned int condition; struct task_struct *tsk = current; + __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg[7]) + goto clear_dr7; + } + if (regs->eflags & VM_MASK) goto debug_vm86; - __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); - /* Mask out spurious TF errors due to lazy TF clearing */ if (condition & DR_STEP) { /* @@ -374,19 +383,13 @@ asmlinkage void do_debug(struct pt_regs * regs, long error_code) goto clear_TF; } - /* Mast out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->tss.debugreg[7]) - goto clear_dr7; - } - /* If this is a kernel mode trap, we need to reset db7 to allow us to continue sanely */ if ((regs->xcs & 3) == 0) goto clear_dr7; /* Ok, finally something we can handle */ - tsk->tss.trap_no = 1; - tsk->tss.error_code = error_code; + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; force_sig(SIGTRAP, tsk); return; @@ -422,8 +425,8 @@ void math_error(void) */ task = current; save_fpu(task); - task->tss.trap_no = 16; - task->tss.error_code = 0; + task->thread.trap_no = 16; + task->thread.error_code = 0; force_sig(SIGFPE, task); } @@ -453,7 +456,7 @@ asmlinkage void math_state_restore(struct pt_regs regs) { __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ if(current->used_math) - __asm__("frstor %0": :"m" (current->tss.i387)); + __asm__("frstor %0": :"m" (current->thread.i387)); else { /* @@ -479,13 +482,14 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -__initfunc(void trap_init_f00f_bug(void)) +void __init trap_init_f00f_bug(void) { unsigned long page; pgd_t * pgd; pmd_t * pmd; pte_t * pte; +return; /* * Allocate a new page in virtual address space, * move the IDT into it and write protect this page. @@ -570,12 +574,12 @@ __asm__ __volatile__ ("movw %3,0(%2)\n\t" \ void set_tss_desc(unsigned int n, void *addr) { - _set_tssldt_desc(gdt_table+FIRST_TSS_ENTRY+(n<<1), (int)addr, 235, 0x89); + _set_tssldt_desc(gdt_table+__TSS(n), (int)addr, 235, 0x89); } void set_ldt_desc(unsigned int n, void *addr, unsigned int size) { - _set_tssldt_desc(gdt_table+FIRST_LDT_ENTRY+(n<<1), (int)addr, ((size << 3) - 1), 0x82); + _set_tssldt_desc(gdt_table+__LDT(n), (int)addr, ((size << 3)-1), 0x82); } #ifdef CONFIG_X86_VISWS_APIC @@ -672,7 +676,7 @@ void __init trap_init(void) { if (readl(0x0FFFD9) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) EISA_bus = 1; - set_call_gate(&default_ldt,lcall7); + set_trap_gate(0,÷_error); set_trap_gate(1,&debug); set_trap_gate(2,&nmi); @@ -693,14 +697,22 @@ void __init trap_init(void) set_trap_gate(17,&alignment_check); set_system_gate(SYSCALL_VECTOR,&system_call); - /* set up GDT task & ldt entries */ - set_tss_desc(0, &init_task.tss); - set_ldt_desc(0, &default_ldt, 1); + /* + * default LDT is a single-entry callgate to lcall7 for iBCS + * and a callgate to lcall27 for Solaris/x86 binaries + */ + set_call_gate(&default_ldt[0],lcall7); + set_call_gate(&default_ldt[4],lcall27); + + /* + * on SMP we do not yet know which CPU is on which TSS, + * so we delay this until smp_init(). (the CPU is already + * in a reasonable state, otherwise we wouldnt have gotten so far :) + */ +#ifndef __SMP__ + cpu_init(); +#endif - /* Clear NT, so that we won't have troubles with that later on */ - __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); - load_TR(0); - load_ldt(0); #ifdef CONFIG_X86_VISWS_APIC superio_init(); lithium_init(); diff --git a/arch/i386/kernel/visws_apic.c b/arch/i386/kernel/visws_apic.c index c12054689..de79fe61e 100644 --- a/arch/i386/kernel/visws_apic.c +++ b/arch/i386/kernel/visws_apic.c @@ -23,7 +23,6 @@ #include <linux/malloc.h> #include <linux/random.h> #include <linux/smp.h> -#include <linux/tasks.h> #include <linux/smp_lock.h> #include <linux/init.h> @@ -103,7 +102,7 @@ static struct hw_interrupt_type cobalt_irq_type = { /* - * Not an initfunc, needed by the reboot code + * Not an __init, needed by the reboot code */ void init_pic_mode(void) { diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index d181dc699..65dd7e9da 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -48,8 +48,8 @@ /* * virtual flags (16 and 32-bit versions) */ -#define VFLAGS (*(unsigned short *)&(current->tss.v86flags)) -#define VEFLAGS (current->tss.v86flags) +#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) +#define VEFLAGS (current->thread.v86flags) #define set_flags(X,new,mask) \ ((X) = ((X) & ~(mask)) | ((new) & (mask))) @@ -65,25 +65,27 @@ asmlinkage struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs) { + struct tss_struct *tss; struct pt_regs *ret; unsigned long tmp; lock_kernel(); - if (!current->tss.vm86_info) { + if (!current->thread.vm86_info) { printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->tss.v86mask); - tmp = copy_to_user(¤t->tss.vm86_info->regs,regs, VM86_REGS_SIZE1); - tmp += copy_to_user(¤t->tss.vm86_info->regs.VM86_REGS_PART2, + set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + tmp = copy_to_user(¤t->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); + tmp += copy_to_user(¤t->thread.vm86_info->regs.VM86_REGS_PART2, ®s->VM86_REGS_PART2, VM86_REGS_SIZE2); - tmp += put_user(current->tss.screen_bitmap,¤t->tss.vm86_info->screen_bitmap); + tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { printk("vm86: could not access userspace vm86_info\n"); do_exit(SIGSEGV); } - current->tss.esp0 = current->tss.saved_esp0; - current->tss.saved_esp0 = 0; + tss = init_tss + smp_processor_id(); + tss->esp0 = current->thread.esp0 = current->thread.saved_esp0; + current->thread.saved_esp0 = 0; ret = KVM86->regs32; unlock_kernel(); return ret; @@ -138,7 +140,7 @@ asmlinkage int sys_vm86old(struct vm86_struct * v86) lock_kernel(); tsk = current; - if (tsk->tss.saved_esp0) + if (tsk->thread.saved_esp0) goto out; tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, @@ -148,7 +150,7 @@ asmlinkage int sys_vm86old(struct vm86_struct * v86) goto out; memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); info.regs32 = (struct pt_regs *) &v86; - tsk->tss.vm86_info = v86; + tsk->thread.vm86_info = v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ out: @@ -188,7 +190,7 @@ asmlinkage int sys_vm86(unsigned long subfunction, struct vm86plus_struct * v86) /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ ret = -EPERM; - if (tsk->tss.saved_esp0) + if (tsk->thread.saved_esp0) goto out; tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, @@ -198,7 +200,7 @@ asmlinkage int sys_vm86(unsigned long subfunction, struct vm86plus_struct * v86) goto out; info.regs32 = (struct pt_regs *) &subfunction; info.vm86plus.is_vm86pus = 1; - tsk->tss.vm86_info = (struct vm86_struct *)v86; + tsk->thread.vm86_info = (struct vm86_struct *)v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ out: @@ -209,6 +211,7 @@ out: static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { + struct tss_struct *tss; /* * make sure the vm86() system call doesn't try to do anything silly */ @@ -231,16 +234,16 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk switch (info->cpu_type) { case CPU_286: - tsk->tss.v86mask = 0; + tsk->thread.v86mask = 0; break; case CPU_386: - tsk->tss.v86mask = NT_MASK | IOPL_MASK; + tsk->thread.v86mask = NT_MASK | IOPL_MASK; break; case CPU_486: - tsk->tss.v86mask = AC_MASK | NT_MASK | IOPL_MASK; + tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK; break; default: - tsk->tss.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; + tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; break; } @@ -248,10 +251,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk * Save old state, set default return value (%eax) to 0 */ info->regs32->eax = 0; - tsk->tss.saved_esp0 = tsk->tss.esp0; - tsk->tss.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.saved_esp0 = tsk->thread.esp0; + tss = init_tss + smp_processor_id(); + tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; - tsk->tss.screen_bitmap = info->screen_bitmap; + tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk); unlock_kernel(); @@ -295,7 +299,7 @@ static inline void clear_TF(struct kernel_vm86_regs * regs) static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) { - set_flags(VEFLAGS, eflags, current->tss.v86mask); + set_flags(VEFLAGS, eflags, current->thread.v86mask); set_flags(regs->eflags, eflags, SAFE_MASK); if (eflags & IF_MASK) set_IF(regs); @@ -303,7 +307,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { - set_flags(VFLAGS, flags, current->tss.v86mask); + set_flags(VFLAGS, flags, current->thread.v86mask); set_flags(regs->eflags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); @@ -315,7 +319,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) if (VEFLAGS & VIF_MASK) flags |= IF_MASK; - return flags | (VEFLAGS & current->tss.v86mask); + return flags | (VEFLAGS & current->thread.v86mask); } static inline int is_revectored(int nr, struct revectored_struct * bitmap) @@ -447,8 +451,8 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno spin_unlock_irqrestore(¤t->sigmask_lock, flags); } send_sig(SIGTRAP, current, 1); - current->tss.trap_no = trapno; - current->tss.error_code = error_code; + current->thread.trap_no = trapno; + current->thread.error_code = error_code; return 0; } diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile index c2cb3e5a6..a6f8dff09 100644 --- a/arch/i386/lib/Makefile +++ b/arch/i386/lib/Makefile @@ -6,7 +6,7 @@ $(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $< -o $*.o L_TARGET = lib.a -L_OBJS = checksum.o old-checksum.o semaphore.o delay.o \ +L_OBJS = checksum.o old-checksum.o delay.o \ usercopy.o getuser.o putuser.o include $(TOPDIR)/Rules.make diff --git a/arch/i386/lib/semaphore.S b/arch/i386/lib/semaphore.S deleted file mode 100644 index 3f6e27fcc..000000000 --- a/arch/i386/lib/semaphore.S +++ /dev/null @@ -1,51 +0,0 @@ -/* - * linux/arch/i386/lib/semaphore.S - * - * Copyright (C) 1996 Linus Torvalds - */ - -#include <linux/linkage.h> - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - */ -ENTRY(__down_failed) - pushl %eax /* save %eax */ - pushl %edx /* save %edx */ - pushl %ecx /* save %ecx (and argument) */ - call SYMBOL_NAME(__down) - popl %ecx /* restore %ecx (count on __down not changing it) */ - popl %edx /* restore %edx */ - popl %eax /* restore %eax */ - ret - -/* Don't save/restore %eax, because that will be our return value */ -ENTRY(__down_failed_interruptible) - pushl %edx /* save %edx */ - pushl %ecx /* save %ecx (and argument) */ - call SYMBOL_NAME(__down_interruptible) - popl %ecx /* restore %ecx (count on __down_interruptible not changing it) */ - popl %edx /* restore %edx */ - ret - -/* Don't save/restore %eax, because that will be our return value */ -ENTRY(__down_failed_trylock) - pushl %edx /* save %edx */ - pushl %ecx /* save %ecx (and argument) */ - call SYMBOL_NAME(__down_trylock) - popl %ecx /* restore %ecx (count on __down_trylock not changing it) */ - popl %edx /* restore %edx */ - ret - -ENTRY(__up_wakeup) - pushl %eax /* save %eax */ - pushl %edx /* save %edx */ - pushl %ecx /* save %ecx (and argument) */ - call SYMBOL_NAME(__up) - popl %ecx /* restore %ecx (count on __up not changing it) */ - popl %edx /* restore %edx */ - popl %eax /* restore %eax */ - ret diff --git a/arch/i386/math-emu/div_Xsig.S b/arch/i386/math-emu/div_Xsig.S index fd83732fc..1d7f9823c 100644 --- a/arch/i386/math-emu/div_Xsig.S +++ b/arch/i386/math-emu/div_Xsig.S @@ -55,7 +55,7 @@ Local storage in a static area: Accumulator: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0 */ - .align 2,0 + .align 4,0 FPU_accum_3: .long 0 FPU_accum_2: diff --git a/arch/i386/math-emu/fpu_entry.c b/arch/i386/math-emu/fpu_entry.c index 1c5d30e1d..9e1484836 100644 --- a/arch/i386/math-emu/fpu_entry.c +++ b/arch/i386/math-emu/fpu_entry.c @@ -283,8 +283,8 @@ do_another_FPU_instruction: FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ RE_ENTRANT_CHECK_OFF; - current->tss.trap_no = 16; - current->tss.error_code = 0; + current->thread.trap_no = 16; + current->thread.error_code = 0; send_sig(SIGFPE, current, 1); return; } @@ -662,8 +662,8 @@ static int valid_prefix(u_char *Byte, u_char **fpu_eip, void math_abort(struct info * info, unsigned int signal) { FPU_EIP = FPU_ORIG_EIP; - current->tss.trap_no = 16; - current->tss.error_code = 0; + current->thread.trap_no = 16; + current->thread.error_code = 0; send_sig(signal,current,1); RE_ENTRANT_CHECK_OFF; __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4)); diff --git a/arch/i386/math-emu/fpu_system.h b/arch/i386/math-emu/fpu_system.h index 1571b2f38..3cda85f65 100644 --- a/arch/i386/math-emu/fpu_system.h +++ b/arch/i386/math-emu/fpu_system.h @@ -33,7 +33,7 @@ #define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ == (1 << 10)) -#define I387 (current->tss.i387) +#define I387 (current->thread.i387) #define FPU_info (I387.soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->___cs)) diff --git a/arch/i386/math-emu/reg_round.S b/arch/i386/math-emu/reg_round.S index 9ab32e999..f6c11eb7d 100644 --- a/arch/i386/math-emu/reg_round.S +++ b/arch/i386/math-emu/reg_round.S @@ -95,7 +95,7 @@ /* Not re-entrant, so we can gain speed by putting local storage in a static area: */ .data - .align 2,0 + .align 4,0 FPU_bits_lost: .byte 0 FPU_denormal: diff --git a/arch/i386/math-emu/reg_u_div.S b/arch/i386/math-emu/reg_u_div.S index 36630de7d..e19ab011b 100644 --- a/arch/i386/math-emu/reg_u_div.S +++ b/arch/i386/math-emu/reg_u_div.S @@ -52,7 +52,7 @@ Result: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0 Overflow flag: ovfl_flag */ - .align 2,0 + .align 4,0 FPU_accum_3: .long 0 FPU_accum_2: diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile index cee7d4e6d..d60bc1969 100644 --- a/arch/i386/mm/Makefile +++ b/arch/i386/mm/Makefile @@ -10,4 +10,8 @@ O_TARGET := mm.o O_OBJS := init.o fault.o ioremap.o extable.o +ifeq ($(CONFIG_BIGMEM),y) +O_OBJS += bigmem.o +endif + include $(TOPDIR)/Rules.make diff --git a/arch/i386/mm/bigmem.c b/arch/i386/mm/bigmem.c new file mode 100644 index 000000000..8da077927 --- /dev/null +++ b/arch/i386/mm/bigmem.c @@ -0,0 +1,33 @@ +/* + * BIGMEM IA32 code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + */ + +#include <linux/mm.h> +#include <linux/bigmem.h> + +unsigned long bigmem_start, bigmem_end; + +/* NOTE: fixmap_init alloc all the fixmap pagetables contigous on the + physical space so we can cache the place of the first one and move + around without checking the pgd every time. */ +pte_t *kmap_pte; +pgprot_t kmap_prot; + +#define kmap_get_fixmap_pte(vaddr) \ + pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + +void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* cache the first kmap pte */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; + if (boot_cpu_data.x86_capability & X86_FEATURE_PGE) + pgprot_val(kmap_prot) |= _PAGE_GLOBAL; +} diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index c3e423b21..1f7879005 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -109,7 +109,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_interrupt() || mm == &init_mm) + if (in_interrupt() || !mm) goto no_context; down(&mm->mmap_sem); @@ -177,7 +177,7 @@ good_area: if (regs->eflags & VM_MASK) { unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) - tsk->tss.screen_bitmap |= 1 << bit; + tsk->thread.screen_bitmap |= 1 << bit; } up(&mm->mmap_sem); return; @@ -191,9 +191,9 @@ bad_area: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { - tsk->tss.cr2 = address; - tsk->tss.error_code = error_code; - tsk->tss.trap_no = 14; + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; force_sig(SIGSEGV, tsk); return; } @@ -243,9 +243,9 @@ no_context: else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); - __asm__("movl %%cr3,%0" : "=r" (page)); - printk(KERN_ALERT "current->tss.cr3 = %08lx, %%cr3 = %08lx\n", - tsk->tss.cr3, page); + printk(" printing eip:\n"); + printk("%08lx\n", regs->eip); + asm("movl %%cr3,%0":"=r" (page)); page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = %08lx\n", page); if (page & 1) { @@ -275,9 +275,9 @@ do_sigbus: * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - tsk->tss.cr2 = address; - tsk->tss.error_code = error_code; - tsk->tss.trap_no = 14; + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 703b8ca87..05684997f 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -2,6 +2,8 @@ * linux/arch/i386/mm/init.c * * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 */ #include <linux/config.h> @@ -20,6 +22,7 @@ #ifdef CONFIG_BLK_DEV_INITRD #include <linux/blk.h> #endif +#include <linux/bigmem.h> #include <asm/processor.h> #include <asm/system.h> @@ -27,6 +30,10 @@ #include <asm/pgtable.h> #include <asm/dma.h> #include <asm/fixmap.h> +#include <asm/e820.h> + +static unsigned long totalram = 0; +static unsigned long totalbig = 0; extern void show_net_buffers(void); extern unsigned long init_smp_mappings(unsigned long); @@ -148,6 +155,7 @@ void show_mem(void) { int i,free = 0,total = 0,reserved = 0; int shared = 0, cached = 0; + int bigmem = 0; printk("Mem-info:\n"); show_free_areas(); @@ -155,6 +163,8 @@ void show_mem(void) i = max_mapnr; while (i-- > 0) { total++; + if (PageBIGMEM(mem_map+i)) + bigmem++; if (PageReserved(mem_map+i)) reserved++; else if (PageSwapCache(mem_map+i)) @@ -165,6 +175,7 @@ void show_mem(void) shared += page_count(mem_map+i) - 1; } printk("%d pages of RAM\n",total); + printk("%d pages of BIGMEM\n",bigmem); printk("%d reserved pages\n",reserved); printk("%d pages shared\n",shared); printk("%d pages swap cached\n",cached); @@ -181,34 +192,6 @@ extern unsigned long free_area_init(unsigned long, unsigned long); extern char _text, _etext, _edata, __bss_start, _end; extern char __init_begin, __init_end; -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -unsigned long mmu_cr4_features __initdata = 0; - -static inline void set_in_cr4(unsigned long mask) -{ - mmu_cr4_features |= mask; - __asm__("movl %%cr4,%%eax\n\t" - "orl %0,%%eax\n\t" - "movl %%eax,%%cr4\n" - : : "irg" (mask) - :"ax"); -} - /* * allocate page table(s) for compile-time fixed mappings */ @@ -264,7 +247,7 @@ void set_fixmap (enum fixed_addresses idx, unsigned long phys) * This routines also unmaps the page at virtual kernel address 0, so * that we can trap those pesky NULL-reference errors in the kernel. */ -__initfunc(unsigned long paging_init(unsigned long start_mem, unsigned long end_mem)) +unsigned long __init paging_init(unsigned long start_mem, unsigned long end_mem) { pgd_t * pg_dir; pte_t * pg_table; @@ -341,7 +324,12 @@ __initfunc(unsigned long paging_init(unsigned long start_mem, unsigned long end_ #endif local_flush_tlb(); +#ifndef CONFIG_BIGMEM return free_area_init(start_mem, end_mem); +#else + kmap_init(); /* run after fixmap_init */ + return free_area_init(start_mem, bigmem_end + PAGE_OFFSET); +#endif } /* @@ -350,7 +338,7 @@ __initfunc(unsigned long paging_init(unsigned long start_mem, unsigned long end_ * before and after the test are here to work-around some nasty CPU bugs. */ -__initfunc(void test_wp_bit(void)) +void __init test_wp_bit(void) { unsigned char tmp_reg; unsigned long old = pg0[0]; @@ -358,7 +346,6 @@ __initfunc(void test_wp_bit(void)) printk("Checking if this processor honours the WP bit even in supervisor mode... "); pg0[0] = pte_val(mk_pte(PAGE_OFFSET, PAGE_READONLY)); local_flush_tlb(); - current->mm->mmap->vm_start += PAGE_SIZE; __asm__ __volatile__( "jmp 1f; 1:\n" "movb %0,%1\n" @@ -370,7 +357,6 @@ __initfunc(void test_wp_bit(void)) :"memory"); pg0[0] = old; local_flush_tlb(); - current->mm->mmap->vm_start -= PAGE_SIZE; if (boot_cpu_data.wp_works_ok < 0) { boot_cpu_data.wp_works_ok = 0; printk("No.\n"); @@ -381,7 +367,7 @@ __initfunc(void test_wp_bit(void)) printk(".\n"); } -__initfunc(void mem_init(unsigned long start_mem, unsigned long end_mem)) +void __init mem_init(unsigned long start_mem, unsigned long end_mem) { unsigned long start_low_mem = PAGE_SIZE; int codepages = 0; @@ -389,11 +375,21 @@ __initfunc(void mem_init(unsigned long start_mem, unsigned long end_mem)) int datapages = 0; int initpages = 0; unsigned long tmp; - unsigned long endbase; + int i, avail; end_mem &= PAGE_MASK; +#ifdef CONFIG_BIGMEM + bigmem_start = PAGE_ALIGN(bigmem_start); + bigmem_end &= PAGE_MASK; +#endif high_memory = (void *) end_mem; +#ifndef CONFIG_BIGMEM max_mapnr = num_physpages = MAP_NR(end_mem); +#else + max_mapnr = num_physpages = PHYSMAP_NR(bigmem_end); + /* cache the bigmem_mapnr */ + bigmem_mapnr = PHYSMAP_NR(bigmem_start); +#endif /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); @@ -413,22 +409,50 @@ __initfunc(void mem_init(unsigned long start_mem, unsigned long end_mem)) #endif start_mem = PAGE_ALIGN(start_mem); - /* - * IBM messed up *AGAIN* in their thinkpad: 0xA0000 -> 0x9F000. - * They seem to have done something stupid with the floppy - * controller as well.. - * The amount of available base memory is in WORD 40:13. + /* walk the whitelist, unreserving good memory */ - endbase = PAGE_OFFSET + ((*(unsigned short *)__va(0x413) * 1024) & PAGE_MASK); - while (start_low_mem < endbase) { - clear_bit(PG_reserved, &mem_map[MAP_NR(start_low_mem)].flags); - start_low_mem += PAGE_SIZE; - } + for (avail = i = 0; i < e820.nr_map; i++) { + unsigned long addr, end, size; - while (start_mem < end_mem) { - clear_bit(PG_reserved, &mem_map[MAP_NR(start_mem)].flags); - start_mem += PAGE_SIZE; + if (e820.map[i].type != E820_RAM) /* not usable memory */ + continue; + addr = e820.map[i].addr; + size = e820.map[i].size; + + /* Silently ignore memory regions starting above 4gb */ + if (addr != e820.map[i].addr) + continue; + + printk("memory region: %luk @ %08lx\n", size >> 10, addr ); + + /* Make sure we don't get fractional pages */ + end = PAGE_OFFSET + ((addr + size) & PAGE_MASK); + addr= PAGE_OFFSET + PAGE_ALIGN(addr); + + for ( ; addr < end; addr += PAGE_SIZE) { + + /* this little bit of grossness is for dealing + * with memory borrowing for system bookkeeping + * (smp stacks, zero page, kernel code, etc) + * without having to go back and edit the e820 + * map to compensate. + * + * if we're in low memory (<1024k), we need to + * avoid the smp stack and zero page. + * if we're in high memory, we need to avoid + * the kernel code. + * in any case, we don't want to hack mem_map + * entries above end_mem. + */ + if ( (addr < start_low_mem) + || (addr >= (HIGH_MEMORY + PAGE_OFFSET)&& addr <= start_mem) + || (addr > end_mem) ) + continue; + + clear_bit(PG_reserved, &mem_map[MAP_NR(addr)].flags); + } } + for (tmp = PAGE_OFFSET ; tmp < end_mem ; tmp += PAGE_SIZE) { if (tmp >= MAX_DMA_ADDRESS) clear_bit(PG_DMA, &mem_map[MAP_NR(tmp)].flags); @@ -449,22 +473,35 @@ __initfunc(void mem_init(unsigned long start_mem, unsigned long end_mem)) continue; } set_page_count(mem_map+MAP_NR(tmp), 1); + totalram += PAGE_SIZE; #ifdef CONFIG_BLK_DEV_INITRD - if (!initrd_start || (tmp < initrd_start || tmp >= - initrd_end)) + if (!initrd_start || (tmp < initrd_start || tmp >= initrd_end)) #endif free_page(tmp); } - printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n", +#ifdef CONFIG_BIGMEM + for (tmp = bigmem_start; tmp < bigmem_end; tmp += PAGE_SIZE) { + clear_bit(PG_reserved, &mem_map[PHYSMAP_NR(tmp)].flags); + set_bit(PG_BIGMEM, &mem_map[PHYSMAP_NR(tmp)].flags); + atomic_set(&mem_map[PHYSMAP_NR(tmp)].count, 1); + free_page(tmp + PAGE_OFFSET); + totalbig += PAGE_SIZE; + } + totalram += totalbig; +#endif + printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %dk bigmem)\n", (unsigned long) nr_free_pages << (PAGE_SHIFT-10), max_mapnr << (PAGE_SHIFT-10), codepages << (PAGE_SHIFT-10), reservedpages << (PAGE_SHIFT-10), datapages << (PAGE_SHIFT-10), - initpages << (PAGE_SHIFT-10)); + initpages << (PAGE_SHIFT-10), + (int) (totalbig >> 10) + ); if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); + } void free_initmem(void) @@ -476,28 +513,18 @@ void free_initmem(void) mem_map[MAP_NR(addr)].flags &= ~(1 << PG_reserved); set_page_count(mem_map+MAP_NR(addr), 1); free_page(addr); + totalram += PAGE_SIZE; } printk ("Freeing unused kernel memory: %dk freed\n", (&__init_end - &__init_begin) >> 10); } void si_meminfo(struct sysinfo *val) { - int i; - - i = max_mapnr; - val->totalram = 0; + val->totalram = totalram; val->sharedram = 0; val->freeram = nr_free_pages << PAGE_SHIFT; val->bufferram = atomic_read(&buffermem); - while (i-- > 0) { - if (PageReserved(mem_map+i)) - continue; - val->totalram++; - if (!page_count(mem_map+i)) - continue; - val->sharedram += page_count(mem_map+i) - 1; - } - val->totalram <<= PAGE_SHIFT; - val->sharedram <<= PAGE_SHIFT; + val->totalbig = totalbig; + val->freebig = nr_free_bigpages << PAGE_SHIFT; return; } diff --git a/arch/i386/vmlinux.lds b/arch/i386/vmlinux.lds deleted file mode 100644 index ecf90c27c..000000000 --- a/arch/i386/vmlinux.lds +++ /dev/null @@ -1,69 +0,0 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - */ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(_start) -SECTIONS -{ - . = 0xC0000000 + 0x100000; - _text = .; /* Text and read-only data */ - .text : { - *(.text) - *(.fixup) - *(.gnu.warning) - } = 0x9090 - .text.lock : { *(.text.lock) } /* out-of-line lock text */ - .rodata : { *(.rodata) } - .kstrtab : { *(.kstrtab) } - - . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : { *(__ex_table) } - __stop___ex_table = .; - - __start___ksymtab = .; /* Kernel symbol table */ - __ksymtab : { *(__ksymtab) } - __stop___ksymtab = .; - - _etext = .; /* End of text section */ - - .data : { /* Data */ - *(.data) - CONSTRUCTORS - } - - _edata = .; /* End of data section */ - - . = ALIGN(8192); /* init_task */ - .data.init_task : { *(.data.init_task) } - - . = ALIGN(4096); /* Init code and data */ - __init_begin = .; - .text.init : { *(.text.init) } - .data.init : { *(.data.init) } - . = ALIGN(4096); - __init_end = .; - - . = ALIGN(32); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } - - . = ALIGN(4096); - .data.page_aligned : { *(.data.idt) } - - - __bss_start = .; /* BSS */ - .bss : { - *(.bss) - } - _end = . ; - - /* Stabs debugging sections. */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } -} diff --git a/arch/i386/vmlinux.lds.S b/arch/i386/vmlinux.lds.S index 347a058a3..9a9ff07e7 100644 --- a/arch/i386/vmlinux.lds.S +++ b/arch/i386/vmlinux.lds.S @@ -42,15 +42,21 @@ SECTIONS __init_begin = .; .text.init : { *(.text.init) } .data.init : { *(.data.init) } + . = ALIGN(16); + __setup_start = .; + .setup.init : { *(.setup.init) } + __setup_end = .; + __initcall_start = .; + .initcall.init : { *(.initcall.init) } + __initcall_end = .; . = ALIGN(4096); __init_end = .; - . = ALIGN(32); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } - . = ALIGN(4096); .data.page_aligned : { *(.data.idt) } + . = ALIGN(32); + .data.cacheline_aligned : { *(.data.cacheline_aligned) } __bss_start = .; /* BSS */ .bss : { |