diff options
Diffstat (limited to 'arch')
45 files changed, 16677 insertions, 1 deletions
diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 1b878067a..b1aebc71a 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -55,7 +55,7 @@ zlilo: $(CONFIGURE) zImage LOWLDFLAGS =-qmagic -Ttext 0xfe0 HIGHLDFLAGS =-qmagic -Ttext 0xfffe0 -tools/system: boot/head.o init/main.o tools/version.o linuxsubdirs +tools/system: boot/head.o init/main.o init/init.o tools/version.o linuxsubdirs $(LD) $(LOWLDFLAGS) boot/head.o init/main.o tools/version.o \ $(ARCHIVES) \ $(FILESYSTEMS) \ diff --git a/arch/i386/bios32.c b/arch/i386/bios32.c new file mode 100644 index 000000000..311dd111e --- /dev/null +++ b/arch/i386/bios32.c @@ -0,0 +1,476 @@ +/* + * bios32.c - BIOS32, PCI BIOS functions. + * + * Sponsored by + * iX Multiuser Multitasking Magazine + * Hannover, Germany + * hm@ix.de + * + * Copyright 1993, 1994 Drew Eckhardt + * Visionary Computing + * (Unix and Linux consulting and custom programming) + * Drew@Colorado.EDU + * +1 (303) 786-7975 + * + * For more information, please consult + * + * PCI BIOS Specification Revision + * PCI Local Bus Specification + * PCI System Design Guide + * + * PCI Special Interest Group + * M/S HF3-15A + * 5200 N.E. Elam Young Parkway + * Hillsboro, Oregon 97124-6497 + * +1 (503) 696-2000 + * +1 (800) 433-5177 + * + * Manuals are $25 each or $50 for all three, plus $7 shipping + * within the United States, $35 abroad. + * + * + * CHANGELOG : + * Jun 17, 1994 : Modified to accommodate the broken pre-PCI BIOS SPECIFICATION + * Revision 2.0 present on <thys@dennis.ee.up.ac.za>'s ASUS mainboard. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/bios32.h> +#include <linux/pci.h> + +#include <asm/segment.h> + +/* + * It would seem some PCI bioses are buggy, so we don't actually use these + * routines unless we need to.. + */ +#ifdef CONFIG_SCSI_NCR53C7xx + #define CONFIG_PCI +#else + #undef CONFIG_PCI +#endif + +#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX +#define PCIBIOS_PCI_BIOS_PRESENT 0xb101 +#define PCIBIOS_FIND_PCI_DEVICE 0xb102 +#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103 +#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106 +#define PCIBIOS_READ_CONFIG_BYTE 0xb108 +#define PCIBIOS_READ_CONFIG_WORD 0xb109 +#define PCIBIOS_READ_CONFIG_DWORD 0xb10a +#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b +#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c +#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d + +/* BIOS32 signature: "_32_" */ +#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) + +/* PCI signature: "PCI " */ +#define PCI_SIGNATURE (('P' << 0) + ('C' << 8) + ('I' << 16) + (' ' << 24)) + +/* PCI service signature: "$PCI" */ +#define PCI_SERVICE (('$' << 0) + ('P' << 8) + ('C' << 16) + ('I' << 24)) + +/* + * This is the standard structure used to identify the entry point + * to the BIOS32 Service Directory, as documented in + * Standard BIOS 32-bit Service Directory Proposal + * Revision 0.4 May 24, 1993 + * Phoenix Technologies Ltd. + * Norwood, MA + * and the PCI BIOS specification. + */ + +union bios32 { + struct { + unsigned long signature; /* _32_ */ + unsigned long entry; /* 32 bit physical address */ + unsigned char revision; /* Revision level, 0 */ + unsigned char length; /* Length in paragraphs should be 01 */ + unsigned char checksum; /* All bytes must add up to zero */ + unsigned char reserved[5]; /* Must be zero */ + } fields; + char chars[16]; +}; + +/* + * Physical address of the service directory. I don't know if we're + * allowed to have more than one of these or not, so just in case + * we'll make bios32_init() take a memory start parameter and store + * the array there. + */ + +static unsigned long bios32_entry = 0; +static struct { + unsigned long address; + unsigned short segment; +} bios32_indirect = { 0, KERNEL_CS }; + +#ifdef CONFIG_PCI +/* + * Returns the entry point for the given service, NULL on error + */ + +static unsigned long bios32_service(unsigned long service) +{ + unsigned char return_code; /* %al */ + unsigned long address; /* %ebx */ + unsigned long length; /* %ecx */ + unsigned long entry; /* %edx */ + + __asm__("lcall (%%edi)" + : "=a" (return_code), + "=b" (address), + "=c" (length), + "=d" (entry) + : "0" (service), + "1" (0), + "D" (&bios32_indirect)); + + switch (return_code) { + case 0: + return address + entry; + case 0x80: /* Not present */ + printk("bios32_service(%ld) : not present\n", service); + return 0; + default: /* Shouldn't happen */ + printk("bios32_service(%ld) : returned 0x%x, mail drew@colorado.edu\n", + service, return_code); + return 0; + } +} + +static long pcibios_entry = 0; +static struct { + unsigned long address; + unsigned short segment; +} pci_indirect = { 0, KERNEL_CS }; + +void NCR53c810_test(void); + +static unsigned long pcibios_init(unsigned long memory_start, unsigned long memory_end) +{ + unsigned long signature; + unsigned char present_status; + unsigned char major_revision; + unsigned char minor_revision; + int pack; + + if ((pcibios_entry = bios32_service(PCI_SERVICE))) { + pci_indirect.address = pcibios_entry; + + __asm__("lcall (%%edi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:\tshl $8, %%eax\n\t" + "movw %%bx, %%ax" + : "=d" (signature), + "=a" (pack) + : "1" (PCIBIOS_PCI_BIOS_PRESENT), + "D" (&pci_indirect) + : "bx", "cx"); + + present_status = (pack >> 16) & 0xff; + major_revision = (pack >> 8) & 0xff; + minor_revision = pack & 0xff; + if (present_status || (signature != PCI_SIGNATURE)) { + printk ("pcibios_init : %s : BIOS32 Service Directory says PCI BIOS is present,\n" + " but PCI_BIOS_PRESENT subfunction fails with present status of 0x%x\n" + " and signature of 0x%08lx (%c%c%c%c). mail drew@Colorado.EDU\n", + (signature == PCI_SIGNATURE) ? "WARNING" : "ERROR", + present_status, signature, + (char) (signature >> 0), (char) (signature >> 8), + (char) (signature >> 16), (char) (signature >> 24)); + + if (signature != PCI_SIGNATURE) + pcibios_entry = 0; + } + if (pcibios_entry) { + printk ("pcibios_init : PCI BIOS revision %x.%02x entry at 0x%lx\n", + major_revision, minor_revision, pcibios_entry); + } + } + +#if 0 + NCR53c810_test(); +#endif + return memory_start; +} + +int pcibios_present(void) +{ + return pcibios_entry ? 1 : 0; +} + +int pcibios_find_class_code (unsigned long class_code, unsigned short index, + unsigned char *bus, unsigned char *device_fn) +{ + unsigned long bx; + unsigned long ret; + + __asm__ ("lcall (%%edi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=b" (bx), + "=a" (ret) + : "1" (PCIBIOS_FIND_PCI_CLASS_CODE), + "c" (class_code), + "S" ((int) index), + "D" (&pci_indirect)); + *bus = (bx >> 8) & 0xff; + *device_fn = bx & 0xff; + return (int) (ret & 0xff00) >> 8; +} + + +int pcibios_find_device (unsigned short vendor, unsigned short device_id, + unsigned short index, unsigned char *bus, unsigned char *device_fn) +{ + unsigned short bx; + unsigned short ret; + + __asm__("lcall (%%edi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=b" (bx), + "=a" (ret) + : "1" (PCIBIOS_FIND_PCI_DEVICE), + "c" (device_id), + "d" (vendor), + "S" ((int) index), + "D" (&pci_indirect)); + *bus = (bx >> 8) & 0xff; + *device_fn = bx & 0xff; + return (int) (ret & 0xff00) >> 8; +} + +int pcibios_read_config_byte(unsigned char bus, + unsigned char device_fn, unsigned char where, unsigned char *value) +{ + unsigned long ret; + unsigned long bx = (bus << 8) | device_fn; + + __asm__("lcall (%%esi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (ret) + : "1" (PCIBIOS_READ_CONFIG_BYTE), + "b" (bx), + "D" ((long) where), + "S" (&pci_indirect)); + return (int) (ret & 0xff00) >> 8; +} + +int pcibios_read_config_word (unsigned char bus, + unsigned char device_fn, unsigned char where, unsigned short *value) +{ + unsigned long ret; + unsigned long bx = (bus << 8) | device_fn; + + __asm__("lcall (%%esi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (ret) + : "1" (PCIBIOS_READ_CONFIG_WORD), + "b" (bx), + "D" ((long) where), + "S" (&pci_indirect)); + return (int) (ret & 0xff00) >> 8; +} + +int pcibios_read_config_dword (unsigned char bus, + unsigned char device_fn, unsigned char where, unsigned long *value) +{ + unsigned long ret; + unsigned long bx = (bus << 8) | device_fn; + + __asm__("lcall (%%esi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (ret) + : "1" (PCIBIOS_READ_CONFIG_DWORD), + "b" (bx), + "D" ((long) where), + "S" (&pci_indirect)); + return (int) (ret & 0xff00) >> 8; +} + +int pcibios_write_config_byte (unsigned char bus, + unsigned char device_fn, unsigned char where, unsigned char value) +{ + unsigned long ret; + unsigned long bx = (bus << 8) | device_fn; + + __asm__("lcall (%%esi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (ret) + : "0" (PCIBIOS_WRITE_CONFIG_BYTE), + "c" (value), + "b" (bx), + "D" ((long) where), + "S" (&pci_indirect)); + return (int) (ret & 0xff00) >> 8; +} + +int pcibios_write_config_word (unsigned char bus, + unsigned char device_fn, unsigned char where, unsigned short value) +{ + unsigned long ret; + unsigned long bx = (bus << 8) | device_fn; + + __asm__("lcall (%%esi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (ret) + : "0" (PCIBIOS_WRITE_CONFIG_WORD), + "c" (value), + "b" (bx), + "D" ((long) where), + "S" (&pci_indirect)); + return (int) (ret & 0xff00) >> 8; +} + +int pcibios_write_config_dword (unsigned char bus, + unsigned char device_fn, unsigned char where, unsigned long value) +{ + unsigned long ret; + unsigned long bx = (bus << 8) | device_fn; + + __asm__("lcall (%%esi)\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (ret) + : "0" (PCIBIOS_WRITE_CONFIG_DWORD), + "c" (value), + "b" (bx), + "D" ((long) where), + "S" (&pci_indirect)); + return (int) (ret & 0xff00) >> 8; +} + +void NCR53c810_test(void) +{ + unsigned char bus, device_fn; + unsigned short index; + int ret; + unsigned char row, col; + unsigned long val; + + for (index = 0; index < 4; ++index) { + ret = pcibios_find_device ( + (unsigned short) PCI_VENDOR_ID_NCR, + (unsigned short) PCI_DEVICE_ID_NCR_53C810, + index, &bus, &device_fn); + if (ret) + break; + printk ("ncr53c810 : at PCI bus %d, device %d, function %d.", + bus, ((device_fn & 0xf8) >> 3), (device_fn & 7)); + for (row = 0; row < 0x3c; row += 0x10) { + printk ("\n reg 0x%02x ", row); + for (col = 0; col < 0x10; col += 4) { + if (!(ret = pcibios_read_config_dword (bus, device_fn, row+col, &val))) + printk ("0x%08lx ", val); + else + printk ("error 0x%02x ", ret); + } + } + printk ("\n"); + } +} + +char *pcibios_strerror (int error) +{ + static char buf[80]; + + switch (error) { + case PCIBIOS_SUCCESSFUL: + return "SUCCESSFUL"; + + case PCIBIOS_FUNC_NOT_SUPPORTED: + return "FUNC_NOT_SUPPORTED"; + + case PCIBIOS_BAD_VENDOR_ID: + return "SUCCESSFUL"; + + case PCIBIOS_DEVICE_NOT_FOUND: + return "DEVICE_NOT_FOUND"; + + case PCIBIOS_BAD_REGISTER_NUMBER: + return "BAD_REGISTER_NUMBER"; + + default: + sprintf (buf, "UNKNOWN RETURN 0x%x", error); + return buf; + } +} + +#endif + +unsigned long bios32_init(unsigned long memory_start, unsigned long memory_end) +{ + union bios32 *check; + unsigned char sum; + int i, length; + + /* + * Follow the standard procedure for locating the BIOS32 Service + * directory by scanning the permissible address range from + * 0xe0000 through 0xfffff for a valid BIOS32 structure. + * + * The PCI BIOS doesn't seem to work too well on many machines, + * so we disable this unless it's really needed (NCR SCSI driver) + */ + + for (check = (union bios32 *) 0xe0000; check <= (union bios32 *) 0xffff0; ++check) { + if (check->fields.signature != BIOS32_SIGNATURE) + continue; + length = check->fields.length * 16; + if (!length) + continue; + sum = 0; + for (i = 0; i < length ; ++i) + sum += check->chars[i]; + if (sum != 0) + continue; + if (check->fields.revision != 0) { + printk("bios32_init : unsupported revision %d at 0x%p, mail drew@colorado.edu\n", + check->fields.revision, check); + continue; + } + printk ("bios32_init : BIOS32 Service Directory structure at 0x%p\n", check); + if (!bios32_entry) { + bios32_indirect.address = bios32_entry = check->fields.entry; + printk ("bios32_init : BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); + } else { + printk ("bios32_init : multiple entries, mail drew@colorado.edu\n"); + /* + * Jeremy Fitzhardinge reports at least one PCI BIOS + * with two different service directories, and as both + * worked for him, we'll just mention the fact, and + * not actually disallow it.. + */ +#if 0 + return memory_start; +#endif + } + } +#ifdef CONFIG_PCI + if (bios32_entry) { + memory_start = pcibios_init (memory_start, memory_end); + } +#endif + return memory_start; +} diff --git a/arch/i386/bootsect.S b/arch/i386/bootsect.S new file mode 100644 index 000000000..f6a0d3158 --- /dev/null +++ b/arch/i386/bootsect.S @@ -0,0 +1,460 @@ +! +! SYS_SIZE is the number of clicks (16 bytes) to be loaded. +! 0x7F00 is 0x7F000 bytes = 508kB, more than enough for current +! versions of linux which compress the kernel +! +#include <linux/config.h> +SYSSIZE = DEF_SYSSIZE +! +! bootsect.s Copyright (C) 1991, 1992 Linus Torvalds +! modified by Drew Eckhardt +! modified by Bruce Evans (bde) +! +! bootsect.s is loaded at 0x7c00 by the bios-startup routines, and moves +! itself out of the way to address 0x90000, and jumps there. +! +! bde - should not jump blindly, there may be systems with only 512K low +! memory. Use int 0x12 to get the top of memory, etc. +! +! It then loads 'setup' directly after itself (0x90200), and the system +! at 0x10000, using BIOS interrupts. +! +! NOTE! currently system is at most (8*65536-4096) bytes long. This should +! be no problem, even in the future. I want to keep it simple. This 508 kB +! kernel size should be enough, especially as this doesn't contain the +! buffer cache as in minix (and especially now that the kernel is +! compressed :-) +! +! The loader has been made as simple as possible, and continuous +! read errors will result in a unbreakable loop. Reboot by hand. It +! loads pretty fast by getting whole tracks at a time whenever possible. + +.text + +SETUPSECS = 4 ! nr of setup-sectors +BOOTSEG = 0x07C0 ! original address of boot-sector +INITSEG = DEF_INITSEG ! we move boot here - out of the way +SETUPSEG = DEF_SETUPSEG ! setup starts here +SYSSEG = DEF_SYSSEG ! system loaded at 0x10000 (65536). + +! ROOT_DEV & SWAP_DEV are now written by "build". +ROOT_DEV = 0 +SWAP_DEV = 0 +#ifndef SVGA_MODE +#define SVGA_MODE ASK_VGA +#endif +#ifndef RAMDISK +#define RAMDISK 0 +#endif +#ifndef CONFIG_ROOT_RDONLY +#define CONFIG_ROOT_RDONLY 0 +#endif + +! ld86 requires an entry symbol. This may as well be the usual one. +.globl _main +_main: +#if 0 /* hook for debugger, harmless unless BIOS is fussy (old HP) */ + int 3 +#endif + mov ax,#BOOTSEG + mov ds,ax + mov ax,#INITSEG + mov es,ax + mov cx,#256 + sub si,si + sub di,di + cld + rep + movsw + jmpi go,INITSEG + +! ax and es already contain INITSEG + +go: mov di,#0x4000-12 ! 0x4000 is arbitrary value >= length of + ! bootsect + length of setup + room for stack + ! 12 is disk parm size + +! bde - changed 0xff00 to 0x4000 to use debugger at 0x6400 up (bde). We +! wouldn't have to worry about this if we checked the top of memory. Also +! my BIOS can be configured to put the wini drive tables in high memory +! instead of in the vector table. The old stack might have clobbered the +! drive table. + + mov ds,ax + mov ss,ax ! put stack at INITSEG:0x4000-12. + mov sp,di +/* + * Many BIOS's default disk parameter tables will not + * recognize multi-sector reads beyond the maximum sector number + * specified in the default diskette parameter tables - this may + * mean 7 sectors in some cases. + * + * Since single sector reads are slow and out of the question, + * we must take care of this by creating new parameter tables + * (for the first disk) in RAM. We will set the maximum sector + * count to 36 - the most we will encounter on an ED 2.88. + * + * High doesn't hurt. Low does. + * + * Segments are as follows: ds=es=ss=cs - INITSEG, + * fs = 0, gs is unused. + */ + +! cx contains 0 from rep movsw above + + mov fs,cx + mov bx,#0x78 ! fs:bx is parameter table address + push ds + seg fs + lds si,(bx) ! ds:si is source + + mov cl,#6 ! copy 12 bytes + cld + push di + + rep + movsw + + pop di + pop ds + + movb 4(di),*36 ! patch sector count + + seg fs + mov (bx),di + seg fs + mov 2(bx),es + +! load the setup-sectors directly after the bootblock. +! Note that 'es' is already set up. +! Also cx is 0 from rep movsw above. + +load_setup: + xor ah,ah ! reset FDC + xor dl,dl + int 0x13 + + xor dx, dx ! drive 0, head 0 + mov cl,#0x02 ! sector 2, track 0 + mov bx,#0x0200 ! address = 512, in INITSEG + mov ah,#0x02 ! service 2, nr of sectors + mov al,setup_sects ! (assume all on head 0, track 0) + int 0x13 ! read it + jnc ok_load_setup ! ok - continue + + push ax ! dump error code + call print_nl + mov bp, sp + call print_hex + pop ax + + jmp load_setup + +ok_load_setup: + +! Get disk drive parameters, specifically nr of sectors/track + +#if 0 + +! bde - the Phoenix BIOS manual says function 0x08 only works for fixed +! disks. It doesn't work for one of my BIOS's (1987 Award). It was +! fatal not to check the error code. + + xor dl,dl + mov ah,#0x08 ! AH=8 is get drive parameters + int 0x13 + xor ch,ch +#else + +! It seems that there is no BIOS call to get the number of sectors. Guess +! 36 sectors if sector 36 can be read, 18 sectors if sector 18 can be read, +! 15 if sector 15 can be read. Otherwise guess 9. + + mov si,#disksizes ! table of sizes to try + +probe_loop: + lodsb + cbw ! extend to word + mov sectors, ax + cmp si,#disksizes+4 + jae got_sectors ! if all else fails, try 9 + xchg ax, cx ! cx = track and sector + xor dx, dx ! drive 0, head 0 + xor bl, bl + mov bh,setup_sects + inc bh + shl bh,#1 ! address after setup (es = cs) + mov ax,#0x0201 ! service 2, 1 sector + int 0x13 + jc probe_loop ! try next value + +#endif + +got_sectors: + +! Restore es + + mov ax,#INITSEG + mov es,ax + +! Print some inane message + + mov ah,#0x03 ! read cursor pos + xor bh,bh + int 0x10 + + mov cx,#9 + mov bx,#0x0007 ! page 0, attribute 7 (normal) + mov bp,#msg1 + mov ax,#0x1301 ! write string, move cursor + int 0x10 + +! ok, we've written the message, now +! we want to load the system (at 0x10000) + + mov ax,#SYSSEG + mov es,ax ! segment of 0x010000 + call read_it + call kill_motor + call print_nl + +! After that we check which root-device to use. If the device is +! defined (!= 0), nothing is done and the given device is used. +! Otherwise, one of /dev/fd0H2880 (2,32) or /dev/PS0 (2,28) or /dev/at0 (2,8), +! depending on the number of sectors we pretend to know we have. + + seg cs + mov ax,root_dev + or ax,ax + jne root_defined + seg cs + mov bx,sectors + mov ax,#0x0208 ! /dev/ps0 - 1.2Mb + cmp bx,#15 + je root_defined + mov al,#0x1c ! /dev/PS0 - 1.44Mb + cmp bx,#18 + je root_defined + mov al,#0x20 ! /dev/fd0H2880 - 2.88Mb + cmp bx,#36 + je root_defined + mov al,#0 ! /dev/fd0 - autodetect +root_defined: + seg cs + mov root_dev,ax + +! after that (everything loaded), we jump to +! the setup-routine loaded directly after +! the bootblock: + + jmpi 0,SETUPSEG + +! This routine loads the system at address 0x10000, making sure +! no 64kB boundaries are crossed. We try to load it as fast as +! possible, loading whole tracks whenever we can. +! +! in: es - starting address segment (normally 0x1000) +! +sread: .word 0 ! sectors read of current track +head: .word 0 ! current head +track: .word 0 ! current track + +read_it: + mov al,setup_sects + inc al + mov sread,al + mov ax,es + test ax,#0x0fff +die: jne die ! es must be at 64kB boundary + xor bx,bx ! bx is starting address within segment +rp_read: + mov ax,es + sub ax,#SYSSEG + cmp ax,syssize ! have we loaded all yet? + jbe ok1_read + ret +ok1_read: + mov ax,sectors + sub ax,sread + mov cx,ax + shl cx,#9 + add cx,bx + jnc ok2_read + je ok2_read + xor ax,ax + sub ax,bx + shr ax,#9 +ok2_read: + call read_track + mov cx,ax + add ax,sread + cmp ax,sectors + jne ok3_read + mov ax,#1 + sub ax,head + jne ok4_read + inc track +ok4_read: + mov head,ax + xor ax,ax +ok3_read: + mov sread,ax + shl cx,#9 + add bx,cx + jnc rp_read + mov ax,es + add ah,#0x10 + mov es,ax + xor bx,bx + jmp rp_read + +read_track: + pusha + pusha + mov ax, #0xe2e ! loading... message 2e = . + mov bx, #7 + int 0x10 + popa + + mov dx,track + mov cx,sread + inc cx + mov ch,dl + mov dx,head + mov dh,dl + and dx,#0x0100 + mov ah,#2 + + push dx ! save for error dump + push cx + push bx + push ax + + int 0x13 + jc bad_rt + add sp, #8 + popa + ret + +bad_rt: push ax ! save error code + call print_all ! ah = error, al = read + + + xor ah,ah + xor dl,dl + int 0x13 + + + add sp, #10 + popa + jmp read_track + +/* + * print_all is for debugging purposes. + * It will print out all of the registers. The assumption is that this is + * called from a routine, with a stack frame like + * dx + * cx + * bx + * ax + * error + * ret <- sp + * +*/ + +print_all: + mov cx, #5 ! error code + 4 registers + mov bp, sp + +print_loop: + push cx ! save count left + call print_nl ! nl for readability + + cmp cl, #5 + jae no_reg ! see if register name is needed + + mov ax, #0xe05 + 'A - 1 + sub al, cl + int 0x10 + + mov al, #'X + int 0x10 + + mov al, #': + int 0x10 + +no_reg: + add bp, #2 ! next register + call print_hex ! print it + pop cx + loop print_loop + ret + +print_nl: + mov ax, #0xe0d ! CR + int 0x10 + mov al, #0xa ! LF + int 0x10 + ret + +/* + * print_hex is for debugging purposes, and prints the word + * pointed to by ss:bp in hexadecimal. +*/ + +print_hex: + mov cx, #4 ! 4 hex digits + mov dx, (bp) ! load word into dx +print_digit: + rol dx, #4 ! rotate so that lowest 4 bits are used + mov ax, #0xe0f ! ah = request, al = mask for nybble + and al, dl + add al, #0x90 ! convert al to ascii hex (four instructions) + daa + adc al, #0x40 + daa + int 0x10 + loop print_digit + ret + + +/* + * This procedure turns off the floppy drive motor, so + * that we enter the kernel in a known state, and + * don't have to worry about it later. + */ +kill_motor: + push dx + mov dx,#0x3f2 + xor al, al + outb + pop dx + ret + +sectors: + .word 0 + +disksizes: + .byte 36,18,15,9 + +msg1: + .byte 13,10 + .ascii "Loading" + +.org 497 +setup_sects: + .byte SETUPSECS +root_flags: + .word CONFIG_ROOT_RDONLY +syssize: + .word SYSSIZE +swap_dev: + .word SWAP_DEV +ram_size: + .word RAMDISK +vid_mode: + .word SVGA_MODE +root_dev: + .word ROOT_DEV +boot_flag: + .word 0xAA55 diff --git a/arch/i386/dummy.c b/arch/i386/dummy.c new file mode 100644 index 000000000..dd2410a7a --- /dev/null +++ b/arch/i386/dummy.c @@ -0,0 +1,11 @@ +/* + * This file handles Systemcalls not available for all CPUs. + * + * Written by Ralf Baechle, + * Copyright (C) 1994 by Waldorf GMBH + */ + +/* + * Nothing yet for i386... + */ + diff --git a/arch/i386/head.S b/arch/i386/head.S new file mode 100644 index 000000000..e720c14d0 --- /dev/null +++ b/arch/i386/head.S @@ -0,0 +1,349 @@ +/* + * linux/boot/head.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * head.S contains the 32-bit startup code. + */ + +.text +.globl _idt,_gdt, +.globl _swapper_pg_dir,_pg0 +.globl _empty_bad_page +.globl _empty_bad_page_table +.globl _empty_zero_page +.globl _floppy_track_buffer + +#include <linux/tasks.h> +#include <linux/segment.h> +#define ASSEMBLER +#include <linux/fd.h> + +#define CL_MAGIC_ADDR 0x90020 +#define CL_MAGIC 0xA33F +#define CL_BASE_ADDR 0x90000 +#define CL_OFFSET 0x90022 + +/* + * swapper_pg_dir is the main page directory, address 0x00001000 (or at + * address 0x00101000 for a compressed boot). + */ +startup_32: + cld + movl $(KERNEL_DS),%eax + mov %ax,%ds + mov %ax,%es + mov %ax,%fs + mov %ax,%gs + lss _stack_start,%esp +/* + * Clear BSS first so that there are no surprises... + */ + xorl %eax,%eax + movl $__edata,%edi + movl $__end,%ecx + subl %edi,%ecx + cld + rep + stosb +/* + * start system 32-bit setup. We need to re-do some of the things done + * in 16-bit mode for the "real" operations. + */ + call setup_idt + xorl %eax,%eax +1: incl %eax # check that A20 really IS enabled + movl %eax,0x000000 # loop forever if it isn't + cmpl %eax,0x100000 + je 1b +/* + * Initialize eflags. Some BIOS's leave bits like NT set. This would + * confuse the debugger if this code is traced. + * XXX - best to initialize before switching to protected mode. + */ + pushl $0 + popfl +/* + * Copy bootup parameters out of the way. First 2kB of + * _empty_zero_page is for boot parameters, second 2kB + * is for the command line. + */ + movl $0x90000,%esi + movl $_empty_zero_page,%edi + movl $512,%ecx + cld + rep + movsl + xorl %eax,%eax + movl $512,%ecx + rep + stosl + cmpw $(CL_MAGIC),CL_MAGIC_ADDR + jne 1f + movl $_empty_zero_page+2048,%edi + movzwl CL_OFFSET,%esi + addl $(CL_BASE_ADDR),%esi + movl $2048,%ecx + rep + movsb +1: +/* check if it is 486 or 386. */ +/* + * XXX - this does a lot of unnecessary setup. Alignment checks don't + * apply at our cpl of 0 and the stack ought to be aligned already, and + * we don't need to preserve eflags. + */ + movl %esp,%edi # save stack pointer + andl $0xfffffffc,%esp # align stack to avoid AC fault + movl $3,_x86 + pushfl # push EFLAGS + popl %eax # get EFLAGS + movl %eax,%ecx # save original EFLAGS + xorl $0x40000,%eax # flip AC bit in EFLAGS + pushl %eax # copy to EFLAGS + popfl # set EFLAGS + pushfl # get new EFLAGS + popl %eax # put it in eax + xorl %ecx,%eax # change in flags + andl $0x40000,%eax # check if AC bit changed + je is386 + movl $4,_x86 + movl %ecx,%eax + xorl $0x200000,%eax # check ID flag + pushl %eax + popfl # if we are on a straight 486DX, SX, or + pushfl # 487SX we can't change it + popl %eax + xorl %ecx,%eax + andl $0x200000,%eax + je is486 +isnew: pushl %ecx # restore original EFLAGS + popfl + movl $1, %eax # Use the CPUID instruction to + .byte 0x0f, 0xa2 # check the processor type + andl $0xf00, %eax # Set _x86 with the family + shrl $8, %eax # returned. + movl %eax, _x86 + movl %edi,%esp # restore esp + movl %cr0,%eax # 486+ + andl $0x80000011,%eax # Save PG,PE,ET + orl $0x50022,%eax # set AM, WP, NE and MP + jmp 2f +is486: pushl %ecx # restore original EFLAGS + popfl + movl %edi,%esp # restore esp + movl %cr0,%eax # 486 + andl $0x80000011,%eax # Save PG,PE,ET + orl $0x50022,%eax # set AM, WP, NE and MP + jmp 2f +is386: pushl %ecx # restore original EFLAGS + popfl + movl %edi,%esp # restore esp + movl %cr0,%eax # 386 + andl $0x80000011,%eax # Save PG,PE,ET + orl $2,%eax # set MP +2: movl %eax,%cr0 + call check_x87 + call setup_paging + lgdt gdt_descr + lidt idt_descr + ljmp $(KERNEL_CS),$1f +1: movl $(KERNEL_DS),%eax # reload all the segment registers + mov %ax,%ds # after changing gdt. + mov %ax,%es + mov %ax,%fs + mov %ax,%gs + lss _stack_start,%esp + xorl %eax,%eax + lldt %ax + pushl %eax # These are the parameters to main :-) + pushl %eax + pushl %eax + cld # gcc2 wants the direction flag cleared at all times + call _start_kernel +L6: + jmp L6 # main should never return here, but + # just in case, we know what happens. + +/* + * We depend on ET to be correct. This checks for 287/387. + */ +check_x87: + movl $0,_hard_math + clts + fninit + fstsw %ax + cmpb $0,%al + je 1f + movl %cr0,%eax /* no coprocessor: have to set bits */ + xorl $4,%eax /* set EM */ + movl %eax,%cr0 + ret +.align 2 +1: movl $1,_hard_math + .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ + ret + +/* + * setup_idt + * + * sets up a idt with 256 entries pointing to + * ignore_int, interrupt gates. It doesn't actually load + * idt - that can be done only after paging has been enabled + * and the kernel moved to 0xC0000000. Interrupts + * are enabled elsewhere, when we can be relatively + * sure everything is ok. + */ +setup_idt: + lea ignore_int,%edx + movl $(KERNEL_CS << 16),%eax + movw %dx,%ax /* selector = 0x0010 = cs */ + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ + + lea _idt,%edi + mov $256,%ecx +rp_sidt: + movl %eax,(%edi) + movl %edx,4(%edi) + addl $8,%edi + dec %ecx + jne rp_sidt + ret + + +/* + * Setup_paging + * + * This routine sets up paging by setting the page bit + * in cr0. The page tables are set up, identity-mapping + * the first 4MB. The rest are initialized later. + * + * (ref: added support for up to 32mb, 17Apr92) -- Rik Faith + * (ref: update, 25Sept92) -- croutons@crunchy.uucp + * (ref: 92.10.11 - Linus Torvalds. Corrected 16M limit - no upper memory limit) + */ +.align 2 +setup_paging: + movl $1024*2,%ecx /* 2 pages - swapper_pg_dir+1 page table */ + xorl %eax,%eax + movl $_swapper_pg_dir,%edi /* swapper_pg_dir is at 0x1000 */ + cld;rep;stosl +/* Identity-map the kernel in low 4MB memory for ease of transition */ + movl $_pg0+7,_swapper_pg_dir /* set present bit/user r/w */ +/* But the real place is at 0xC0000000 */ + movl $_pg0+7,_swapper_pg_dir+3072 /* set present bit/user r/w */ + movl $_pg0+4092,%edi + movl $0x03ff007,%eax /* 4Mb - 4096 + 7 (r/w user,p) */ + std +1: stosl /* fill the page backwards - more efficient :-) */ + subl $0x1000,%eax + jge 1b + cld + movl $_swapper_pg_dir,%eax + movl %eax,%cr3 /* cr3 - page directory start */ + movl %cr0,%eax + orl $0x80000000,%eax + movl %eax,%cr0 /* set paging (PG) bit */ + ret /* this also flushes the prefetch-queue */ + +/* + * page 0 is made non-existent, so that kernel NULL pointer references get + * caught. Thus the swapper page directory has been moved to 0x1000 + * + * XXX Actually, the swapper page directory is at 0x1000 plus 1 megabyte, + * with the introduction of the compressed boot code. Theoretically, + * the original design of overlaying the startup code with the swapper + * page directory is still possible --- it would reduce the size of the kernel + * by 2-3k. This would be a good thing to do at some point..... + */ +.org 0x1000 +_swapper_pg_dir: +/* + * The page tables are initialized to only 4MB here - the final page + * tables are set up later depending on memory size. + */ +.org 0x2000 +_pg0: + +.org 0x3000 +_empty_bad_page: + +.org 0x4000 +_empty_bad_page_table: + +.org 0x5000 +_empty_zero_page: + +.org 0x6000 +/* + * floppy_track_buffer is used to buffer one track of floppy data: it + * has to be separate from the tmp_floppy area, as otherwise a single- + * sector read/write can mess it up. It can contain one full cylinder (sic) of + * data (36*2*512 bytes). + */ +_floppy_track_buffer: + .fill 512*2*MAX_BUFFER_SECTORS,1,0 + +/* This is the default interrupt "handler" :-) */ +int_msg: + .asciz "Unknown interrupt\n" +.align 2 +ignore_int: + cld + pushl %eax + pushl %ecx + pushl %edx + push %ds + push %es + push %fs + movl $(KERNEL_DS),%eax + mov %ax,%ds + mov %ax,%es + mov %ax,%fs + pushl $int_msg + call _printk + popl %eax + pop %fs + pop %es + pop %ds + popl %edx + popl %ecx + popl %eax + iret + +/* + * The interrupt descriptor table has room for 256 idt's + */ +.align 4 +.word 0 +idt_descr: + .word 256*8-1 # idt contains 256 entries + .long 0xc0000000+_idt + +.align 4 +_idt: + .fill 256,8,0 # idt is uninitialized + +.align 4 +.word 0 +gdt_descr: + .word (8+2*NR_TASKS)*8-1 + .long 0xc0000000+_gdt + +/* + * This gdt setup gives the kernel a 1GB address space at virtual + * address 0xC0000000 - space enough for expansion, I hope. + */ +.align 4 +_gdt: + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* not used */ + .quad 0xc0c39a000000ffff /* 0x10 kernel 1GB code at 0xC0000000 */ + .quad 0xc0c392000000ffff /* 0x18 kernel 1GB data at 0xC0000000 */ + .quad 0x00cbfa000000ffff /* 0x23 user 3GB code at 0x00000000 */ + .quad 0x00cbf2000000ffff /* 0x2b user 3GB data at 0x00000000 */ + .quad 0x0000000000000000 /* not used */ + .quad 0x0000000000000000 /* not used */ + .fill 2*NR_TASKS,8,0 /* space for LDT's and TSS's etc */ diff --git a/arch/i386/ioport.c b/arch/i386/ioport.c new file mode 100644 index 000000000..c61690e3c --- /dev/null +++ b/arch/i386/ioport.c @@ -0,0 +1,194 @@ +/* + * linux/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> + +static unsigned long ioport_registrar[IO_BITMAP_SIZE] = {0, /* ... */}; + +#define _IODEBUG + +#ifdef IODEBUG +static char * ios(unsigned long l) +{ + static char str[33] = { '\0' }; + int i; + unsigned long mask; + + for (i = 0, mask = 0x80000000; i < 32; ++i, mask >>= 1) + str[i] = (l & mask) ? '1' : '0'; + return str; +} + +static void dump_io_bitmap(void) +{ + int i, j; + int numl = sizeof(current->tss.io_bitmap) >> 2; + + for (i = j = 0; j < numl; ++i) + { + printk("%4d [%3x]: ", 64*i, 64*i); + printk("%s ", ios(current->tss.io_bitmap[j++])); + if (j < numl) + printk("%s", ios(current->tss.io_bitmap[j++])); + printk("\n"); + } +} +#endif + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +asmlinkage void set_bitmap(unsigned long *bitmap, + short base, short extent, int new_value) +{ + int mask; + unsigned long *bitmap_base = bitmap + (base >> 5); + unsigned short low_index = base & 0x1f; + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0 << low_index); + if (length < 32) + mask &= ~(~0 << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + length -= 32; + } + + mask = (new_value ? ~0 : 0); + while (length >= 32) { + *bitmap_base++ = mask; + length -= 32; + } + + if (length > 0) { + mask = ~(~0 << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + } +} + +/* Check for set bits in BITMAP starting at BASE, going to EXTENT. */ +asmlinkage int check_bitmap(unsigned long *bitmap, short base, short extent) +{ + int mask; + unsigned long *bitmap_base = bitmap + (base >> 5); + unsigned short low_index = base & 0x1f; + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0 << low_index); + if (length < 32) + mask &= ~(~0 << length); + if (*bitmap_base++ & mask) + return 1; + length -= 32; + } + while (length >= 32) { + if (*bitmap_base++ != 0) + return 1; + length -= 32; + } + + if (length > 0) { + mask = ~(~0 << length); + if (*bitmap_base++ & mask) + return 1; + } + return 0; +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + if (from + num <= from) + return -EINVAL; + if (from + num > IO_BITMAP_SIZE*32) + return -EINVAL; + if (!suser()) + return -EPERM; + +#ifdef IODEBUG + printk("io: from=%d num=%d %s\n", from, num, (turn_on ? "on" : "off")); +#endif + set_bitmap((unsigned long *)current->tss.io_bitmap, from, num, !turn_on); + return 0; +} + +unsigned int *stack; + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ +asmlinkage int sys_iopl(long ebx,long ecx,long edx, + long esi, long edi, long ebp, long eax, long ds, + long es, long fs, long gs, long orig_eax, + long eip,long cs,long eflags,long esp,long ss) +{ + unsigned int level = ebx; + + if (level > 3) + return -EINVAL; + if (!suser()) + return -EPERM; + *(&eflags) = (eflags & 0xffffcfff) | (level << 12); + return 0; +} + + +void snarf_region(unsigned int from, unsigned int num) +{ + if (from > IO_BITMAP_SIZE*32) + return; + if (from + num > IO_BITMAP_SIZE*32) + num = IO_BITMAP_SIZE*32 - from; + set_bitmap(ioport_registrar, from, num, 1); + return; +} + +void release_region(unsigned int from, unsigned int num) +{ + if (from > IO_BITMAP_SIZE*32) + return; + if (from + num > IO_BITMAP_SIZE*32) + num = IO_BITMAP_SIZE*32 - from; + set_bitmap(ioport_registrar, from, num, 0); + return; +} + +int check_region(unsigned int from, unsigned int num) +{ + if (from > IO_BITMAP_SIZE*32) + return 0; + if (from + num > IO_BITMAP_SIZE*32) + num = IO_BITMAP_SIZE*32 - from; + return check_bitmap(ioport_registrar, from, num); +} + +/* Called from init/main.c to reserve IO ports. */ +void reserve_setup(char *str, int *ints) +{ + int i; + + for (i = 1; i < ints[0]; i += 2) + snarf_region(ints[i], ints[i+1]); +} diff --git a/arch/i386/irq.c b/arch/i386/irq.c new file mode 100644 index 000000000..2de16db53 --- /dev/null +++ b/arch/i386/irq.c @@ -0,0 +1,354 @@ +/* + * linux/kernel/irq.c + * + * Copyright (C) 1992 Linus Torvalds + * + * This file contains the code used by various IRQ handling routines: + * asking for different IRQ's should be done through these routines + * instead of just grabbing them. Thus setups with different IRQ numbers + * shouldn't result in any weird surprises, and installing new handlers + * should be easier. + */ + +/* + * IRQ's are in fact implemented a bit like signal handlers for the kernel. + * The same sigaction struct is used, and with similar semantics (ie there + * is a SA_INTERRUPT flag etc). Naturally it's not a 1:1 relation, but there + * are similarities. + * + * sa_handler(int irq_NR) is the default function called (0 if no). + * sa_mask is horribly ugly (I won't even mention it) + * sa_flags contains various info: SA_INTERRUPT etc + * sa_restorer is the unused + */ + +#include <linux/ptrace.h> +#include <linux/errno.h> +#include <linux/kernel_stat.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/interrupt.h> + +#include <asm/system.h> +#include <asm/io.h> +#include <asm/irq.h> + +#define CR0_NE 32 + +static unsigned char cache_21 = 0xff; +static unsigned char cache_A1 = 0xff; + +unsigned long intr_count = 0; +unsigned long bh_active = 0; +unsigned long bh_mask = 0xFFFFFFFF; +struct bh_struct bh_base[32]; + +void disable_irq(unsigned int irq_nr) +{ + unsigned long flags; + unsigned char mask; + + mask = 1 << (irq_nr & 7); + save_flags(flags); + if (irq_nr < 8) { + cli(); + cache_21 |= mask; + outb(cache_21,0x21); + restore_flags(flags); + return; + } + cli(); + cache_A1 |= mask; + outb(cache_A1,0xA1); + restore_flags(flags); +} + +void enable_irq(unsigned int irq_nr) +{ + unsigned long flags; + unsigned char mask; + + mask = ~(1 << (irq_nr & 7)); + save_flags(flags); + if (irq_nr < 8) { + cli(); + cache_21 &= mask; + outb(cache_21,0x21); + restore_flags(flags); + return; + } + cli(); + cache_A1 &= mask; + outb(cache_A1,0xA1); + restore_flags(flags); +} + +/* + * do_bottom_half() runs at normal kernel priority: all interrupts + * enabled. do_bottom_half() is atomic with respect to itself: a + * bottom_half handler need not be re-entrant. + */ +asmlinkage void do_bottom_half(void) +{ + unsigned long active; + unsigned long mask, left; + struct bh_struct *bh; + + bh = bh_base; + active = bh_active & bh_mask; + for (mask = 1, left = ~0 ; left & active ; bh++,mask += mask,left += left) { + if (mask & active) { + void (*fn)(void *); + bh_active &= ~mask; + fn = bh->routine; + if (!fn) + goto bad_bh; + fn(bh->data); + } + } + return; +bad_bh: + printk ("irq.c:bad bottom half entry\n"); +} + +/* + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that do all + * the operations that are needed to keep the AT interrupt-controller + * happy. They are also written to be fast - and to disable interrupts + * as little as humanly possible. + * + * NOTE! These macros expand to three different handlers for each line: one + * complete handler that does all the fancy stuff (including signal handling), + * and one fast handler that is meant for simple IRQ's that want to be + * atomic. The specific handler is chosen depending on the SA_INTERRUPT + * flag when installing a handler. Finally, one "bad interrupt" handler, that + * is used when no handler is present. + */ +BUILD_IRQ(FIRST,0,0x01) +BUILD_IRQ(FIRST,1,0x02) +BUILD_IRQ(FIRST,2,0x04) +BUILD_IRQ(FIRST,3,0x08) +BUILD_IRQ(FIRST,4,0x10) +BUILD_IRQ(FIRST,5,0x20) +BUILD_IRQ(FIRST,6,0x40) +BUILD_IRQ(FIRST,7,0x80) +BUILD_IRQ(SECOND,8,0x01) +BUILD_IRQ(SECOND,9,0x02) +BUILD_IRQ(SECOND,10,0x04) +BUILD_IRQ(SECOND,11,0x08) +BUILD_IRQ(SECOND,12,0x10) +BUILD_IRQ(SECOND,13,0x20) +BUILD_IRQ(SECOND,14,0x40) +BUILD_IRQ(SECOND,15,0x80) + +/* + * Pointers to the low-level handlers: first the general ones, then the + * fast ones, then the bad ones. + */ +static void (*interrupt[16])(void) = { + IRQ0_interrupt, IRQ1_interrupt, IRQ2_interrupt, IRQ3_interrupt, + IRQ4_interrupt, IRQ5_interrupt, IRQ6_interrupt, IRQ7_interrupt, + IRQ8_interrupt, IRQ9_interrupt, IRQ10_interrupt, IRQ11_interrupt, + IRQ12_interrupt, IRQ13_interrupt, IRQ14_interrupt, IRQ15_interrupt +}; + +static void (*fast_interrupt[16])(void) = { + fast_IRQ0_interrupt, fast_IRQ1_interrupt, + fast_IRQ2_interrupt, fast_IRQ3_interrupt, + fast_IRQ4_interrupt, fast_IRQ5_interrupt, + fast_IRQ6_interrupt, fast_IRQ7_interrupt, + fast_IRQ8_interrupt, fast_IRQ9_interrupt, + fast_IRQ10_interrupt, fast_IRQ11_interrupt, + fast_IRQ12_interrupt, fast_IRQ13_interrupt, + fast_IRQ14_interrupt, fast_IRQ15_interrupt +}; + +static void (*bad_interrupt[16])(void) = { + bad_IRQ0_interrupt, bad_IRQ1_interrupt, + bad_IRQ2_interrupt, bad_IRQ3_interrupt, + bad_IRQ4_interrupt, bad_IRQ5_interrupt, + bad_IRQ6_interrupt, bad_IRQ7_interrupt, + bad_IRQ8_interrupt, bad_IRQ9_interrupt, + bad_IRQ10_interrupt, bad_IRQ11_interrupt, + bad_IRQ12_interrupt, bad_IRQ13_interrupt, + bad_IRQ14_interrupt, bad_IRQ15_interrupt +}; + +/* + * Initial irq handlers. + */ +static struct sigaction irq_sigaction[16] = { + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL } +}; + +int get_irq_list(char *buf) +{ + int i, len = 0; + struct sigaction * sa = irq_sigaction; + + for (i = 0 ; i < 16 ; i++, sa++) { + if (!sa->sa_handler) + continue; + len += sprintf(buf+len, "%2d: %8d %c %s\n", + i, kstat.interrupts[i], + (sa->sa_flags & SA_INTERRUPT) ? '+' : ' ', + (char *) sa->sa_mask); + } + return len; +} + +/* + * do_IRQ handles IRQ's that have been installed without the + * SA_INTERRUPT flag: it uses the full signal-handling return + * and runs with other interrupts enabled. All relatively slow + * IRQ's should use this format: notably the keyboard/timer + * routines. + */ +asmlinkage void do_IRQ(int irq, struct pt_regs * regs) +{ + struct sigaction * sa = irq + irq_sigaction; + + kstat.interrupts[irq]++; + sa->sa_handler((int) regs); +} + +/* + * do_fast_IRQ handles IRQ's that don't need the fancy interrupt return + * stuff - the handler is also running with interrupts disabled unless + * it explicitly enables them later. + */ +asmlinkage void do_fast_IRQ(int irq) +{ + struct sigaction * sa = irq + irq_sigaction; + + kstat.interrupts[irq]++; + sa->sa_handler(irq); +} + +/* + * Using "struct sigaction" is slightly silly, but there + * are historical reasons and it works well, so.. + */ +static int irqaction(unsigned int irq, struct sigaction * new_sa) +{ + struct sigaction * sa; + unsigned long flags; + + if (irq > 15) + return -EINVAL; + sa = irq + irq_sigaction; + if (sa->sa_handler) + return -EBUSY; + if (!new_sa->sa_handler) + return -EINVAL; + save_flags(flags); + cli(); + *sa = *new_sa; + if (sa->sa_flags & SA_INTERRUPT) + set_intr_gate(0x20+irq,fast_interrupt[irq]); + else + set_intr_gate(0x20+irq,interrupt[irq]); + if (irq < 8) { + cache_21 &= ~(1<<irq); + outb(cache_21,0x21); + } else { + cache_21 &= ~(1<<2); + cache_A1 &= ~(1<<(irq-8)); + outb(cache_21,0x21); + outb(cache_A1,0xA1); + } + restore_flags(flags); + return 0; +} + +int request_irq(unsigned int irq, void (*handler)(int), + unsigned long flags, const char * devname) +{ + struct sigaction sa; + + sa.sa_handler = handler; + sa.sa_flags = flags; + sa.sa_mask = (unsigned long) devname; + sa.sa_restorer = NULL; + return irqaction(irq,&sa); +} + +void free_irq(unsigned int irq) +{ + struct sigaction * sa = irq + irq_sigaction; + unsigned long flags; + + if (irq > 15) { + printk("Trying to free IRQ%d\n",irq); + return; + } + if (!sa->sa_handler) { + printk("Trying to free free IRQ%d\n",irq); + return; + } + save_flags(flags); + cli(); + if (irq < 8) { + cache_21 |= 1 << irq; + outb(cache_21,0x21); + } else { + cache_A1 |= 1 << (irq-8); + outb(cache_A1,0xA1); + } + set_intr_gate(0x20+irq,bad_interrupt[irq]); + sa->sa_handler = NULL; + sa->sa_flags = 0; + sa->sa_mask = 0; + sa->sa_restorer = NULL; + restore_flags(flags); +} + +/* + * Note that on a 486, we don't want to do a SIGFPE on a irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ +static void math_error_irq(int cpl) +{ + outb(0,0xF0); + if (ignore_irq13 || !hard_math) + return; + math_error(); +} + +static void no_action(int cpl) { } + +void init_IRQ(void) +{ + int i; + + for (i = 0; i < 16 ; i++) + set_intr_gate(0x20+i,bad_interrupt[i]); + if (request_irq(2, no_action, SA_INTERRUPT, "cascade")) + printk("Unable to get IRQ2 for cascade\n"); + if (request_irq(13,math_error_irq, 0, "math error")) + printk("Unable to get IRQ13 for math-error handler\n"); + + /* initialize the bottom half routines. */ + for (i = 0; i < 32; i++) { + bh_base[i].routine = NULL; + bh_base[i].data = NULL; + } + bh_active = 0; + intr_count = 0; +} diff --git a/arch/i386/ldt.c b/arch/i386/ldt.c new file mode 100644 index 000000000..dd0e477d4 --- /dev/null +++ b/arch/i386/ldt.c @@ -0,0 +1,103 @@ +/* + * linux/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/ldt.h> + +static int read_ldt(void * ptr, unsigned long bytecount) +{ + int error; + void * address = current->ldt; + unsigned long size; + + if (!ptr) + return -EINVAL; + size = LDT_ENTRIES*LDT_ENTRY_SIZE; + if (!address) { + address = &default_ldt; + size = sizeof(default_ldt); + } + if (size > bytecount) + size = bytecount; + error = verify_area(VERIFY_WRITE, ptr, size); + if (error) + return error; + memcpy_tofs(ptr, address, size); + return size; +} + +static int write_ldt(void * ptr, unsigned long bytecount) +{ + struct modify_ldt_ldt_s ldt_info; + unsigned long *lp; + unsigned long base, limit; + int error, i; + + if (bytecount != sizeof(ldt_info)) + return -EINVAL; + error = verify_area(VERIFY_READ, ptr, sizeof(ldt_info)); + if (error) + return error; + + memcpy_fromfs(&ldt_info, ptr, sizeof(ldt_info)); + + if (ldt_info.contents == 3 || ldt_info.entry_number >= LDT_ENTRIES) + return -EINVAL; + + limit = ldt_info.limit; + base = ldt_info.base_addr; + if (ldt_info.limit_in_pages) + limit *= PAGE_SIZE; + + limit += base; + if (limit < base || limit >= 0xC0000000) + return -EINVAL; + + if (!current->ldt) { + for (i=1 ; i<NR_TASKS ; i++) { + if (task[i] == current) { + if (!(current->ldt = (struct desc_struct*) vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE))) + return -ENOMEM; + set_ldt_desc(gdt+(i<<1)+FIRST_LDT_ENTRY, current->ldt, LDT_ENTRIES); + load_ldt(i); + } + } + } + + lp = (unsigned long *) ¤t->ldt[ldt_info.entry_number]; + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + *lp = 0; + *(lp+1) = 0; + return 0; + } + *lp = ((ldt_info.base_addr & 0x0000ffff) << 16) | + (ldt_info.limit & 0x0ffff); + *(lp+1) = (ldt_info.base_addr & 0xff000000) | + ((ldt_info.base_addr & 0x00ff0000)>>16) | + (ldt_info.limit & 0xf0000) | + (ldt_info.contents << 10) | + ((ldt_info.read_exec_only ^ 1) << 9) | + (ldt_info.seg_32bit << 22) | + (ldt_info.limit_in_pages << 23) | + ((ldt_info.seg_not_present ^1) << 15) | + 0x7000; + return 0; +} + +asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) +{ + if (func == 0) + return read_ldt(ptr, bytecount); + if (func == 1) + return write_ldt(ptr, bytecount); + return -ENOSYS; +} diff --git a/arch/i386/main.c b/arch/i386/main.c new file mode 100644 index 000000000..49606d4de --- /dev/null +++ b/arch/i386/main.c @@ -0,0 +1,481 @@ +/* + * linux/init/main.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <stdarg.h> + +#include <asm/system.h> +#include <asm/io.h> + +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/tty.h> +#include <linux/head.h> +#include <linux/unistd.h> +#include <linux/string.h> +#include <linux/timer.h> +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/utsname.h> +#include <linux/ioport.h> + +extern unsigned long * prof_buffer; +extern unsigned long prof_len; +extern char edata, end; +extern char *linux_banner; +asmlinkage void lcall7(void); +struct desc_struct default_ldt; + +/* + * we need this inline - forking from kernel space will result + * in NO COPY ON WRITE (!!!), until an execve is executed. This + * is no problem, but for the stack. This is handled by not letting + * main() use the stack at all after fork(). Thus, no function + * calls - which means inline code for fork too, as otherwise we + * would use the stack upon exit from 'fork()'. + * + * Actually only pause and fork are needed inline, so that there + * won't be any messing with the stack from main(), but we define + * some others too. + */ +#define __NR__exit __NR_exit +static inline _syscall0(int,idle) +static inline _syscall0(int,fork) +static inline _syscall0(int,pause) +static inline _syscall0(int,setup) +static inline _syscall0(int,sync) +static inline _syscall0(pid_t,setsid) +static inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) +static inline _syscall1(int,dup,int,fd) +static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) +static inline _syscall3(int,open,const char *,file,int,flag,int,mode) +static inline _syscall1(int,close,int,fd) +static inline _syscall1(int,_exit,int,exitcode) +static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) + +static inline pid_t wait(int * wait_stat) +{ + return waitpid(-1,wait_stat,0); +} + +static char printbuf[1024]; + +extern int console_loglevel; + +extern char empty_zero_page[PAGE_SIZE]; +extern void init(void); +extern void init_IRQ(void); +extern void init_modules(void); +extern long console_init(long, long); +extern long kmalloc_init(long,long); +extern long blk_dev_init(long,long); +extern long chr_dev_init(long,long); +extern void sock_init(void); +extern long rd_init(long mem_start, int length); +unsigned long net_dev_init(unsigned long, unsigned long); +extern long bios32_init(long, long); + +extern void hd_setup(char *str, int *ints); +extern void bmouse_setup(char *str, int *ints); +extern void eth_setup(char *str, int *ints); +extern void xd_setup(char *str, int *ints); +extern void mcd_setup(char *str, int *ints); +extern void st_setup(char *str, int *ints); +extern void st0x_setup(char *str, int *ints); +extern void tmc8xx_setup(char *str, int *ints); +extern void t128_setup(char *str, int *ints); +extern void pas16_setup(char *str, int *ints); +extern void generic_NCR5380_setup(char *str, int *intr); +extern void aha152x_setup(char *str, int *ints); +extern void aha1542_setup(char *str, int *ints); +extern void aha274x_setup(char *str, int *ints); +extern void scsi_luns_setup(char *str, int *ints); +extern void sound_setup(char *str, int *ints); +#ifdef CONFIG_SBPCD +extern void sbpcd_setup(char *str, int *ints); +#endif CONFIG_SBPCD +#ifdef CONFIG_CDU31A +extern void cdu31a_setup(char *str, int *ints); +#endif CONFIG_CDU31A +void ramdisk_setup(char *str, int *ints); + +#ifdef CONFIG_SYSVIPC +extern void ipc_init(void); +#endif +#ifdef CONFIG_SCSI +extern unsigned long scsi_dev_init(unsigned long, unsigned long); +#endif + +/* + * This is set up by the setup-routine at boot-time + */ +#define PARAM empty_zero_page +#define EXT_MEM_K (*(unsigned short *) (PARAM+2)) +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80)) +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0)) +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2)) +#define RAMDISK_SIZE (*(unsigned short *) (PARAM+0x1F8)) +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC)) +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF)) + +/* + * Boot command-line arguments + */ +void copy_options(char * to, char * from); +void parse_options(char *line); +#define MAX_INIT_ARGS 8 +#define MAX_INIT_ENVS 8 +#define COMMAND_LINE ((char *) (PARAM+2048)) +#define COMMAND_LINE_SIZE 256 + +extern void time_init(void); + +static unsigned long memory_start = 0; /* After mem_init, stores the */ + /* amount of free user memory */ +static unsigned long memory_end = 0; +static unsigned long low_memory_start = 0; + +static char term[21]; +int rows, cols; + +static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +static char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", term, NULL, }; + +static char * argv_rc[] = { "/bin/sh", NULL }; +static char * envp_rc[] = { "HOME=/", term, NULL }; + +static char * argv[] = { "-/bin/sh",NULL }; +static char * envp[] = { "HOME=/usr/root", term, NULL }; + +struct drive_info_struct { char dummy[32]; } drive_info; +struct screen_info screen_info; + +unsigned char aux_device_present; +int ramdisk_size; +int root_mountflags = 0; + +static char fpu_error = 0; + +static char command_line[COMMAND_LINE_SIZE] = { 0, }; + +char *get_options(char *str, int *ints) +{ + char *cur = str; + int i=1; + + while (cur && isdigit(*cur) && i <= 10) { + ints[i++] = simple_strtoul(cur,NULL,0); + if ((cur = strchr(cur,',')) != NULL) + cur++; + } + ints[0] = i-1; + return(cur); +} + +struct { + char *str; + void (*setup_func)(char *, int *); +} bootsetups[] = { + { "reserve=", reserve_setup }, + { "ramdisk=", ramdisk_setup }, +#ifdef CONFIG_INET + { "ether=", eth_setup }, +#endif +#ifdef CONFIG_SCSI + { "max_scsi_luns=", scsi_luns_setup }, +#endif +#ifdef CONFIG_BLK_DEV_HD + { "hd=", hd_setup }, +#endif +#ifdef CONFIG_CHR_DEV_ST + { "st=", st_setup }, +#endif +#ifdef CONFIG_BUSMOUSE + { "bmouse=", bmouse_setup }, +#endif +#ifdef CONFIG_SCSI_SEAGATE + { "st0x=", st0x_setup }, + { "tmc8xx=", tmc8xx_setup }, +#endif +#ifdef CONFIG_SCSI_T128 + { "t128=", t128_setup }, +#endif +#ifdef CONFIG_SCSI_PAS16 + { "pas16=", pas16_setup }, +#endif +#ifdef CONFIG_SCSI_GENERIC_NCR5380 + { "ncr5380=", generic_NCR5380_setup }, +#endif +#ifdef CONFIG_SCSI_AHA152X + { "aha152x=", aha152x_setup}, +#endif +#ifdef CONFIG_SCSI_AHA1542 + { "aha1542=", aha1542_setup}, +#endif +#ifdef CONFIG_SCSI_AHA274X + { "aha274x=", aha274x_setup}, +#endif +#ifdef CONFIG_BLK_DEV_XD + { "xd=", xd_setup }, +#endif +#ifdef CONFIG_MCD + { "mcd=", mcd_setup }, +#endif +#ifdef CONFIG_SOUND + { "sound=", sound_setup }, +#endif +#ifdef CONFIG_SBPCD + { "sbpcd=", sbpcd_setup }, +#endif CONFIG_SBPCD +#ifdef CONFIG_CDU31A + { "cdu31a=", cdu31a_setup }, +#endif CONFIG_CDU31A + { 0, 0 } +}; + +void ramdisk_setup(char *str, int *ints) +{ + if (ints[0] > 0 && ints[1] >= 0) + ramdisk_size = ints[1]; +} + +static int checksetup(char *line) +{ + int i = 0; + int ints[11]; + + while (bootsetups[i].str) { + int n = strlen(bootsetups[i].str); + if (!strncmp(line,bootsetups[i].str,n)) { + bootsetups[i].setup_func(get_options(line+n,ints), ints); + return 1; + } + i++; + } + return 0; +} + +unsigned long loops_per_sec = 1; + +static void calibrate_delay(void) +{ + int ticks; + + printk("Calibrating delay loop.. "); + while (loops_per_sec <<= 1) { + ticks = jiffies; + __delay(loops_per_sec); + ticks = jiffies - ticks; + if (ticks >= HZ) { + __asm__("mull %1 ; divl %2" + :"=a" (loops_per_sec) + :"d" (HZ), + "r" (ticks), + "0" (loops_per_sec) + :"dx"); + printk("ok - %lu.%02lu BogoMips\n", + loops_per_sec/500000, + (loops_per_sec/5000) % 100); + return; + } + } + printk("failed\n"); +} + +/* + * parse machine depended options + */ +int parse_machine_options(char *line) +{ + if (!strcmp(line,"no-hlt")) { + hlt_works_ok = 0; + return 1; + } + if (!strcmp(line,"no387")) { + hard_math = 0; + __asm__("movl %%cr0,%%eax\n\t" + "orl $0xE,%%eax\n\t" + "movl %%eax,%%cr0\n\t" : : : "ax"); + return 1; + } +} + +static void copro_timeout(void) +{ + fpu_error = 1; + timer_table[COPRO_TIMER].expires = jiffies+100; + timer_active |= 1<<COPRO_TIMER; + printk("387 failed: trying to reset\n"); + send_sig(SIGFPE, last_task_used_math, 1); + outb_p(0,0xf1); + outb_p(0,0xf0); +} + +static void check_fpu(void) +{ + static double x = 4195835.0; + static double y = 3145727.0; + unsigned short control_word; + int i; + + if (!hard_math) { +#ifndef CONFIG_MATH_EMULATION + printk("No coprocessor found and no math emulation present.\n"); + printk("Giving up.\n"); + for (;;) ; +#endif + return; + } + /* + * check if exception 16 works correctly.. This is truly evil + * code: it disables the high 8 interrupts to make sure that + * the irq13 doesn't happen. But as this will lead to a lockup + * if no exception16 arrives, it depends on the fact that the + * high 8 interrupts will be re-enabled by the next timer tick. + * So the irq13 will happen eventually, but the exception 16 + * should get there first.. + */ + printk("Checking 386/387 coupling... "); + timer_table[COPRO_TIMER].expires = jiffies+50; + timer_table[COPRO_TIMER].fn = copro_timeout; + timer_active |= 1<<COPRO_TIMER; + __asm__("clts ; fninit ; fnstcw %0 ; fwait":"=m" (*&control_word)); + control_word &= 0xffc0; + __asm__("fldcw %0 ; fwait": :"m" (*&control_word)); + outb_p(inb_p(0x21) | (1 << 2), 0x21); + __asm__("fldz ; fld1 ; fdiv %st,%st(1) ; fwait"); + timer_active &= ~(1<<COPRO_TIMER); + if (fpu_error) + return; + if (!ignore_irq13) { + printk("Ok, fpu using old IRQ13 error reporting\n"); + return; + } + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&i) + : "m" (*&x), "m" (*&y)); + if (!i) { + printk("Ok, fpu using exception 16 error reporting.\n"); + return; + + } + printk("Ok, FDIV bug i%c86 system\n", '0'+x86); +} + +static void check_hlt(void) +{ + printk("Checking 'hlt' instruction... "); + if (!hlt_works_ok) { + printk("disabled\n"); + return; + } + __asm__ __volatile__("hlt ; hlt ; hlt ; hlt"); + printk("Ok.\n"); +} + +static void check_bugs(void) +{ + check_fpu(); + check_hlt(); +} + +asmlinkage void start_kernel(void) +{ +/* + * Interrupts are still disabled. Do necessary setups, then + * enable them + */ + set_call_gate(&default_ldt,lcall7); + ROOT_DEV = ORIG_ROOT_DEV; + drive_info = DRIVE_INFO; + screen_info = SCREEN_INFO; + aux_device_present = AUX_DEVICE_INFO; + memory_end = (1<<20) + (EXT_MEM_K<<10); + memory_end &= PAGE_MASK; + ramdisk_size = RAMDISK_SIZE; + copy_options(command_line,COMMAND_LINE); +#ifdef CONFIG_MAX_16M + if (memory_end > 16*1024*1024) + memory_end = 16*1024*1024; +#endif + if (MOUNT_ROOT_RDONLY) + root_mountflags |= MS_RDONLY; + if ((unsigned long)&end >= (1024*1024)) { + memory_start = (unsigned long) &end; + low_memory_start = PAGE_SIZE; + } else { + memory_start = 1024*1024; + low_memory_start = (unsigned long) &end; + } + low_memory_start = PAGE_ALIGN(low_memory_start); + memory_start = paging_init(memory_start,memory_end); + if (strncmp((char*)0x0FFFD9, "EISA", 4) == 0) + EISA_bus = 1; + trap_init(); + init_IRQ(); + sched_init(); + parse_options(command_line); + init_modules(); +#ifdef CONFIG_PROFILE + prof_buffer = (unsigned long *) memory_start; + prof_len = (unsigned long) &end; + prof_len >>= 2; + memory_start += prof_len * sizeof(unsigned long); +#endif + memory_start = console_init(memory_start,memory_end); + memory_start = bios32_init(memory_start,memory_end); + memory_start = kmalloc_init(memory_start,memory_end); + memory_start = chr_dev_init(memory_start,memory_end); + memory_start = blk_dev_init(memory_start,memory_end); + sti(); + calibrate_delay(); +#ifdef CONFIG_SCSI + memory_start = scsi_dev_init(memory_start,memory_end); +#endif +#ifdef CONFIG_INET + memory_start = net_dev_init(memory_start,memory_end); +#endif + memory_start = inode_init(memory_start,memory_end); + memory_start = file_table_init(memory_start,memory_end); + memory_start = name_cache_init(memory_start,memory_end); + mem_init(low_memory_start,memory_start,memory_end); + buffer_init(); + time_init(); + sock_init(); +#ifdef CONFIG_SYSVIPC + ipc_init(); +#endif + sti(); + check_bugs(); + + system_utsname.machine[1] = '0' + x86; + printk(linux_banner); + + move_to_user_mode(); + if (!fork()) /* we count on this going ok */ + init(); +/* + * task[0] is meant to be used as an "idle" task: it may not sleep, but + * it might do some general things like count free pages or it could be + * used to implement a reasonable LRU algorithm for the paging routines: + * anything that can be useful, but shouldn't take time from the real + * processes. + * + * Right now task[0] just does a infinite idle loop. + */ + for(;;) + idle(); +} diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile new file mode 100644 index 000000000..5063d60c2 --- /dev/null +++ b/arch/i386/mm/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for the linux memory manager. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +.c.o: + $(CC) $(CFLAGS) -c $< +.s.o: + $(AS) -o $*.o $< +.c.s: + $(CC) $(CFLAGS) -S $< + +OBJS = memory.o swap.o mmap.o mprotect.o kmalloc.o vmalloc.o + +mm.o: $(OBJS) + $(LD) -r -o mm.o $(OBJS) + +dep: + $(CPP) -M *.c > .depend + +# +# include a dependency file if one exists +# +ifeq (.depend,$(wildcard .depend)) +include .depend +endif diff --git a/arch/i386/mm/kmalloc.c b/arch/i386/mm/kmalloc.c new file mode 100644 index 000000000..018f8db8f --- /dev/null +++ b/arch/i386/mm/kmalloc.c @@ -0,0 +1,362 @@ +/* + * linux/mm/kmalloc.c + * + * Copyright (C) 1991, 1992 Linus Torvalds & Roger Wolff. + * + * Written by R.E. Wolff Sept/Oct '93. + * + */ + +/* + * Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple + * pages. So for 'page' throughout, read 'area'. + */ + +#include <linux/mm.h> +#include <asm/system.h> +#include <linux/delay.h> + +#define GFP_LEVEL_MASK 0xf + +/* I want this low enough for a while to catch errors. + I want this number to be increased in the near future: + loadable device drivers should use this function to get memory */ + +#define MAX_KMALLOC_K ((PAGE_SIZE<<(NUM_AREA_ORDERS-1))>>10) + + +/* This defines how many times we should try to allocate a free page before + giving up. Normally this shouldn't happen at all. */ +#define MAX_GET_FREE_PAGE_TRIES 4 + + +/* Private flags. */ + +#define MF_USED 0xffaa0055 +#define MF_FREE 0x0055ffaa + + +/* + * Much care has gone into making these routines in this file reentrant. + * + * The fancy bookkeeping of nbytesmalloced and the like are only used to + * report them to the user (oooohhhhh, aaaaahhhhh....) are not + * protected by cli(). (If that goes wrong. So what?) + * + * These routines restore the interrupt status to allow calling with ints + * off. + */ + +/* + * A block header. This is in front of every malloc-block, whether free or not. + */ +struct block_header { + unsigned long bh_flags; + union { + unsigned long ubh_length; + struct block_header *fbh_next; + } vp; +}; + + +#define bh_length vp.ubh_length +#define bh_next vp.fbh_next +#define BH(p) ((struct block_header *)(p)) + + +/* + * The page descriptor is at the front of every page that malloc has in use. + */ +struct page_descriptor { + struct page_descriptor *next; + struct block_header *firstfree; + int order; + int nfree; +}; + + +#define PAGE_DESC(p) ((struct page_descriptor *)(((unsigned long)(p)) & PAGE_MASK)) + + +/* + * A size descriptor describes a specific class of malloc sizes. + * Each class of sizes has its own freelist. + */ +struct size_descriptor { + struct page_descriptor *firstfree; + int size; + int nblocks; + + int nmallocs; + int nfrees; + int nbytesmalloced; + int npages; + unsigned long gfporder; /* number of pages in the area required */ +}; + +/* + * For now it is unsafe to allocate bucket sizes between n & n=16 where n is + * 4096 * any power of two + */ + +struct size_descriptor sizes[] = { + { NULL, 32,127, 0,0,0,0, 0}, + { NULL, 64, 63, 0,0,0,0, 0 }, + { NULL, 128, 31, 0,0,0,0, 0 }, + { NULL, 252, 16, 0,0,0,0, 0 }, + { NULL, 508, 8, 0,0,0,0, 0 }, + { NULL,1020, 4, 0,0,0,0, 0 }, + { NULL,2040, 2, 0,0,0,0, 0 }, + { NULL,4096-16, 1, 0,0,0,0, 0 }, + { NULL,8192-16, 1, 0,0,0,0, 1 }, + { NULL,16384-16, 1, 0,0,0,0, 2 }, + { NULL,32768-16, 1, 0,0,0,0, 3 }, + { NULL,65536-16, 1, 0,0,0,0, 4 }, + { NULL,131072-16, 1, 0,0,0,0, 5 }, + { NULL, 0, 0, 0,0,0,0, 0 } +}; + + +#define NBLOCKS(order) (sizes[order].nblocks) +#define BLOCKSIZE(order) (sizes[order].size) +#define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder)) + + +long kmalloc_init (long start_mem,long end_mem) +{ + int order; + +/* + * Check the static info array. Things will blow up terribly if it's + * incorrect. This is a late "compile time" check..... + */ +for (order = 0;BLOCKSIZE(order);order++) + { + if ((NBLOCKS (order)*BLOCKSIZE(order) + sizeof (struct page_descriptor)) > + AREASIZE(order)) + { + printk ("Cannot use %d bytes out of %d in order = %d block mallocs\n", + NBLOCKS (order) * BLOCKSIZE(order) + + sizeof (struct page_descriptor), + (int) AREASIZE(order), + BLOCKSIZE (order)); + panic ("This only happens if someone messes with kmalloc"); + } + } +return start_mem; +} + + + +int get_order (int size) +{ + int order; + + /* Add the size of the header */ + size += sizeof (struct block_header); + for (order = 0;BLOCKSIZE(order);order++) + if (size <= BLOCKSIZE (order)) + return order; + return -1; +} + +void * kmalloc (size_t size, int priority) +{ + unsigned long flags; + int order,tries,i,sz; + struct block_header *p; + struct page_descriptor *page; + +/* Sanity check... */ + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("kmalloc called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + +order = get_order (size); +if (order < 0) + { + printk ("kmalloc of too large a block (%d bytes).\n",size); + return (NULL); + } + +save_flags(flags); + +/* It seems VERY unlikely to me that it would be possible that this + loop will get executed more than once. */ +tries = MAX_GET_FREE_PAGE_TRIES; +while (tries --) + { + /* Try to allocate a "recently" freed memory block */ + cli (); + if ((page = sizes[order].firstfree) && + (p = page->firstfree)) + { + if (p->bh_flags == MF_FREE) + { + page->firstfree = p->bh_next; + page->nfree--; + if (!page->nfree) + { + sizes[order].firstfree = page->next; + page->next = NULL; + } + restore_flags(flags); + + sizes [order].nmallocs++; + sizes [order].nbytesmalloced += size; + p->bh_flags = MF_USED; /* As of now this block is officially in use */ + p->bh_length = size; + return p+1; /* Pointer arithmetic: increments past header */ + } + printk ("Problem: block on freelist at %08lx isn't free.\n",(long)p); + return (NULL); + } + restore_flags(flags); + + + /* Now we're in trouble: We need to get a new free page..... */ + + sz = BLOCKSIZE(order); /* sz is the size of the blocks we're dealing with */ + + /* This can be done with ints on: This is private to this invocation */ + page = (struct page_descriptor *) __get_free_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder); + if (!page) { + static unsigned long last = 0; + if (last + 10*HZ < jiffies) { + last = jiffies; + printk ("Couldn't get a free page.....\n"); + } + return NULL; + } +#if 0 + printk ("Got page %08x to use for %d byte mallocs....",(long)page,sz); +#endif + sizes[order].npages++; + + /* Loop for all but last block: */ + for (i=NBLOCKS(order),p=BH (page+1);i > 1;i--,p=p->bh_next) + { + p->bh_flags = MF_FREE; + p->bh_next = BH ( ((long)p)+sz); + } + /* Last block: */ + p->bh_flags = MF_FREE; + p->bh_next = NULL; + + page->order = order; + page->nfree = NBLOCKS(order); + page->firstfree = BH(page+1); +#if 0 + printk ("%d blocks per page\n",page->nfree); +#endif + /* Now we're going to muck with the "global" freelist for this size: + this should be uninterruptible */ + cli (); + /* + * sizes[order].firstfree used to be NULL, otherwise we wouldn't be + * here, but you never know.... + */ + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + restore_flags(flags); + } + +/* Pray that printk won't cause this to happen again :-) */ + +printk ("Hey. This is very funny. I tried %d times to allocate a whole\n" + "new page for an object only %d bytes long, but some other process\n" + "beat me to actually allocating it. Also note that this 'error'\n" + "message is soooo very long to catch your attention. I'd appreciate\n" + "it if you'd be so kind as to report what conditions caused this to\n" + "the author of this kmalloc: wolff@dutecai.et.tudelft.nl.\n" + "(Executive summary: This can't happen)\n", + MAX_GET_FREE_PAGE_TRIES, + size); +return NULL; +} + + +void kfree_s (void *ptr,int size) +{ +unsigned long flags; +int order; +register struct block_header *p=((struct block_header *)ptr) -1; +struct page_descriptor *page,*pg2; + +page = PAGE_DESC (p); +order = page->order; +if ((order < 0) || + (order > sizeof (sizes)/sizeof (sizes[0])) || + (((long)(page->next)) & ~PAGE_MASK) || + (p->bh_flags != MF_USED)) + { + printk ("kfree of non-kmalloced memory: %p, next= %p, order=%d\n", + p, page->next, page->order); + return; + } +if (size && + size != p->bh_length) + { + printk ("Trying to free pointer at %p with wrong size: %d instead of %lu.\n", + p,size,p->bh_length); + return; + } +size = p->bh_length; +p->bh_flags = MF_FREE; /* As of now this block is officially free */ +save_flags(flags); +cli (); +p->bh_next = page->firstfree; +page->firstfree = p; +page->nfree ++; + +if (page->nfree == 1) + { /* Page went from full to one free block: put it on the freelist */ + if (page->next) + { + printk ("Page %p already on freelist dazed and confused....\n", page); + } + else + { + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + } + } + +/* If page is completely free, free it */ +if (page->nfree == NBLOCKS (page->order)) + { +#if 0 + printk ("Freeing page %08x.\n", (long)page); +#endif + if (sizes[order].firstfree == page) + { + sizes[order].firstfree = page->next; + } + else + { + for (pg2=sizes[order].firstfree; + (pg2 != NULL) && (pg2->next != page); + pg2=pg2->next) + /* Nothing */; + if (pg2 != NULL) + pg2->next = page->next; + else + printk ("Ooops. page %p doesn't show on freelist.\n", page); + } +/* FIXME: I'm sure we should do something with npages here (like npages--) */ + free_pages ((long)page, sizes[order].gfporder); + } +restore_flags(flags); + +/* FIXME: ?? Are these increment & decrement operations guaranteed to be + * atomic? Could an IRQ not occur between the read & the write? + * Maybe yes on a x86 with GCC...?? + */ +sizes[order].nfrees++; /* Noncritical (monitoring) admin stuff */ +sizes[order].nbytesmalloced -= size; +} diff --git a/arch/i386/mm/memory.c b/arch/i386/mm/memory.c new file mode 100644 index 000000000..3e5a67041 --- /dev/null +++ b/arch/i386/mm/memory.c @@ -0,0 +1,1320 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + */ + +#include <linux/config.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> + +#include <asm/system.h> +#include <asm/segment.h> + +/* + * Define this if things work differently on a i386 and a i486: + * it will (on a i486) warn about kernel memory accesses that are + * done without a 'verify_area(VERIFY_WRITE,..)' + */ +#undef CONFIG_TEST_VERIFY_AREA + +unsigned long high_memory = 0; + +extern unsigned long pg0[1024]; /* page table for 0-4MB for everybody */ + +extern void sound_mem_init(void); +extern void die_if_kernel(char *,struct pt_regs *,long); +extern void show_net_buffers(void); + +/* + * The free_area_list arrays point to the queue heads of the free areas + * of different sizes + */ +int nr_swap_pages = 0; +int nr_free_pages = 0; +struct mem_list free_area_list[NR_MEM_LISTS]; +unsigned char * free_area_map[NR_MEM_LISTS]; + +#define copy_page(from,to) \ +__asm__("cld ; rep ; movsl": :"S" (from),"D" (to),"c" (1024):"cx","di","si") + +unsigned short * mem_map = NULL; + +#define CODE_SPACE(addr,p) ((addr) < (p)->end_code) + +/* + * oom() prints a message (so that the user knows why the process died), + * and gives the process an untrappable SIGKILL. + */ +void oom(struct task_struct * task) +{ + printk("\nOut of memory.\n"); + task->sigaction[SIGKILL-1].sa_handler = NULL; + task->blocked &= ~(1<<(SIGKILL-1)); + send_sig(SIGKILL,task,1); +} + +static void free_one_table(unsigned long * page_dir) +{ + int j; + unsigned long pg_table = *page_dir; + unsigned long * page_table; + + if (!pg_table) + return; + *page_dir = 0; + if (pg_table >= high_memory || !(pg_table & PAGE_PRESENT)) { + printk("Bad page table: [%p]=%08lx\n",page_dir,pg_table); + return; + } + if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED) + return; + page_table = (unsigned long *) (pg_table & PAGE_MASK); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++) { + unsigned long pg = *page_table; + + if (!pg) + continue; + *page_table = 0; + if (pg & PAGE_PRESENT) + free_page(PAGE_MASK & pg); + else + swap_free(pg); + } + free_page(PAGE_MASK & pg_table); +} + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. Note that + * unlike 'free_page_tables()', this function still leaves a valid + * page-table-tree in memory: it just removes the user pages. The two + * functions are similar, but there is a fundamental difference. + */ +void clear_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) + panic("task[0] (swapper) doesn't support exec()\n"); + pg_dir = tsk->tss.cr3; + page_dir = (unsigned long *) pg_dir; + if (!page_dir || page_dir == swapper_pg_dir) { + printk("Trying to clear kernel page-directory: not good\n"); + return; + } + if (mem_map[MAP_NR(pg_dir)] > 1) { + unsigned long * new_pg; + + if (!(new_pg = (unsigned long*) get_free_page(GFP_KERNEL))) { + oom(tsk); + return; + } + for (i = 768 ; i < 1024 ; i++) + new_pg[i] = page_dir[i]; + free_page(pg_dir); + tsk->tss.cr3 = (unsigned long) new_pg; + return; + } + for (i = 0 ; i < 768 ; i++,page_dir++) + free_one_table(page_dir); + invalidate(); + return; +} + +/* + * This function frees up all page tables of a process when it exits. + */ +void free_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) { + printk("task[0] (swapper) killed: unable to recover\n"); + panic("Trying to free up swapper memory space"); + } + pg_dir = tsk->tss.cr3; + if (!pg_dir || pg_dir == (unsigned long) swapper_pg_dir) { + printk("Trying to free kernel page-directory: not good\n"); + return; + } + tsk->tss.cr3 = (unsigned long) swapper_pg_dir; + if (tsk == current) + __asm__ __volatile__("movl %0,%%cr3": :"a" (tsk->tss.cr3)); + if (mem_map[MAP_NR(pg_dir)] > 1) { + free_page(pg_dir); + return; + } + page_dir = (unsigned long *) pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,page_dir++) + free_one_table(page_dir); + free_page(pg_dir); + invalidate(); +} + +/* + * clone_page_tables() clones the page table for a process - both + * processes will have the exact same pages in memory. There are + * probably races in the memory management with cloning, but we'll + * see.. + */ +int clone_page_tables(struct task_struct * tsk) +{ + unsigned long pg_dir; + + pg_dir = current->tss.cr3; + mem_map[MAP_NR(pg_dir)]++; + tsk->tss.cr3 = pg_dir; + return 0; +} + +/* + * copy_page_tables() just copies the whole process memory range: + * note the special handling of RESERVED (ie kernel) pages, which + * means that they are always shared by all processes. + */ +int copy_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long old_pg_dir, *old_page_dir; + unsigned long new_pg_dir, *new_page_dir; + + if (!(new_pg_dir = get_free_page(GFP_KERNEL))) + return -ENOMEM; + old_pg_dir = current->tss.cr3; + tsk->tss.cr3 = new_pg_dir; + old_page_dir = (unsigned long *) old_pg_dir; + new_page_dir = (unsigned long *) new_pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) { + int j; + unsigned long old_pg_table, *old_page_table; + unsigned long new_pg_table, *new_page_table; + + old_pg_table = *old_page_dir; + if (!old_pg_table) + continue; + if (old_pg_table >= high_memory || !(old_pg_table & PAGE_PRESENT)) { + printk("copy_page_tables: bad page table: " + "probable memory corruption\n"); + *old_page_dir = 0; + continue; + } + if (mem_map[MAP_NR(old_pg_table)] & MAP_PAGE_RESERVED) { + *new_page_dir = old_pg_table; + continue; + } + if (!(new_pg_table = get_free_page(GFP_KERNEL))) { + free_page_tables(tsk); + return -ENOMEM; + } + old_page_table = (unsigned long *) (PAGE_MASK & old_pg_table); + new_page_table = (unsigned long *) (PAGE_MASK & new_pg_table); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) { + unsigned long pg; + pg = *old_page_table; + if (!pg) + continue; + if (!(pg & PAGE_PRESENT)) { + *new_page_table = swap_duplicate(pg); + continue; + } + if (pg > high_memory || (mem_map[MAP_NR(pg)] & MAP_PAGE_RESERVED)) { + *new_page_table = pg; + continue; + } + if (pg & PAGE_COW) + pg &= ~PAGE_RW; + if (delete_from_swap_cache(pg)) + pg |= PAGE_DIRTY; + *new_page_table = pg; + *old_page_table = pg; + mem_map[MAP_NR(pg)]++; + } + *new_page_dir = new_pg_table | PAGE_TABLE; + } + invalidate(); + return 0; +} + +/* + * a more complete version of free_page_tables which performs with page + * granularity. + */ +int unmap_page_range(unsigned long from, unsigned long size) +{ + unsigned long page, page_dir; + unsigned long *page_table, *dir; + unsigned long poff, pcnt, pc; + + if (from & ~PAGE_MASK) { + printk("unmap_page_range called with wrong alignment\n"); + return -EINVAL; + } + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + for ( ; size > 0; ++dir, size -= pcnt, + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) { + if (!(page_dir = *dir)) { + poff = 0; + continue; + } + if (!(page_dir & PAGE_PRESENT)) { + printk("unmap_page_range: bad page directory."); + continue; + } + page_table = (unsigned long *)(PAGE_MASK & page_dir); + if (poff) { + page_table += poff; + poff = 0; + } + for (pc = pcnt; pc--; page_table++) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (PAGE_PRESENT & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + } + if (pcnt == PTRS_PER_PAGE) { + *dir = 0; + free_page(PAGE_MASK & page_dir); + } + } + invalidate(); + return 0; +} + +int zeromap_page_range(unsigned long from, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) { + printk("zeromap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + mask |= ZERO_PAGE; + } + if (from & ~PAGE_MASK) { + printk("zeromap_page_range: from = %08lx\n",from); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (!(PAGE_PRESENT & *dir)) { + /* clear page needed here? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -ENOMEM; + } + if (PAGE_PRESENT & *dir) { + free_page((unsigned long) page_table); + page_table = (unsigned long *)(PAGE_MASK & *dir++); + } else + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + page_table += poff; + poff = 0; + for (size -= pcnt; pcnt-- ;) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (page & PAGE_PRESENT) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + *page_table++ = mask; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +int remap_page_range(unsigned long from, unsigned long to, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) { + printk("remap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + } + if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) { + printk("remap_page_range: from = %08lx, to=%08lx\n",from,to); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.cr3,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (!(PAGE_PRESENT & *dir)) { + /* clearing page here, needed? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -1; + } + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } + else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + if (poff) { + page_table += poff; + poff = 0; + } + + for (size -= pcnt; pcnt-- ;) { + if ((page = *page_table) != 0) { + *page_table = 0; + if (PAGE_PRESENT & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + + /* + * the first condition should return an invalid access + * when the page is referenced. current assumptions + * cause it to be treated as demand allocation in some + * cases. + */ + if (!mask) + *page_table++ = 0; /* not present */ + else if (to >= high_memory) + *page_table++ = (to | mask); + else if (!mem_map[MAP_NR(to)]) + *page_table++ = 0; /* not present */ + else { + *page_table++ = (to | mask); + if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) { + ++current->mm->rss; + mem_map[MAP_NR(to)]++; + } + } + to += PAGE_SIZE; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * This function puts a page in memory at the wanted address. + * It returns the physical address of the page gotten, 0 if + * out of memory (either when trying to access page-table or + * page.) + */ +unsigned long put_page(struct task_struct * tsk,unsigned long page, + unsigned long address,int prot) +{ + unsigned long *page_table; + + if ((prot & (PAGE_MASK|PAGE_PRESENT)) != PAGE_PRESENT) + printk("put_page: prot = %08x\n",prot); + if (page >= high_memory) { + printk("put_page: trying to put page %08lx at %08lx\n",page,address); + return 0; + } + page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if ((*page_table) & PAGE_PRESENT) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + printk("put_page: bad page directory entry\n"); + oom(tsk); + *page_table = BAD_PAGETABLE | PAGE_TABLE; + return 0; + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_page: page already exists\n"); + *page_table = 0; + invalidate(); + } + *page_table = page | prot; +/* no need for invalidate */ + return page; +} + +/* + * The previous function doesn't work very well if you also want to mark + * the page dirty: exec.c wants this, as it has earlier changed the page, + * and we want the dirty-status to be correct (for VM). Thus the same + * routine, but this time we mark it dirty too. + */ +unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) +{ + unsigned long tmp, *page_table; + + if (page >= high_memory) + printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); + if (mem_map[MAP_NR(page)] != 1) + printk("mem_map disagrees with %08lx at %08lx\n",page,address); + page_table = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *page_table) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + if (!(tmp = get_free_page(GFP_KERNEL))) + return 0; + if (PAGE_PRESENT & *page_table) { + free_page(tmp); + page_table = (unsigned long *) (PAGE_MASK & *page_table); + } else { + *page_table = tmp | PAGE_TABLE; + page_table = (unsigned long *) tmp; + } + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_dirty_page: page already exists\n"); + *page_table = 0; + invalidate(); + } + *page_table = page | (PAGE_DIRTY | PAGE_PRIVATE); +/* no need for invalidate */ + return page; +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + */ +void do_wp_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long *pde, pte, old_page, prot; + unsigned long new_page; + + new_page = __get_free_page(GFP_KERNEL); + pde = PAGE_DIR_OFFSET(vma->vm_task->tss.cr3,address); + pte = *pde; + if (!(pte & PAGE_PRESENT)) + goto end_wp_page; + if ((pte & PAGE_TABLE) != PAGE_TABLE || pte >= high_memory) + goto bad_wp_pagetable; + pte &= PAGE_MASK; + pte += PAGE_PTR(address); + old_page = *(unsigned long *) pte; + if (!(old_page & PAGE_PRESENT)) + goto end_wp_page; + if (old_page >= high_memory) + goto bad_wp_page; + if (old_page & PAGE_RW) + goto end_wp_page; + vma->vm_task->mm->min_flt++; + prot = (old_page & ~PAGE_MASK) | PAGE_RW | PAGE_DIRTY; + old_page &= PAGE_MASK; + if (mem_map[MAP_NR(old_page)] != 1) { + if (new_page) { + if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED) + ++vma->vm_task->mm->rss; + copy_page(old_page,new_page); + *(unsigned long *) pte = new_page | prot; + free_page(old_page); + invalidate(); + return; + } + free_page(old_page); + oom(vma->vm_task); + *(unsigned long *) pte = BAD_PAGE | prot; + invalidate(); + return; + } + *(unsigned long *) pte |= PAGE_RW | PAGE_DIRTY; + invalidate(); + if (new_page) + free_page(new_page); + return; +bad_wp_page: + printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); + *(unsigned long *) pte = BAD_PAGE | PAGE_SHARED; + send_sig(SIGKILL, vma->vm_task, 1); + goto end_wp_page; +bad_wp_pagetable: + printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n",address,pte); + *pde = BAD_PAGETABLE | PAGE_TABLE; + send_sig(SIGKILL, vma->vm_task, 1); +end_wp_page: + if (new_page) + free_page(new_page); + return; +} + +/* + * Ugly, ugly, but the goto's result in better assembly.. + */ +int verify_area(int type, const void * addr, unsigned long size) +{ + struct vm_area_struct * vma; + unsigned long start = (unsigned long) addr; + + /* If the current user space is mapped to kernel space (for the + * case where we use a fake user buffer with get_fs/set_fs()) we + * don't expect to find the address in the user vm map. + */ + if (get_fs() == get_ds()) + return 0; + + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > start) + break; + } + if (vma->vm_start <= start) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + +good_area: + if (!wp_works_ok && type == VERIFY_WRITE) + goto check_wp_fault_by_hand; + for (;;) { + struct vm_area_struct * next; + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (type != VERIFY_READ && !(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + if (vma->vm_end - start >= size) + return 0; + next = vma->vm_next; + if (!next || vma->vm_end != next->vm_start) + goto bad_area; + vma = next; + } + +check_wp_fault_by_hand: + size--; + size += start & ~PAGE_MASK; + size >>= PAGE_SHIFT; + start &= PAGE_MASK; + + for (;;) { + if (!(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + do_wp_page(vma, start, PAGE_PRESENT); + if (!size) + return 0; + size--; + start += PAGE_SIZE; + if (start < vma->vm_end) + continue; + vma = vma->vm_next; + if (!vma || vma->vm_start != start) + break; + } + +bad_area: + return -EFAULT; +} + +static inline void get_empty_page(struct task_struct * tsk, unsigned long address) +{ + unsigned long tmp; + + if (!(tmp = get_free_page(GFP_KERNEL))) { + oom(tsk); + tmp = BAD_PAGE; + } + if (!put_page(tsk,tmp,address,PAGE_PRIVATE)) + free_page(tmp); +} + +/* + * try_to_share() checks the page at address "address" in the task "p", + * to see if it exists, and if it is clean. If so, share it with the current + * task. + * + * NOTE! This assumes we have checked that p != current, and that they + * share the same inode and can generally otherwise be shared. + */ +static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area, + unsigned long from_address, struct vm_area_struct * from_area, + unsigned long newpage) +{ + unsigned long from; + unsigned long to; + unsigned long from_page; + unsigned long to_page; + + from_page = (unsigned long)PAGE_DIR_OFFSET(from_area->vm_task->tss.cr3,from_address); + to_page = (unsigned long)PAGE_DIR_OFFSET(to_area->vm_task->tss.cr3,to_address); +/* is there a page-directory at from? */ + from = *(unsigned long *) from_page; + if (!(from & PAGE_PRESENT)) + return 0; + from &= PAGE_MASK; + from_page = from + PAGE_PTR(from_address); + from = *(unsigned long *) from_page; +/* is the page present? */ + if (!(from & PAGE_PRESENT)) + return 0; +/* if it is private, it must be clean to be shared */ + if (from & PAGE_DIRTY) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } +/* is the page reasonable at all? */ + if (from >= high_memory) + return 0; + if (mem_map[MAP_NR(from)] & MAP_PAGE_RESERVED) + return 0; +/* is the destination ok? */ + to = *(unsigned long *) to_page; + if (!(to & PAGE_PRESENT)) + return 0; + to &= PAGE_MASK; + to_page = to + PAGE_PTR(to_address); + if (*(unsigned long *) to_page) + return 0; +/* do we copy? */ + if (newpage) { + if (in_swap_cache(from)) { /* implies PAGE_DIRTY */ + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } + copy_page((from & PAGE_MASK), newpage); + *(unsigned long *) to_page = newpage | to_area->vm_page_prot; + return 1; + } +/* do a final swap-cache test before sharing them.. */ + if (in_swap_cache(from)) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + from |= PAGE_DIRTY; + *(unsigned long *) from_page = from; + delete_from_swap_cache(from); + invalidate(); + } + mem_map[MAP_NR(from)]++; +/* fill in the 'to' field, checking for COW-stuff */ + to = (from & (PAGE_MASK | PAGE_DIRTY)) | to_area->vm_page_prot; + if (to & PAGE_COW) + to &= ~PAGE_RW; + *(unsigned long *) to_page = to; +/* Check if we need to do anything at all to the 'from' field */ + if (!(from & PAGE_RW)) + return 1; + if (!(from_area->vm_page_prot & PAGE_COW)) + return 1; +/* ok, need to mark it read-only, so invalidate any possible old TB entry */ + from &= ~PAGE_RW; + *(unsigned long *) from_page = from; + invalidate(); + return 1; +} + +/* + * share_page() tries to find a process that could share a page with + * the current one. + * + * We first check if it is at all feasible by checking inode->i_count. + * It should be >1 if there are other tasks sharing this inode. + */ +static int share_page(struct vm_area_struct * area, unsigned long address, + unsigned long error_code, unsigned long newpage) +{ + struct inode * inode; + struct task_struct ** p; + unsigned long offset; + unsigned long from_address; + unsigned long give_page; + + if (!area || !(inode = area->vm_inode) || inode->i_count < 2) + return 0; + /* do we need to copy or can we just share? */ + give_page = 0; + if ((area->vm_page_prot & PAGE_COW) && (error_code & PAGE_RW)) { + if (!newpage) + return 0; + give_page = newpage; + } + offset = address - area->vm_start + area->vm_offset; + for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) { + struct vm_area_struct * mpnt; + if (!*p) + continue; + if (area->vm_task == *p) + continue; + /* Now see if there is something in the VMM that + we can share pages with */ + for (mpnt = (*p)->mm->mmap; mpnt; mpnt = mpnt->vm_next) { + /* must be same inode */ + if (mpnt->vm_inode != inode) + continue; + /* offsets must be mutually page-aligned */ + if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK) + continue; + /* the other area must actually cover the wanted page.. */ + from_address = offset + mpnt->vm_start - mpnt->vm_offset; + if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end) + continue; + /* .. NOW we can actually try to use the same physical page */ + if (!try_to_share(address, area, from_address, mpnt, give_page)) + continue; + /* free newpage if we never used it.. */ + if (give_page || !newpage) + return 1; + free_page(newpage); + return 1; + } + } + return 0; +} + +/* + * fill in an empty page-table if none exists. + */ +static inline unsigned long get_empty_pgtable(struct task_struct * tsk,unsigned long address) +{ + unsigned long page; + unsigned long *p; + + p = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *p) + return *p; + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + page = get_free_page(GFP_KERNEL); + p = PAGE_DIR_OFFSET(tsk->tss.cr3,address); + if (PAGE_PRESENT & *p) { + free_page(page); + return *p; + } + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + if (page) { + *p = page | PAGE_TABLE; + return *p; + } + oom(current); + *p = BAD_PAGETABLE | PAGE_TABLE; + return 0; +} + +static inline void do_swap_page(struct vm_area_struct * vma, + unsigned long address, unsigned long * pge, unsigned long entry) +{ + unsigned long page; + + if (vma->vm_ops && vma->vm_ops->swapin) + page = vma->vm_ops->swapin(vma, entry); + else + page = swap_in(entry); + if (*pge != entry) { + free_page(page); + return; + } + page = page | vma->vm_page_prot; + if (mem_map[MAP_NR(page)] > 1 && (page & PAGE_COW)) + page &= ~PAGE_RW; + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->maj_flt; + *pge = page; + return; +} + +void do_no_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long page, entry, prot; + + page = get_empty_pgtable(vma->vm_task,address); + if (!page) + return; + page &= PAGE_MASK; + page += PAGE_PTR(address); + entry = *(unsigned long *) page; + if (entry & PAGE_PRESENT) + return; + if (entry) { + do_swap_page(vma, address, (unsigned long *) page, entry); + return; + } + address &= PAGE_MASK; + + if (!vma->vm_ops || !vma->vm_ops->nopage) { + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->min_flt; + get_empty_page(vma->vm_task,address); + return; + } + page = get_free_page(GFP_KERNEL); + if (share_page(vma, address, error_code, page)) { + ++vma->vm_task->mm->min_flt; + ++vma->vm_task->mm->rss; + return; + } + if (!page) { + oom(current); + put_page(vma->vm_task, BAD_PAGE, address, PAGE_PRIVATE); + return; + } + ++vma->vm_task->mm->maj_flt; + ++vma->vm_task->mm->rss; + prot = vma->vm_page_prot; + /* + * The fourth argument is "no_share", which tells the low-level code + * to copy, not share the page even if sharing is possible. It's + * essentially an early COW detection ("moo at 5 AM"). + */ + page = vma->vm_ops->nopage(vma, address, page, (error_code & PAGE_RW) && (prot & PAGE_COW)); + if (share_page(vma, address, error_code, 0)) { + free_page(page); + return; + } + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. + */ + if (error_code & PAGE_RW) { + prot |= PAGE_DIRTY; /* can't be COW-shared: see "no_share" above */ + } else if ((prot & PAGE_COW) && mem_map[MAP_NR(page)] > 1) + prot &= ~PAGE_RW; + if (put_page(vma->vm_task, page, address, prot)) + return; + free_page(page); + oom(current); +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct vm_area_struct * vma; + unsigned long address; + unsigned long page; + + /* get the address */ + __asm__("movl %%cr2,%0":"=r" (address)); + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > address) + break; + } + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + vma->vm_offset -= vma->vm_start - (address & PAGE_MASK); + vma->vm_start = (address & PAGE_MASK); +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + if (regs->eflags & VM_MASK) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + current->screen_bitmap |= 1 << bit; + } + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (error_code & PAGE_PRESENT) { + if (!(vma->vm_page_prot & (PAGE_RW | PAGE_COW))) + goto bad_area; +#ifdef CONFIG_TEST_VERIFY_AREA + if (regs->cs == KERNEL_CS) + printk("WP fault at %08x\n", regs->eip); +#endif + do_wp_page(vma, address, error_code); + return; + } + do_no_page(vma, address, error_code); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + if (error_code & PAGE_USER) { + current->tss.cr2 = address; + current->tss.error_code = error_code; + current->tss.trap_no = 14; + send_sig(SIGSEGV, current, 1); + return; + } +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + if (wp_works_ok < 0 && address == TASK_SIZE && (error_code & PAGE_PRESENT)) { + wp_works_ok = 1; + pg0[0] = PAGE_SHARED; + invalidate(); + printk("This processor honours the WP bit even when in supervisor mode. Good.\n"); + return; + } + if ((unsigned long) (address-TASK_SIZE) < PAGE_SIZE) { + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + pg0[0] = PAGE_SHARED; + } else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at virtual address %08lx\n",address); + __asm__("movl %%cr3,%0" : "=r" (page)); + printk(KERN_ALERT "current->tss.cr3 = %08lx, %%cr3 = %08lx\n", + current->tss.cr3, page); + page = ((unsigned long *) page)[address >> 22]; + printk(KERN_ALERT "*pde = %08lx\n", page); + if (page & PAGE_PRESENT) { + page &= PAGE_MASK; + address &= 0x003ff000; + page = ((unsigned long *) page)[address >> PAGE_SHIFT]; + printk(KERN_ALERT "*pte = %08lx\n", page); + } + die_if_kernel("Oops", regs, error_code); + do_exit(SIGKILL); +} + +/* + * BAD_PAGE is the page that is used for page faults when linux + * is out-of-memory. Older versions of linux just did a + * do_exit(), but using this instead means there is less risk + * for a process dying in kernel mode, possibly leaving a inode + * unused etc.. + * + * BAD_PAGETABLE is the accompanying page-table: it is initialized + * to point to BAD_PAGE entries. + * + * ZERO_PAGE is a special page that is used for zero-initialized + * data and COW. + */ +unsigned long __bad_pagetable(void) +{ + extern char empty_bad_page_table[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (BAD_PAGE + PAGE_TABLE), + "D" ((long) empty_bad_page_table), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_bad_page_table; +} + +unsigned long __bad_page(void) +{ + extern char empty_bad_page[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (0), + "D" ((long) empty_bad_page), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_bad_page; +} + +unsigned long __zero_page(void) +{ + extern char empty_zero_page[PAGE_SIZE]; + + __asm__ __volatile__("cld ; rep ; stosl": + :"a" (0), + "D" ((long) empty_zero_page), + "c" (PTRS_PER_PAGE) + :"di","cx"); + return (unsigned long) empty_zero_page; +} + +void show_mem(void) +{ + int i,free = 0,total = 0,reserved = 0; + int shared = 0; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); + i = high_memory >> PAGE_SHIFT; + while (i-- > 0) { + total++; + if (mem_map[i] & MAP_PAGE_RESERVED) + reserved++; + else if (!mem_map[i]) + free++; + else + shared += mem_map[i]-1; + } + printk("%d pages of RAM\n",total); + printk("%d free pages\n",free); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + show_buffers(); +#ifdef CONFIG_NET + show_net_buffers(); +#endif +} + +extern unsigned long free_area_init(unsigned long, unsigned long); + +/* + * paging_init() sets up the page tables - note that the first 4MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +unsigned long paging_init(unsigned long start_mem, unsigned long end_mem) +{ + unsigned long * pg_dir; + unsigned long * pg_table; + unsigned long tmp; + unsigned long address; + +/* + * Physical page 0 is special; it's not touched by Linux since BIOS + * and SMM (for laptops with [34]86/SL chips) may need it. It is read + * and write protected to detect null pointer references in the + * kernel. + */ +#if 0 + memset((void *) 0, 0, PAGE_SIZE); +#endif + start_mem = PAGE_ALIGN(start_mem); + address = 0; + pg_dir = swapper_pg_dir; + while (address < end_mem) { + tmp = *(pg_dir + 768); /* at virtual addr 0xC0000000 */ + if (!tmp) { + tmp = start_mem | PAGE_TABLE; + *(pg_dir + 768) = tmp; + start_mem += PAGE_SIZE; + } + *pg_dir = tmp; /* also map it in at 0x0000000 for init */ + pg_dir++; + pg_table = (unsigned long *) (tmp & PAGE_MASK); + for (tmp = 0 ; tmp < PTRS_PER_PAGE ; tmp++,pg_table++) { + if (address < end_mem) + *pg_table = address | PAGE_SHARED; + else + *pg_table = 0; + address += PAGE_SIZE; + } + } + invalidate(); + return free_area_init(start_mem, end_mem); +} + +void mem_init(unsigned long start_low_mem, + unsigned long start_mem, unsigned long end_mem) +{ + int codepages = 0; + int reservedpages = 0; + int datapages = 0; + unsigned long tmp; + extern int etext; + + cli(); + end_mem &= PAGE_MASK; + high_memory = end_mem; + + /* mark usable pages in the mem_map[] */ + start_low_mem = PAGE_ALIGN(start_low_mem); + start_mem = PAGE_ALIGN(start_mem); + + /* + * IBM messed up *AGAIN* in their thinkpad: 0xA0000 -> 0x9F000. + * They seem to have done something stupid with the floppy + * controller as well.. + */ + while (start_low_mem < 0x9f000) { + mem_map[MAP_NR(start_low_mem)] = 0; + start_low_mem += PAGE_SIZE; + } + + while (start_mem < high_memory) { + mem_map[MAP_NR(start_mem)] = 0; + start_mem += PAGE_SIZE; + } +#ifdef CONFIG_SOUND + sound_mem_init(); +#endif + for (tmp = 0 ; tmp < high_memory ; tmp += PAGE_SIZE) { + if (mem_map[MAP_NR(tmp)]) { + if (tmp >= 0xA0000 && tmp < 0x100000) + reservedpages++; + else if (tmp < (unsigned long) &etext) + codepages++; + else + datapages++; + continue; + } + mem_map[MAP_NR(tmp)] = 1; + free_page(tmp); + } + tmp = nr_free_pages << PAGE_SHIFT; + printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data)\n", + tmp >> 10, + high_memory >> 10, + codepages << (PAGE_SHIFT-10), + reservedpages << (PAGE_SHIFT-10), + datapages << (PAGE_SHIFT-10)); +/* test if the WP bit is honoured in supervisor mode */ + wp_works_ok = -1; + pg0[0] = PAGE_READONLY; + invalidate(); + __asm__ __volatile__("movb 0,%%al ; movb %%al,0": : :"ax", "memory"); + pg0[0] = 0; + invalidate(); + if (wp_works_ok < 0) + wp_works_ok = 0; +#ifdef CONFIG_TEST_VERIFY_AREA + wp_works_ok = 0; +#endif + return; +} + +void si_meminfo(struct sysinfo *val) +{ + int i; + + i = high_memory >> PAGE_SHIFT; + val->totalram = 0; + val->sharedram = 0; + val->freeram = nr_free_pages << PAGE_SHIFT; + val->bufferram = buffermem; + while (i-- > 0) { + if (mem_map[i] & MAP_PAGE_RESERVED) + continue; + val->totalram++; + if (!mem_map[i]) + continue; + val->sharedram += mem_map[i]-1; + } + val->totalram <<= PAGE_SHIFT; + val->sharedram <<= PAGE_SHIFT; + return; +} + + +/* + * This handles a generic mmap of a disk file. + */ +static unsigned long file_mmap_nopage(struct vm_area_struct * area, unsigned long address, + unsigned long page, int no_share) +{ + struct inode * inode = area->vm_inode; + unsigned int block; + int nr[8]; + int i, *p; + + address &= PAGE_MASK; + block = address - area->vm_start + area->vm_offset; + block >>= inode->i_sb->s_blocksize_bits; + i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; + p = nr; + do { + *p = bmap(inode,block); + i--; + block++; + p++; + } while (i > 0); + return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share); +} + +struct vm_operations_struct file_mmap = { + NULL, /* open */ + NULL, /* close */ + file_mmap_nopage, /* nopage */ + NULL, /* wppage */ + NULL, /* share */ + NULL, /* unmap */ +}; diff --git a/arch/i386/mm/mmap.c b/arch/i386/mm/mmap.c new file mode 100644 index 000000000..fbbea985c --- /dev/null +++ b/arch/i386/mm/mmap.c @@ -0,0 +1,470 @@ +/* + * linux/mm/mmap.c + * + * Written by obz. + */ +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> + +#include <asm/segment.h> +#include <asm/system.h> + +static int anon_map(struct inode *, struct file *, struct vm_area_struct *); + +/* + * description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ + +int do_mmap(struct file * file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long off) +{ + int mask, error; + struct vm_area_struct * vma; + + if ((len = PAGE_ALIGN(len)) == 0) + return addr; + + if (addr > TASK_SIZE || len > TASK_SIZE || addr > TASK_SIZE-len) + return -EINVAL; + + /* offset overflow? */ + if (off + len < off) + return -EINVAL; + + /* + * do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + + if (file != NULL) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot & PROT_WRITE) && !(file->f_mode & 2)) + return -EACCES; + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & 1)) + return -EACCES; + break; + + default: + return -EINVAL; + } + if ((flags & MAP_DENYWRITE) && (file->f_inode->i_wcount > 0)) + return -ETXTBSY; + } else if ((flags & MAP_TYPE) == MAP_SHARED) + return -EINVAL; + + /* + * obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + + if (flags & MAP_FIXED) { + if (addr & ~PAGE_MASK) + return -EINVAL; + if (len > TASK_SIZE || addr > TASK_SIZE - len) + return -EINVAL; + } else { + addr = get_unmapped_area(len); + if (!addr) + return -ENOMEM; + } + + /* + * determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + if (file && (!file->f_op || !file->f_op->mmap)) + return -ENODEV; + mask = PAGE_PRESENT; + if (prot & (PROT_READ | PROT_EXEC)) + mask |= PAGE_READONLY; + if (prot & PROT_WRITE) + if ((flags & MAP_TYPE) == MAP_PRIVATE) + mask |= PAGE_COPY; + else + mask |= PAGE_SHARED; + + vma = (struct vm_area_struct *)kmalloc(sizeof(struct vm_area_struct), + GFP_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_task = current; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_page_prot = mask; + vma->vm_flags = prot & (VM_READ | VM_WRITE | VM_EXEC); + vma->vm_flags |= flags & (VM_GROWSDOWN | VM_DENYWRITE | VM_EXECUTABLE); + + if (file) { + if (file->f_mode & 1) + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (flags & MAP_SHARED) { + vma->vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & 2)) + vma->vm_flags &= ~VM_MAYWRITE; + } + } else + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + vma->vm_ops = NULL; + vma->vm_offset = off; + vma->vm_inode = NULL; + vma->vm_pte = 0; + + do_munmap(addr, len); /* Clear old maps */ + + if (file) + error = file->f_op->mmap(file->f_inode, file, vma); + else + error = anon_map(NULL, NULL, vma); + + if (error) { + kfree(vma); + return error; + } + insert_vm_struct(current, vma); + merge_segments(current->mm->mmap); + return addr; +} + +/* + * Get an address range which is currently unmapped. + * For mmap() without MAP_FIXED and shmat() with addr=0. + * Return value 0 means ENOMEM. + */ +unsigned long get_unmapped_area(unsigned long len) +{ + struct vm_area_struct * vmm; + unsigned long gap_start = 0, gap_end; + + for (vmm = current->mm->mmap; ; vmm = vmm->vm_next) { + if (gap_start < SHM_RANGE_START) + gap_start = SHM_RANGE_START; + if (!vmm || ((gap_end = vmm->vm_start) > SHM_RANGE_END)) + gap_end = SHM_RANGE_END; + gap_start = PAGE_ALIGN(gap_start); + gap_end &= PAGE_MASK; + if ((gap_start <= gap_end) && (gap_end - gap_start >= len)) + return gap_start; + if (!vmm) + return 0; + gap_start = vmm->vm_end; + } +} + +asmlinkage int sys_mmap(unsigned long *buffer) +{ + int error; + unsigned long flags; + struct file * file = NULL; + + error = verify_area(VERIFY_READ, buffer, 6*sizeof(long)); + if (error) + return error; + flags = get_fs_long(buffer+3); + if (!(flags & MAP_ANONYMOUS)) { + unsigned long fd = get_fs_long(buffer+4); + if (fd >= NR_OPEN || !(file = current->files->fd[fd])) + return -EBADF; + } + return do_mmap(file, get_fs_long(buffer), get_fs_long(buffer+1), + get_fs_long(buffer+2), flags, get_fs_long(buffer+5)); +} + +/* + * Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * This function works out what part of an area is affected and + * adjusts the mapping information. Since the actual page + * manipulation is done in do_mmap(), none need be done here, + * though it would probably be more appropriate. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list, so it needs to be + * reinserted if necessary. + * + * The 4 main cases are: + * Unmapping the whole area + * Unmapping from the start of the segment to a point in it + * Unmapping from an intermediate point to the end + * Unmapping between to intermediate points, making a hole. + * + * Case 4 involves the creation of 2 new areas, for each side of + * the hole. + */ +void unmap_fixup(struct vm_area_struct *area, + unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt; + unsigned long end = addr + len; + + if (addr < area->vm_start || addr >= area->vm_end || + end <= area->vm_start || end > area->vm_end || + end < addr) + { + printk("unmap_fixup: area=%lx-%lx, unmap %lx-%lx!!\n", + area->vm_start, area->vm_end, addr, end); + return; + } + + /* Unmapping the whole area */ + if (addr == area->vm_start && end == area->vm_end) { + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_inode) + iput(area->vm_inode); + return; + } + + /* Work out to one of the ends */ + if (addr >= area->vm_start && end == area->vm_end) + area->vm_end = addr; + if (addr == area->vm_start && end <= area->vm_end) { + area->vm_offset += (end - area->vm_start); + area->vm_start = end; + } + + /* Unmapping a hole */ + if (addr > area->vm_start && end < area->vm_end) + { + /* Add end mapping -- leave beginning for below */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + + if (!mpnt) + return; + *mpnt = *area; + mpnt->vm_offset += (end - area->vm_start); + mpnt->vm_start = end; + if (mpnt->vm_inode) + mpnt->vm_inode->i_count++; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + area->vm_end = addr; /* Truncate area */ + insert_vm_struct(current, mpnt); + } + + /* construct whatever mapping is needed */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + if (!mpnt) + return; + *mpnt = *area; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + if (area->vm_ops && area->vm_ops->close) { + area->vm_end = area->vm_start; + area->vm_ops->close(area); + } + insert_vm_struct(current, mpnt); +} + +asmlinkage int sys_munmap(unsigned long addr, size_t len) +{ + return do_munmap(addr, len); +} + +/* + * Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardine <jeremy@sw.oz.au> + */ +int do_munmap(unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt, **npp, *free; + + if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return 0; + + /* + * Check if this memory area is ok - put it on the temporary + * list if so.. The checks here are pretty simple -- + * every area affected in some way (by any overlap) is put + * on the list. If nothing is put on, nothing is affected. + */ + npp = ¤t->mm->mmap; + free = NULL; + for (mpnt = *npp; mpnt != NULL; mpnt = *npp) { + unsigned long end = addr+len; + + if ((addr < mpnt->vm_start && end <= mpnt->vm_start) || + (addr >= mpnt->vm_end && end > mpnt->vm_end)) + { + npp = &mpnt->vm_next; + continue; + } + + *npp = mpnt->vm_next; + mpnt->vm_next = free; + free = mpnt; + } + + if (free == NULL) + return 0; + + /* + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and unmap the page range.. + * If the one of the segments is only being partially unmapped, + * it will put new vm_area_struct(s) into the address space. + */ + while (free) { + unsigned long st, end; + + mpnt = free; + free = free->vm_next; + + st = addr < mpnt->vm_start ? mpnt->vm_start : addr; + end = addr+len; + end = end > mpnt->vm_end ? mpnt->vm_end : end; + + if (mpnt->vm_ops && mpnt->vm_ops->unmap) + mpnt->vm_ops->unmap(mpnt, st, end-st); + else + unmap_fixup(mpnt, st, end-st); + + kfree(mpnt); + } + + unmap_page_range(addr, len); + return 0; +} + +/* This is used for a general mmap of a disk file */ +int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma) +{ + extern struct vm_operations_struct file_mmap; + + if (vma->vm_page_prot & PAGE_RW) /* only PAGE_COW or read-only supported right now */ + return -EINVAL; + if (vma->vm_offset & (inode->i_sb->s_blocksize - 1)) + return -EINVAL; + if (!inode->i_sb || !S_ISREG(inode->i_mode)) + return -EACCES; + if (!inode->i_op || !inode->i_op->bmap) + return -ENOEXEC; + if (!IS_RDONLY(inode)) { + inode->i_atime = CURRENT_TIME; + inode->i_dirt = 1; + } + vma->vm_inode = inode; + inode->i_count++; + vma->vm_ops = &file_mmap; + return 0; +} + +/* + * Insert vm structure into process list sorted by address. + */ +void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp) +{ + struct vm_area_struct **p, *mpnt; + + p = &t->mm->mmap; + while ((mpnt = *p) != NULL) { + if (mpnt->vm_start > vmp->vm_start) + break; + if (mpnt->vm_end > vmp->vm_start) + printk("insert_vm_struct: overlapping memory areas\n"); + p = &mpnt->vm_next; + } + vmp->vm_next = mpnt; + *p = vmp; +} + +/* + * Merge a list of memory segments if possible. + * Redundant vm_area_structs are freed. + * This assumes that the list is ordered by address. + */ +void merge_segments(struct vm_area_struct *mpnt) +{ + struct vm_area_struct *prev, *next; + + if (mpnt == NULL) + return; + + for(prev = mpnt, mpnt = mpnt->vm_next; + mpnt != NULL; + prev = mpnt, mpnt = next) + { + next = mpnt->vm_next; + + /* + * To share, we must have the same inode, operations.. + */ + if (mpnt->vm_inode != prev->vm_inode) + continue; + if (mpnt->vm_pte != prev->vm_pte) + continue; + if (mpnt->vm_ops != prev->vm_ops) + continue; + if (mpnt->vm_page_prot != prev->vm_page_prot || + mpnt->vm_flags != prev->vm_flags) + continue; + if (prev->vm_end != mpnt->vm_start) + continue; + /* + * and if we have an inode, the offsets must be contiguous.. + */ + if ((mpnt->vm_inode != NULL) || (mpnt->vm_flags & VM_SHM)) { + if (prev->vm_offset + prev->vm_end - prev->vm_start != mpnt->vm_offset) + continue; + } + + /* + * merge prev with mpnt and set up pointers so the new + * big segment can possibly merge with the next one. + * The old unused mpnt is freed. + */ + prev->vm_end = mpnt->vm_end; + prev->vm_next = mpnt->vm_next; + if (mpnt->vm_ops && mpnt->vm_ops->close) { + mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start; + mpnt->vm_start = mpnt->vm_end; + mpnt->vm_ops->close(mpnt); + } + if (mpnt->vm_inode) + mpnt->vm_inode->i_count--; + kfree_s(mpnt, sizeof(*mpnt)); + mpnt = prev; + } +} + +/* + * Map memory not associated with any file into a process + * address space. Adjacent memory is merged. + */ +static int anon_map(struct inode *ino, struct file * file, struct vm_area_struct * vma) +{ + if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) + return -ENOMEM; + return 0; +} diff --git a/arch/i386/mm/mprotect.c b/arch/i386/mm/mprotect.c new file mode 100644 index 000000000..99252183b --- /dev/null +++ b/arch/i386/mm/mprotect.c @@ -0,0 +1,230 @@ +/* + * linux/mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + */ +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> + +#include <asm/segment.h> +#include <asm/system.h> + +#define CHG_MASK (PAGE_MASK | PAGE_ACCESSED | PAGE_DIRTY | PAGE_PWT | PAGE_PCD) + +static void change_protection(unsigned long start, unsigned long end, int prot) +{ + unsigned long *page_table, *dir; + unsigned long page, offset; + int nr; + + dir = PAGE_DIR_OFFSET(current->tss.cr3, start); + offset = (start >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + nr = (end - start) >> PAGE_SHIFT; + while (nr > 0) { + page = *dir; + dir++; + if (!(page & PAGE_PRESENT)) { + nr = nr - PTRS_PER_PAGE + offset; + offset = 0; + continue; + } + page_table = offset + (unsigned long *) (page & PAGE_MASK); + offset = PTRS_PER_PAGE - offset; + if (offset > nr) + offset = nr; + nr = nr - offset; + do { + page = *page_table; + if (page & PAGE_PRESENT) + *page_table = (page & CHG_MASK) | prot; + ++page_table; + } while (--offset); + } + return; +} + +static inline int mprotect_fixup_all(struct vm_area_struct * vma, + int newflags, int prot) +{ + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + return 0; +} + +static inline int mprotect_fixup_start(struct vm_area_struct * vma, + unsigned long end, + int newflags, int prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_start = end; + n->vm_end = end; + vma->vm_offset += vma->vm_start - n->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_end(struct vm_area_struct * vma, + unsigned long start, + int newflags, int prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_end = start; + n->vm_start = start; + n->vm_offset += n->vm_start - vma->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, + int newflags, int prot) +{ + struct vm_area_struct * left, * right; + + left = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!left) + return -ENOMEM; + right = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!right) { + kfree(left); + return -ENOMEM; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + vma->vm_start = start; + vma->vm_end = end; + right->vm_start = end; + vma->vm_offset += vma->vm_start - left->vm_start; + right->vm_offset += right->vm_start - left->vm_start; + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + if (vma->vm_inode) + vma->vm_inode->i_count += 2; + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + insert_vm_struct(current, left); + insert_vm_struct(current, right); + return 0; +} + +static int mprotect_fixup(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned int newflags) +{ + int prot, error; + + if (newflags == vma->vm_flags) + return 0; + prot = PAGE_PRESENT; + if (newflags & (VM_READ | VM_EXEC)) + prot |= PAGE_READONLY; + if (newflags & VM_WRITE) + if (newflags & VM_SHARED) + prot |= PAGE_SHARED; + else + prot |= PAGE_COPY; + + if (start == vma->vm_start) + if (end == vma->vm_end) + error = mprotect_fixup_all(vma, newflags, prot); + else + error = mprotect_fixup_start(vma, end, newflags, prot); + else if (end == vma->vm_end) + error = mprotect_fixup_end(vma, start, newflags, prot); + else + error = mprotect_fixup_middle(vma, start, end, newflags, prot); + + if (error) + return error; + + change_protection(start, end, prot); + return 0; +} + +asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long end, tmp; + struct vm_area_struct * vma, * next; + int error; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) + return -EINVAL; + if (end == start) + return 0; + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + return -EFAULT; + if (vma->vm_end > start) + break; + } + if (vma->vm_start > start) + return -EFAULT; + + for ( ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= start < vma->vm_end. */ + + newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + break; + } + + if (vma->vm_end >= end) { + error = mprotect_fixup(vma, start, end, newflags); + break; + } + + tmp = vma->vm_end; + next = vma->vm_next; + error = mprotect_fixup(vma, start, tmp, newflags); + if (error) + break; + start = tmp; + vma = next; + if (!vma || vma->vm_start != start) { + error = -EFAULT; + break; + } + } + merge_segments(current->mm->mmap); + return error; +} diff --git a/arch/i386/mm/swap.c b/arch/i386/mm/swap.c new file mode 100644 index 000000000..f7a1f54b3 --- /dev/null +++ b/arch/i386/mm/swap.c @@ -0,0 +1,1017 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file should contain most things doing the swapping from/to disk. + * Started 18.12.91 + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/fs.h> + +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/bitops.h> + +#define MAX_SWAPFILES 8 + +#define SWP_USED 1 +#define SWP_WRITEOK 3 + +#define SWP_TYPE(entry) (((entry) & 0xfe) >> 1) +#define SWP_OFFSET(entry) ((entry) >> PAGE_SHIFT) +#define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << PAGE_SHIFT)) + +int min_free_pages = 20; + +static int nr_swapfiles = 0; +static struct wait_queue * lock_queue = NULL; + +static struct swap_info_struct { + unsigned long flags; + struct inode * swap_file; + unsigned int swap_device; + unsigned char * swap_map; + unsigned char * swap_lockmap; + int pages; + int lowest_bit; + int highest_bit; + unsigned long max; +} swap_info[MAX_SWAPFILES]; + +extern int shm_swap (int); + +unsigned long *swap_cache; + +#ifdef SWAP_CACHE_INFO +unsigned long swap_cache_add_total = 0; +unsigned long swap_cache_add_success = 0; +unsigned long swap_cache_del_total = 0; +unsigned long swap_cache_del_success = 0; +unsigned long swap_cache_find_total = 0; +unsigned long swap_cache_find_success = 0; + +extern inline void show_swap_cache_info(void) +{ + printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", + swap_cache_add_total, swap_cache_add_success, + swap_cache_del_total, swap_cache_del_success, + swap_cache_find_total, swap_cache_find_success); +} +#endif + +extern inline int add_to_swap_cache(unsigned long addr, unsigned long entry) +{ + struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; + +#ifdef SWAP_CACHE_INFO + swap_cache_add_total++; +#endif + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + __asm__ __volatile__ ( + "xchgl %0,%1\n" + : "=m" (swap_cache[addr >> PAGE_SHIFT]), + "=r" (entry) + : "0" (swap_cache[addr >> PAGE_SHIFT]), + "1" (entry)); + if (entry) { + printk("swap_cache: replacing non-NULL entry\n"); + } +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; + } + return 0; +} + +extern inline int add_to_swap_cache(unsigned long addr, unsigned long entry) +{ + struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; + +#ifdef SWAP_CACHE_INFO + swap_cache_add_total++; +#endif + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + __asm__ __volatile__ ( + "xchgl %0,%1\n" + : "=m" (swap_cache[addr >> PAGE_SHIFT]), + "=r" (entry) + : "0" (swap_cache[addr >> PAGE_SHIFT]), + "1" (entry) + ); + if (entry) { + printk("swap_cache: replacing non-NULL entry\n"); + } +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; + } + return 0; +} + +static unsigned long init_swap_cache(unsigned long mem_start, + unsigned long mem_end) +{ + unsigned long swap_cache_size; + + mem_start = (mem_start + 15) & ~15; + swap_cache = (unsigned long *) mem_start; + swap_cache_size = mem_end >> PAGE_SHIFT; + memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long)); + return (unsigned long) (swap_cache + swap_cache_size); +} + +void rw_swap_page(int rw, unsigned long entry, char * buf) +{ + unsigned long type, offset; + struct swap_info_struct * p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk("Internal error: bad swap-device\n"); + return; + } + p = &swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("rw_swap_page: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to swap to unused swap-device\n"); + return; + } + while (set_bit(offset,p->swap_lockmap)) + sleep_on(&lock_queue); + if (rw == READ) + kstat.pswpin++; + else + kstat.pswpout++; + if (p->swap_device) { + ll_rw_page(rw,p->swap_device,offset,buf); + } else if (p->swap_file) { + struct inode *swapf = p->swap_file; + unsigned int zones[8]; + int i; + if (swapf->i_op->bmap == NULL + && swapf->i_op->smap != NULL){ + /* + With MsDOS, we use msdos_smap which return + a sector number (not a cluster or block number). + It is a patch to enable the UMSDOS project. + Other people are working on better solution. + + It sounds like ll_rw_swap_file defined + it operation size (sector size) based on + PAGE_SIZE and the number of block to read. + So using bmap or smap should work even if + smap will require more blocks. + */ + int j; + unsigned int block = offset << 3; + + for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){ + if (!(zones[i] = swapf->i_op->smap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + }else{ + int j; + unsigned int block = offset + << (12 - swapf->i_sb->s_blocksize_bits); + + for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) + if (!(zones[i] = bmap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); + } else + printk("re_swap_page: no swap file or device\n"); + if (offset && !clear_bit(offset,p->swap_lockmap)) + printk("rw_swap_page: lock already cleared\n"); + wake_up(&lock_queue); +} + +unsigned int get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned int offset, type; + + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) { + if (p->swap_map[offset]) + continue; + p->swap_map[offset] = 1; + nr_swap_pages--; + if (offset == p->highest_bit) + p->highest_bit--; + p->lowest_bit = offset; + return SWP_ENTRY(type,offset); + } + } + return 0; +} + +unsigned long swap_duplicate(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return 0; + offset = SWP_OFFSET(entry); + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return entry; + if (type >= nr_swapfiles) { + printk("Trying to duplicate nonexistent swap-page\n"); + return 0; + } + p = type + swap_info; + if (offset >= p->max) { + printk("swap_duplicate: weirdness\n"); + return 0; + } + if (!p->swap_map[offset]) { + printk("swap_duplicate: trying to duplicate unused page\n"); + return 0; + } + p->swap_map[offset]++; + return entry; +} + +void swap_free(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return; + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return; + if (type >= nr_swapfiles) { + printk("Trying to free nonexistent swap-page\n"); + return; + } + p = & swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("swap_free: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to free swap from unused swap-device\n"); + return; + } + while (set_bit(offset,p->swap_lockmap)) + sleep_on(&lock_queue); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (!p->swap_map[offset]) + printk("swap_free: swap-space map bad (entry %08lx)\n",entry); + else + if (!--p->swap_map[offset]) + nr_swap_pages++; + if (!clear_bit(offset,p->swap_lockmap)) + printk("swap_free: lock already cleared\n"); + wake_up(&lock_queue); +} + +unsigned long swap_in(unsigned long entry) +{ + unsigned long page; + + if (!(page = get_free_page(GFP_KERNEL))) { + oom(current); + return BAD_PAGE; + } + read_swap_page(entry, (char *) page); + if (add_to_swap_cache(page, entry)) + return page | PAGE_PRESENT; + swap_free(entry); + return page | PAGE_DIRTY | PAGE_PRESENT; +} + +static inline int try_to_swap_out(unsigned long * table_ptr) +{ + unsigned long page, entry; + + page = *table_ptr; + if (!(PAGE_PRESENT & page)) + return 0; + if (page >= high_memory) + return 0; + if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) + return 0; + + if ((PAGE_DIRTY & page) && delete_from_swap_cache(page)) { + *table_ptr &= ~PAGE_ACCESSED; + return 0; + } + if (PAGE_ACCESSED & page) { + *table_ptr &= ~PAGE_ACCESSED; + return 0; + } + if (PAGE_DIRTY & page) { + page &= PAGE_MASK; + if (mem_map[MAP_NR(page)] != 1) + return 0; + if (!(entry = get_swap_page())) + return 0; + *table_ptr = entry; + invalidate(); + write_swap_page(entry, (char *) page); + free_page(page); + return 1; + } + if ((entry = find_in_swap_cache(page))) { + if (mem_map[MAP_NR(page)] != 1) { + *table_ptr |= PAGE_DIRTY; + printk("Aiee.. duplicated cached swap-cache entry\n"); + return 0; + } + *table_ptr = entry; + invalidate(); + free_page(page & PAGE_MASK); + return 1; + } + page &= PAGE_MASK; + *table_ptr = 0; + invalidate(); + free_page(page); + return 1 + mem_map[MAP_NR(page)]; +} + +/* + * A new implementation of swap_out(). We do not swap complete processes, + * but only a small number of blocks, before we continue with the next + * process. The number of blocks actually swapped is determined on the + * number of page faults, that this process actually had in the last time, + * so we won't swap heavily used processes all the time ... + * + * Note: the priority argument is a hint on much CPU to waste with the + * swap block search, not a hint, of how much blocks to swap with + * each process. + * + * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de + */ + +/* + * These are the minimum and maximum number of pages to swap from one process, + * before proceeding to the next: + */ +#define SWAP_MIN 4 +#define SWAP_MAX 32 + +/* + * The actual number of pages to swap is determined as: + * SWAP_RATIO / (number of recent major page faults) + */ +#define SWAP_RATIO 128 + +static int swap_out_process(struct task_struct * p) +{ + unsigned long address; + unsigned long offset; + unsigned long *pgdir; + unsigned long pg_table; + + /* + * Go through process' page directory. + */ + address = p->mm->swap_address; + pgdir = (address >> PGDIR_SHIFT) + (unsigned long *) p->tss.cr3; + offset = address & ~PGDIR_MASK; + address &= PGDIR_MASK; + for ( ; address < TASK_SIZE ; + pgdir++, address = address + PGDIR_SIZE, offset = 0) { + pg_table = *pgdir; + if (pg_table >= high_memory) + continue; + if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED) + continue; + if (!(PAGE_PRESENT & pg_table)) { + printk("swap_out_process (%s): bad page-table at vm %08lx: %08lx\n", + p->comm, address + offset, pg_table); + *pgdir = 0; + continue; + } + pg_table &= 0xfffff000; + + /* + * Go through this page table. + */ + for( ; offset < ~PGDIR_MASK ; offset += PAGE_SIZE) { + switch(try_to_swap_out((unsigned long *) (pg_table + (offset >> 10)))) { + case 0: + break; + + case 1: + p->mm->rss--; + /* continue with the following page the next time */ + p->mm->swap_address = address + offset + PAGE_SIZE; + return 1; + + default: + p->mm->rss--; + break; + } + } + } + /* + * Finish work with this process, if we reached the end of the page + * directory. Mark restart from the beginning the next time. + */ + p->mm->swap_address = 0; + return 0; +} + +static int swap_out(unsigned int priority) +{ + static int swap_task; + int loop; + int counter = NR_TASKS * 2 >> priority; + struct task_struct *p; + + counter = NR_TASKS * 2 >> priority; + for(; counter >= 0; counter--, swap_task++) { + /* + * Check that swap_task is suitable for swapping. If not, look for + * the next suitable process. + */ + loop = 0; + while(1) { + if (swap_task >= NR_TASKS) { + swap_task = 1; + if (loop) + /* all processes are unswappable or already swapped out */ + return 0; + loop = 1; + } + + p = task[swap_task]; + if (p && p->mm->swappable && p->mm->rss) + break; + + swap_task++; + } + + /* + * Determine the number of pages to swap from this process. + */ + if (!p->mm->swap_cnt) { + p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt; + p->mm->old_maj_flt = p->mm->maj_flt; + + if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) { + p->mm->dec_flt = SWAP_RATIO / SWAP_MIN; + p->mm->swap_cnt = SWAP_MIN; + } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX) + p->mm->swap_cnt = SWAP_MAX; + else + p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt; + } + if (swap_out_process(p)) { + if ((--p->mm->swap_cnt) == 0) + swap_task++; + return 1; + } + } + return 0; +} + +static int try_to_free_page(int priority) +{ + int i=6; + + while (i--) { + if (priority != GFP_NOBUFFER && shrink_buffers(i)) + return 1; + if (shm_swap(i)) + return 1; + if (swap_out(i)) + return 1; + } + return 0; +} + +static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->prev = head; + entry->next = head->next; + entry->next->prev = entry; + head->next = entry; +} + +static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->next->prev = entry->prev; + entry->prev->next = entry->next; +} + +/* + * Free_page() adds the page to the free lists. This is optimized for + * fast normal cases (no error jumps taken normally). + * + * The way to optimize jumps for gcc-2.2.2 is to: + * - select the "normal" case and put it inside the if () { XXX } + * - no else-statements if you can avoid them + * + * With the above two rules, you get a straight-line execution path + * for the normal case, giving better asm-code. + */ + +/* + * Buddy system. Hairy. You really aren't expected to understand this + */ +static inline void free_pages_ok(unsigned long addr, unsigned long order) +{ + unsigned long index = addr >> (PAGE_SHIFT + 1 + order); + unsigned long mask = PAGE_MASK << order; + + addr &= mask; + nr_free_pages += 1 << order; + while (order < NR_MEM_LISTS-1) { + if (!change_bit(index, free_area_map[order])) + break; + remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask))); + order++; + index >>= 1; + mask <<= 1; + addr &= mask; + } + add_mem_queue(free_area_list+order, (struct mem_list *) addr); +} + +static inline void check_free_buffers(unsigned long addr) +{ + struct buffer_head * bh; + + bh = buffer_pages[MAP_NR(addr)]; + if (bh) { + struct buffer_head *tmp = bh; + do { + if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff) + refile_buffer(tmp); + tmp = tmp->b_this_page; + } while (tmp != bh); + } +} + +void free_pages(unsigned long addr, unsigned long order) +{ + if (addr < high_memory) { + unsigned long flag; + unsigned short * map = mem_map + MAP_NR(addr); + if (*map) { + if (!(*map & MAP_PAGE_RESERVED)) { + save_flags(flag); + cli(); + if (!--*map) { + free_pages_ok(addr, order); + delete_from_swap_cache(addr); + } + restore_flags(flag); + if (*map == 1) + check_free_buffers(addr); + } + return; + } + printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr); + printk("PC = %08lx\n",*(((unsigned long *)&addr)-1)); + return; + } +} + +/* + * Some ugly macros to speed up __get_free_pages().. + */ +#define RMQUEUE(order) \ +do { struct mem_list * queue = free_area_list+order; \ + unsigned long new_order = order; \ + do { struct mem_list *next = queue->next; \ + if (queue != next) { \ + (queue->next = next->next)->prev = queue; \ + mark_used((unsigned long) next, new_order); \ + nr_free_pages -= 1 << order; \ + restore_flags(flags); \ + EXPAND(next, order, new_order); \ + return (unsigned long) next; \ + } new_order++; queue++; \ + } while (new_order < NR_MEM_LISTS); \ +} while (0) + +static inline int mark_used(unsigned long addr, unsigned long order) +{ + return change_bit(addr >> (PAGE_SHIFT+1+order), free_area_map[order]); +} + +#define EXPAND(addr,low,high) \ +do { unsigned long size = PAGE_SIZE << high; \ + while (high > low) { \ + high--; size >>= 1; cli(); \ + add_mem_queue(free_area_list+high, addr); \ + mark_used((unsigned long) addr, high); \ + restore_flags(flags); \ + addr = (struct mem_list *) (size + (unsigned long) addr); \ + } mem_map[MAP_NR((unsigned long) addr)] = 1; \ +} while (0) + +unsigned long __get_free_pages(int priority, unsigned long order) +{ + unsigned long flags; + int reserved_pages; + + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("gfp called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + reserved_pages = 5; + if (priority != GFP_NFS) + reserved_pages = min_free_pages; + save_flags(flags); +repeat: + cli(); + if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { + RMQUEUE(order); + restore_flags(flags); + return 0; + } + restore_flags(flags); + if (priority != GFP_BUFFER && try_to_free_page(priority)) + goto repeat; + return 0; +} + +/* + * Yes, I know this is ugly. Don't tell me. + */ +unsigned long __get_dma_pages(int priority, unsigned long order) +{ + unsigned long list = 0; + unsigned long result; + unsigned long limit = 16*1024*1024; + + /* if (EISA_bus) limit = ~0UL; */ + if (priority != GFP_ATOMIC) + priority = GFP_BUFFER; + for (;;) { + result = __get_free_pages(priority, order); + if (result < limit) /* covers failure as well */ + break; + *(unsigned long *) result = list; + list = result; + } + while (list) { + unsigned long tmp = list; + list = *(unsigned long *) list; + free_pages(tmp, order); + } + return result; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + unsigned long order, flags; + unsigned long total = 0; + + printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); + save_flags(flags); + cli(); + for (order=0 ; order < NR_MEM_LISTS; order++) { + struct mem_list * tmp; + unsigned long nr = 0; + for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) { + nr ++; + } + total += nr * (4 << order); + printk("%lu*%ukB ", nr, 4 << order); + } + restore_flags(flags); + printk("= %lukB)\n", total); +#ifdef SWAP_CACHE_INFO + show_swap_cache_info(); +#endif +} + +/* + * Trying to stop swapping from a file is fraught with races, so + * we repeat quite a bit here when we have to pause. swapoff() + * isn't exactly timing-critical, so who cares? + */ +static int try_to_unuse(unsigned int type) +{ + int nr, pgt, pg; + unsigned long page, *ppage; + unsigned long tmp = 0; + struct task_struct *p; + + nr = 0; + +/* + * When we have to sleep, we restart the whole algorithm from the same + * task we stopped in. That at least rids us of all races. + */ +repeat: + for (; nr < NR_TASKS ; nr++) { + p = task[nr]; + if (!p) + continue; + for (pgt = 0 ; pgt < PTRS_PER_PAGE ; pgt++) { + ppage = pgt + ((unsigned long *) p->tss.cr3); + page = *ppage; + if (!page) + continue; + if (!(page & PAGE_PRESENT) || (page >= high_memory)) + continue; + if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) + continue; + ppage = (unsigned long *) (page & PAGE_MASK); + for (pg = 0 ; pg < PTRS_PER_PAGE ; pg++,ppage++) { + page = *ppage; + if (!page) + continue; + if (page & PAGE_PRESENT) { + if (!(page = in_swap_cache(page))) + continue; + if (SWP_TYPE(page) != type) + continue; + *ppage |= PAGE_DIRTY; + delete_from_swap_cache(*ppage); + continue; + } + if (SWP_TYPE(page) != type) + continue; + if (!tmp) { + if (!(tmp = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + goto repeat; + } + read_swap_page(page, (char *) tmp); + if (*ppage == page) { + *ppage = tmp | (PAGE_DIRTY | PAGE_PRIVATE); + ++p->mm->rss; + swap_free(page); + tmp = 0; + } + goto repeat; + } + } + } + free_page(tmp); + return 0; +} + +asmlinkage int sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * inode; + unsigned int type; + int i; + + if (!suser()) + return -EPERM; + i = namei(specialfile,&inode); + if (i) + return i; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + if (p->swap_file) { + if (p->swap_file == inode) + break; + } else { + if (!S_ISBLK(inode->i_mode)) + continue; + if (p->swap_device == inode->i_rdev) + break; + } + } + iput(inode); + if (type >= nr_swapfiles) + return -EINVAL; + p->flags = SWP_USED; + i = try_to_unuse(type); + if (i) { + p->flags = SWP_WRITEOK; + return i; + } + nr_swap_pages -= p->pages; + iput(p->swap_file); + p->swap_file = NULL; + p->swap_device = 0; + vfree(p->swap_map); + p->swap_map = NULL; + free_page((long) p->swap_lockmap); + p->swap_lockmap = NULL; + p->flags = 0; + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage int sys_swapon(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * swap_inode; + unsigned int type; + int i,j; + int error; + + if (!suser()) + return -EPERM; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + if (type >= MAX_SWAPFILES) + return -EPERM; + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->max = 1; + error = namei(specialfile,&swap_inode); + if (error) + goto bad_swap; + p->swap_file = swap_inode; + error = -EBUSY; + if (swap_inode->i_count != 1) + goto bad_swap; + error = -EINVAL; + if (S_ISBLK(swap_inode->i_mode)) { + p->swap_device = swap_inode->i_rdev; + p->swap_file = NULL; + iput(swap_inode); + error = -ENODEV; + if (!p->swap_device) + goto bad_swap; + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + if (i == type) + continue; + if (p->swap_device == swap_info[i].swap_device) + goto bad_swap; + } + } else if (!S_ISREG(swap_inode->i_mode)) + goto bad_swap; + p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER); + if (!p->swap_lockmap) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap); + if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + memset(p->swap_lockmap+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,p->swap_lockmap)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + p->max = i+1; + j++; + } + } + if (!j) { + printk("Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map = (unsigned char *) vmalloc(p->max); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < p->max ; i++) { + if (test_bit(i,p->swap_lockmap)) + p->swap_map[i] = 0; + else + p->swap_map[i] = 0x80; + } + p->swap_map[0] = 0x80; + memset(p->swap_lockmap,0,PAGE_SIZE); + p->flags = SWP_WRITEOK; + p->pages = j; + nr_swap_pages += j; + printk("Adding Swap: %dk swap-space\n",j<<2); + return 0; +bad_swap: + free_page((long) p->swap_lockmap); + vfree(p->swap_map); + iput(p->swap_file); + p->swap_device = 0; + p->swap_file = NULL; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->flags = 0; + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i, j; + + val->freeswap = val->totalswap = 0; + for (i = 0; i < nr_swapfiles; i++) { + if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (j = 0; j < swap_info[i].max; ++j) + switch (swap_info[i].swap_map[j]) { + case 128: + continue; + case 0: + ++val->freeswap; + default: + ++val->totalswap; + } + } + val->freeswap <<= PAGE_SHIFT; + val->totalswap <<= PAGE_SHIFT; + return; +} + +/* + * set up the free-area data structures: + * - mark all pages MAP_PAGE_RESERVED + * - mark all memory queues empty + * - clear the memory bitmaps + */ +unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) +{ + unsigned short * p; + unsigned long mask = PAGE_MASK; + int i; + + /* + * select nr of pages we try to keep free for important stuff + * with a minimum of 16 pages. This is totally arbitrary + */ + i = end_mem >> (PAGE_SHIFT+6); + if (i < 16) + i = 16; + min_free_pages = i; + start_mem = init_swap_cache(start_mem, end_mem); + mem_map = (unsigned short *) start_mem; + p = mem_map + MAP_NR(end_mem); + start_mem = (unsigned long) p; + while (p > mem_map) + *--p = MAP_PAGE_RESERVED; + + for (i = 0 ; i < NR_MEM_LISTS ; i++, mask <<= 1) { + unsigned long bitmap_size; + free_area_list[i].prev = free_area_list[i].next = &free_area_list[i]; + end_mem = (end_mem + ~mask) & mask; + bitmap_size = end_mem >> (PAGE_SHIFT + i); + bitmap_size = (bitmap_size + 7) >> 3; + free_area_map[i] = (unsigned char *) start_mem; + memset((void *) start_mem, 0, bitmap_size); + start_mem += bitmap_size; + } + return start_mem; +} diff --git a/arch/i386/mm/vmalloc.c b/arch/i386/mm/vmalloc.c new file mode 100644 index 000000000..0dbd16d54 --- /dev/null +++ b/arch/i386/mm/vmalloc.c @@ -0,0 +1,202 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + */ + +#include <asm/system.h> +#include <linux/config.h> + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/malloc.h> +#include <asm/segment.h> + +struct vm_struct { + unsigned long flags; + void * addr; + unsigned long size; + struct vm_struct * next; +}; + +static struct vm_struct * vmlist = NULL; + +/* Just any arbitrary offset to the start of the vmalloc VM area: the + * current 8MB value just means that there will be a 8MB "hole" after the + * physical memory until the kernel virtual memory starts. That means that + * any out-of-bounds memory accesses will hopefully be caught. + * The vmalloc() routines leaves a hole of 4kB between each vmalloced + * area for the same reason. ;) + */ +#define VMALLOC_OFFSET (8*1024*1024) + +static inline void set_pgdir(unsigned long dindex, unsigned long value) +{ + struct task_struct * p; + + p = &init_task; + do { + ((unsigned long *) p->tss.cr3)[dindex] = value; + p = p->next_task; + } while (p != &init_task); +} + +static int free_area_pages(unsigned long dindex, unsigned long index, unsigned long nr) +{ + unsigned long page, *pte; + + if (!(PAGE_PRESENT & (page = swapper_pg_dir[dindex]))) + return 0; + page &= PAGE_MASK; + pte = index + (unsigned long *) page; + do { + unsigned long pg = *pte; + *pte = 0; + if (pg & PAGE_PRESENT) + free_page(pg); + pte++; + } while (--nr); + pte = (unsigned long *) page; + for (nr = 0 ; nr < 1024 ; nr++, pte++) + if (*pte) + return 0; + set_pgdir(dindex,0); + mem_map[MAP_NR(page)] = 1; + free_page(page); + invalidate(); + return 0; +} + +static int alloc_area_pages(unsigned long dindex, unsigned long index, unsigned long nr) +{ + unsigned long page, *pte; + + page = swapper_pg_dir[dindex]; + if (!page) { + page = get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + if (swapper_pg_dir[dindex]) { + free_page(page); + page = swapper_pg_dir[dindex]; + } else { + mem_map[MAP_NR(page)] = MAP_PAGE_RESERVED; + set_pgdir(dindex, page | PAGE_SHARED); + } + } + page &= PAGE_MASK; + pte = index + (unsigned long *) page; + *pte = PAGE_SHARED; /* remove a race with vfree() */ + do { + unsigned long pg = get_free_page(GFP_KERNEL); + + if (!pg) + return -ENOMEM; + *pte = pg | PAGE_SHARED; + pte++; + } while (--nr); + invalidate(); + return 0; +} + +static int do_area(void * addr, unsigned long size, + int (*area_fn)(unsigned long,unsigned long,unsigned long)) +{ + unsigned long nr, dindex, index; + + nr = size >> PAGE_SHIFT; + dindex = (TASK_SIZE + (unsigned long) addr) >> 22; + index = (((unsigned long) addr) >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + while (nr > 0) { + unsigned long i = PTRS_PER_PAGE - index; + + if (i > nr) + i = nr; + nr -= i; + if (area_fn(dindex, index, i)) + return -1; + index = 0; + dindex++; + } + return 0; +} + +void vfree(void * addr) +{ + struct vm_struct **p, *tmp; + + if (!addr) + return; + if ((PAGE_SIZE-1) & (unsigned long) addr) { + printk("Trying to vfree() bad address (%p)\n", addr); + return; + } + for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { + if (tmp->addr == addr) { + *p = tmp->next; + do_area(tmp->addr, tmp->size, free_area_pages); + kfree(tmp); + return; + } + } + printk("Trying to vfree() nonexistent vm area (%p)\n", addr); +} + +void * vmalloc(unsigned long size) +{ + void * addr; + struct vm_struct **p, *tmp, *area; + + size = PAGE_ALIGN(size); + if (!size || size > high_memory) + return NULL; + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + addr = (void *) ((high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)); + area->size = size + PAGE_SIZE; + area->next = NULL; + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + if (size + (unsigned long) addr < (unsigned long) tmp->addr) + break; + addr = (void *) (tmp->size + (unsigned long) tmp->addr); + } + area->addr = addr; + area->next = *p; + *p = area; + if (do_area(addr, size, alloc_area_pages)) { + vfree(addr); + return NULL; + } + return addr; +} + +int vread(char *buf, char *addr, int count) +{ + struct vm_struct **p, *tmp; + char *vaddr, *buf_start = buf; + int n; + + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + vaddr = (char *) tmp->addr; + while (addr < vaddr) { + if (count == 0) + goto finished; + put_fs_byte('\0', buf++), addr++, count--; + } + n = tmp->size - PAGE_SIZE; + if (addr > vaddr) + n -= addr - vaddr; + while (--n >= 0) { + if (count == 0) + goto finished; + put_fs_byte(*addr++, buf++), count--; + } + } +finished: + return buf - buf_start; +} diff --git a/arch/i386/ptrace.c b/arch/i386/ptrace.c new file mode 100644 index 000000000..cade04750 --- /dev/null +++ b/arch/i386/ptrace.c @@ -0,0 +1,517 @@ +/* ptrace.c */ +/* By Ross Biro 1/23/92 */ +/* edited by Linus Torvalds */ + +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/user.h> + +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/debugreg.h> + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x00044dd5 + +/* set's the trap flag. */ +#define TRAP_FLAG 0x100 + +/* + * this is the number to subtract from the top of the stack. To find + * the local frame. + */ +#define MAGICNUMBER 68 + +/* change a pid into a task struct. */ +static inline struct task_struct * get_task(int pid) +{ + int i; + + for (i = 1; i < NR_TASKS; i++) { + if (task[i] != NULL && (task[i]->pid == pid)) + return task[i]; + } + return NULL; +} + +/* + * this routine will get a word off of the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int get_stack_long(struct task_struct *task, int offset) +{ + unsigned char *stack; + + stack = (unsigned char *)task->tss.esp0; + stack += offset; + return (*((int *)stack)); +} + +/* + * this routine will put a word on the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int put_stack_long(struct task_struct *task, int offset, + unsigned long data) +{ + unsigned char * stack; + + stack = (unsigned char *) task->tss.esp0; + stack += offset; + *(unsigned long *) stack = data; + return 0; +} + +/* + * This routine gets a long from any process space by following the page + * tables. NOTE! You should check that the long isn't on a page boundary, + * and that it is in the task area before calling this: this routine does + * no checking. + */ +static unsigned long get_long(struct vm_area_struct * vma, unsigned long addr) +{ + unsigned long page; + +repeat: + page = *PAGE_DIR_OFFSET(vma->vm_task->tss.cr3, addr); + if (page & PAGE_PRESENT) { + page &= PAGE_MASK; + page += PAGE_PTR(addr); + page = *((unsigned long *) page); + } + if (!(page & PAGE_PRESENT)) { + do_no_page(vma, addr, 0); + goto repeat; + } +/* this is a hack for non-kernel-mapped video buffers and similar */ + if (page >= high_memory) + return 0; + page &= PAGE_MASK; + page += addr & ~PAGE_MASK; + return *(unsigned long *) page; +} + +/* + * This routine puts a long into any process space by following the page + * tables. NOTE! You should check that the long isn't on a page boundary, + * and that it is in the task area before calling this: this routine does + * no checking. + * + * Now keeps R/W state of page so that a text page stays readonly + * even if a debugger scribbles breakpoints into it. -M.U- + */ +static void put_long(struct vm_area_struct * vma, unsigned long addr, + unsigned long data) +{ + unsigned long page, pte = 0; + int readonly = 0; + +repeat: + page = *PAGE_DIR_OFFSET(vma->vm_task->tss.cr3, addr); + if (page & PAGE_PRESENT) { + page &= PAGE_MASK; + page += PAGE_PTR(addr); + pte = page; + page = *((unsigned long *) page); + } + if (!(page & PAGE_PRESENT)) { + do_no_page(vma, addr, 0 /* PAGE_RW */); + goto repeat; + } + if (!(page & PAGE_RW)) { + if (!(page & PAGE_COW)) + readonly = 1; + do_wp_page(vma, addr, PAGE_RW | PAGE_PRESENT); + goto repeat; + } +/* this is a hack for non-kernel-mapped video buffers and similar */ + if (page >= high_memory) + return; +/* we're bypassing pagetables, so we have to set the dirty bit ourselves */ + *(unsigned long *) pte |= (PAGE_DIRTY|PAGE_COW); + page &= PAGE_MASK; + page += addr & ~PAGE_MASK; + *(unsigned long *) page = data; + if (readonly) { + *(unsigned long *) pte &=~ (PAGE_RW|PAGE_COW); + invalidate(); + } +} + +static struct vm_area_struct * find_vma(struct task_struct * tsk, unsigned long addr) +{ + struct vm_area_struct * vma; + + addr &= PAGE_MASK; + for (vma = tsk->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + return NULL; + if (vma->vm_end > addr) + break; + } + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + if (vma->vm_end - addr > tsk->rlim[RLIMIT_STACK].rlim_cur) + return NULL; + vma->vm_offset -= vma->vm_start - addr; + vma->vm_start = addr; + return vma; +} + +/* + * This routine checks the page boundaries, and that the offset is + * within the task area. It then calls get_long() to read a long. + */ +static int read_long(struct task_struct * tsk, unsigned long addr, + unsigned long * result) +{ + struct vm_area_struct * vma = find_vma(tsk, addr); + + if (!vma) + return -EIO; + if ((addr & ~PAGE_MASK) > PAGE_SIZE-sizeof(long)) { + unsigned long low,high; + struct vm_area_struct * vma_high = vma; + + if (addr + sizeof(long) >= vma->vm_end) { + vma_high = vma->vm_next; + if (!vma_high || vma_high->vm_start != vma->vm_end) + return -EIO; + } + low = get_long(vma, addr & ~(sizeof(long)-1)); + high = get_long(vma_high, (addr+sizeof(long)) & ~(sizeof(long)-1)); + switch (addr & (sizeof(long)-1)) { + case 1: + low >>= 8; + low |= high << 24; + break; + case 2: + low >>= 16; + low |= high << 16; + break; + case 3: + low >>= 24; + low |= high << 8; + break; + } + *result = low; + } else + *result = get_long(vma, addr); + return 0; +} + +/* + * This routine checks the page boundaries, and that the offset is + * within the task area. It then calls put_long() to write a long. + */ +static int write_long(struct task_struct * tsk, unsigned long addr, + unsigned long data) +{ + struct vm_area_struct * vma = find_vma(tsk, addr); + + if (!vma) + return -EIO; + if ((addr & ~PAGE_MASK) > PAGE_SIZE-sizeof(long)) { + unsigned long low,high; + struct vm_area_struct * vma_high = vma; + + if (addr + sizeof(long) >= vma->vm_end) { + vma_high = vma->vm_next; + if (!vma_high || vma_high->vm_start != vma->vm_end) + return -EIO; + } + low = get_long(vma, addr & ~(sizeof(long)-1)); + high = get_long(vma_high, (addr+sizeof(long)) & ~(sizeof(long)-1)); + switch (addr & (sizeof(long)-1)) { + case 0: /* shouldn't happen, but safety first */ + low = data; + break; + case 1: + low &= 0x000000ff; + low |= data << 8; + high &= ~0xff; + high |= data >> 24; + break; + case 2: + low &= 0x0000ffff; + low |= data << 16; + high &= ~0xffff; + high |= data >> 16; + break; + case 3: + low &= 0x00ffffff; + low |= data << 24; + high &= ~0xffffff; + high |= data >> 8; + break; + } + put_long(vma, addr & ~(sizeof(long)-1),low); + put_long(vma_high, (addr+sizeof(long)) & ~(sizeof(long)-1),high); + } else + put_long(vma, addr, data); + return 0; +} + +asmlinkage int sys_ptrace(long request, long pid, long addr, long data) +{ + struct task_struct *child; + struct user * dummy; + int i; + + dummy = NULL; + + if (request == PTRACE_TRACEME) { + /* are we already being traced? */ + if (current->flags & PF_PTRACED) + return -EPERM; + /* set the ptrace bit in the process flags. */ + current->flags |= PF_PTRACED; + return 0; + } + if (pid == 1) /* you may not mess with init */ + return -EPERM; + if (!(child = get_task(pid))) + return -ESRCH; + if (request == PTRACE_ATTACH) { + if (child == current) + return -EPERM; + if ((!child->dumpable || + (current->uid != child->euid) || + (current->uid != child->uid) || + (current->gid != child->egid) || + (current->gid != child->gid)) && !suser()) + return -EPERM; + /* the same process cannot be attached many times */ + if (child->flags & PF_PTRACED) + return -EPERM; + child->flags |= PF_PTRACED; + if (child->p_pptr != current) { + REMOVE_LINKS(child); + child->p_pptr = current; + SET_LINKS(child); + } + send_sig(SIGSTOP, child, 1); + return 0; + } + if (!(child->flags & PF_PTRACED)) + return -ESRCH; + if (child->state != TASK_STOPPED) { + if (request != PTRACE_KILL) + return -ESRCH; + } + if (child->p_pptr != current) + return -ESRCH; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: { + unsigned long tmp; + int res; + + res = read_long(child, addr, &tmp); + if (res < 0) + return res; + res = verify_area(VERIFY_WRITE, (void *) data, sizeof(long)); + if (!res) + put_fs_long(tmp,(unsigned long *) data); + return res; + } + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + int res; + + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + return -EIO; + + res = verify_area(VERIFY_WRITE, (void *) data, sizeof(long)); + if (res) + return res; + tmp = 0; /* Default return condition */ + if(addr < 17*sizeof(long)) { + addr = addr >> 2; /* temporary hack. */ + + tmp = get_stack_long(child, sizeof(long)*addr - MAGICNUMBER); + if (addr == DS || addr == ES || + addr == FS || addr == GS || + addr == CS || addr == SS) + tmp &= 0xffff; + }; + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + addr -= (long) &dummy->u_debugreg[0]; + addr = addr >> 2; + tmp = child->debugreg[addr]; + }; + put_fs_long(tmp,(unsigned long *) data); + return 0; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + return write_long(child,addr,data); + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + return -EIO; + + addr = addr >> 2; /* temporary hack. */ + + if (addr == ORIG_EAX) + return -EIO; + if (addr == DS || addr == ES || + addr == FS || addr == GS || + addr == CS || addr == SS) { + data &= 0xffff; + if (data && (data & 3) != 3) + return -EIO; + } + if (addr == EFL) { /* flags. */ + data &= FLAG_MASK; + data |= get_stack_long(child, EFL*sizeof(long)-MAGICNUMBER) & ~FLAG_MASK; + } + /* Do not allow the user to set the debug register for kernel + address space */ + if(addr < 17){ + if (put_stack_long(child, sizeof(long)*addr-MAGICNUMBER, data)) + return -EIO; + return 0; + }; + + /* We need to be very careful here. We implicitly + want to modify a portion of the task_struct, and we + have to be selective about what portions we allow someone + to modify. */ + + addr = addr << 2; /* Convert back again */ + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + + if(addr == (long) &dummy->u_debugreg[4]) return -EIO; + if(addr == (long) &dummy->u_debugreg[5]) return -EIO; + if(addr < (long) &dummy->u_debugreg[4] && + ((unsigned long) data) >= 0xbffffffd) return -EIO; + + if(addr == (long) &dummy->u_debugreg[7]) { + data &= ~DR_CONTROL_RESERVED; + for(i=0; i<4; i++) + if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) + return -EIO; + }; + + addr -= (long) &dummy->u_debugreg; + addr = addr >> 2; + child->debugreg[addr] = data; + return 0; + }; + return -EIO; + + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: { /* restart after signal. */ + long tmp; + + if ((unsigned long) data > NSIG) + return -EIO; + if (request == PTRACE_SYSCALL) + child->flags |= PF_TRACESYS; + else + child->flags &= ~PF_TRACESYS; + child->exit_code = data; + child->state = TASK_RUNNING; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + return 0; + } + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: { + long tmp; + + child->state = TASK_RUNNING; + child->exit_code = SIGKILL; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + return 0; + } + + case PTRACE_SINGLESTEP: { /* set the trap flag. */ + long tmp; + + if ((unsigned long) data > NSIG) + return -EIO; + child->flags &= ~PF_TRACESYS; + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) | TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + child->state = TASK_RUNNING; + child->exit_code = data; + /* give it a chance to run. */ + return 0; + } + + case PTRACE_DETACH: { /* detach a process that was attached. */ + long tmp; + + if ((unsigned long) data > NSIG) + return -EIO; + child->flags &= ~(PF_PTRACED|PF_TRACESYS); + child->state = TASK_RUNNING; + child->exit_code = data; + REMOVE_LINKS(child); + child->p_pptr = child->p_opptr; + SET_LINKS(child); + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + return 0; + } + + default: + return -EIO; + } +} + +asmlinkage void syscall_trace(void) +{ + if ((current->flags & (PF_PTRACED|PF_TRACESYS)) + != (PF_PTRACED|PF_TRACESYS)) + return; + current->exit_code = SIGTRAP; + current->state = TASK_STOPPED; + notify_parent(current); + schedule(); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) + current->signal |= (1 << (current->exit_code - 1)); + current->exit_code = 0; +} diff --git a/arch/i386/sched.c b/arch/i386/sched.c new file mode 100644 index 000000000..6eed6e8f5 --- /dev/null +++ b/arch/i386/sched.c @@ -0,0 +1,861 @@ +/* + * linux/kernel/sched.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'sched.c' is the main kernel file. It contains scheduling primitives + * (sleep_on, wakeup, schedule etc) as well as a number of simple system + * call functions (type getpid(), which just extracts a field from + * current-task + */ + +#include <linux/config.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/fdreg.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/ptrace.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/tqueue.h> +#include <linux/resource.h> + +#include <asm/system.h> +#include <asm/io.h> +#include <asm/segment.h> + +#define TIMER_IRQ 0 + +#include <linux/timex.h> + +/* + * kernel variables + */ +long tick = 1000000 / HZ; /* timer interrupt period */ +volatile struct timeval xtime; /* The current time */ +int tickadj = 500/HZ; /* microsecs */ + +DECLARE_TASK_QUEUE(tq_timer); +DECLARE_TASK_QUEUE(tq_immediate); + +/* + * phase-lock loop variables + */ +int time_status = TIME_BAD; /* clock synchronization status */ +long time_offset = 0; /* time adjustment (us) */ +long time_constant = 0; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = 0x70000000;/* maximum error */ +long time_esterror = 0x70000000;/* estimated error */ +long time_phase = 0; /* phase offset (scaled us) */ +long time_freq = 0; /* frequency offset (scaled ppm) */ +long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ +long time_reftime = 0; /* time at last adjustment (s) */ + +long time_adjust = 0; +long time_adjust_step = 0; + +int need_resched = 0; +unsigned long event = 0; + +/* + * Tell us the machine setup.. + */ +int hard_math = 0; /* set by boot/head.S */ +int x86 = 0; /* set by boot/head.S to 3 or 4 */ +int ignore_irq13 = 0; /* set if exception 16 works */ +int wp_works_ok = 0; /* set if paging hardware honours WP */ +int hlt_works_ok = 1; /* set if the "hlt" instruction works */ + +/* + * Bus types .. + */ +int EISA_bus = 0; + +extern int _setitimer(int, struct itimerval *, struct itimerval *); +unsigned long * prof_buffer = NULL; +unsigned long prof_len = 0; + +#define _S(nr) (1<<((nr)-1)) + +extern void mem_use(void); + +extern int timer_interrupt(void); +asmlinkage int system_call(void); + +static unsigned long init_kernel_stack[1024] = { STACK_MAGIC, }; +static struct vm_area_struct init_mmap = INIT_MMAP; +struct task_struct init_task = INIT_TASK; + +unsigned long volatile jiffies=0; + +struct task_struct *current = &init_task; +struct task_struct *last_task_used_math = NULL; + +struct task_struct * task[NR_TASKS] = {&init_task, }; + +long user_stack [ PAGE_SIZE>>2 ] = { STACK_MAGIC, }; + +struct { + long * a; + short b; + } stack_start = { & user_stack [PAGE_SIZE>>2] , KERNEL_DS }; + +struct kernel_stat kstat = { 0 }; + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + */ +asmlinkage void math_state_restore(void) +{ + __asm__ __volatile__("clts"); + if (last_task_used_math == current) + return; + timer_table[COPRO_TIMER].expires = jiffies+50; + timer_active |= 1<<COPRO_TIMER; + if (last_task_used_math) + __asm__("fnsave %0":"=m" (last_task_used_math->tss.i387)); + else + __asm__("fnclex"); + last_task_used_math = current; + if (current->used_math) { + __asm__("frstor %0": :"m" (current->tss.i387)); + } else { + __asm__("fninit"); + current->used_math=1; + } + timer_active &= ~(1<<COPRO_TIMER); +} + +#ifndef CONFIG_MATH_EMULATION + +asmlinkage void math_emulate(long arg) +{ + printk("math-emulation not enabled and no coprocessor found.\n"); + printk("killing %s.\n",current->comm); + send_sig(SIGFPE,current,1); + schedule(); +} + +#endif /* CONFIG_MATH_EMULATION */ + +unsigned long itimer_ticks = 0; +unsigned long itimer_next = ~0; + +/* + * 'schedule()' is the scheduler function. It's a very simple and nice + * scheduler: it's not perfect, but certainly works for most things. + * The one thing you might take a look at is the signal-handler code here. + * + * NOTE!! Task 0 is the 'idle' task, which gets called when no other + * tasks can run. It can not be killed, and it cannot sleep. The 'state' + * information in task[0] is never used. + * + * The "confuse_gcc" goto is used only to get better assembly code.. + * Dijkstra probably hates me. + */ +asmlinkage void schedule(void) +{ + int c; + struct task_struct * p; + struct task_struct * next; + unsigned long ticks; + +/* check alarm, wake up any interruptible tasks that have got a signal */ + + if (intr_count) { + printk("Aiee: scheduling in interrupt\n"); + intr_count = 0; + } + cli(); + ticks = itimer_ticks; + itimer_ticks = 0; + itimer_next = ~0; + sti(); + need_resched = 0; + p = &init_task; + for (;;) { + if ((p = p->next_task) == &init_task) + goto confuse_gcc1; + if (ticks && p->it_real_value) { + if (p->it_real_value <= ticks) { + send_sig(SIGALRM, p, 1); + if (!p->it_real_incr) { + p->it_real_value = 0; + goto end_itimer; + } + do { + p->it_real_value += p->it_real_incr; + } while (p->it_real_value <= ticks); + } + p->it_real_value -= ticks; + if (p->it_real_value < itimer_next) + itimer_next = p->it_real_value; + } +end_itimer: + if (p->state != TASK_INTERRUPTIBLE) + continue; + if (p->signal & ~p->blocked) { + p->state = TASK_RUNNING; + continue; + } + if (p->timeout && p->timeout <= jiffies) { + p->timeout = 0; + p->state = TASK_RUNNING; + } + } +confuse_gcc1: + +/* this is the scheduler proper: */ +#if 0 + /* give processes that go to sleep a bit higher priority.. */ + /* This depends on the values for TASK_XXX */ + /* This gives smoother scheduling for some things, but */ + /* can be very unfair under some circumstances, so.. */ + if (TASK_UNINTERRUPTIBLE >= (unsigned) current->state && + current->counter < current->priority*2) { + ++current->counter; + } +#endif + c = -1000; + next = p = &init_task; + for (;;) { + if ((p = p->next_task) == &init_task) + goto confuse_gcc2; + if (p->state == TASK_RUNNING && p->counter > c) + c = p->counter, next = p; + } +confuse_gcc2: + if (!c) { + for_each_task(p) + p->counter = (p->counter >> 1) + p->priority; + } + if (current == next) + return; + kstat.context_swtch++; + switch_to(next); + /* Now maybe reload the debug registers */ + if(current->debugreg[7]){ + loaddebug(0); + loaddebug(1); + loaddebug(2); + loaddebug(3); + loaddebug(6); + }; +} + +asmlinkage int sys_pause(void) +{ + current->state = TASK_INTERRUPTIBLE; + schedule(); + return -ERESTARTNOHAND; +} + +/* + * wake_up doesn't wake up stopped processes - they have to be awakened + * with signals or similar. + * + * Note that this doesn't need cli-sti pairs: interrupts may not change + * the wait-queue structures directly, but only call wake_up() to wake + * a process. The process itself must remove the queue once it has woken. + */ +void wake_up(struct wait_queue **q) +{ + struct wait_queue *tmp; + struct task_struct * p; + + if (!q || !(tmp = *q)) + return; + do { + if ((p = tmp->task) != NULL) { + if ((p->state == TASK_UNINTERRUPTIBLE) || + (p->state == TASK_INTERRUPTIBLE)) { + p->state = TASK_RUNNING; + if (p->counter > current->counter + 3) + need_resched = 1; + } + } + if (!tmp->next) { + printk("wait_queue is bad (eip = %p)\n", + __builtin_return_address(0)); + printk(" q = %p\n",q); + printk(" *q = %p\n",*q); + printk(" tmp = %p\n",tmp); + break; + } + tmp = tmp->next; + } while (tmp != *q); +} + +void wake_up_interruptible(struct wait_queue **q) +{ + struct wait_queue *tmp; + struct task_struct * p; + + if (!q || !(tmp = *q)) + return; + do { + if ((p = tmp->task) != NULL) { + if (p->state == TASK_INTERRUPTIBLE) { + p->state = TASK_RUNNING; + if (p->counter > current->counter + 3) + need_resched = 1; + } + } + if (!tmp->next) { + printk("wait_queue is bad (eip = %p)\n", + __builtin_return_address(0)); + printk(" q = %p\n",q); + printk(" *q = %p\n",*q); + printk(" tmp = %p\n",tmp); + break; + } + tmp = tmp->next; + } while (tmp != *q); +} + +void __down(struct semaphore * sem) +{ + struct wait_queue wait = { current, NULL }; + add_wait_queue(&sem->wait, &wait); + current->state = TASK_UNINTERRUPTIBLE; + while (sem->count <= 0) { + schedule(); + current->state = TASK_UNINTERRUPTIBLE; + } + current->state = TASK_RUNNING; + remove_wait_queue(&sem->wait, &wait); +} + +static inline void __sleep_on(struct wait_queue **p, int state) +{ + unsigned long flags; + struct wait_queue wait = { current, NULL }; + + if (!p) + return; + if (current == task[0]) + panic("task[0] trying to sleep"); + current->state = state; + add_wait_queue(p, &wait); + save_flags(flags); + sti(); + schedule(); + remove_wait_queue(p, &wait); + restore_flags(flags); +} + +void interruptible_sleep_on(struct wait_queue **p) +{ + __sleep_on(p,TASK_INTERRUPTIBLE); +} + +void sleep_on(struct wait_queue **p) +{ + __sleep_on(p,TASK_UNINTERRUPTIBLE); +} + +/* + * The head for the timer-list has a "expires" field of MAX_UINT, + * and the sorting routine counts on this.. + */ +static struct timer_list timer_head = { &timer_head, &timer_head, ~0, 0, NULL }; +#define SLOW_BUT_DEBUGGING_TIMERS 1 + +void add_timer(struct timer_list * timer) +{ + unsigned long flags; + struct timer_list *p; + +#if SLOW_BUT_DEBUGGING_TIMERS + if (timer->next || timer->prev) { + printk("add_timer() called with non-zero list from %p\n", + __builtin_return_address(0)); + return; + } +#endif + p = &timer_head; + timer->expires += jiffies; + save_flags(flags); + cli(); + do { + p = p->next; + } while (timer->expires > p->expires); + timer->next = p; + timer->prev = p->prev; + p->prev = timer; + timer->prev->next = timer; + restore_flags(flags); +} + +int del_timer(struct timer_list * timer) +{ + unsigned long flags; +#if SLOW_BUT_DEBUGGING_TIMERS + struct timer_list * p; + + p = &timer_head; + save_flags(flags); + cli(); + while ((p = p->next) != &timer_head) { + if (p == timer) { + timer->next->prev = timer->prev; + timer->prev->next = timer->next; + timer->next = timer->prev = NULL; + restore_flags(flags); + timer->expires -= jiffies; + return 1; + } + } + if (timer->next || timer->prev) + printk("del_timer() called from %p with timer not initialized\n", + __builtin_return_address(0)); + restore_flags(flags); + return 0; +#else + save_flags(flags); + cli(); + if (timer->next) { + timer->next->prev = timer->prev; + timer->prev->next = timer->next; + timer->next = timer->prev = NULL; + restore_flags(flags); + timer->expires -= jiffies; + return 1; + } + restore_flags(flags); + return 0; +#endif +} + +unsigned long timer_active = 0; +struct timer_struct timer_table[32]; + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + */ +unsigned long avenrun[3] = { 0,0,0 }; + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + struct task_struct **p; + unsigned long nr = 0; + + for(p = &LAST_TASK; p > &FIRST_TASK; --p) + if (*p && ((*p)->state == TASK_RUNNING || + (*p)->state == TASK_UNINTERRUPTIBLE || + (*p)->state == TASK_SWAPPING)) + nr += FIXED_1; + return nr; +} + +static inline void calc_load(void) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + if (count-- > 0) + return; + count = LOAD_FREQ; + active_tasks = count_active_tasks(); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * These were ported to Linux by Philip Gladstone. + */ +static void second_overflow(void) +{ + long ltemp; + /* last time the cmos clock got updated */ + static long last_rtc_update=0; + extern int set_rtc_mmss(unsigned long); + + /* Bump the maxerror field */ + time_maxerror = (0x70000000-time_maxerror < time_tolerance) ? + 0x70000000 : (time_maxerror + time_tolerance); + + /* Run the PLL */ + if (time_offset < 0) { + ltemp = (-(time_offset+1) >> (SHIFT_KG + time_constant)) + 1; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + time_offset += (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); + time_adj = - time_adj; + } else if (time_offset > 0) { + ltemp = ((time_offset-1) >> (SHIFT_KG + time_constant)) + 1; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + time_offset -= (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); + } else { + time_adj = 0; + } + + time_adj += (time_freq >> (SHIFT_KF + SHIFT_HZ - SHIFT_SCALE)) + + FINETUNE; + + /* Handle the leap second stuff */ + switch (time_status) { + case TIME_INS: + /* ugly divide should be replaced */ + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; /* !! */ + time_status = TIME_OOP; + printk("Clock: inserting leap second 23:59:60 GMT\n"); + } + break; + + case TIME_DEL: + /* ugly divide should be replaced */ + if (xtime.tv_sec % 86400 == 86399) { + xtime.tv_sec++; + time_status = TIME_OK; + printk("Clock: deleting leap second 23:59:59 GMT\n"); + } + break; + + case TIME_OOP: + time_status = TIME_OK; + break; + } + if (xtime.tv_sec > last_rtc_update + 660) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + last_rtc_update = xtime.tv_sec - 600; /* do it again in one min */ +} + +/* + * disregard lost ticks for now.. We don't care enough. + */ +static void timer_bh(void * unused) +{ + unsigned long mask; + struct timer_struct *tp; + struct timer_list * timer; + + cli(); + while ((timer = timer_head.next) != &timer_head && timer->expires < jiffies) { + void (*fn)(unsigned long) = timer->function; + unsigned long data = timer->data; + timer->next->prev = timer->prev; + timer->prev->next = timer->next; + timer->next = timer->prev = NULL; + sti(); + fn(data); + cli(); + } + sti(); + + for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { + if (mask > timer_active) + break; + if (!(mask & timer_active)) + continue; + if (tp->expires > jiffies) + continue; + timer_active &= ~mask; + tp->fn(); + sti(); + } +} + +void tqueue_bh(void * unused) +{ + run_task_queue(&tq_timer); +} + +void immediate_bh(void * unused) +{ + run_task_queue(&tq_immediate); +} + +/* + * The int argument is really a (struct pt_regs *), in case the + * interrupt wants to know from where it was called. The timer + * irq uses this to decide if it should update the user or system + * times. + */ +static void do_timer(struct pt_regs * regs) +{ + unsigned long mask; + struct timer_struct *tp; + + long ltemp, psecs; + + /* Advance the phase, once it gets to one microsecond, then + * advance the tick more. + */ + time_phase += time_adj; + if (time_phase < -FINEUSEC) { + ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + xtime.tv_usec += tick + time_adjust_step - ltemp; + } + else if (time_phase > FINEUSEC) { + ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + xtime.tv_usec += tick + time_adjust_step + ltemp; + } else + xtime.tv_usec += tick + time_adjust_step; + + if (time_adjust) + { + /* We are doing an adjtime thing. + * + * Modify the value of the tick for next time. + * Note that a positive delta means we want the clock + * to run fast. This means that the tick should be bigger + * + * Limit the amount of the step for *next* tick to be + * in the range -tickadj .. +tickadj + */ + if (time_adjust > tickadj) + time_adjust_step = tickadj; + else if (time_adjust < -tickadj) + time_adjust_step = -tickadj; + else + time_adjust_step = time_adjust; + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + else + time_adjust_step = 0; + + if (xtime.tv_usec >= 1000000) { + xtime.tv_usec -= 1000000; + xtime.tv_sec++; + second_overflow(); + } + + jiffies++; + calc_load(); + if ((VM_MASK & regs->eflags) || (3 & regs->cs)) { + current->utime++; + if (current != task[0]) { + if (current->priority < 15) + kstat.cpu_nice++; + else + kstat.cpu_user++; + } + /* Update ITIMER_VIRT for current task if not in a system call */ + if (current->it_virt_value && !(--current->it_virt_value)) { + current->it_virt_value = current->it_virt_incr; + send_sig(SIGVTALRM,current,1); + } + } else { + current->stime++; + if(current != task[0]) + kstat.cpu_system++; +#ifdef CONFIG_PROFILE + if (prof_buffer && current != task[0]) { + unsigned long eip = regs->eip; + eip >>= 2; + if (eip < prof_len) + prof_buffer[eip]++; + } +#endif + } + /* + * check the cpu time limit on the process. + */ + if ((current->rlim[RLIMIT_CPU].rlim_max != RLIM_INFINITY) && + (((current->stime + current->utime) / HZ) >= current->rlim[RLIMIT_CPU].rlim_max)) + send_sig(SIGKILL, current, 1); + if ((current->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) && + (((current->stime + current->utime) % HZ) == 0)) { + psecs = (current->stime + current->utime) / HZ; + /* send when equal */ + if (psecs == current->rlim[RLIMIT_CPU].rlim_cur) + send_sig(SIGXCPU, current, 1); + /* and every five seconds thereafter. */ + else if ((psecs > current->rlim[RLIMIT_CPU].rlim_cur) && + ((psecs - current->rlim[RLIMIT_CPU].rlim_cur) % 5) == 0) + send_sig(SIGXCPU, current, 1); + } + + if (current != task[0] && 0 > --current->counter) { + current->counter = 0; + need_resched = 1; + } + /* Update ITIMER_PROF for the current task */ + if (current->it_prof_value && !(--current->it_prof_value)) { + current->it_prof_value = current->it_prof_incr; + send_sig(SIGPROF,current,1); + } + for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { + if (mask > timer_active) + break; + if (!(mask & timer_active)) + continue; + if (tp->expires > jiffies) + continue; + mark_bh(TIMER_BH); + } + cli(); + itimer_ticks++; + if (itimer_ticks > itimer_next) + need_resched = 1; + if (timer_head.next->expires < jiffies) + mark_bh(TIMER_BH); + if (tq_timer != &tq_last) + mark_bh(TQUEUE_BH); + sti(); +} + +asmlinkage int sys_alarm(long seconds) +{ + struct itimerval it_new, it_old; + + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + _setitimer(ITIMER_REAL, &it_new, &it_old); + return(it_old.it_value.tv_sec + (it_old.it_value.tv_usec / 1000000)); +} + +asmlinkage int sys_getpid(void) +{ + return current->pid; +} + +asmlinkage int sys_getppid(void) +{ + return current->p_opptr->pid; +} + +asmlinkage int sys_getuid(void) +{ + return current->uid; +} + +asmlinkage int sys_geteuid(void) +{ + return current->euid; +} + +asmlinkage int sys_getgid(void) +{ + return current->gid; +} + +asmlinkage int sys_getegid(void) +{ + return current->egid; +} + +asmlinkage int sys_nice(long increment) +{ + int newprio; + + if (increment < 0 && !suser()) + return -EPERM; + newprio = current->priority - increment; + if (newprio < 1) + newprio = 1; + if (newprio > 35) + newprio = 35; + current->priority = newprio; + return 0; +} + +static void show_task(int nr,struct task_struct * p) +{ + unsigned long free; + static char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; + + printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr); + if (((unsigned) p->state) < sizeof(stat_nam)/sizeof(char *)) + printk(stat_nam[p->state]); + else + printk(" "); + if (p == current) + printk(" current "); + else + printk(" %08lX ", ((unsigned long *)p->tss.esp)[3]); + for (free = 1; free < 1024 ; free++) { + if (((unsigned long *)p->kernel_stack_page)[free]) + break; + } + printk("%5lu %5d %6d ", free << 2, p->pid, p->p_pptr->pid); + if (p->p_cptr) + printk("%5d ", p->p_cptr->pid); + else + printk(" "); + if (p->p_ysptr) + printk("%7d", p->p_ysptr->pid); + else + printk(" "); + if (p->p_osptr) + printk(" %5d\n", p->p_osptr->pid); + else + printk("\n"); +} + +void show_state(void) +{ + int i; + + printk(" free sibling\n"); + printk(" task PC stack pid father child younger older\n"); + for (i=0 ; i<NR_TASKS ; i++) + if (task[i]) + show_task(i,task[i]); +} + +void sched_init(void) +{ + int i; + struct desc_struct * p; + + bh_base[TIMER_BH].routine = timer_bh; + bh_base[TQUEUE_BH].routine = tqueue_bh; + bh_base[IMMEDIATE_BH].routine = immediate_bh; + if (sizeof(struct sigaction) != 16) + panic("Struct sigaction MUST be 16 bytes"); + set_tss_desc(gdt+FIRST_TSS_ENTRY,&init_task.tss); + set_ldt_desc(gdt+FIRST_LDT_ENTRY,&default_ldt,1); + set_system_gate(0x80,&system_call); + p = gdt+2+FIRST_TSS_ENTRY; + for(i=1 ; i<NR_TASKS ; i++) { + task[i] = NULL; + p->a=p->b=0; + p++; + p->a=p->b=0; + p++; + } +/* Clear NT, so that we won't have troubles with that later on */ + __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); + load_TR(0); + load_ldt(0); + outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(LATCH & 0xff , 0x40); /* LSB */ + outb(LATCH >> 8 , 0x40); /* MSB */ + if (request_irq(TIMER_IRQ,(void (*)(int)) do_timer, 0, "timer") != 0) + panic("Could not allocate timer IRQ!"); +} diff --git a/arch/i386/signal.c b/arch/i386/signal.c new file mode 100644 index 000000000..df7324294 --- /dev/null +++ b/arch/i386/signal.c @@ -0,0 +1,407 @@ +/* + * linux/kernel/signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/ptrace.h> +#include <linux/unistd.h> + +#include <asm/segment.h> + +#define _S(nr) (1<<((nr)-1)) + +#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) + +asmlinkage int do_signal(unsigned long oldmask, struct pt_regs * regs); + +asmlinkage int sys_sigprocmask(int how, sigset_t *set, sigset_t *oset) +{ + sigset_t new_set, old_set = current->blocked; + int error; + + if (set) { + error = verify_area(VERIFY_READ, set, sizeof(sigset_t)); + if (error) + return error; + new_set = get_fs_long((unsigned long *) set) & _BLOCKABLE; + switch (how) { + case SIG_BLOCK: + current->blocked |= new_set; + break; + case SIG_UNBLOCK: + current->blocked &= ~new_set; + break; + case SIG_SETMASK: + current->blocked = new_set; + break; + default: + return -EINVAL; + } + } + if (oset) { + error = verify_area(VERIFY_WRITE, oset, sizeof(sigset_t)); + if (error) + return error; + put_fs_long(old_set, (unsigned long *) oset); + } + return 0; +} + +asmlinkage int sys_sgetmask(void) +{ + return current->blocked; +} + +asmlinkage int sys_ssetmask(int newmask) +{ + int old=current->blocked; + + current->blocked = newmask & _BLOCKABLE; + return old; +} + +asmlinkage int sys_sigpending(sigset_t *set) +{ + int error; + /* fill in "set" with signals pending but blocked. */ + error = verify_area(VERIFY_WRITE, set, 4); + if (!error) + put_fs_long(current->blocked & current->signal, (unsigned long *)set); + return error; +} + +/* + * atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, unsigned long set) +{ + unsigned long mask; + struct pt_regs * regs = (struct pt_regs *) &restart; + + mask = current->blocked; + current->blocked = set & _BLOCKABLE; + regs->eax = -EINTR; + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(mask,regs)) + return -EINTR; + } +} + +/* + * POSIX 3.3.1.3: + * "Setting a signal action to SIG_IGN for a signal that is pending + * shall cause the pending signal to be discarded, whether or not + * it is blocked" (but SIGCHLD is unspecified: linux leaves it alone). + * + * "Setting a signal action to SIG_DFL for a signal that is pending + * and whose default action is to ignore the signal (for example, + * SIGCHLD), shall cause the pending signal to be discarded, whether + * or not it is blocked" + * + * Note the silly behaviour of SIGCHLD: SIG_IGN means that the signal + * isn't actually ignored, but does automatic child reaping, while + * SIG_DFL is explicitly said by POSIX to force the signal to be ignored.. + */ +static void check_pending(int signum) +{ + struct sigaction *p; + + p = signum - 1 + current->sigaction; + if (p->sa_handler == SIG_IGN) { + if (signum == SIGCHLD) + return; + current->signal &= ~_S(signum); + return; + } + if (p->sa_handler == SIG_DFL) { + if (signum != SIGCONT && signum != SIGCHLD && signum != SIGWINCH) + return; + current->signal &= ~_S(signum); + return; + } +} + +asmlinkage int sys_signal(int signum, unsigned long handler) +{ + struct sigaction tmp; + + if (signum<1 || signum>32) + return -EINVAL; + if (signum==SIGKILL || signum==SIGSTOP) + return -EINVAL; + if (handler >= TASK_SIZE) + return -EFAULT; + tmp.sa_handler = (void (*)(int)) handler; + tmp.sa_mask = 0; + tmp.sa_flags = SA_ONESHOT | SA_NOMASK; + tmp.sa_restorer = NULL; + handler = (long) current->sigaction[signum-1].sa_handler; + current->sigaction[signum-1] = tmp; + check_pending(signum); + return handler; +} + +asmlinkage int sys_sigaction(int signum, const struct sigaction * action, + struct sigaction * oldaction) +{ + struct sigaction new_sa, *p; + + if (signum<1 || signum>32) + return -EINVAL; + if (signum==SIGKILL || signum==SIGSTOP) + return -EINVAL; + p = signum - 1 + current->sigaction; + if (action) { + int err = verify_area(VERIFY_READ, action, sizeof(*action)); + if (err) + return err; + memcpy_fromfs(&new_sa, action, sizeof(struct sigaction)); + if (new_sa.sa_flags & SA_NOMASK) + new_sa.sa_mask = 0; + else { + new_sa.sa_mask |= _S(signum); + new_sa.sa_mask &= _BLOCKABLE; + } + if (TASK_SIZE <= (unsigned long) new_sa.sa_handler) + return -EFAULT; + } + if (oldaction) { + int err = verify_area(VERIFY_WRITE, oldaction, sizeof(*oldaction)); + if (err) + return err; + memcpy_tofs(oldaction, p, sizeof(struct sigaction)); + } + if (action) { + *p = new_sa; + check_pending(signum); + } + return 0; +} + +asmlinkage int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options); + +/* + * This sets regs->esp even though we don't actually use sigstacks yet.. + */ +asmlinkage int sys_sigreturn(unsigned long __unused) +{ +#define COPY(x) regs->x = context.x +#define COPY_SEG(x) \ +if ((context.x & 0xfffc) && (context.x & 3) != 3) goto badframe; COPY(x); +#define COPY_SEG_STRICT(x) \ +if (!(context.x & 0xfffc) || (context.x & 3) != 3) goto badframe; COPY(x); + struct sigcontext_struct context; + struct pt_regs * regs; + + regs = (struct pt_regs *) &__unused; + if (verify_area(VERIFY_READ, (void *) regs->esp, sizeof(context))) + goto badframe; + memcpy_fromfs(&context,(void *) regs->esp, sizeof(context)); + current->blocked = context.oldmask & _BLOCKABLE; + COPY_SEG(ds); + COPY_SEG(es); + COPY_SEG(fs); + COPY_SEG(gs); + COPY_SEG_STRICT(ss); + COPY_SEG_STRICT(cs); + COPY(eip); + COPY(ecx); COPY(edx); + COPY(ebx); + COPY(esp); COPY(ebp); + COPY(edi); COPY(esi); + regs->eflags &= ~0x40DD5; + regs->eflags |= context.eflags & 0x40DD5; + regs->orig_eax = -1; /* disable syscall checks */ + return context.eax; +badframe: + do_exit(SIGSEGV); +} + +/* + * Set up a signal frame... Make the stack look the way iBCS2 expects + * it to look. + */ +static void setup_frame(struct sigaction * sa, unsigned long ** fp, unsigned long eip, + struct pt_regs * regs, int signr, unsigned long oldmask) +{ + unsigned long * frame; + +#define __CODE ((unsigned long)(frame+24)) +#define CODE(x) ((unsigned long *) ((x)+__CODE)) + frame = *fp; + if (regs->ss != USER_DS) + frame = (unsigned long *) sa->sa_restorer; + frame -= 32; + if (verify_area(VERIFY_WRITE,frame,32*4)) + do_exit(SIGSEGV); +/* set up the "normal" stack seen by the signal handler (iBCS2) */ + put_fs_long(__CODE,frame); + if (current->exec_domain && current->exec_domain->signal_invmap) + put_fs_long(current->exec_domain->signal_invmap[signr], frame+1); + else + put_fs_long(signr, frame+1); + put_fs_long(regs->gs, frame+2); + put_fs_long(regs->fs, frame+3); + put_fs_long(regs->es, frame+4); + put_fs_long(regs->ds, frame+5); + put_fs_long(regs->edi, frame+6); + put_fs_long(regs->esi, frame+7); + put_fs_long(regs->ebp, frame+8); + put_fs_long((long)*fp, frame+9); + put_fs_long(regs->ebx, frame+10); + put_fs_long(regs->edx, frame+11); + put_fs_long(regs->ecx, frame+12); + put_fs_long(regs->eax, frame+13); + put_fs_long(current->tss.trap_no, frame+14); + put_fs_long(current->tss.error_code, frame+15); + put_fs_long(eip, frame+16); + put_fs_long(regs->cs, frame+17); + put_fs_long(regs->eflags, frame+18); + put_fs_long(regs->esp, frame+19); + put_fs_long(regs->ss, frame+20); + put_fs_long(0,frame+21); /* 387 state pointer - not implemented*/ +/* non-iBCS2 extensions.. */ + put_fs_long(oldmask, frame+22); + put_fs_long(current->tss.cr2, frame+23); +/* set up the return code... */ + put_fs_long(0x0000b858, CODE(0)); /* popl %eax ; movl $,%eax */ + put_fs_long(0x80cd0000, CODE(4)); /* int $0x80 */ + put_fs_long(__NR_sigreturn, CODE(2)); + *fp = frame; +#undef __CODE +#undef CODE +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + * + * Note that we go through the signals twice: once to check the signals that + * the kernel can handle, and then we build all the user-level signal handling + * stack-frames in one go after that. + */ +asmlinkage int do_signal(unsigned long oldmask, struct pt_regs * regs) +{ + unsigned long mask = ~current->blocked; + unsigned long handler_signal = 0; + unsigned long *frame = NULL; + unsigned long eip = 0; + unsigned long signr; + struct sigaction * sa; + + while ((signr = current->signal & mask)) { + __asm__("bsf %2,%1\n\t" + "btrl %1,%0" + :"=m" (current->signal),"=r" (signr) + :"1" (signr)); + sa = current->sigaction + signr; + signr++; + if ((current->flags & PF_PTRACED) && signr != SIGKILL) { + current->exit_code = signr; + current->state = TASK_STOPPED; + notify_parent(current); + schedule(); + if (!(signr = current->exit_code)) + continue; + current->exit_code = 0; + if (signr == SIGSTOP) + continue; + if (_S(signr) & current->blocked) { + current->signal |= _S(signr); + continue; + } + sa = current->sigaction + signr - 1; + } + if (sa->sa_handler == SIG_IGN) { + if (signr != SIGCHLD) + continue; + /* check for SIGCHLD: it's special */ + while (sys_waitpid(-1,NULL,WNOHANG) > 0) + /* nothing */; + continue; + } + if (sa->sa_handler == SIG_DFL) { + if (current->pid == 1) + continue; + switch (signr) { + case SIGCONT: case SIGCHLD: case SIGWINCH: + continue; + + case SIGSTOP: case SIGTSTP: case SIGTTIN: case SIGTTOU: + if (current->flags & PF_PTRACED) + continue; + current->state = TASK_STOPPED; + current->exit_code = signr; + if (!(current->p_pptr->sigaction[SIGCHLD-1].sa_flags & + SA_NOCLDSTOP)) + notify_parent(current); + schedule(); + continue; + + case SIGQUIT: case SIGILL: case SIGTRAP: + case SIGIOT: case SIGFPE: case SIGSEGV: + if (current->binfmt && current->binfmt->core_dump) { + if (current->binfmt->core_dump(signr, regs)) + signr |= 0x80; + } + /* fall through */ + default: + current->signal |= _S(signr & 0x7f); + do_exit(signr); + } + } + /* + * OK, we're invoking a handler + */ + if (regs->orig_eax >= 0) { + if (regs->eax == -ERESTARTNOHAND || + (regs->eax == -ERESTARTSYS && !(sa->sa_flags & SA_RESTART))) + regs->eax = -EINTR; + } + handler_signal |= 1 << (signr-1); + mask &= ~sa->sa_mask; + } + if (regs->orig_eax >= 0 && + (regs->eax == -ERESTARTNOHAND || + regs->eax == -ERESTARTSYS || + regs->eax == -ERESTARTNOINTR)) { + regs->eax = regs->orig_eax; + regs->eip -= 2; + } + if (!handler_signal) /* no handler will be called - return 0 */ + return 0; + eip = regs->eip; + frame = (unsigned long *) regs->esp; + signr = 1; + sa = current->sigaction; + for (mask = 1 ; mask ; sa++,signr++,mask += mask) { + if (mask > handler_signal) + break; + if (!(mask & handler_signal)) + continue; + setup_frame(sa,&frame,eip,regs,signr,oldmask); + eip = (unsigned long) sa->sa_handler; + if (sa->sa_flags & SA_ONESHOT) + sa->sa_handler = NULL; +/* force a supervisor-mode page-in of the signal handler to reduce races */ + __asm__("testb $0,%%fs:%0": :"m" (*(char *) eip)); + regs->cs = USER_CS; regs->ss = USER_DS; + regs->ds = USER_DS; regs->es = USER_DS; + regs->gs = USER_DS; regs->fs = USER_DS; + current->blocked |= sa->sa_mask; + oldmask |= sa->sa_mask; + } + regs->esp = (unsigned long) frame; + regs->eip = eip; /* "return" to the first handler */ + current->tss.trap_no = current->tss.error_code = 0; + return 1; +} diff --git a/arch/i386/traps.c b/arch/i386/traps.c new file mode 100644 index 000000000..150b702b3 --- /dev/null +++ b/arch/i386/traps.c @@ -0,0 +1,245 @@ +/* + * linux/kernel/traps.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'asm.s'. Currently mostly a debugging-aid, will be extended + * to mainly kill the offending process (probably by giving it a signal, + * but possibly by killing it outright if necessary). + */ +#include <linux/head.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/ptrace.h> + +#include <asm/system.h> +#include <asm/segment.h> +#include <asm/io.h> + +static inline void console_verbose(void) +{ + extern int console_loglevel; + console_loglevel = 15; +} + +#define DO_ERROR(trapnr, signr, str, name, tsk) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + tsk->tss.error_code = error_code; \ + tsk->tss.trap_no = trapnr; \ + if (signr == SIGTRAP && current->flags & PF_PTRACED) \ + current->blocked &= ~(1 << (SIGTRAP-1)); \ + send_sig(signr, tsk, 1); \ + die_if_kernel(str,regs,error_code); \ +} + +#define get_seg_byte(seg,addr) ({ \ +register unsigned char __res; \ +__asm__("push %%fs;mov %%ax,%%fs;movb %%fs:%2,%%al;pop %%fs" \ + :"=a" (__res):"0" (seg),"m" (*(addr))); \ +__res;}) + +#define get_seg_long(seg,addr) ({ \ +register unsigned long __res; \ +__asm__("push %%fs;mov %%ax,%%fs;movl %%fs:%2,%%eax;pop %%fs" \ + :"=a" (__res):"0" (seg),"m" (*(addr))); \ +__res;}) + +#define _fs() ({ \ +register unsigned short __res; \ +__asm__("mov %%fs,%%ax":"=a" (__res):); \ +__res;}) + +void page_exception(void); + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +asmlinkage void double_fault(void); +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void coprocessor_error(void); +asmlinkage void reserved(void); +asmlinkage void alignment_check(void); + +/*static*/ void die_if_kernel(char * str, struct pt_regs * regs, long err) +{ + int i; + unsigned long esp; + unsigned short ss; + + esp = (unsigned long) ®s->esp; + ss = KERNEL_DS; + if ((regs->eflags & VM_MASK) || (3 & regs->cs) == 3) + return; + if (regs->cs & 3) { + esp = regs->esp; + ss = regs->ss; + } + console_verbose(); + printk("%s: %04lx\n", str, err & 0xffff); + printk("EIP: %04x:%08lx\nEFLAGS: %08lx\n", 0xffff & regs->cs,regs->eip,regs->eflags); + printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->eax, regs->ebx, regs->ecx, regs->edx); + printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->esi, regs->edi, regs->ebp, esp); + printk("ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", + regs->ds, regs->es, regs->fs, regs->gs, ss); + store_TR(i); + if (STACK_MAGIC != *(unsigned long *)current->kernel_stack_page) + printk("Corrupted stack page\n"); + printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)\nStack: ", + current->comm, current->pid, 0xffff & i, current->kernel_stack_page); + for(i=0;i<5;i++) + printk("%08lx ", get_seg_long(ss,(i+(unsigned long *)esp))); + printk("\nCode: "); + for(i=0;i<20;i++) + printk("%02x ",0xff & get_seg_byte(regs->cs,(i+(char *)regs->eip))); + printk("\n"); + do_exit(SIGSEGV); +} + +DO_ERROR( 0, SIGFPE, "divide error", divide_error, current) +DO_ERROR( 3, SIGTRAP, "int3", int3, current) +DO_ERROR( 4, SIGSEGV, "overflow", overflow, current) +DO_ERROR( 5, SIGSEGV, "bounds", bounds, current) +DO_ERROR( 6, SIGILL, "invalid operand", invalid_op, current) +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available, current) +DO_ERROR( 8, SIGSEGV, "double fault", double_fault, current) +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun, last_task_used_math) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS, current) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present, current) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment, current) +DO_ERROR(15, SIGSEGV, "reserved", reserved, current) +DO_ERROR(17, SIGSEGV, "alignment check", alignment_check, current) + +asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) +{ + int signr = SIGSEGV; + + if (regs->eflags & VM_MASK) { + handle_vm86_fault((struct vm86_regs *) regs, error_code); + return; + } + die_if_kernel("general protection",regs,error_code); + switch (get_seg_byte(regs->cs, (char *)regs->eip)) { + case 0xCD: /* INT */ + case 0xF4: /* HLT */ + case 0xFA: /* CLI */ + case 0xFB: /* STI */ + signr = SIGILL; + } + current->tss.error_code = error_code; + current->tss.trap_no = 13; + send_sig(signr, current, 1); +} + +asmlinkage void do_nmi(struct pt_regs * regs, long error_code) +{ + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); + printk("You probably have a hardware problem with your RAM chips\n"); +} + +asmlinkage void do_debug(struct pt_regs * regs, long error_code) +{ + if (regs->eflags & VM_MASK) { + handle_vm86_debug((struct vm86_regs *) regs, error_code); + return; + } + if (current->flags & PF_PTRACED) + current->blocked &= ~(1 << (SIGTRAP-1)); + send_sig(SIGTRAP, current, 1); + current->tss.trap_no = 1; + current->tss.error_code = error_code; + if ((regs->cs & 3) == 0) { + /* If this is a kernel mode trap, then reset db7 and allow us to continue */ + __asm__("movl %0,%%db7" + : /* no output */ + : "r" (0)); + return; + } + die_if_kernel("debug",regs,error_code); +} + +/* + * Allow the process which triggered the interrupt to recover the error + * condition. + * - the status word is saved in the cs selector. + * - the tag word is saved in the operand selector. + * - the status word is then cleared and the tags all set to Empty. + * + * This will give sufficient information for complete recovery provided that + * the affected process knows or can deduce the code and data segments + * which were in force when the exception condition arose. + * + * Note that we play around with the 'TS' bit to hopefully get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +void math_error(void) +{ + struct i387_hard_struct * env; + + clts(); + if (!last_task_used_math) { + __asm__("fnclex"); + return; + } + env = &last_task_used_math->tss.i387.hard; + send_sig(SIGFPE, last_task_used_math, 1); + last_task_used_math->tss.trap_no = 16; + last_task_used_math->tss.error_code = 0; + __asm__ __volatile__("fnsave %0":"=m" (*env)); + last_task_used_math = NULL; + stts(); + env->fcs = (env->swd & 0x0000ffff) | (env->fcs & 0xffff0000); + env->fos = env->twd; + env->swd &= 0xffff3800; + env->twd = 0xffffffff; +} + +asmlinkage void do_coprocessor_error(struct pt_regs * regs, long error_code) +{ + ignore_irq13 = 1; + math_error(); +} + +void trap_init(void) +{ + int i; + + set_trap_gate(0,÷_error); + set_trap_gate(1,&debug); + set_trap_gate(2,&nmi); + set_system_gate(3,&int3); /* int3-5 can be called from all */ + set_system_gate(4,&overflow); + set_system_gate(5,&bounds); + set_trap_gate(6,&invalid_op); + set_trap_gate(7,&device_not_available); + set_trap_gate(8,&double_fault); + set_trap_gate(9,&coprocessor_segment_overrun); + set_trap_gate(10,&invalid_TSS); + set_trap_gate(11,&segment_not_present); + set_trap_gate(12,&stack_segment); + set_trap_gate(13,&general_protection); + set_trap_gate(14,&page_fault); + set_trap_gate(15,&reserved); + set_trap_gate(16,&coprocessor_error); + set_trap_gate(17,&alignment_check); + for (i=18;i<48;i++) + set_trap_gate(i,&reserved); +} diff --git a/arch/i386/vm86.c b/arch/i386/vm86.c new file mode 100644 index 000000000..144d93a02 --- /dev/null +++ b/arch/i386/vm86.c @@ -0,0 +1,404 @@ +/* + * linux/kernel/vm86.c + * + * Copyright (C) 1994 Linus Torvalds + */ +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/string.h> +#include <linux/ptrace.h> + +#include <asm/segment.h> +#include <asm/io.h> + +/* + * Known problems: + * + * Interrupt handling is not guaranteed: + * - a real x86 will disable all interrupts for one instruction + * after a "mov ss,xx" to make stack handling atomic even without + * the 'lss' instruction. We can't guarantee this in v86 mode, + * as the next instruction might result in a page fault or similar. + * - a real x86 will have interrupts disabled for one instruction + * past the 'sti' that enables them. We don't bother with all the + * details yet.. + * + * Hopefully these problems do not actually matter for anything. + */ + +/* + * 8- and 16-bit register defines.. + */ +#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->eip)) +#define SP(regs) (*(unsigned short *)&((regs)->esp)) + +/* + * virtual flags (16 and 32-bit versions) + */ +#define VFLAGS (*(unsigned short *)&(current->v86flags)) +#define VEFLAGS (current->v86flags) + +#define set_flags(X,new,mask) \ +((X) = ((X) & ~(mask)) | ((new) & (mask))) + +#define SAFE_MASK (0xDD5) +#define RETURN_MASK (0xDFF) + +asmlinkage struct pt_regs * save_v86_state(struct vm86_regs * regs) +{ + unsigned long tmp; + + if (!current->vm86_info) { + printk("no vm86_info: BAD\n"); + do_exit(SIGSEGV); + } + set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->v86mask); + memcpy_tofs(¤t->vm86_info->regs,regs,sizeof(*regs)); + put_fs_long(current->screen_bitmap,¤t->vm86_info->screen_bitmap); + tmp = current->tss.esp0; + current->tss.esp0 = current->saved_kernel_stack; + current->saved_kernel_stack = 0; + return (struct pt_regs *) tmp; +} + +static void mark_screen_rdonly(struct task_struct * tsk) +{ + unsigned long tmp; + unsigned long *pg_table; + + if ((tmp = tsk->tss.cr3) != 0) { + tmp = *(unsigned long *) tmp; + if (tmp & PAGE_PRESENT) { + tmp &= PAGE_MASK; + pg_table = (0xA0000 >> PAGE_SHIFT) + (unsigned long *) tmp; + tmp = 32; + while (tmp--) { + if (PAGE_PRESENT & *pg_table) + *pg_table &= ~PAGE_RW; + pg_table++; + } + } + } +} + +asmlinkage int sys_vm86(struct vm86_struct * v86) +{ + struct vm86_struct info; + struct pt_regs * pt_regs = (struct pt_regs *) &v86; + int error; + + if (current->saved_kernel_stack) + return -EPERM; + /* v86 must be readable (now) and writable (for save_v86_state) */ + error = verify_area(VERIFY_WRITE,v86,sizeof(*v86)); + if (error) + return error; + memcpy_fromfs(&info,v86,sizeof(info)); +/* + * make sure the vm86() system call doesn't try to do anything silly + */ + info.regs.__null_ds = 0; + info.regs.__null_es = 0; + info.regs.__null_fs = 0; + info.regs.__null_gs = 0; +/* + * The eflags register is also special: we cannot trust that the user + * has set it up safely, so this makes sure interrupt etc flags are + * inherited from protected mode. + */ + VEFLAGS = info.regs.eflags; + info.regs.eflags &= SAFE_MASK; + info.regs.eflags |= pt_regs->eflags & ~SAFE_MASK; + info.regs.eflags |= VM_MASK; + + switch (info.cpu_type) { + case CPU_286: + current->v86mask = 0; + break; + case CPU_386: + current->v86mask = NT_MASK | IOPL_MASK; + break; + case CPU_486: + current->v86mask = AC_MASK | NT_MASK | IOPL_MASK; + break; + default: + current->v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; + break; + } + +/* + * Save old state, set default return value (%eax) to 0 + */ + pt_regs->eax = 0; + current->saved_kernel_stack = current->tss.esp0; + current->tss.esp0 = (unsigned long) pt_regs; + current->vm86_info = v86; + + current->screen_bitmap = info.screen_bitmap; + if (info.flags & VM86_SCREEN_BITMAP) + mark_screen_rdonly(current); + __asm__ __volatile__("movl %0,%%esp\n\t" + "jmp ret_from_sys_call" + : /* no outputs */ + :"r" (&info.regs)); + return 0; +} + +static inline void return_to_32bit(struct vm86_regs * regs16, int retval) +{ + struct pt_regs * regs32; + + regs32 = save_v86_state(regs16); + regs32->eax = retval; + __asm__ __volatile__("movl %0,%%esp\n\t" + "jmp ret_from_sys_call" + : : "r" (regs32)); +} + +static inline void set_IF(struct vm86_regs * regs) +{ + VEFLAGS |= VIF_MASK; + if (VEFLAGS & VIP_MASK) + return_to_32bit(regs, VM86_STI); +} + +static inline void clear_IF(struct vm86_regs * regs) +{ + VEFLAGS &= ~VIF_MASK; +} + +static inline void clear_TF(struct vm86_regs * regs) +{ + regs->eflags &= ~TF_MASK; +} + +static inline void set_vflags_long(unsigned long eflags, struct vm86_regs * regs) +{ + set_flags(VEFLAGS, eflags, current->v86mask); + set_flags(regs->eflags, eflags, SAFE_MASK); + if (eflags & IF_MASK) + set_IF(regs); +} + +static inline void set_vflags_short(unsigned short flags, struct vm86_regs * regs) +{ + set_flags(VFLAGS, flags, current->v86mask); + set_flags(regs->eflags, flags, SAFE_MASK); + if (flags & IF_MASK) + set_IF(regs); +} + +static inline unsigned long get_vflags(struct vm86_regs * regs) +{ + unsigned long flags = regs->eflags & RETURN_MASK; + + if (VEFLAGS & VIF_MASK) + flags |= IF_MASK; + return flags | (VEFLAGS & current->v86mask); +} + +static inline int is_revectored(int nr, struct revectored_struct * bitmap) +{ + __asm__ __volatile__("btl %2,%%fs:%1\n\tsbbl %0,%0" + :"=r" (nr) + :"m" (*bitmap),"r" (nr)); + return nr; +} + +/* + * Boy are these ugly, but we need to do the correct 16-bit arithmetic. + * Gcc makes a mess of it, so we do it inline and use non-obvious calling + * conventions.. + */ +#define pushb(base, ptr, val) \ +__asm__ __volatile__( \ + "decw %w0\n\t" \ + "movb %2,%%fs:0(%1,%0)" \ + : "=r" (ptr) \ + : "r" (base), "q" (val), "0" (ptr)) + +#define pushw(base, ptr, val) \ +__asm__ __volatile__( \ + "decw %w0\n\t" \ + "movb %h2,%%fs:0(%1,%0)\n\t" \ + "decw %w0\n\t" \ + "movb %b2,%%fs:0(%1,%0)" \ + : "=r" (ptr) \ + : "r" (base), "q" (val), "0" (ptr)) + +#define pushl(base, ptr, val) \ +__asm__ __volatile__( \ + "decw %w0\n\t" \ + "rorl $16,%2\n\t" \ + "movb %h2,%%fs:0(%1,%0)\n\t" \ + "decw %w0\n\t" \ + "movb %b2,%%fs:0(%1,%0)\n\t" \ + "decw %w0\n\t" \ + "rorl $16,%2\n\t" \ + "movb %h2,%%fs:0(%1,%0)\n\t" \ + "decw %w0\n\t" \ + "movb %b2,%%fs:0(%1,%0)" \ + : "=r" (ptr) \ + : "r" (base), "q" (val), "0" (ptr)) + +#define popb(base, ptr) \ +({ unsigned long __res; \ +__asm__ __volatile__( \ + "movb %%fs:0(%1,%0),%b2\n\t" \ + "incw %w0" \ + : "=r" (ptr), "=r" (base), "=q" (__res) \ + : "0" (ptr), "1" (base), "2" (0)); \ +__res; }) + +#define popw(base, ptr) \ +({ unsigned long __res; \ +__asm__ __volatile__( \ + "movb %%fs:0(%1,%0),%b2\n\t" \ + "incw %w0\n\t" \ + "movb %%fs:0(%1,%0),%h2\n\t" \ + "incw %w0" \ + : "=r" (ptr), "=r" (base), "=q" (__res) \ + : "0" (ptr), "1" (base), "2" (0)); \ +__res; }) + +#define popl(base, ptr) \ +({ unsigned long __res; \ +__asm__ __volatile__( \ + "movb %%fs:0(%1,%0),%b2\n\t" \ + "incw %w0\n\t" \ + "movb %%fs:0(%1,%0),%h2\n\t" \ + "incw %w0\n\t" \ + "rorl $16,%2\n\t" \ + "movb %%fs:0(%1,%0),%b2\n\t" \ + "incw %w0\n\t" \ + "movb %%fs:0(%1,%0),%h2\n\t" \ + "incw %w0\n\t" \ + "rorl $16,%2" \ + : "=r" (ptr), "=r" (base), "=q" (__res) \ + : "0" (ptr), "1" (base)); \ +__res; }) + +static void do_int(struct vm86_regs *regs, int i, unsigned char * ssp, unsigned long sp) +{ + unsigned short seg = get_fs_word((void *) ((i<<2)+2)); + + if (seg == BIOSSEG || regs->cs == BIOSSEG || + is_revectored(i, ¤t->vm86_info->int_revectored)) + return_to_32bit(regs, VM86_INTx + (i << 8)); + if (i==0x21 && is_revectored(AH(regs),¤t->vm86_info->int21_revectored)) + return_to_32bit(regs, VM86_INTx + (i << 8)); + pushw(ssp, sp, get_vflags(regs)); + pushw(ssp, sp, regs->cs); + pushw(ssp, sp, IP(regs)); + regs->cs = seg; + SP(regs) -= 6; + IP(regs) = get_fs_word((void *) (i<<2)); + clear_TF(regs); + clear_IF(regs); + return; +} + +void handle_vm86_debug(struct vm86_regs * regs, long error_code) +{ +#if 0 + do_int(regs, 1, (unsigned char *) (regs->ss << 4), SP(regs)); +#else + if (current->flags & PF_PTRACED) + current->blocked &= ~(1 << (SIGTRAP-1)); + send_sig(SIGTRAP, current, 1); + current->tss.trap_no = 1; + current->tss.error_code = error_code; +#endif +} + +void handle_vm86_fault(struct vm86_regs * regs, long error_code) +{ + unsigned char *csp, *ssp; + unsigned long ip, sp; + + csp = (unsigned char *) (regs->cs << 4); + ssp = (unsigned char *) (regs->ss << 4); + sp = SP(regs); + ip = IP(regs); + + switch (popb(csp, ip)) { + + /* operand size override */ + case 0x66: + switch (popb(csp, ip)) { + + /* pushfd */ + case 0x9c: + SP(regs) -= 4; + IP(regs) += 2; + pushl(ssp, sp, get_vflags(regs)); + return; + + /* popfd */ + case 0x9d: + SP(regs) += 4; + IP(regs) += 2; + set_vflags_long(popl(ssp, sp), regs); + return; + } + + /* pushf */ + case 0x9c: + SP(regs) -= 2; + IP(regs)++; + pushw(ssp, sp, get_vflags(regs)); + return; + + /* popf */ + case 0x9d: + SP(regs) += 2; + IP(regs)++; + set_vflags_short(popw(ssp, sp), regs); + return; + + /* int 3 */ + case 0xcc: + IP(regs)++; + do_int(regs, 3, ssp, sp); + return; + + /* int xx */ + case 0xcd: + IP(regs) += 2; + do_int(regs, popb(csp, ip), ssp, sp); + return; + + /* iret */ + case 0xcf: + SP(regs) += 6; + IP(regs) = popw(ssp, sp); + regs->cs = popw(ssp, sp); + set_vflags_short(popw(ssp, sp), regs); + return; + + /* cli */ + case 0xfa: + IP(regs)++; + clear_IF(regs); + return; + + /* sti */ + /* + * Damn. This is incorrect: the 'sti' instruction should actually + * enable interrupts after the /next/ instruction. Not good. + * + * Probably needs some horsing around with the TF flag. Aiee.. + */ + case 0xfb: + IP(regs)++; + set_IF(regs); + return; + + default: + return_to_32bit(regs, VM86_UNKNOWN); + } +} diff --git a/arch/mips/Makefile b/arch/mips/Makefile new file mode 100644 index 000000000..0dc133749 --- /dev/null +++ b/arch/mips/Makefile @@ -0,0 +1,71 @@ +# +# Makefile.mips +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1994 by Waldorf GMBH, +# written by Ralf Baechle +# + +AS = mips-linux-as +ASFLAGS = -mips3 -mcpu=r4000 +LD = mips-linux-ld +HOSTCC = gcc +CC = mips-linux-gcc -V 2.5.8 -Wa,-mips3 -mcpu=r4000 -D__KERNEL__ -I$(TOPDIR)/include +#CC = mips-linux-gcc -V 2.6.2 -Wa,-mips3 -mcpu=r4600 -D__KERNEL__ -I$(TOPDIR)/include +MAKE = make +CPP = $(CC) -E +AR = mips-linux-ar +RANLIB = mips-linux-ranlib +STRIP = strip +KERNELHDRS = /home/ralf/src/linux + +zBoot/zSystem: zBoot/*.c zBoot/*.S tools/zSystem + $(MAKE) -C zBoot + +zImage: $(CONFIGURE) tools/zSystem + cp tools/System zImage + sync + +#zImage: $(CONFIGURE) zBoot/zSystem tools/build +# tools/build zBoot/zSystem $(ROOT_DEV) > zImage +# sync + +zdisk: zImage + mcopy -n zImage a:vmlinux + +tools/zSystem: boot/head.o init/main.o init/init.o tools/version.o linuxsubdirs + $(LD) $(LOWLDFLAGS) boot/head.o init/main.o init/init.o \ + tools/version.o \ + $(ARCHIVES) \ + $(FILESYSTEMS) \ + $(DRIVERS) \ + $(LIBS) \ + -N -Ttext 0x80000000 \ + -o tools/System + nm tools/System | grep -v '\(compiled\)\|\(\.o$$\)\|\( a \)' | \ + sort > System.map + +#tools/system: boot/head.o init/main.o init/init.o tools/version.o linuxsubdirs +# $(LD) $(LOWLDFLAGS) boot/head.o init/main.o tools/version.o \ +# $(ARCHIVES) \ +# $(FILESYSTEMS) \ +# $(DRIVERS) \ +# $(LIBS) \ +# -N -Ttext 0x80000000 \ +# -o tools/system +# nm tools/zSystem | grep -v '\(compiled\)\|\(\.o$$\)\|\( a \)' | \ +# sort > System.map + +#tools/zSystem: boot/head.o init/main.o tools/version.o linuxsubdirs +# $(LD) $(HIGHLDFLAGS) boot/head.o init/main.o tools/version.o \ +# $(ARCHIVES) \ +# $(FILESYSTEMS) \ +# $(DRIVERS) \ +# $(LIBS) \ +# -N -Ttext 0x80600000 \ +# -o tools/zSystem +# nm tools/zSystem | grep -v '\(compiled\)\|\(\.o$$\)\|\( a \)' | \ +# sort > zSystem.map diff --git a/arch/mips/bios32.c b/arch/mips/bios32.c new file mode 100644 index 000000000..bb011a852 --- /dev/null +++ b/arch/mips/bios32.c @@ -0,0 +1,8 @@ +/* + * bios32.c - BIOS32, PCI BIOS functions. + * + * Copyright (C) 1994 by Waldorf GMBH, + * written by Ralf Baechle + * + * Just nothing for a MIPS board... + */ diff --git a/arch/mips/boot/head.S b/arch/mips/boot/head.S new file mode 100644 index 000000000..73e9d2cf9 --- /dev/null +++ b/arch/mips/boot/head.S @@ -0,0 +1,387 @@ +/* + * mips/head.S + * + * Copyright (C) 1994 Ralf Baechle + * + * Head.S contains the MIPS 32-bit startup code. + */ + +/* + * prevent prototypes from being imported + */ +#define __ASSEMBLY__ + +#include <asm/segment.h> +#include <asm/cachectl.h> +#include <asm/mipsregs.h> +#include <asm/mipsconfig.h> +#include <asm/stackframe.h> +#include <asm/regdef.h> +#include <linux/tasks.h> + + + .globl _empty_bad_page + .globl _empty_bad_page_table + .globl _invalid_pg_table + .globl _empty_zero_page + .globl _tmp_floppy_area + .globl _floppy_track_buffer + .globl _swapper_pg_dir + + .set noreorder + + .text +/* + * This is space for the interrupt handlers. + * They are located at virtual address 0x80000000 (physical 0x0) + */ + /* + * TLB refill, EXL == 0 + */ +except_vec0: + .set noreorder + .set noat + /* + * This TLB-refill handler is supposed never to cause + * another tlb-refill exception. Unmapped pages will + * cause another type of exception. + */ + dmfc0 k0,CP0_CONTEXT + dsrl k0,k0,1 + lwu k0,(k1) + lwu k1,4(k1) + dmtc0 k0,CP0_ENTRYLO0 + dmtc0 k0,CP0_ENTRYLO1 + tlbwr + eret + + /* + * XTLB refill, EXL == 0 (X == 64-bit TLB) + */ + .org except_vec0+0x80 +except_vec1: + /* + * Not used yet... + */ + eret + + /* + * Cache Error + */ + .org except_vec1+0x80 +except_vec2: + /* + * Not used yet... + */ + eret + + /* + * General exception vector. + */ + .org except_vec2+0x80 +except_vec3: + SAVE_ALL + mfc0 t0,CP0_STATUS + ori t0,t0,0x1f + xori t0,t0,0x1f + mtc0 t0,CP0_STATUS + .set at + la k0,_exception_handlers + mfc0 k1,CP0_CAUSE + andi k1,k1,0x7c + addu k0,k0,k1 + lw k0,(k0) + FILL_LDS + jr k0 + nop + + +/******************************************************************************/ + + /* + * The following data is expected to be at certain absolute + * addresses, which are hardwired in + * include/asm-mips/mipsconfig.h + * If the following offset is to short, the assembler will + * break with an assertion failure. You then will have to + * increase it and to fix the address in + * include/asm-mips/mipsconfig.h + */ + + .org except_vec3+0x100 + .globl _kernelsp +_kernelsp: .word 0 + +kernel_entry: + +/* + * Flush the TLB + */ + dmtc0 zero,CP0_ENTRYHI + dmtc0 zero,CP0_ENTRYLO0 + dmtc0 zero,CP0_ENTRYLO1 + li t0,NUMBER_OF_TLB_ENTRIES-1 +1: mtc0 t0,CP0_INDEX + tlbwi + bne zero,t0,1b + subu t0,t0,1 + +/* + * Initialize memory management. + * Wire mapping for port i/o space 0xe0000000 -> 0x9000000900000000 + */ + li t0,3 + mtc0 t0,CP0_WIRED + li t0,PM_64K + mtc0 t0,CP0_PAGEMASK + la t3,map0 + ld t1,0(t3) + ld t2,8(t3) + mtc0 zero,CP0_INDEX + dmtc0 t1,CP0_ENTRYHI + dmtc0 t2,CP0_ENTRYLO0 + dmtc0 zero,CP0_ENTRYLO1 /* Invalid page */ + tlbwi + li t0,PM_1M + mtc0 t0,CP0_PAGEMASK + ld t1,16(t3) + ld t2,24(t3) + li t0,1 + mtc0 t0,CP0_INDEX + dmtc0 t1,CP0_ENTRYHI + dmtc0 t2,CP0_ENTRYLO0 + tlbwi + ld t1,32(t3) + ld t2,40(t3) + li t0,2 + mtc0 t0,CP0_INDEX + dmtc0 t1,CP0_ENTRYHI + dmtc0 t2,CP0_ENTRYLO0 + tlbwi + +/* + * We always use 4k pages. Therefore the PageMask register + * is expected to be setup for 4k pages. + */ + li t0,PM_4K + mtc0 t0,CP0_PAGEMASK + +/* + * Clear BSS first so that there are no surprises... + */ + la t0,__edata + la t1,__end + sw zero,(t0) +1: addiu t0,t0,4 + bnel t0,t1,1b + sw zero,(t0) + +/* + * Copy bootup parameters out of the way. First 2kB of + * _empty_zero_page is for boot parameters, second 2kB + * is for the command line. + */ +#if 0 + movl $0x90000,%esi + movl $_empty_zero_page,%edi + movl $512,%ecx + cld + rep + movsl + xorl %eax,%eax + movl $512,%ecx + rep + stosl + cmpw $(CL_MAGIC),CL_MAGIC_ADDR + jne 1f + movl $_empty_zero_page+2048,%edi + movzwl CL_OFFSET,%esi + addl $(CL_BASE_ADDR),%esi + movl $2048,%ecx + rep + movsb +#endif + + /* + * Preliminary stack... + */ + la sp,0x80700000 + sw sp,_kernelsp +6: + jal _start_kernel + nop + j 6b # main should never return here, but + # just in case, we know what happens. + +#if 0 +/* This is the default interrupt "handler" :-) */ +int_msg: + .asciz "Unknown interrupt\n" +.align 2 +ignore_int: + cld + pushl %eax + pushl %ecx + pushl %edx + push %ds + push %es + push %fs + movl $(KERNEL_DS),%eax + mov %ax,%ds + mov %ax,%es + mov %ax,%fs + pushl $int_msg + call _printk + popl %eax + pop %fs + pop %es + pop %ds + popl %edx + popl %ecx + popl %eax + iret +#endif + +#define CACHELINES 512 /* number of cachelines */ + +/* + * Flush instruction/data caches + * + * Parameters: a0 - starting address to flush + * a1 - size of area to be flushed + * a2 - which caches to be flushed + * + * FIXME: - ignores parameters + * - doesn't know about second level caches + */ + + .set noreorder + .globl _cacheflush + .text +_cacheflush: + /* + * Flush the instruction cache + */ + lui t0,0x8000 + li t1,CACHELINES-1 +1: cache 0,0(t0) + cache 0,32(t0) + cache 0,64(t0) + cache 0,96(t0) + cache 0,128(t0) + cache 0,160(t0) + cache 0,192(t0) + cache 0,224(t0) + cache 0,256(t0) + cache 0,288(t0) + cache 0,320(t0) + cache 0,352(t0) + cache 0,384(t0) + cache 0,416(t0) + cache 0,448(t0) + cache 0,480(t0) + addiu t0,t0,512 + bne zero,t1,1b + subu t1,t1,1 + /* + * Flush the data cache + */ + lui t0,0x8000 + li t1,CACHELINES-1 +1: cache 1,0(t0) + cache 1,32(t0) + cache 1,64(t0) + cache 1,96(t0) + cache 1,128(t0) + cache 1,160(t0) + cache 1,192(t0) + cache 1,224(t0) + cache 1,256(t0) + cache 1,288(t0) + cache 1,320(t0) + cache 1,352(t0) + cache 1,384(t0) + cache 1,416(t0) + cache 1,448(t0) + cache 1,480(t0) + addiu t0,t0,512 + bne zero,t1,1b + subu t1,t1,1 + + j ra + nop + + .globl _beep +_beep: lbu t0,0xe0000061 + xori t0,t0,3 + sb t0,0xe0000061 + jr ra + nop + +/* + * Instead of Intel's strage and unportable segment descriptor magic + * we difference user and kernel space by their address. + * Kernel space (== physical memory) is mapped at 0x80000000, + * User space is mapped at 0x0. + */ + + .data + + .globl _segment_fs + /* + * Inital wired mappings + */ +map0: .quad 0xc00000ffe0000000,0x24000017 + .quad 0xc00000ffe1000000,0x04000017 + .quad 0xc00000ffe2000000,0x04020017 + +/* + * page 0 is made non-existent, so that kernel NULL pointer references get + * caught. Thus the swapper page directory has been moved to 0x1000 + * + * XXX Actually, the swapper page directory is at 0x1000 plus 1 megabyte, + * with the introduction of the compressed boot code. Theoretically, + * the original design of overlaying the startup code with the swapper + * page directory is still possible --- it would reduce the size of the kernel + * by 2-3k. This would be a good thing to do at some point..... + */ + .text + + .org 0x1000 +_swapper_pg_dir: +/* + * The page tables are initialized to only 4MB here - the final page + * tables are set up later depending on memory size. + */ + .org 0x2000 +_pg0: + + .org 0x3000 +_empty_bad_page: + + .org 0x4000 +_empty_bad_page_table: + + .org 0x5000 +_invalid_pg_table: + + .org 0x6000 +_empty_zero_page: + + .org 0x7000 + +/* + * tmp_floppy_area is used by the floppy-driver when DMA cannot + * reach to a buffer-block. It needs to be aligned, so that it isn't + * on a 64kB border. + */ +_tmp_floppy_area: .fill 1024,1,0 +/* + * floppy_track_buffer is used to buffer one track of floppy data: it + * has to be separate from the tmp_floppy area, as otherwise a single- + * sector read/write can mess it up. It can contain one full cylinder (sic) of + * data (36*2*512 bytes). + */ +_floppy_track_buffer: .fill 512*2*36,1,0 + +_segment_fs: .word KERNEL_DS diff --git a/arch/mips/config.in b/arch/mips/config.in new file mode 100644 index 000000000..24f0c13f9 --- /dev/null +++ b/arch/mips/config.in @@ -0,0 +1,214 @@ +# +# For a description of the syntax of this configuration file, +# see the Configure script. +# + +comment 'General setup' + +bool 'Normal floppy disk support' CONFIG_BLK_DEV_FD n +bool 'Normal harddisk support' CONFIG_BLK_DEV_HD n +bool 'XT harddisk support' CONFIG_BLK_DEV_XD n +bool 'Networking support' CONFIG_NET n +bool 'System V IPC' CONFIG_SYSVIPC n +bool 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF n + +if [ "$CONFIG_NET" = "y" ]; then +comment 'Networking options' +bool 'TCP/IP networking' CONFIG_INET n +if [ "$CONFIG_INET" "=" "y" ]; then +bool 'IP forwarding/gatewaying' CONFIG_IP_FORWARD y +comment '(it is safe to leave these untouched)' +bool 'PC/TCP compatibility mode' CONFIG_INET_PCTCP n +bool 'Reverse ARP' CONFIG_INET_RARP n +bool 'Assume subnets are local' CONFIG_INET_SNARL y +bool 'Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF n +fi +bool 'The IPX protocol' CONFIG_IPX n +#bool 'Amateur Radio AX.25 Level 2' CONFIG_AX25 n +fi + +comment 'SCSI support' + +bool 'SCSI support?' CONFIG_SCSI n + +if [ "$CONFIG_SCSI" = "n" ]; then + +comment 'Skipping SCSI configuration options...' + +else + +comment 'SCSI support type (disk, tape, CDrom)' + +bool 'Scsi disk support' CONFIG_BLK_DEV_SD y +bool 'Scsi tape support' CONFIG_CHR_DEV_ST y +bool 'Scsi CDROM support' CONFIG_BLK_DEV_SR y +bool 'Scsi generic support' CONFIG_CHR_DEV_SG y + +comment 'SCSI low-level drivers' + +bool 'Adaptec AHA152X support' CONFIG_SCSI_AHA152X n +bool 'Adaptec AHA1542 support' CONFIG_SCSI_AHA1542 y +bool 'Adaptec AHA1740 support' CONFIG_SCSI_AHA1740 n +bool 'BusLogic SCSI support' CONFIG_SCSI_BUSLOGIC n +bool 'Future Domain 16xx SCSI support' CONFIG_SCSI_FUTURE_DOMAIN n +bool 'Generic NCR5380 SCSI support' CONFIG_SCSI_GENERIC_NCR5380 n +bool 'NCR53c7,8xx SCSI support' CONFIG_SCSI_NCR53C7xx n +bool 'Always IN2000 SCSI support (test release)' CONFIG_SCSI_IN2000 n +bool 'PAS16 SCSI support' CONFIG_SCSI_PAS16 n +bool 'QLOGIC SCSI support' CONFIG_SCSI_QLOGIC n +bool 'Seagate ST-02 and Future Domain TMC-8xx SCSI support' CONFIG_SCSI_SEAGATE n +bool 'Trantor T128/T128F/T228 SCSI support' CONFIG_SCSI_T128 n +bool 'UltraStor SCSI support' CONFIG_SCSI_ULTRASTOR n +bool '7000FASST SCSI support' CONFIG_SCSI_7000FASST n +bool 'EISA EATA support' CONFIG_SCSI_EATA n +#bool 'SCSI debugging host adapter' CONFIG_SCSI_DEBUG n +fi + + +if [ "$CONFIG_NET" = "y" ]; then + +comment 'Network device support' + +bool 'Network device support?' CONFIG_NETDEVICES y +if [ "$CONFIG_NETDEVICES" = "n" ]; then + +comment 'Skipping network driver configuration options...' + +else +bool 'Dummy net driver support' CONFIG_DUMMY n +bool 'SLIP (serial line) support' CONFIG_SLIP n +if [ "$CONFIG_SLIP" = "y" ]; then + bool ' CSLIP compressed headers' SL_COMPRESSED y +# bool ' SLIP debugging on' SL_DUMP y +fi +bool 'PPP (point-to-point) support' CONFIG_PPP n +bool 'PLIP (parallel port) support' CONFIG_PLIP n +bool 'Load balancing support (experimental)' CONFIG_SLAVE_BALANCING n +bool 'Do you want to be offered ALPHA test drivers' CONFIG_NET_ALPHA n +bool 'Western Digital/SMC cards' CONFIG_NET_VENDOR_SMC y +if [ "$CONFIG_NET_VENDOR_SMC" = "y" ]; then + bool 'WD80*3 support' CONFIG_WD80x3 y + bool 'SMC Ultra support' CONFIG_ULTRA n +fi +bool '3COM cards' CONFIG_NET_VENDOR_3COM y +if [ "$CONFIG_NET_VENDOR_3COM" = "y" ]; then + bool '3c501 support' CONFIG_EL1 n + bool '3c503 support' CONFIG_EL2 n + if [ "$CONFIG_NET_ALPHA" = "y" ]; then + bool '3c505 support' CONFIG_ELPLUS n + bool '3c507 support' CONFIG_EL16 n + fi + bool '3c509/3c579 support' CONFIG_EL3 n +fi +bool 'Other ISA cards' CONFIG_NET_ISA n +if [ "$CONFIG_NET_ISA" = "y" ]; then + bool 'AT1500 and NE2100 (LANCE and PCnet-ISA) support' CONFIG_LANCE n + bool 'Cabletron E21xx support (not recommended)' CONFIG_E2100 n + bool 'DEPCA support' CONFIG_DEPCA n + bool 'EtherWorks 3 support' CONFIG_EWRK3 n + if [ "$CONFIG_NET_ALPHA" = "y" ]; then + bool 'EtherExpress support' CONFIG_EEXPRESS n + bool 'AT1700 support' CONFIG_AT1700 n + fi + bool 'HP PCLAN support' CONFIG_HPLAN n + bool 'HP PCLAN PLUS support' CONFIG_HPLAN_PLUS n + bool 'NE2000/NE1000 support' CONFIG_NE2000 y + bool 'SK_G16 support' CONFIG_SK_G16 n +fi +bool 'EISA and on board controllers' CONFIG_NET_EISA n + if [ "$CONFIG_NET_ALPHA" = "y" ]; then + bool 'Ansel Communications EISA 3200 support' CONFIG_AC3200 n + fi + bool 'Apricot Xen-II on board ethernet' CONFIG_APRICOT n +#bool 'NI52EE support' CONFIG_NI52 n +#bool 'NI65EE support' CONFIG_NI65 n +bool 'Pocket and portable adaptors' CONFIG_NET_POCKET n +if [ "$CONFIG_NET_POCKET" = "y" ]; then + bool 'D-Link DE600 pocket adaptor support' CONFIG_DE600 n + bool 'D-Link DE620 pocket adaptor support' CONFIG_DE620 n + bool 'AT-LAN-TEC/RealTek pocket adaptor support' CONFIG_ATP n + bool 'Zenith Z-Note support' CONFIG_ZNET n +fi +fi +fi + +comment 'CD-ROM drivers' + +bool 'Sony CDU31A/CDU33A CDROM driver support' CONFIG_CDU31A n +bool 'Mitsumi CDROM driver support' CONFIG_MCD n +bool 'Matsushita/Panasonic CDROM driver support' CONFIG_SBPCD n +if [ "$CONFIG_SBPCD" = "y" ]; then + bool 'Matsushita/Panasonic second CDROM controller support' CONFIG_SBPCD2 n + if [ "$CONFIG_SBPCD2" = "y" ]; then + bool 'Matsushita/Panasonic third CDROM controller support' CONFIG_SBPCD3 n + if [ "$CONFIG_SBPCD3" = "y" ]; then + bool 'Matsushita/Panasonic fourth CDROM controller support' CONFIG_SBPCD4 n + fi + fi +fi + +comment 'Filesystems' + +bool 'Standard (minix) fs support' CONFIG_MINIX_FS y +bool 'Extended fs support' CONFIG_EXT_FS n +bool 'Second extended fs support' CONFIG_EXT2_FS n +bool 'xiafs filesystem support' CONFIG_XIA_FS n +bool 'msdos fs support' CONFIG_MSDOS_FS n +if [ "$CONFIG_MSDOS_FS" = "y" ]; then +bool 'umsdos: Unix like fs on top of std MSDOS FAT fs' CONFIG_UMSDOS_FS n +fi +bool '/proc filesystem support' CONFIG_PROC_FS n +if [ "$CONFIG_INET" = "y" ]; then +bool 'NFS filesystem support' CONFIG_NFS_FS y +fi +if [ "$CONFIG_BLK_DEV_SR" = "y" -o "$CONFIG_CDU31A" = "y" -o "$CONFIG_MCD" = "y" -o "$CONFIG_SBPCD" = "y" -o "$CONFIG_BLK_DEV_IDECD" = "y" ]; then + bool 'ISO9660 cdrom filesystem support' CONFIG_ISO9660_FS y +else + bool 'ISO9660 cdrom filesystem support' CONFIG_ISO9660_FS n +fi +bool 'OS/2 HPFS filesystem support (read only)' CONFIG_HPFS_FS n +bool 'System V and Coherent filesystem support' CONFIG_SYSV_FS n + +comment 'character devices' + +bool 'Parallel printer support' CONFIG_PRINTER n +bool 'Logitech busmouse support' CONFIG_BUSMOUSE n +bool 'PS/2 mouse (aka "auxiliary device") support' CONFIG_PSMOUSE n +if [ "$CONFIG_PSMOUSE" = "y" ]; then +bool 'C&T 82C710 mouse port support (as on TI Travelmate)' CONFIG_82C710_MOUSE n +fi +bool 'Microsoft busmouse support' CONFIG_MS_BUSMOUSE n +bool 'ATIXL busmouse support' CONFIG_ATIXL_BUSMOUSE n +bool 'Selection (cut and paste for virtual consoles)' CONFIG_SELECTION n + +bool 'QIC-02 tape support' CONFIG_QIC02_TAPE n +if [ "$CONFIG_QIC02_TAPE" = "y" ]; then +bool 'Do you want runtime configuration for QIC-02' CONFIG_QIC02_DYNCONF n +if [ "$CONFIG_QIC02_DYNCONF" != "y" ]; then + +comment '>>> Edit configuration parameters in ./include/linux/tpqic02.h!' + +else + +comment '>>> Setting runtime QIC-02 configuration is done with qic02conf' +comment '>>> Which is available from ftp://ftp.funet.fi/pub/OS/Linux/BETA/QIC-02/' + +fi +fi + +bool 'QIC-117 tape support' CONFIG_FTAPE n +if [ "$CONFIG_FTAPE" = "y" ]; then +int ' number of ftape buffers' NR_FTAPE_BUFFERS 3 +fi + +comment 'Sound' + +bool 'Sound card support' CONFIG_SOUND n + +comment 'Kernel hacking' + +#bool 'Debug kmalloc/kfree' CONFIG_DEBUG_MALLOC n +bool 'Kernel profiling support' CONFIG_PROFILE n +if [ "$CONFIG_SCSI" = "y" ]; then +bool 'Verbose scsi error reporting (kernel size +=12K)' CONFIG_SCSI_CONSTANTS y +fi diff --git a/arch/mips/dummy.c b/arch/mips/dummy.c new file mode 100644 index 000000000..b85a1d71e --- /dev/null +++ b/arch/mips/dummy.c @@ -0,0 +1,17 @@ +/* + * This file handles Systemcalls not available for all CPUs. + * + * Written by Ralf Baechle, + * Copyright (C) 1994 by Waldorf GMBH + */ + +unsigned long paging_init(unsigned long start_mem, unsigned long end_mem) +{ + printk("clone_page_tables\n"); + return start_mem; +} + +void fake_keyboard_interrupt(void) +{ +/* printk("fake_keyboard_interrupt\n"); */ +} diff --git a/arch/mips/entry.S b/arch/mips/entry.S new file mode 100644 index 000000000..ebf2c1d9c --- /dev/null +++ b/arch/mips/entry.S @@ -0,0 +1,665 @@ +/* + * linux/kernel/mips/sys_call.S + * + * Copyright (C) 1994 Waldorf GMBH + * written by Ralf Baechle + */ + +/* + * sys_call.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + */ + +#define __ASSEMBLY__ + +#include <linux/sys.h> +#include <asm/segment.h> +#include <asm/mipsregs.h> +#include <asm/mipsconfig.h> +#include <asm/stackframe.h> +#include <asm/regdef.h> + +/* + * These are offsets into the task-struct. + */ +state = 0 +counter = 4 +priority = 8 +signal = 12 +blocked = 16 +flags = 20 +errno = 24 #/* MIPS OK */ +exec_domain = 60 #/* ??? */ + +ENOSYS = 38 + + .globl _system_call + .globl _lcall7 + .globl _device_not_available + .globl _coprocessor_error + .globl _divide_error + .globl _debug + .globl _nmi + .globl _int3 + .globl _overflow + .globl _bounds + .globl _invalid_op + .globl _double_fault + .globl _coprocessor_segment_overrun + .globl _invalid_TSS + .globl _segment_not_present + .globl _stack_segment + .globl _general_protection + .globl _reserved + .globl _alignment_check + .globl _page_fault + .globl ret_from_sys_call + .globl _sys_call_table + + .text + .set noreorder + .align 4 +handle_bottom_half: + lw s0,_intr_count + addiu s1,s0,1 + sw s1,_intr_count + mfc0 t0,CP0_STATUS # Enable IRQs + ori t0,t0,7 + xori t0,t0,6 + jal _do_bottom_half + mtc0 t0,CP0_STATUS + j 9f + sw s1,_intr_count + + .set reorder + .align 4 +reschedule: + la ra,ret_from_sys_call + j _schedule + + .set noreorder + .align 4 +_system_call: + li t1,NR_syscalls + bge t0,t1,ret_from_sys_call + .set nomacro + li t2,-ENOSYS # must be single instruction! + .set macro + lui t1,_sys_call_table + sll t0,t0,2 + addu t1,t0,t1 + lw t0,_sys_call_table(t1) + lw s0,_current + + beq zero,t0,ret_from_sys_call + lw t0,flags(s0) + sll t0,t0,2 # PF_TRACESYS + bltz t0,1f + sw zero,errno(s0) # delay slot + + jal t0 # do the real work + nop # fillme: delay slot + + sw v0,FR_REG2(sp) # save the return value + lw v0,errno(s0) + beq zero,v0,ret_from_sys_call + subu v0,zero,v0 # v0 = -v0 + # fixme: indicate error + j ret_from_sys_call + sw v0,FR_REG2(sp) + + .align 4 +1: jal _syscall_trace + nop +#if 0 + movl ORIG_EAX(%esp),%eax + call _sys_call_table(,%eax,4) + movl %eax,EAX(%esp) # save the return value + movl _current,%eax + movl errno(%eax),%edx + negl %edx + je 1f + movl %edx,EAX(%esp) + orl $(CF_MASK),EFLAGS(%esp) # set carry to indicate error +#endif +1: jal _syscall_trace + nop + + .align 4 +ret_from_sys_call: + lw t0,_intr_count # bottom half + bne zero,t0,2f + + lw t0,_bh_mask + lw t1,_bh_active + and t0,t0,t1 + bne zero,t0,handle_bottom_half +9: + mfc0 t0,CP0_STATUS # returning to supervisor ? + andi t0,t0,30 + subu t0,t0,6 + bltz t0,2f + +1: +#if 0 +/* + * Try whether this is needed or not... + */ + mfc0 t0,CP0_STATUS # enable irqs + ori t0,t0,0x7 + xori t0,t0,0x6 + mtc0 t0,CP0_STATUS +#endif + + lw t0,_need_resched + bne zero,t0,reschedule + + lw t0,_current + la t1,_task # task[0] cannot have signals + lw t2,state(s0) # state + beq t0,t1,2f + lw t0,counter(s0) # counter + beq zero,t2,reschedule # state == 0 ? + lw a0,blocked(s0) + # save blocked in a0 for + # signal handling + beq zero,t0,reschedule # counter == 0 ? + lw t0,signal(s0) + nor t1,zero,t0 + and t1,a0,t1 + beq zero,t1,skip_signal_return + nop +2: + jal _do_signal + move a1,sp + +skip_signal_return: + .set noreorder + .set noat +return: RESTORE_ALL + .set at + +/* + * Assumptions for _handle_int: + * - only bank a or b are possible interrupt sources + */ + .globl _handle_int +_handle_int: + .set noreorder + .text + la s0,PORT_BASE + li t1,0x0f + sb t1,0x20(s0) # poll command + lb t1,0x20(s0) # read result + FILL_LDS + bgtz t1,poll_second + andi t1,t1,7 + /* + * Acknowledge first pic + */ + lb t2,0x21(s0) + li s1,1 + sllv s1,s1,t1 + lb t4,_cache_21 + or t4,t4,s1 + sb t4,_cache_21 + sb t4,0x21(s0) + li t4,0x20 + sb t4,0x20(s0) + lw t0,_intr_count + addiu t0,t0,1 + sw t0,_intr_count + /* + * Now call the real handler + */ + la t0,_IRQ_vectors + sll t2,t1,2 + addu t0,t0,t2 + lw t0,(t0) + FILL_LDS + jalr t0 + nop + lw t0,_intr_count + subu t0,t0,1 + sw t0,_intr_count + /* + * Unblock first pic + */ +test1: lbu t1,0x21(s0) # tlbl exception?!? + lb t1,_cache_21 + nor s1,zero,s1 + and t1,t1,s1 + sb t1,_cache_21 + jr v0 + sb t1,0x21(s0) # delay slot + + .set at +poll_second: + li t1,0x0f + sb t1,0xa0(s0) # poll command + lb t1,0xa0(s0) # read result + FILL_LDS + bgtz t1,spurious_interrupt + andi t1,t1,7 + /* + * Acknowledge second pic + */ + lbu t2,0xa1(s0) + lbu t3,_cache_A1 + li s1,1 + sllv s1,s1,t1 + or t3,t3,s1 + sb t3,_cache_A1 + sb t3,0xa1(s0) + li t3,0x20 + sb t3,0xa0(s0) + lw t0,_intr_count + sb t3,0x20(s0) + addiu t0,t0,1 + sw t0,_intr_count + /* + * Now call the real handler + */ + la t0,_IRQ_vectors + sll t2,t1,2 + addu t0,t0,t2 + lw t0,32(t0) + FILL_LDS + jalr t0 + nop + lw t0,_intr_count + subu t0,t0,1 + sw t0,_intr_count + /* + * Unblock second pic + */ + lbu t1,0xa1(s0) + lb t1,_cache_A1 + nor s1,zero,s1 + and t1,t1,s1 + sb t1,_cache_A1 + jr v0 + sb t1,0xa1(s0) # delay slot + + .set at +spurious_interrupt: + /* + * Nothing happend... (whistle) + */ + lw t0,_spurious_count + la v0,return + addiu t0,t0,1 + sw t0,_spurious_count + jr ra + nop + + .globl _IRQ +_IRQ: move s2,ra + mfc0 t0,CP0_STATUS + ori t0,t0,0x1f + xori t0,t0,0x1e + mtc0 t0,CP0_STATUS + move a1,sp + jal _do_IRQ + move a0,t1 # Delay slot + mfc0 t0,CP0_STATUS + ori t0,t0,1 + xori t0,t0,1 + la v0,ret_from_sys_call + jr s2 + mtc0 t0,CP0_STATUS # Delay slot + + .globl _fast_IRQ +_fast_IRQ: move s2,ra + move a1,sp + jal _do_fast_IRQ + move a0,t1 # Delay slot + la v0,return + jr s2 + nop + + .globl _bad_IRQ +_bad_IRQ: + /* + * Don't return & unblock the pic + */ + j return + nop + + .bss + .globl _IRQ_vectors + +_IRQ_vectors: + .fill 16,4,0 + +/* + * Dummy handlers + */ + .text + .set noreorder + .set at + + .globl _handle_mod +_handle_mod: + la a0,mod_text + j _panic + nop + + .globl _handle_tlbl +_handle_tlbl: + la a0,badvaddr + mfc0 a1,CP0_BADVADDR + jal _printk + nop + la a0,status + lw a1,FR_STATUS(sp) + jal _printk + nop + la a0,eszero + move a1,s0 + jal _printk + nop + la a0,espe + move a1,sp + jal _printk + nop + la a0,jifftext + lw a1,_jiffies + jal _printk + nop + la a0,inttext + lw a1,_intr_count + jal _printk + nop + la a0,tlbl_msg + mfc0 a1,CP0_EPC + jal _printk + nop + la a0,tlbl_text + j _panic + nop + + .data +tlbl_msg: .asciz "tlbl exception at %x\n" +badvaddr: .asciz "accessing %x\n" +status: .asciz "cp0_status %x\n" +eszero: .asciz "s0 %x\n" +espe: .asciz "sp %x\n" +jifftext: .asciz "jiffies %d\n" +inttext: .asciz "IntNest: %d\n" + + .text + .globl _handle_tlbs +_handle_tlbs: + la a0,tlbs_text + j _panic + nop + + .globl _handle_adel +_handle_adel: + la v0,adel_text + jal _printk + nop + j _handle_tlbl + la a0,adel_text + j _panic + nop + + .globl _handle_ades +_handle_ades: + la a0,ades_text + j _panic + nop + + .globl _handle_ibe +_handle_ibe: + la a0,ibe_text + j _panic + nop + + .globl _handle_dbe +_handle_dbe: + la a0,dbe_text + j _panic + nop + + .globl _handle_sys +_handle_sys: + la a0,sys_text + j _panic + nop + + .globl _handle_bp +_handle_bp: + la a0,bp_text + j _panic + nop + + .globl _handle_ri +_handle_ri: + la a0,ri_text + j _panic + nop + + .globl _handle_cpu +_handle_cpu: + la a0,cpu_text + j _panic + nop + + .globl _handle_ov +_handle_ov: + la a0,ov_text + j _panic + nop + + .globl _handle_tr +_handle_tr: + la a0,tr_text + j _panic + nop + + .globl _handle_reserved +_handle_reserved: + la a0,reserved_text + j _panic + nop + + .globl _handle_fpe +_handle_fpe: + la a0,fpe_text + j _panic + nop + + .data +spurious_text: .asciz "Spurious interrupt" +fpe_text: .asciz "fpe exception" +reserved_text: .asciz "reserved exception" +tr_text: .asciz "tr exception" +ov_text: .asciz "ov exception" +cpu_text: .asciz "cpu exception" +ri_text: .asciz "ri exception" +bp_text: .asciz "bp exception" +sys_text: .asciz "sys exception" +dbe_text: .asciz "dbe exception" +ibe_text: .asciz "ibe exception" +ades_text: .asciz "ades exception" +adel_text: .asciz "adel exception" +tlbs_text: .asciz "tlbs exception" +mod_text: .asciz "mod exception" +tlbl_text: .asciz "tlbl exception" + +/* + * Exception handler table, 256 entries. + */ + .data + .globl _exception_handlers +_exception_handlers: + .word _handle_int /* 0 */ + .word _handle_mod + .word _handle_tlbl + .word _handle_tlbs + .word _handle_adel + .word _handle_ades + .word _handle_ibe + .word _handle_dbe + .word _handle_sys + .word _handle_bp + .word _handle_ri + .word _handle_cpu + .word _handle_ov + .word _handle_tr + .word _handle_reserved + .word _handle_fpe /* 15 */ +#if 0 + .fill 240,4,_handle_reserved +#endif + +/* + * Table of syscalls + */ + .data +_sys_call_table: + .word _sys_setup /* 0 */ + .word _sys_exit + .word _sys_fork + .word _sys_read + .word _sys_write + .word _sys_open /* 5 */ + .word _sys_close + .word _sys_waitpid + .word _sys_creat + .word _sys_link + .word _sys_unlink /* 10 */ + .word _sys_execve + .word _sys_chdir + .word _sys_time + .word _sys_mknod + .word _sys_chmod /* 15 */ + .word _sys_chown + .word _sys_break + .word _sys_stat + .word _sys_lseek + .word _sys_getpid /* 20 */ + .word _sys_mount + .word _sys_umount + .word _sys_setuid + .word _sys_getuid + .word _sys_stime /* 25 */ + .word _sys_ptrace + .word _sys_alarm + .word _sys_fstat + .word _sys_pause + .word _sys_utime /* 30 */ + .word _sys_stty + .word _sys_gtty + .word _sys_access + .word _sys_nice + .word _sys_ftime /* 35 */ + .word _sys_sync + .word _sys_kill + .word _sys_rename + .word _sys_mkdir + .word _sys_rmdir /* 40 */ + .word _sys_dup + .word _sys_pipe + .word _sys_times + .word _sys_prof + .word _sys_brk /* 45 */ + .word _sys_setgid + .word _sys_getgid + .word _sys_signal + .word _sys_geteuid + .word _sys_getegid /* 50 */ + .word _sys_acct + .word _sys_phys + .word _sys_lock + .word _sys_ioctl + .word _sys_fcntl /* 55 */ + .word _sys_mpx + .word _sys_setpgid + .word _sys_ulimit + .word _sys_olduname + .word _sys_umask /* 60 */ + .word _sys_chroot + .word _sys_ustat + .word _sys_dup2 + .word _sys_getppid + .word _sys_getpgrp /* 65 */ + .word _sys_setsid + .word _sys_sigaction + .word _sys_sgetmask + .word _sys_ssetmask + .word _sys_setreuid /* 70 */ + .word _sys_setregid + .word _sys_sigsuspend + .word _sys_sigpending + .word _sys_sethostname + .word _sys_setrlimit /* 75 */ + .word _sys_getrlimit + .word _sys_getrusage + .word _sys_gettimeofday + .word _sys_settimeofday + .word _sys_getgroups /* 80 */ + .word _sys_setgroups + .word _sys_select + .word _sys_symlink + .word _sys_lstat + .word _sys_readlink /* 85 */ + .word _sys_uselib + .word _sys_swapon + .word _sys_reboot + .word _sys_readdir + .word _sys_mmap /* 90 */ + .word _sys_munmap + .word _sys_truncate + .word _sys_ftruncate + .word _sys_fchmod + .word _sys_fchown /* 95 */ + .word _sys_getpriority + .word _sys_setpriority + .word _sys_profil + .word _sys_statfs + .word _sys_fstatfs /* 100 */ + .word _sys_ioperm + .word _sys_socketcall + .word _sys_syslog + .word _sys_setitimer + .word _sys_getitimer /* 105 */ + .word _sys_newstat + .word _sys_newlstat + .word _sys_newfstat + .word _sys_uname + .word _sys_iopl /* 110 */ + .word _sys_vhangup + .word _sys_idle + .word _sys_vm86 + .word _sys_wait4 + .word _sys_swapoff /* 115 */ + .word _sys_sysinfo + .word _sys_ipc + .word _sys_fsync + .word _sys_sigreturn + .word _sys_clone /* 120 */ + .word _sys_setdomainname + .word _sys_newuname + .word _sys_modify_ldt + .word _sys_adjtimex + .word _sys_mprotect /* 125 */ + .word _sys_sigprocmask + .word _sys_create_module + .word _sys_init_module + .word _sys_delete_module + .word _sys_get_kernel_syms /* 130 */ + .word _sys_quotactl + .word _sys_getpgid + .word _sys_fchdir + .word _sys_bdflush + .word _sys_sysfs /* 135 */ + .word _sys_personality + .word 0 /* for afs_syscall */ + .word _sys_setfsuid + .word _sys_setfsgid + .word _sys_llseek /* 140 */ + .space (NR_syscalls-140)*4 diff --git a/arch/mips/ioport.c b/arch/mips/ioport.c new file mode 100644 index 000000000..ee3352410 --- /dev/null +++ b/arch/mips/ioport.c @@ -0,0 +1,20 @@ +/* + * linux/arch/mips/ioport.c + * + * Functions not implemented for Linux/MIPS + */ +#include <linux/linkage.h> +#include <linux/errno.h> + +asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + return -ENOSYS; +} + +asmlinkage int sys_iopl(long ebx,long ecx,long edx, + long esi, long edi, long ebp, long eax, long ds, + long es, long fs, long gs, long orig_eax, + long eip,long cs,long eflags,long esp,long ss) +{ + return -ENOSYS; +} diff --git a/arch/mips/irq.S b/arch/mips/irq.S new file mode 100644 index 000000000..129c2843f --- /dev/null +++ b/arch/mips/irq.S @@ -0,0 +1,642 @@ +/* + * linux/kernel/mips/sys_call.S + * + * Copyright (C) 1994 Waldorf GMBH + * written by Ralf Baechle + */ + +/* + * All code below must be relocatable! + */ + +/* + * sys_call.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'ret_from_system_call': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + */ + +#include <linux/segment.h> +#include <linux/sys.h> + +/* + * Offsets into the Interrupt stackframe. + */ +FR_REG1 = 0 +FR_REG2 = 4 +FR_REG3 = 8 +FR_REG4 = 12 +FR_REG5 = 16 +FR_REG6 = 20 +FR_REG7 = 24 +FR_REG8 = 28 +FR_REG9 = 32 +FR_REG10 = 36 +FR_REG11 = 40 +FR_REG12 = 44 +FR_REG13 = 48 +FR_REG14 = 52 +FR_REG15 = 56 +FR_REG16 = 60 +FR_REG17 = 64 +FR_REG18 = 68 +FR_REG19 = 72 +FR_REG20 = 76 +FR_REG21 = 80 +FR_REG22 = 84 +FR_REG23 = 88 +FR_REG24 = 92 +FR_REG25 = 96 +/* $26 and $27 not saved */ +FR_REG28 = 100 +FR_REG29 = 104 +FR_REG30 = 108 +FR_REG31 = 112 +/* + * Saved cp0 registers follow + */ +FR_STATUS = 116 +FR_EPC = 120 +FR_ERROREPC = 124 +FR_SIZE = 120 /* Size of stack frame */ + +/* + * These are offsets into the task-struct. + */ +state = 0 +counter = 4 +priority = 8 +signal = 12 +blocked = 16 +flags = 20 +errno = 24 +dbgreg6 = 52 +dbgreg7 = 56 +exec_domain = 60 + +ENOSYS = 38 + + .globl _system_call,_lcall7 + .globl _device_not_available, _coprocessor_error + .globl _divide_error,_debug,_nmi,_int3,_overflow,_bounds, + .globl _invalid_op,_double_fault,_coprocessor_segment_overrun + .globl _invalid_TSS,_segment_not_present,_stack_segment + .globl _general_protection,_reserved + .globl _alignment_check,_page_fault + .globl ret_from_sys_call, _sys_call_table + +#define SAVE_ALL(which_pc) \ + .set noreorder \ + .set noat \ + lui k0,0x8000 \ + move k1,$sp \ + lw sp,_kernelsp-except_vec0(k0) \ + subu sp,$sp,FR_SIZE \ + sw sp,_kernelsp-except_vec0(k0) \ /* Kernel SP */ + mfc0 v0,CP0_STATUS \ + sw v0,FR_STATUS(sp) \ + mfc0 v0,CP0_EPC \ + sw v0,FR_EPC \ + mfc0 v0,CP0_ERROREPC \ + sw v0,FR_ERROREPC \ + sw k1,FR_R27(sp) \ + sw $2,FR_R1(sp) \ + sw $2,FR_R2(sp) \ + sw $3,FR_R3(sp) \ + sw $4,FR_R4(sp) \ + sw $5,FR_R5(sp) \ + sw $6,FR_R6(sp) \ + sw $7,FR_R7(sp) \ + sw $8,FR_R8(sp) \ + sw $9,FR_R9(sp) \ + sw $10,FR_R10(sp) \ + sw $11,FR_R11(sp) \ + sw $12,FR_R12(sp) \ + sw $13,FR_R13(sp) \ + sw $14,FR_R14(sp) \ + sw $15,FR_R15(sp) \ + sw $16,FR_R16(sp) \ + sw $17,FR_R17(sp) \ + sw $18,FR_R18(sp) \ + sw $19,FR_R19(sp) \ + sw $20,FR_R20(sp) \ + sw $21,FR_R21(sp) \ + sw $22,FR_R22(sp) \ + sw $23,FR_R23(sp) \ + sw $24,FR_R24(sp) \ + sw $25,FR_R25(sp) \ + sw $28,FR_R28(sp) \ + sw $30,FR_R30(sp) \ + sw $31,FR_R31(sp) + + +#define RESTORE_ALL \ + lui k1,0x8000 \ + move k0,$sp \ + lw v0,FR_ERROREPC(k0) \ + lw v1,FR_EPC(k0) \ + mtc0 v0,CP0_ERROREPC(k0) \ + mtc0 v1,CP0_EPC(k0) \ + lw $31,FR_R31(k0) \ + lw $30,FR_R30(k0) \ + lw $28,FR_R28(k0) \ + lw $25,FR_R25(k0) \ + lw $24,FR_R24(k0) \ + lw $23,FR_R23(k0) \ + lw $22,FR_R22(k0) \ + lw $21,FR_R21(k0) \ + lw $20,FR_R20(k0) \ + lw $19,FR_R19(k0) \ + lw $18,FR_R18(k0) \ + lw $17,FR_R17(k0) \ + lw $16,FR_R16(k0) \ + lw $15,FR_R15(k0) \ + lw $14,FR_R14(k0) \ + lw $13,FR_R13(k0) \ + lw $12,FR_R12(k0) \ + lw $11,FR_R11(k0) \ + lw $10,FR_R10(k0) \ + lw $9,FR_R9(k0) \ + lw $8,FR_R8(k0) \ + lw $7,FR_R7(k0) \ + lw $6,FR_R6(k0) \ + lw $5,FR_R5(k0) \ + lw $4,FR_R4(k0) \ + lw $3,FR_R3(k0) \ + lw $2,FR_R2(k0) \ + lw $1,FR_R1(k0) \ + addiu k0,k0,FR_SIZE \ + sw k0,_kernelsp-except_vec0(k1) \ /* Kernel SP */ + eret + + .align 4 +handle_bottom_half: + pushfl + incl _intr_count + mtc0 zero,CP0_STATUS + call _do_bottom_half + popfl + decl _intr_count + j 9f + nop + + .align 4 +reschedule: + pushl $ret_from_sys_call + j _schedule + nop + + .align 4 +_system_call: + pushl %eax # save orig_eax + SAVE_ALL + movl $-ENOSYS,EAX(%esp) + cmpl $(NR_syscalls),%eax + jae ret_from_sys_call + movl _sys_call_table(,%eax,4),%eax + testl %eax,%eax + je ret_from_sys_call + movl _current,%ebx + andl $~CF_MASK,EFLAGS(%esp) # clear carry - assume no errors + movl $0,errno(%ebx) + movl %db6,%edx + movl %edx,dbgreg6(%ebx) # save current hardware debugging status + testb $0x20,flags(%ebx) # PF_TRACESYS + jne 1f + call *%eax + movl %eax,EAX(%esp) # save the return value + movl errno(%ebx),%edx + negl %edx + je ret_from_sys_call + movl %edx,EAX(%esp) + orl $(CF_MASK),EFLAGS(%esp) # set carry to indicate error + j ret_from_sys_call + nop + + .align 4 +1: call _syscall_trace + movl ORIG_EAX(%esp),%eax + call _sys_call_table(,%eax,4) + movl %eax,EAX(%esp) # save the return value + movl _current,%eax + movl errno(%eax),%edx + negl %edx + je 1f + movl %edx,EAX(%esp) + orl $(CF_MASK),EFLAGS(%esp) # set carry to indicate error +1: call _syscall_trace + + .align 4,0x90 +ret_from_sys_call: + cmpl $0,_intr_count + jne 2f + movl _bh_mask,%eax + andl _bh_active,%eax + jne handle_bottom_half +9: movl EFLAGS(%esp),%eax # check VM86 flag: CS/SS are + testl $(VM_MASK),%eax # different then + jne 1f + cmpw $(KERNEL_CS),CS(%esp) # was old code segment supervisor ? + je 2f +1: sti + orl $(IF_MASK),%eax # these just try to make sure + andl $~NT_MASK,%eax # the program doesn't do anything + movl %eax,EFLAGS(%esp) # stupid + cmpl $0,_need_resched + jne reschedule + movl _current,%eax + cmpl _task,%eax # task[0] cannot have signals + je 2f + cmpl $0,state(%eax) # state + jne reschedule + cmpl $0,counter(%eax) # counter + je reschedule + movl blocked(%eax),%ecx + movl %ecx,%ebx # save blocked in %ebx for + # signal handling + notl %ecx + andl signal(%eax),%ecx + jne signal_return +2: RESTORE_ALL + + .align 4 +signal_return: + movl %esp,%ecx + pushl %ecx + testl $(VM_MASK),EFLAGS(%ecx) + jne v86_signal_return + pushl %ebx + call _do_signal + popl %ebx + popl %ebx + RESTORE_ALL + + .align 4 +v86_signal_return: + call _save_v86_state + movl %eax,%esp + pushl %eax + pushl %ebx + call _do_signal + popl %ebx + popl %ebx + RESTORE_ALL + + .align 4 +_divide_error: + move $a1,zero # no error code + la $t0,$_do_divide_error + .align 4,0x90 +error_code: + push %fs + push %es + push %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + cld + movl $-1, %eax + xchgl %eax, ORIG_EAX(%esp) # orig_eax (get the error code. ) + xorl %ebx,%ebx # zero ebx + mov %gs,%bx # get the lower order bits of gs + xchgl %ebx, GS(%esp) # get the address and save gs. + pushl %eax # push the error code + lea 4(%esp),%edx + pushl %edx + movl $(KERNEL_DS),%edx + mov %dx,%ds + mov %dx,%es + movl $(USER_DS),%edx + mov %dx,%fs + jal t0 # call handler + addl $8,%esp + j ret_from_sys_call + + .align 4 +_coprocessor_error: + move a1,zero + la t0,_do_coprocessor_error + j error_code + + .align 4 +_device_not_available: + pushl $-1 # mark this as an int + SAVE_ALL + pushl $ret_from_sys_call + movl %cr0,%eax + testl $0x4,%eax # EM (math emulation bit) + je _math_state_restore + pushl $0 # temporary storage for ORIG_EIP + call _math_emulate + addl $4,%esp + ret + + .set reorder + + .align 4 +_debug: + move a1,zero + la t0,_do_debug + j error_code + + .align 4 +_nmi: + move a1,zero + la t0,_do_nmi + j error_code + + .align 4 +_int3: + move a1,zero + la t0,_do_int3 + j error_code + + .align 4 +_overflow: + move a1,zero + la t0,_do_overflow + j error_code + + .align 4 +_bounds: + move a1,zero + la t0,_do_bounds + j error_code + + .align 4 +_invalid_op: + move a1,zero + la t0,_do_invalid_op + j error_code + + .align 4 +_segment_not_present: + la t0,_do_segment_not_present + j error_code + + .align 4 +_stack_segment: + la t0,_do_stack_segment + j error_code + + .align 4 +_general_protection: + la t0,_do_general_protection + j error_code + + .align 4 +_page_fault: + la t0,_do_page_fault + j error_code +/* + * TLB Refill exception entry point + * + * The mm data is stored in the context register and + */ + .text + .set noreorder + .set noat + dmfc0 k0,CP0_CONTEXT + dsrl k0,k0,2 + lw k0,(k0) # Level 1 descriptor + dmfc0 k1,CP0_BADVADDR + srl k1,k1,10 + andi k1,k1,0xffc + addu k1,k1,k1 + lwu k0,(k1) # 2 Level 2 entries + lwu k1,4(k1) + dmtc0 k0,CP0_ENTRYLO0 + dmtc0 k0,CP0_ENTRYLO1 + tlbwr + /* + * Now compute the return address. Since this is extremly + * timecritical the code is inlined + */ + mfc0 k0,CP0_CAUSE + bgtz k0,1f + + /* + * Damn - a branch delay slot. Compute new PC + */ + + /* + * That's it boys - back to work! + */ +1: eret + + + + + lui t0,>_exception_handlers + mfc0 t1,CP0_CAUSE + andi t1,t1,0x3fc + addu t0,t0,t1 + lw t0,<_exception_handlers(t0) + sw /* fill delay slot */ + jalr t0 + sw /* fill delay slot */ + + +/* + * Exception handler table, 256 entries. + */ + .data + .align 4 +_exception_handlers: + .word _handle_int /* 0 */ + .word _handle_mod + .word _handle_tlbl + .word _handle_tlbs + .word _handle_adel + .word _handle_ades + .word _handle_ibe + .word _handle_dbe + .word _handle_sys + .word _handle_bp + .word _handle_ri + .word _handle_cpu + .word _handle_ov + .word _handle_tr + .word _handle_reserved + .word _handle_fpe + .fill 240,4,_handle_reserved /* 16 */ + +/* + * Table of syscalls + */ + .data + .align 4 +_sys_call_table: + .word _sys_setup /* 0 */ + .word _sys_exit + .word _sys_fork + .word _sys_read + .word _sys_write + .word _sys_open /* 5 */ + .word _sys_close + .word _sys_wordpid + .word _sys_creat + .word _sys_link + .word _sys_unlink /* 10 */ + .word _sys_execve + .word _sys_chdir + .word _sys_time + .word _sys_mknod + .word _sys_chmod /* 15 */ + .word _sys_chown + .word _sys_break + .word _sys_stat + .word _sys_lseek + .word _sys_getpid /* 20 */ + .word _sys_mount + .word _sys_umount + .word _sys_setuid + .word _sys_getuid + .word _sys_stime /* 25 */ + .word _sys_ptrace + .word _sys_alarm + .word _sys_fstat + .word _sys_pause + .word _sys_utime /* 30 */ + .word _sys_stty + .word _sys_gtty + .word _sys_access + .word _sys_nice + .word _sys_ftime /* 35 */ + .word _sys_sync + .word _sys_kill + .word _sys_rename + .word _sys_mkdir + .word _sys_rmdir /* 40 */ + .word _sys_dup + .word _sys_pipe + .word _sys_times + .word _sys_prof + .word _sys_brk /* 45 */ + .word _sys_setgid + .word _sys_getgid + .word _sys_signal + .word _sys_geteuid + .word _sys_getegid /* 50 */ + .word _sys_acct + .word _sys_phys + .word _sys_lock + .word _sys_ioctl + .word _sys_fcntl /* 55 */ + .word _sys_mpx + .word _sys_setpgid + .word _sys_ulimit + .word _sys_olduname + .word _sys_umask /* 60 */ + .word _sys_chroot + .word _sys_ustat + .word _sys_dup2 + .word _sys_getppid + .word _sys_getpgrp /* 65 */ + .word _sys_setsid + .word _sys_sigaction + .word _sys_sgetmask + .word _sys_ssetmask + .word _sys_setreuid /* 70 */ + .word _sys_setregid + .word _sys_sigsuspend + .word _sys_sigpending + .word _sys_sethostname + .word _sys_setrlimit /* 75 */ + .word _sys_getrlimit + .word _sys_getrusage + .word _sys_gettimeofday + .word _sys_settimeofday + .word _sys_getgroups /* 80 */ + .word _sys_setgroups + .word _sys_select + .word _sys_symlink + .word _sys_lstat + .word _sys_readlink /* 85 */ + .word _sys_uselib + .word _sys_swapon + .word _sys_reboot + .word _sys_readdir + .word _sys_mmap /* 90 */ + .word _sys_munmap + .word _sys_truncate + .word _sys_ftruncate + .word _sys_fchmod + .word _sys_fchown /* 95 */ + .word _sys_getpriority + .word _sys_setpriority + .word _sys_profil + .word _sys_statfs + .word _sys_fstatfs /* 100 */ + .word _sys_ioperm + .word _sys_socketcall + .word _sys_syslog + .word _sys_setitimer + .word _sys_getitimer /* 105 */ + .word _sys_newstat + .word _sys_newlstat + .word _sys_newfstat + .word _sys_uname + .word _sys_iopl /* 110 */ + .word _sys_vhangup + .word _sys_idle + .word _sys_vm86 + .word _sys_word4 + .word _sys_swapoff /* 115 */ + .word _sys_sysinfo + .word _sys_ipc + .word _sys_fsync + .word _sys_sigreturn + .word _sys_clone /* 120 */ + .word _sys_setdomainname + .word _sys_newuname + .word _sys_modify_ldt + .word _sys_adjtimex + .word _sys_mprotect /* 125 */ + .word _sys_sigprocmask + .word _sys_create_module + .word _sys_init_module + .word _sys_delete_module + .word _sys_get_kernel_syms /* 130 */ + .word _sys_quotactl + .word _sys_getpgid + .word _sys_fchdir + .word _sys_bdflush + .word _sys_sysfs /* 135 */ + .word _sys_personality + .word 0 /* for afs_syscall */ + + .space (NR_syscalls-137)*4 diff --git a/arch/mips/irq.c b/arch/mips/irq.c new file mode 100644 index 000000000..1bce3d07a --- /dev/null +++ b/arch/mips/irq.c @@ -0,0 +1,292 @@ +/* + * linux/kernel/irq.c + * + * Copyright (C) 1992 Linus Torvalds + * + * This file contains the code used by various IRQ handling routines: + * asking for different IRQ's should be done through these routines + * instead of just grabbing them. Thus setups with different IRQ numbers + * shouldn't result in any weird surprises, and installing new handlers + * should be easier. + */ + +/* + * IRQ's are in fact implemented a bit like signal handlers for the kernel. + * The same sigaction struct is used, and with similar semantics (ie there + * is a SA_INTERRUPT flag etc). Naturally it's not a 1:1 relation, but there + * are similarities. + * + * sa_handler(int irq_NR) is the default function called (0 if no). + * sa_mask is horribly ugly (I won't even mention it) + * sa_flags contains various info: SA_INTERRUPT etc + * sa_restorer is the unused + */ + +#include <linux/ptrace.h> +#include <linux/errno.h> +#include <linux/kernel_stat.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/interrupt.h> + +#include <asm/system.h> +#include <asm/io.h> +#include <asm/irq.h> + +#define CR0_NE 32 + +unsigned char cache_21 = 0xff; +unsigned char cache_A1 = 0xff; + +unsigned long intr_count = 0; +unsigned long spurious_count = 0; +unsigned long bh_active = 0; +unsigned long bh_mask = 0xFFFFFFFF; +struct bh_struct bh_base[32]; + +void disable_irq(unsigned int irq_nr) +{ + unsigned long flags; + unsigned char mask; + + mask = 1 << (irq_nr & 7); + save_flags(flags); + if (irq_nr < 8) { + cli(); + cache_21 |= mask; + outb(cache_21,0x21); + restore_flags(flags); + return; + } + cli(); + cache_A1 |= mask; + outb(cache_A1,0xA1); + restore_flags(flags); +} + +void enable_irq(unsigned int irq_nr) +{ + unsigned long flags; + unsigned char mask; + + mask = ~(1 << (irq_nr & 7)); + save_flags(flags); + if (irq_nr < 8) { + cli(); + cache_21 &= mask; + outb(cache_21,0x21); + restore_flags(flags); + return; + } + cli(); + cache_A1 &= mask; + outb(cache_A1,0xA1); + restore_flags(flags); +} + +/* + * do_bottom_half() runs at normal kernel priority: all interrupts + * enabled. do_bottom_half() is atomic with respect to itself: a + * bottom_half handler need not be re-entrant. + */ +asmlinkage void do_bottom_half(void) +{ + unsigned long active; + unsigned long mask, left; + struct bh_struct *bh; + + bh = bh_base; + active = bh_active & bh_mask; + for (mask = 1, left = ~0 ; left & active ; bh++,mask += mask,left += left) { + if (mask & active) { + void (*fn)(void *); + bh_active &= ~mask; + fn = bh->routine; + if (!fn) + goto bad_bh; + fn(bh->data); + } + } + return; +bad_bh: + printk ("irq.c:bad bottom half entry\n"); +} + +/* + * Pointers to the low-level handlers: first the general ones, then the + * fast ones, then the bad ones. + */ +extern void IRQ(void); +extern void fast_IRQ(void); +extern void bad_IRQ(void); + +/* + * Initial irq handlers. + */ +static struct sigaction irq_sigaction[16] = { + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL }, + { NULL, 0, 0, NULL }, { NULL, 0, 0, NULL } +}; + +int get_irq_list(char *buf) +{ + int i, len = 0; + struct sigaction * sa = irq_sigaction; + + for (i = 0 ; i < 16 ; i++, sa++) { + if (!sa->sa_handler) + continue; + len += sprintf(buf+len, "%2d: %8d %c %s\n", + i, kstat.interrupts[i], + (sa->sa_flags & SA_INTERRUPT) ? '+' : ' ', + (char *) sa->sa_mask); + } + return len; +} + +/* + * do_IRQ handles IRQ's that have been installed without the + * SA_INTERRUPT flag: it uses the full signal-handling return + * and runs with other interrupts enabled. All relatively slow + * IRQ's should use this format: notably the keyboard/timer + * routines. + */ +asmlinkage void do_IRQ(int irq, struct pt_regs * regs) +{ + struct sigaction * sa = irq + irq_sigaction; + + kstat.interrupts[irq]++; + sa->sa_handler((int) regs); +} + +/* + * do_fast_IRQ handles IRQ's that don't need the fancy interrupt return + * stuff - the handler is also running with interrupts disabled unless + * it explicitly enables them later. + */ +asmlinkage void do_fast_IRQ(int irq) +{ + struct sigaction * sa = irq + irq_sigaction; + + kstat.interrupts[irq]++; + sa->sa_handler(irq); +} + +/* + * Using "struct sigaction" is slightly silly, but there + * are historical reasons and it works well, so.. + */ +static int irqaction(unsigned int irq, struct sigaction * new_sa) +{ + struct sigaction * sa; + unsigned long flags; + + if (irq > 15) + return -EINVAL; + sa = irq + irq_sigaction; + if (sa->sa_handler) + return -EBUSY; + if (!new_sa->sa_handler) + return -EINVAL; + save_flags(flags); + cli(); + *sa = *new_sa; + /* + * FIXME: Does the SA_INTERRUPT flag make any sense on the MIPS??? + */ + if (sa->sa_flags & SA_INTERRUPT) + set_intr_gate(irq,fast_IRQ); + else + set_intr_gate(irq,IRQ); + if (irq < 8) { + cache_21 &= ~(1<<irq); + outb(cache_21,0x21); + } else { + cache_21 &= ~(1<<2); + cache_A1 &= ~(1<<(irq-8)); + outb(cache_21,0x21); + outb(cache_A1,0xA1); + } + restore_flags(flags); + return 0; +} + +int request_irq(unsigned int irq, void (*handler)(int), + unsigned long flags, const char * devname) +{ + struct sigaction sa; + + sa.sa_handler = handler; + sa.sa_flags = flags; + sa.sa_mask = (unsigned long) devname; + sa.sa_restorer = NULL; + return irqaction(irq,&sa); +} + +void free_irq(unsigned int irq) +{ + struct sigaction * sa = irq + irq_sigaction; + unsigned long flags; + + if (irq > 15) { + printk("Trying to free IRQ%d\n",irq); + return; + } + if (!sa->sa_handler) { + printk("Trying to free free IRQ%d\n",irq); + return; + } + save_flags(flags); + cli(); + if (irq < 8) { + cache_21 |= 1 << irq; + outb(cache_21,0x21); + } else { + cache_A1 |= 1 << (irq-8); + outb(cache_A1,0xA1); + } + set_intr_gate(irq,bad_IRQ); + sa->sa_handler = NULL; + sa->sa_flags = 0; + sa->sa_mask = 0; + sa->sa_restorer = NULL; + restore_flags(flags); +} + +#if 0 +/* + * handle fpa errors + */ +static void math_error_irq(int cpl) +{ + if (!hard_math) + return; + handle_fpe(); +} +#endif + +static void no_action(int cpl) { } + +void init_IRQ(void) +{ + int i; + + for (i = 0; i < 16 ; i++) + set_intr_gate(i, bad_IRQ[i]); + if (request_irq(2, no_action, SA_INTERRUPT, "cascade")) + printk("Unable to get IRQ2 for cascade\n"); + + /* initialize the bottom half routines. */ + for (i = 0; i < 32; i++) { + bh_base[i].routine = NULL; + bh_base[i].data = NULL; + } + bh_active = 0; + intr_count = 0; +} diff --git a/arch/mips/ldt.c b/arch/mips/ldt.c new file mode 100644 index 000000000..089605cee --- /dev/null +++ b/arch/mips/ldt.c @@ -0,0 +1,13 @@ +/* + * arch/mips/ldt.c + * + * Copyright (C) 1994 by Waldorf GMBH, + * written by Ralf Baechle + */ +#include <linux/linkage.h> +#include <linux/errno.h> + +asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) +{ + return -ENOSYS; +} diff --git a/arch/mips/main.c b/arch/mips/main.c new file mode 100644 index 000000000..8cb92d5f2 --- /dev/null +++ b/arch/mips/main.c @@ -0,0 +1,333 @@ +/* + * arch/mips/main.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * MIPSified by Ralf Baechle + */ + +#include <stdarg.h> + +#include <asm/system.h> +#include <asm/io.h> +#include <asm/bootinfo.h> + +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/tty.h> +#include <linux/head.h> +#include <linux/unistd.h> +#include <linux/string.h> +#include <linux/timer.h> +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/utsname.h> +#include <linux/ioport.h> + +extern unsigned long * prof_buffer; +extern unsigned long prof_len; +extern char edata, end; +extern char *linux_banner; + +/* + * we need this inline - forking from kernel space will result + * in NO COPY ON WRITE (!!!), until an execve is executed. This + * is no problem, but for the stack. This is handled by not letting + * main() use the stack at all after fork(). Thus, no function + * calls - which means inline code for fork too, as otherwise we + * would use the stack upon exit from 'fork()'. + * + * Actually only pause and fork are needed inline, so that there + * won't be any messing with the stack from main(), but we define + * some others too. + */ +#define __NR__exit __NR_exit +static inline _syscall0(int,idle) +static inline _syscall0(int,fork) + +extern int console_loglevel; + +extern char empty_zero_page[PAGE_SIZE]; +extern void init(void); +extern void init_IRQ(void); +extern void init_modules(void); +extern long console_init(long, long); +extern long kmalloc_init(long,long); +extern long blk_dev_init(long,long); +extern long chr_dev_init(long,long); +extern void floppy_init(void); +extern void sock_init(void); +extern long rd_init(long mem_start, int length); +unsigned long net_dev_init(unsigned long, unsigned long); +#if 0 +extern long bios32_init(long, long); +#endif + +extern void hd_setup(char *str, int *ints); +extern void bmouse_setup(char *str, int *ints); +extern void eth_setup(char *str, int *ints); +extern void xd_setup(char *str, int *ints); +extern void mcd_setup(char *str, int *ints); +extern void st_setup(char *str, int *ints); +extern void st0x_setup(char *str, int *ints); +extern void tmc8xx_setup(char *str, int *ints); +extern void t128_setup(char *str, int *ints); +extern void pas16_setup(char *str, int *ints); +extern void generic_NCR5380_setup(char *str, int *intr); +extern void aha152x_setup(char *str, int *ints); +extern void aha1542_setup(char *str, int *ints); +extern void aha274x_setup(char *str, int *ints); +extern void scsi_luns_setup(char *str, int *ints); +extern void sound_setup(char *str, int *ints); +#ifdef CONFIG_SBPCD +extern void sbpcd_setup(char *str, int *ints); +#endif CONFIG_SBPCD +#ifdef CONFIG_CDU31A +extern void cdu31a_setup(char *str, int *ints); +#endif CONFIG_CDU31A +void ramdisk_setup(char *str, int *ints); + +#ifdef CONFIG_SYSVIPC +extern void ipc_init(void); +#endif +#ifdef CONFIG_SCSI +extern unsigned long scsi_dev_init(unsigned long, unsigned long); +#endif + +/* + * This is set up by the setup-routine at boot-time + */ +#define PARAM empty_zero_page +#define EXT_MEM_K (*(unsigned short *) (PARAM+2)) +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80)) +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2)) +#define RAMDISK_SIZE (*(unsigned short *) (PARAM+0x1F8)) +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC)) +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF)) + +/* + * Defaults, may be overwritten by milo + */ +#define SCREEN_INFO {0,0,{0,0},52,3,80,4626,3,9,50} + +/* + * Information passed by milo + */ +struct bootinfo boot_info; +struct screen_info screen_info = SCREEN_INFO; + +/* + * Boot command-line arguments + */ +extern void copy_options(char * to, char * from); +void parse_options(char *line); +#define MAX_INIT_ARGS 8 +#define MAX_INIT_ENVS 8 +#define COMMAND_LINE ((char *) (PARAM+2048)) +#define COMMAND_LINE_SIZE 256 + +extern void time_init(void); + +static unsigned long memory_start = 0; /* After mem_init, stores the */ + /* amount of free user memory */ +/* static */ unsigned long memory_end = 0; +/* static unsigned long low_memory_start = 0; */ + +int rows, cols; + +struct drive_info_struct { char dummy[32]; } drive_info; + +unsigned char aux_device_present; +int ramdisk_size; +int root_mountflags; + +static char command_line[COMMAND_LINE_SIZE] = { 0, }; + +struct { + char *str; + void (*setup_func)(char *, int *); +} bootsetups[] = { + { "ramdisk=", ramdisk_setup }, +#ifdef CONFIG_INET + { "ether=", eth_setup }, +#endif +#ifdef CONFIG_SCSI + { "max_scsi_luns=", scsi_luns_setup }, +#endif +#ifdef CONFIG_BLK_DEV_HD + { "hd=", hd_setup }, +#endif +#ifdef CONFIG_CHR_DEV_ST + { "st=", st_setup }, +#endif +#ifdef CONFIG_BUSMOUSE + { "bmouse=", bmouse_setup }, +#endif +#ifdef CONFIG_SCSI_SEAGATE + { "st0x=", st0x_setup }, + { "tmc8xx=", tmc8xx_setup }, +#endif +#ifdef CONFIG_SCSI_T128 + { "t128=", t128_setup }, +#endif +#ifdef CONFIG_SCSI_PAS16 + { "pas16=", pas16_setup }, +#endif +#ifdef CONFIG_SCSI_GENERIC_NCR5380 + { "ncr5380=", generic_NCR5380_setup }, +#endif +#ifdef CONFIG_SCSI_AHA152X + { "aha152x=", aha152x_setup}, +#endif +#ifdef CONFIG_SCSI_AHA1542 + { "aha1542=", aha1542_setup}, +#endif +#ifdef CONFIG_SCSI_AHA274X + { "aha274x=", aha274x_setup}, +#endif +#ifdef CONFIG_BLK_DEV_XD + { "xd=", xd_setup }, +#endif +#ifdef CONFIG_MCD + { "mcd=", mcd_setup }, +#endif +#ifdef CONFIG_SOUND + { "sound=", sound_setup }, +#endif +#ifdef CONFIG_SBPCD + { "sbpcd=", sbpcd_setup }, +#endif CONFIG_SBPCD +#ifdef CONFIG_CDU31A + { "cdu31a=", cdu31a_setup }, +#endif CONFIG_CDU31A + { 0, 0 } +}; + +void ramdisk_setup(char *str, int *ints) +{ + if (ints[0] > 0 && ints[1] >= 0) + ramdisk_size = ints[1]; +} + +unsigned long loops_per_sec = 1; + +static void calibrate_delay(void) +{ + int ticks; + + printk("Calibrating delay loop.. "); + while (loops_per_sec <<= 1) { + /* wait for "start of" clock tick */ + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + /* Go .. */ + ticks = jiffies; + __delay(loops_per_sec); + ticks = jiffies - ticks; + if (ticks >= HZ) { + /* + * No assembler - should be ok + */ + loops_per_sec = (loops_per_sec * HZ) / ticks; + printk("ok - %lu.%02lu BogoMips\n", + loops_per_sec/500000, + (loops_per_sec/5000) % 100); + return; + } + } + printk("failed\n"); +} + +int parse_machine_options(char *line) +{ + /* + * No special MIPS options yet + */ + return 0; +} + +asmlinkage void start_kernel(void) +{ + /* + * Interrupts are still disabled. Do necessary setups, then + * enable them + */ + ROOT_DEV = ORIG_ROOT_DEV; + drive_info = DRIVE_INFO; + aux_device_present = AUX_DEVICE_INFO; +#if 0 + memory_end = (1<<20) + (EXT_MEM_K<<10); +#else + memory_end = 0x80800000; +#endif + memory_end &= PAGE_MASK; + ramdisk_size = RAMDISK_SIZE; + copy_options(command_line,COMMAND_LINE); + + if (MOUNT_ROOT_RDONLY) + root_mountflags |= MS_RDONLY; + memory_start = 0x7fffffff & (unsigned long) &end; + + memory_start = paging_init(memory_start,memory_end); + trap_init(); + init_IRQ(); + sched_init(); + parse_options(command_line); + init_modules(); +#ifdef CONFIG_PROFILE + prof_buffer = (unsigned long *) memory_start; + prof_len = (unsigned long) &end; + prof_len >>= 2; + memory_start += prof_len * sizeof(unsigned long); +#endif + memory_start = console_init(memory_start,memory_end); + memory_start = kmalloc_init(memory_start,memory_end); + memory_start = chr_dev_init(memory_start,memory_end); + memory_start = blk_dev_init(memory_start,memory_end); + sti(); + calibrate_delay(); +#ifdef CONFIG_SCSI + memory_start = scsi_dev_init(memory_start,memory_end); +#endif +#ifdef CONFIG_INET + memory_start = net_dev_init(memory_start,memory_end); +#endif +while(1); + memory_start = inode_init(memory_start,memory_end); + memory_start = file_table_init(memory_start,memory_end); + memory_start = name_cache_init(memory_start,memory_end); + mem_init(memory_start,memory_end); + buffer_init(); + time_init(); + floppy_init(); + sock_init(); +#ifdef CONFIG_SYSVIPC + ipc_init(); +#endif + sti(); + + /* + * Get CPU type + * FIXME: Not implemented yet + */ + + printk(linux_banner); + + move_to_user_mode(); + if (!fork()) /* we count on this going ok */ + init(); +/* + * task[0] is meant to be used as an "idle" task: it may not sleep, but + * it might do some general things like count free pages or it could be + * used to implement a reasonable LRU algorithm for the paging routines: + * anything that can be useful, but shouldn't take time from the real + * processes. + * + * Right now task[0] just does a infinite idle loop. + */ + for(;;) + idle(); +} diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile new file mode 100644 index 000000000..5063d60c2 --- /dev/null +++ b/arch/mips/mm/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for the linux memory manager. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +.c.o: + $(CC) $(CFLAGS) -c $< +.s.o: + $(AS) -o $*.o $< +.c.s: + $(CC) $(CFLAGS) -S $< + +OBJS = memory.o swap.o mmap.o mprotect.o kmalloc.o vmalloc.o + +mm.o: $(OBJS) + $(LD) -r -o mm.o $(OBJS) + +dep: + $(CPP) -M *.c > .depend + +# +# include a dependency file if one exists +# +ifeq (.depend,$(wildcard .depend)) +include .depend +endif diff --git a/arch/mips/mm/kmalloc.c b/arch/mips/mm/kmalloc.c new file mode 100644 index 000000000..018f8db8f --- /dev/null +++ b/arch/mips/mm/kmalloc.c @@ -0,0 +1,362 @@ +/* + * linux/mm/kmalloc.c + * + * Copyright (C) 1991, 1992 Linus Torvalds & Roger Wolff. + * + * Written by R.E. Wolff Sept/Oct '93. + * + */ + +/* + * Modified by Alex Bligh (alex@cconcepts.co.uk) 4 Apr 1994 to use multiple + * pages. So for 'page' throughout, read 'area'. + */ + +#include <linux/mm.h> +#include <asm/system.h> +#include <linux/delay.h> + +#define GFP_LEVEL_MASK 0xf + +/* I want this low enough for a while to catch errors. + I want this number to be increased in the near future: + loadable device drivers should use this function to get memory */ + +#define MAX_KMALLOC_K ((PAGE_SIZE<<(NUM_AREA_ORDERS-1))>>10) + + +/* This defines how many times we should try to allocate a free page before + giving up. Normally this shouldn't happen at all. */ +#define MAX_GET_FREE_PAGE_TRIES 4 + + +/* Private flags. */ + +#define MF_USED 0xffaa0055 +#define MF_FREE 0x0055ffaa + + +/* + * Much care has gone into making these routines in this file reentrant. + * + * The fancy bookkeeping of nbytesmalloced and the like are only used to + * report them to the user (oooohhhhh, aaaaahhhhh....) are not + * protected by cli(). (If that goes wrong. So what?) + * + * These routines restore the interrupt status to allow calling with ints + * off. + */ + +/* + * A block header. This is in front of every malloc-block, whether free or not. + */ +struct block_header { + unsigned long bh_flags; + union { + unsigned long ubh_length; + struct block_header *fbh_next; + } vp; +}; + + +#define bh_length vp.ubh_length +#define bh_next vp.fbh_next +#define BH(p) ((struct block_header *)(p)) + + +/* + * The page descriptor is at the front of every page that malloc has in use. + */ +struct page_descriptor { + struct page_descriptor *next; + struct block_header *firstfree; + int order; + int nfree; +}; + + +#define PAGE_DESC(p) ((struct page_descriptor *)(((unsigned long)(p)) & PAGE_MASK)) + + +/* + * A size descriptor describes a specific class of malloc sizes. + * Each class of sizes has its own freelist. + */ +struct size_descriptor { + struct page_descriptor *firstfree; + int size; + int nblocks; + + int nmallocs; + int nfrees; + int nbytesmalloced; + int npages; + unsigned long gfporder; /* number of pages in the area required */ +}; + +/* + * For now it is unsafe to allocate bucket sizes between n & n=16 where n is + * 4096 * any power of two + */ + +struct size_descriptor sizes[] = { + { NULL, 32,127, 0,0,0,0, 0}, + { NULL, 64, 63, 0,0,0,0, 0 }, + { NULL, 128, 31, 0,0,0,0, 0 }, + { NULL, 252, 16, 0,0,0,0, 0 }, + { NULL, 508, 8, 0,0,0,0, 0 }, + { NULL,1020, 4, 0,0,0,0, 0 }, + { NULL,2040, 2, 0,0,0,0, 0 }, + { NULL,4096-16, 1, 0,0,0,0, 0 }, + { NULL,8192-16, 1, 0,0,0,0, 1 }, + { NULL,16384-16, 1, 0,0,0,0, 2 }, + { NULL,32768-16, 1, 0,0,0,0, 3 }, + { NULL,65536-16, 1, 0,0,0,0, 4 }, + { NULL,131072-16, 1, 0,0,0,0, 5 }, + { NULL, 0, 0, 0,0,0,0, 0 } +}; + + +#define NBLOCKS(order) (sizes[order].nblocks) +#define BLOCKSIZE(order) (sizes[order].size) +#define AREASIZE(order) (PAGE_SIZE<<(sizes[order].gfporder)) + + +long kmalloc_init (long start_mem,long end_mem) +{ + int order; + +/* + * Check the static info array. Things will blow up terribly if it's + * incorrect. This is a late "compile time" check..... + */ +for (order = 0;BLOCKSIZE(order);order++) + { + if ((NBLOCKS (order)*BLOCKSIZE(order) + sizeof (struct page_descriptor)) > + AREASIZE(order)) + { + printk ("Cannot use %d bytes out of %d in order = %d block mallocs\n", + NBLOCKS (order) * BLOCKSIZE(order) + + sizeof (struct page_descriptor), + (int) AREASIZE(order), + BLOCKSIZE (order)); + panic ("This only happens if someone messes with kmalloc"); + } + } +return start_mem; +} + + + +int get_order (int size) +{ + int order; + + /* Add the size of the header */ + size += sizeof (struct block_header); + for (order = 0;BLOCKSIZE(order);order++) + if (size <= BLOCKSIZE (order)) + return order; + return -1; +} + +void * kmalloc (size_t size, int priority) +{ + unsigned long flags; + int order,tries,i,sz; + struct block_header *p; + struct page_descriptor *page; + +/* Sanity check... */ + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("kmalloc called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + +order = get_order (size); +if (order < 0) + { + printk ("kmalloc of too large a block (%d bytes).\n",size); + return (NULL); + } + +save_flags(flags); + +/* It seems VERY unlikely to me that it would be possible that this + loop will get executed more than once. */ +tries = MAX_GET_FREE_PAGE_TRIES; +while (tries --) + { + /* Try to allocate a "recently" freed memory block */ + cli (); + if ((page = sizes[order].firstfree) && + (p = page->firstfree)) + { + if (p->bh_flags == MF_FREE) + { + page->firstfree = p->bh_next; + page->nfree--; + if (!page->nfree) + { + sizes[order].firstfree = page->next; + page->next = NULL; + } + restore_flags(flags); + + sizes [order].nmallocs++; + sizes [order].nbytesmalloced += size; + p->bh_flags = MF_USED; /* As of now this block is officially in use */ + p->bh_length = size; + return p+1; /* Pointer arithmetic: increments past header */ + } + printk ("Problem: block on freelist at %08lx isn't free.\n",(long)p); + return (NULL); + } + restore_flags(flags); + + + /* Now we're in trouble: We need to get a new free page..... */ + + sz = BLOCKSIZE(order); /* sz is the size of the blocks we're dealing with */ + + /* This can be done with ints on: This is private to this invocation */ + page = (struct page_descriptor *) __get_free_pages (priority & GFP_LEVEL_MASK, sizes[order].gfporder); + if (!page) { + static unsigned long last = 0; + if (last + 10*HZ < jiffies) { + last = jiffies; + printk ("Couldn't get a free page.....\n"); + } + return NULL; + } +#if 0 + printk ("Got page %08x to use for %d byte mallocs....",(long)page,sz); +#endif + sizes[order].npages++; + + /* Loop for all but last block: */ + for (i=NBLOCKS(order),p=BH (page+1);i > 1;i--,p=p->bh_next) + { + p->bh_flags = MF_FREE; + p->bh_next = BH ( ((long)p)+sz); + } + /* Last block: */ + p->bh_flags = MF_FREE; + p->bh_next = NULL; + + page->order = order; + page->nfree = NBLOCKS(order); + page->firstfree = BH(page+1); +#if 0 + printk ("%d blocks per page\n",page->nfree); +#endif + /* Now we're going to muck with the "global" freelist for this size: + this should be uninterruptible */ + cli (); + /* + * sizes[order].firstfree used to be NULL, otherwise we wouldn't be + * here, but you never know.... + */ + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + restore_flags(flags); + } + +/* Pray that printk won't cause this to happen again :-) */ + +printk ("Hey. This is very funny. I tried %d times to allocate a whole\n" + "new page for an object only %d bytes long, but some other process\n" + "beat me to actually allocating it. Also note that this 'error'\n" + "message is soooo very long to catch your attention. I'd appreciate\n" + "it if you'd be so kind as to report what conditions caused this to\n" + "the author of this kmalloc: wolff@dutecai.et.tudelft.nl.\n" + "(Executive summary: This can't happen)\n", + MAX_GET_FREE_PAGE_TRIES, + size); +return NULL; +} + + +void kfree_s (void *ptr,int size) +{ +unsigned long flags; +int order; +register struct block_header *p=((struct block_header *)ptr) -1; +struct page_descriptor *page,*pg2; + +page = PAGE_DESC (p); +order = page->order; +if ((order < 0) || + (order > sizeof (sizes)/sizeof (sizes[0])) || + (((long)(page->next)) & ~PAGE_MASK) || + (p->bh_flags != MF_USED)) + { + printk ("kfree of non-kmalloced memory: %p, next= %p, order=%d\n", + p, page->next, page->order); + return; + } +if (size && + size != p->bh_length) + { + printk ("Trying to free pointer at %p with wrong size: %d instead of %lu.\n", + p,size,p->bh_length); + return; + } +size = p->bh_length; +p->bh_flags = MF_FREE; /* As of now this block is officially free */ +save_flags(flags); +cli (); +p->bh_next = page->firstfree; +page->firstfree = p; +page->nfree ++; + +if (page->nfree == 1) + { /* Page went from full to one free block: put it on the freelist */ + if (page->next) + { + printk ("Page %p already on freelist dazed and confused....\n", page); + } + else + { + page->next = sizes[order].firstfree; + sizes[order].firstfree = page; + } + } + +/* If page is completely free, free it */ +if (page->nfree == NBLOCKS (page->order)) + { +#if 0 + printk ("Freeing page %08x.\n", (long)page); +#endif + if (sizes[order].firstfree == page) + { + sizes[order].firstfree = page->next; + } + else + { + for (pg2=sizes[order].firstfree; + (pg2 != NULL) && (pg2->next != page); + pg2=pg2->next) + /* Nothing */; + if (pg2 != NULL) + pg2->next = page->next; + else + printk ("Ooops. page %p doesn't show on freelist.\n", page); + } +/* FIXME: I'm sure we should do something with npages here (like npages--) */ + free_pages ((long)page, sizes[order].gfporder); + } +restore_flags(flags); + +/* FIXME: ?? Are these increment & decrement operations guaranteed to be + * atomic? Could an IRQ not occur between the read & the write? + * Maybe yes on a x86 with GCC...?? + */ +sizes[order].nfrees++; /* Noncritical (monitoring) admin stuff */ +sizes[order].nbytesmalloced -= size; +} diff --git a/arch/mips/mm/memory.c b/arch/mips/mm/memory.c new file mode 100644 index 000000000..5872f8bd5 --- /dev/null +++ b/arch/mips/mm/memory.c @@ -0,0 +1,1295 @@ +/* + * arch/mips/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Ported to MIPS by Ralf Baechle + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + */ + +#include <linux/config.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> + +#include <asm/system.h> +#include <asm/segment.h> + +unsigned long high_memory = 0; + +extern unsigned long pg0[1024]; /* page table for 0-4MB for everybody */ +extern unsigned long invalid_pg_table[1024]; + +extern void sound_mem_init(void); +extern void die_if_kernel(char *,struct pt_regs *,long); +extern void show_net_buffers(void); + +/* + * The free_area_list arrays point to the queue heads of the free areas + * of different sizes + */ +int nr_swap_pages = 0; +int nr_free_pages = 0; +struct mem_list free_area_list[NR_MEM_LISTS]; +unsigned char * free_area_map[NR_MEM_LISTS]; + +unsigned short * mem_map = NULL; + +/* + * oom() prints a message (so that the user knows why the process died), + * and gives the process an untrappable SIGKILL. + */ +void oom(struct task_struct * task) +{ + printk("\nOut of memory.\n"); + task->sigaction[SIGKILL-1].sa_handler = NULL; + task->blocked &= ~(1<<(SIGKILL-1)); + send_sig(SIGKILL,task,1); +} + +static void free_one_table(unsigned long * page_dir) +{ + int j; + unsigned long pg_table = *page_dir; + unsigned long * page_table; + + if ((long) pg_table & PAGE_MASK != (long) invalid_pg_table & PAGE_MASK ) + return; + *page_dir = PAGE_VALID | (unsigned long) invalid_pg_table; + if (pg_table >= high_memory || !(pg_table & PAGE_VALID)) { + printk("Bad page table: [%p]=%08lx\n",page_dir,pg_table); + return; + } + if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED) + return; + page_table = (unsigned long *) (pg_table & PAGE_MASK); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,page_table++) { + unsigned long pg = *page_table; + + if (!pg) + continue; + *page_table = 0; + if (pg & PAGE_VALID) + free_page(PAGE_MASK & pg); + else + swap_free(pg); + } + free_page(PAGE_MASK & pg_table); +} + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. Note that + * unlike 'free_page_tables()', this function still leaves a valid + * page-table-tree in memory: it just removes the user pages. The two + * functions are similar, but there is a fundamental difference. + */ +void clear_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) + panic("task[0] (swapper) doesn't support exec()\n"); + pg_dir = tsk->tss.pg_dir; + page_dir = (unsigned long *) pg_dir; + if (!page_dir || page_dir == swapper_pg_dir) { + printk("Trying to clear kernel page-directory: not good\n"); + return; + } + if (mem_map[MAP_NR(pg_dir)] > 1) { + unsigned long * new_pg; + + if (!(new_pg = (unsigned long*) get_free_page(GFP_KERNEL))) { + oom(tsk); + return; + } + for (i = 768 ; i < 1024 ; i++) + new_pg[i] = page_dir[i]; + free_page(pg_dir); + tsk->tss.pg_dir = (unsigned long) new_pg; + return; + } + for (i = 0 ; i < 768 ; i++,page_dir++) + free_one_table(page_dir); + invalidate(); + return; +} + +/* + * This function frees up all page tables of a process when it exits. + */ +void free_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long pg_dir; + unsigned long * page_dir; + + if (!tsk) + return; + if (tsk == task[0]) { + printk("task[0] (swapper) killed: unable to recover\n"); + panic("Trying to free up swapper memory space"); + } + pg_dir = tsk->tss.pg_dir; + if (!pg_dir || pg_dir == (unsigned long) swapper_pg_dir) { + printk("Trying to free kernel page-directory: not good\n"); + return; + } + tsk->tss.pg_dir = (unsigned long) swapper_pg_dir; + if (mem_map[MAP_NR(pg_dir)] > 1) { + free_page(pg_dir); + return; + } + page_dir = (unsigned long *) pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,page_dir++) + free_one_table(page_dir); + free_page(pg_dir); + invalidate(); +} + +/* + * clone_page_tables() clones the page table for a process - both + * processes will have the exact same pages in memory. There are + * probably races in the memory management with cloning, but we'll + * see.. + */ +int clone_page_tables(struct task_struct * tsk) +{ + unsigned long pg_dir; + + pg_dir = current->tss.pg_dir; + mem_map[MAP_NR(pg_dir)]++; + tsk->tss.pg_dir = pg_dir; + return 0; +} + +/* + * copy_page_tables() just copies the whole process memory range: + * note the special handling of RESERVED (ie kernel) pages, which + * means that they are always shared by all processes. + */ +int copy_page_tables(struct task_struct * tsk) +{ + int i; + unsigned long old_pg_dir, *old_page_dir; + unsigned long new_pg_dir, *new_page_dir; + + if (!(new_pg_dir = get_free_page(GFP_KERNEL))) + return -ENOMEM; + old_pg_dir = current->tss.pg_dir; + tsk->tss.pg_dir = new_pg_dir; + old_page_dir = (unsigned long *) old_pg_dir; + new_page_dir = (unsigned long *) new_pg_dir; + for (i = 0 ; i < PTRS_PER_PAGE ; i++,old_page_dir++,new_page_dir++) { + int j; + unsigned long old_pg_table, *old_page_table; + unsigned long new_pg_table, *new_page_table; + + old_pg_table = *old_page_dir; + if (old_pg_table == (unsigned long) invalid_pg_table) + continue; + if (old_pg_table >= high_memory || !(old_pg_table & PAGE_VALID)) { + printk("copy_page_tables: bad page table: " + "probable memory corruption\n"); + *old_page_dir = PAGE_TABLE | (unsigned long)invalid_pg_table; + continue; + } + if (mem_map[MAP_NR(old_pg_table)] & MAP_PAGE_RESERVED) { + *new_page_dir = old_pg_table; + continue; + } + if (!(new_pg_table = get_free_page(GFP_KERNEL))) { + free_page_tables(tsk); + return -ENOMEM; + } + old_page_table = (unsigned long *) (PAGE_MASK & old_pg_table); + new_page_table = (unsigned long *) (PAGE_MASK & new_pg_table); + for (j = 0 ; j < PTRS_PER_PAGE ; j++,old_page_table++,new_page_table++) { + unsigned long pg; + pg = *old_page_table; + if (!pg) + continue; + if (!(pg & PAGE_VALID)) { + *new_page_table = swap_duplicate(pg); + continue; + } + if (pg > high_memory || (mem_map[MAP_NR(pg)] & MAP_PAGE_RESERVED)) { + *new_page_table = pg; + continue; + } + if (pg & PAGE_COW) + pg &= ~PAGE_RW; + if (delete_from_swap_cache(pg)) + pg |= PAGE_DIRTY; + *new_page_table = pg; + *old_page_table = pg; + mem_map[MAP_NR(pg)]++; + } + *new_page_dir = new_pg_table | PAGE_TABLE; + } + invalidate(); + return 0; +} + +/* + * a more complete version of free_page_tables which performs with page + * granularity. + */ +int unmap_page_range(unsigned long from, unsigned long size) +{ + unsigned long page, page_dir; + unsigned long *page_table, *dir; + unsigned long poff, pcnt, pc; + + if (from & ~PAGE_MASK) { + printk("unmap_page_range called with wrong alignment\n"); + return -EINVAL; + } + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + dir = PAGE_DIR_OFFSET(current->tss.pg_dir,from); + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + for ( ; size > 0; ++dir, size -= pcnt, + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size)) { + if (!(page_dir = *dir)) { + poff = 0; + continue; + } + if (!(page_dir & PAGE_VALID)) { + printk("unmap_page_range: bad page directory."); + continue; + } + page_table = (unsigned long *)(PAGE_MASK & page_dir); + if (poff) { + page_table += poff; + poff = 0; + } + for (pc = pcnt; pc--; page_table++) { + if ((page = *page_table) != (unsigned long) invalid_pg_table) { + *page_table = (unsigned long) invalid_pg_table; + if (PAGE_VALID & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + } + if (pcnt == PTRS_PER_PAGE) { + *dir = 0; + free_page(PAGE_MASK & page_dir); + } + } + invalidate(); + return 0; +} + +int zeromap_page_range(unsigned long from, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_VALID)) != PAGE_VALID) { + printk("zeromap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + mask |= ZERO_PAGE; + } + if (from & ~PAGE_MASK) { + printk("zeromap_page_range: from = %08lx\n",from); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.pg_dir,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (PAGE_MASK & *dir == (unsigned long) invalid_pg_table) { + /* clear page needed here? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -ENOMEM; + } + if (PAGE_MASK & *dir == (unsigned long) invalid_pg_table) { + free_page((unsigned long) page_table); + page_table = (unsigned long *)(PAGE_MASK & *dir++); + } else + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + page_table += poff; + poff = 0; + for (size -= pcnt; pcnt-- ;) { + if (PAGE_MASK & (page = *page_table) != (unsigned long) invalid_pg_table) { + *page_table = PAGE_TABLE | (unsigned long) invalid_pg_table; + if (page & PAGE_VALID) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + *page_table++ = mask; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * Maps a range of physical memory into the requested pages. the old + * mappings are removed. Any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +int remap_page_range(unsigned long from, unsigned long to, unsigned long size, int mask) +{ + unsigned long *page_table, *dir; + unsigned long poff, pcnt; + unsigned long page; + + if (mask) { + if ((mask & (PAGE_MASK|PAGE_VALID)) != PAGE_VALID) { + printk("remap_page_range: mask = %08x\n",mask); + return -EINVAL; + } + } + if ((from & ~PAGE_MASK) || (to & ~PAGE_MASK)) { + printk("remap_page_range: from = %08lx, to=%08lx\n",from,to); + return -EINVAL; + } + dir = PAGE_DIR_OFFSET(current->tss.pg_dir,from); + size = (size + ~PAGE_MASK) >> PAGE_SHIFT; + poff = (from >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if ((pcnt = PTRS_PER_PAGE - poff) > size) + pcnt = size; + + while (size > 0) { + if (PAGE_MASK & *dir != (unsigned long) invalid_pg_table) { + /* clearing page here, needed? SRB. */ + if (!(page_table = (unsigned long*) get_free_page(GFP_KERNEL))) { + invalidate(); + return -1; + } + *dir++ = ((unsigned long) page_table) | PAGE_TABLE; + } + else + page_table = (unsigned long *)(PAGE_MASK & *dir++); + if (poff) { + page_table += poff; + poff = 0; + } + + for (size -= pcnt; pcnt-- ;) { + if (PAGE_MASK & (page = *page_table) != (unsigned long) invalid_pg_table) { + *page_table = PAGE_TABLE | (unsigned long) invalid_pg_table; + if (PAGE_VALID & page) { + if (!(mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED)) + if (current->mm->rss > 0) + --current->mm->rss; + free_page(PAGE_MASK & page); + } else + swap_free(page); + } + + /* + * the first condition should return an invalid access + * when the page is referenced. current assumptions + * cause it to be treated as demand allocation in some + * cases. + */ + if (!mask) + *page_table++ = 0; /* not present */ + else if (to >= high_memory) + *page_table++ = (to | mask); + else if (!mem_map[MAP_NR(to)]) + *page_table++ = 0; /* not present */ + else { + *page_table++ = (to | mask); + if (!(mem_map[MAP_NR(to)] & MAP_PAGE_RESERVED)) { + ++current->mm->rss; + mem_map[MAP_NR(to)]++; + } + } + to += PAGE_SIZE; + } + pcnt = (size > PTRS_PER_PAGE ? PTRS_PER_PAGE : size); + } + invalidate(); + return 0; +} + +/* + * This function puts a page in memory at the wanted address. + * It returns the physical address of the page gotten, 0 if + * out of memory (either when trying to access page-table or + * page.) + */ +unsigned long put_page(struct task_struct * tsk,unsigned long page, + unsigned long address,int prot) +{ + unsigned long *page_table; + + if ((prot & (PAGE_MASK|PAGE_VALID)) != PAGE_VALID) + printk("put_page: prot = %08x\n",prot); + if (page >= high_memory) { + printk("put_page: trying to put page %08lx at %08lx\n",page,address); + return 0; + } + page_table = PAGE_DIR_OFFSET(tsk->tss.pg_dir,address); + if ((*page_table) & PAGE_VALID) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + printk("put_page: bad page directory entry\n"); + oom(tsk); + *page_table = BAD_PAGETABLE | PAGE_TABLE; + return 0; + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_page: page already exists\n"); + *page_table = PAGE_TABLE | (unsigned long) invalid_pg_table; + invalidate(); + } + *page_table = page | prot; +/* no need for invalidate */ + return page; +} + +/* + * The previous function doesn't work very well if you also want to mark + * the page dirty: exec.c wants this, as it has earlier changed the page, + * and we want the dirty-status to be correct (for VM). Thus the same + * routine, but this time we mark it dirty too. + */ +unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page, unsigned long address) +{ + unsigned long tmp, *page_table; + + if (page >= high_memory) + printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address); + if (mem_map[MAP_NR(page)] != 1) + printk("mem_map disagrees with %08lx at %08lx\n",page,address); + page_table = PAGE_DIR_OFFSET(tsk->tss.pg_dir,address); + if (PAGE_MASK & *page_table == (unsigned long) invalid_pg_table) + page_table = (unsigned long *) (PAGE_MASK & *page_table); + else { + if (!(tmp = get_free_page(GFP_KERNEL))) + return 0; + if (PAGE_MASK & *page_table == (unsigned long) invalid_pg_table) { + free_page(tmp); + page_table = (unsigned long *) (PAGE_MASK & *page_table); + } else { + *page_table = tmp | PAGE_TABLE; + page_table = (unsigned long *) tmp; + } + } + page_table += (address >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + if (*page_table) { + printk("put_dirty_page: page already exists\n"); + *page_table = PAGE_TABLE | (unsigned long) invalid_pg_table; + invalidate(); + } + *page_table = page | (PAGE_DIRTY | PAGE_PRIVATE); +/* no need for invalidate */ + return page; +} + +/* + * Note that processing of the dirty bit has already been done + * before in the assembler part. That way it's not only faster - + * we also can use the i386 code with very little changes. + * + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + */ +void do_wp_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long *pde, pte, old_page, prot; + unsigned long new_page; + + new_page = __get_free_page(GFP_KERNEL); + pde = PAGE_DIR_OFFSET(vma->vm_task->tss.pg_dir,address); + pte = *pde; + if (!(pte & PAGE_VALID)) + goto end_wp_page; + if ((pte & PAGE_TABLE) != PAGE_TABLE || pte >= high_memory) + goto bad_wp_pagetable; + pte &= PAGE_MASK; + pte += PAGE_PTR(address); + old_page = *(unsigned long *) pte; + if (!(old_page & PAGE_VALID)) + goto end_wp_page; + if (old_page >= high_memory) + goto bad_wp_page; + if (old_page & PAGE_RW) + goto end_wp_page; + vma->vm_task->mm->min_flt++; + prot = (old_page & ~PAGE_MASK) | PAGE_RW | PAGE_DIRTY; + old_page &= PAGE_MASK; + if (mem_map[MAP_NR(old_page)] != 1) { + if (new_page) { + if (mem_map[MAP_NR(old_page)] & MAP_PAGE_RESERVED) + ++vma->vm_task->mm->rss; + copy_page(old_page,new_page); + *(unsigned long *) pte = new_page | prot; + free_page(old_page); + invalidate(); + return; + } + free_page(old_page); + oom(vma->vm_task); + *(unsigned long *) pte = BAD_PAGE | prot; + invalidate(); + return; + } + *(unsigned long *) pte |= PAGE_RW | PAGE_DIRTY; + invalidate(); + if (new_page) + free_page(new_page); + return; +bad_wp_page: + printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page); + *(unsigned long *) pte = BAD_PAGE | PAGE_SHARED; + send_sig(SIGKILL, vma->vm_task, 1); + goto end_wp_page; +bad_wp_pagetable: + printk("do_wp_page: bogus page-table at address %08lx (%08lx)\n",address,pte); + *pde = BAD_PAGETABLE | PAGE_TABLE; + send_sig(SIGKILL, vma->vm_task, 1); +end_wp_page: + if (new_page) + free_page(new_page); + return; +} + +/* + * Ugly, ugly, but the goto's result in better assembly.. + */ +int verify_area(int type, const void * addr, unsigned long size) +{ + struct vm_area_struct * vma; + unsigned long start = (unsigned long) addr; + + /* If the current user space is mapped to kernel space (for the + * case where we use a fake user buffer with get_fs/set_fs()) we + * don't expect to find the address in the user vm map. + */ + if (get_fs() == get_ds()) + return 0; + + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > start) + break; + } + if (vma->vm_start <= start) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - start > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + +good_area: + if (!wp_works_ok && type == VERIFY_WRITE) + goto check_wp_fault_by_hand; + for (;;) { + struct vm_area_struct * next; + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (type != VERIFY_READ && !(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + if (vma->vm_end - start >= size) + return 0; + next = vma->vm_next; + if (!next || vma->vm_end != next->vm_start) + goto bad_area; + vma = next; + } + +check_wp_fault_by_hand: + size--; + size += start & ~PAGE_MASK; + size >>= PAGE_SHIFT; + start &= PAGE_MASK; + + for (;;) { + if (!(vma->vm_page_prot & (PAGE_COW | PAGE_RW))) + goto bad_area; + do_wp_page(vma, start, PAGE_VALID); + if (!size) + return 0; + size--; + start += PAGE_SIZE; + if (start < vma->vm_end) + continue; + vma = vma->vm_next; + if (!vma || vma->vm_start != start) + break; + } + +bad_area: + return -EFAULT; +} + +static inline void get_empty_page(struct task_struct * tsk, unsigned long address) +{ + unsigned long tmp; + + if (!(tmp = get_free_page(GFP_KERNEL))) { + oom(tsk); + tmp = BAD_PAGE; + } + if (!put_page(tsk,tmp,address,PAGE_PRIVATE)) + free_page(tmp); +} + +/* + * try_to_share() checks the page at address "address" in the task "p", + * to see if it exists, and if it is clean. If so, share it with the current + * task. + * + * NOTE! This assumes we have checked that p != current, and that they + * share the same inode and can generally otherwise be shared. + */ +static int try_to_share(unsigned long to_address, struct vm_area_struct * to_area, + unsigned long from_address, struct vm_area_struct * from_area, + unsigned long newpage) +{ + unsigned long from; + unsigned long to; + unsigned long from_page; + unsigned long to_page; + + from_page = (unsigned long)PAGE_DIR_OFFSET(from_area->vm_task->tss.pg_dir,from_address); + to_page = (unsigned long)PAGE_DIR_OFFSET(to_area->vm_task->tss.pg_dir,to_address); +/* is there a page-directory at from? */ + from = *(unsigned long *) from_page; + if (from & PAGE_MASK == (unsigned long) invalid_pg_table) + return 0; + from &= PAGE_MASK; + from_page = from + PAGE_PTR(from_address); + from = *(unsigned long *) from_page; +/* is the page present? */ + if (!(from & PAGE_VALID)) + return 0; +/* if it is private, it must be clean to be shared */ + if (from & PAGE_DIRTY) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } +/* is the page reasonable at all? */ + if (from >= high_memory) + return 0; + if (mem_map[MAP_NR(from)] & MAP_PAGE_RESERVED) + return 0; +/* is the destination ok? */ + to = *(unsigned long *) to_page; + if (to & PAGE_MASK == (unsigned long) invalid_pg_table) + return 0; + to &= PAGE_MASK; + to_page = to + PAGE_PTR(to_address); + if (*(unsigned long *) to_page) + return 0; +/* do we copy? */ + if (newpage) { + if (in_swap_cache(from)) { /* implies PAGE_DIRTY */ + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + } + copy_page((from & PAGE_MASK), newpage); + *(unsigned long *) to_page = newpage | to_area->vm_page_prot; + return 1; + } +/* do a final swap-cache test before sharing them.. */ + if (in_swap_cache(from)) { + if (from_area->vm_page_prot & PAGE_COW) + return 0; + if (!(from_area->vm_page_prot & PAGE_RW)) + return 0; + from |= PAGE_DIRTY; + *(unsigned long *) from_page = from; + delete_from_swap_cache(from); + invalidate(); + } + mem_map[MAP_NR(from)]++; +/* fill in the 'to' field, checking for COW-stuff */ + to = (from & (PAGE_MASK | PAGE_DIRTY)) | to_area->vm_page_prot; + if (to & PAGE_COW) + to &= ~PAGE_RW; + *(unsigned long *) to_page = to; +/* Check if we need to do anything at all to the 'from' field */ + if (!(from & PAGE_RW)) + return 1; + if (!(from_area->vm_page_prot & PAGE_COW)) + return 1; +/* ok, need to mark it read-only, so invalidate any possible old TB entry */ + from &= ~PAGE_RW; + *(unsigned long *) from_page = from; + invalidate(); + return 1; +} + +/* + * share_page() tries to find a process that could share a page with + * the current one. + * + * We first check if it is at all feasible by checking inode->i_count. + * It should be >1 if there are other tasks sharing this inode. + */ +static int share_page(struct vm_area_struct * area, unsigned long address, + unsigned long error_code, unsigned long newpage) +{ + struct inode * inode; + struct task_struct ** p; + unsigned long offset; + unsigned long from_address; + unsigned long give_page; + + if (!area || !(inode = area->vm_inode) || inode->i_count < 2) + return 0; + /* do we need to copy or can we just share? */ + give_page = 0; + if ((area->vm_page_prot & PAGE_COW) && (error_code & PAGE_RW)) { + if (!newpage) + return 0; + give_page = newpage; + } + offset = address - area->vm_start + area->vm_offset; + for (p = &LAST_TASK ; p > &FIRST_TASK ; --p) { + struct vm_area_struct * mpnt; + if (!*p) + continue; + if (area->vm_task == *p) + continue; + /* Now see if there is something in the VMM that + we can share pages with */ + for (mpnt = (*p)->mm->mmap; mpnt; mpnt = mpnt->vm_next) { + /* must be same inode */ + if (mpnt->vm_inode != inode) + continue; + /* offsets must be mutually page-aligned */ + if ((mpnt->vm_offset ^ area->vm_offset) & ~PAGE_MASK) + continue; + /* the other area must actually cover the wanted page.. */ + from_address = offset + mpnt->vm_start - mpnt->vm_offset; + if (from_address < mpnt->vm_start || from_address >= mpnt->vm_end) + continue; + /* .. NOW we can actually try to use the same physical page */ + if (!try_to_share(address, area, from_address, mpnt, give_page)) + continue; + /* free newpage if we never used it.. */ + if (give_page || !newpage) + return 1; + free_page(newpage); + return 1; + } + } + return 0; +} + +/* + * fill in an empty page-table if none exists. + */ +static inline unsigned long get_empty_pgtable(struct task_struct * tsk,unsigned long address) +{ + unsigned long page; + unsigned long *p; + + p = PAGE_DIR_OFFSET(tsk->tss.pg_dir,address); + if (*p & PAGE_MASK == (unsigned long) invalid_pg_table) + return *p; + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + page = get_free_page(GFP_KERNEL); + p = PAGE_DIR_OFFSET(tsk->tss.pg_dir,address); + if (*p & PAGE_MASK == (unsigned long) invalid_pg_table) { + free_page(page); + return *p; + } + if (*p) { + printk("get_empty_pgtable: bad page-directory entry \n"); + *p = 0; + } + if (page) { + *p = page | PAGE_TABLE; + return *p; + } + oom(current); + *p = BAD_PAGETABLE | PAGE_TABLE; + return 0; +} + +static inline void do_swap_page(struct vm_area_struct * vma, + unsigned long address, unsigned long * pge, unsigned long entry) +{ + unsigned long page; + + if (vma->vm_ops && vma->vm_ops->swapin) + page = vma->vm_ops->swapin(vma, entry); + else + page = swap_in(entry); + if (*pge != entry) { + free_page(page); + return; + } + page = page | vma->vm_page_prot; + if (mem_map[MAP_NR(page)] > 1 && (page & PAGE_COW)) + page &= ~PAGE_RW; + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->maj_flt; + *pge = page; + return; +} + +void do_no_page(struct vm_area_struct * vma, unsigned long address, + unsigned long error_code) +{ + unsigned long page, entry, prot; + + page = get_empty_pgtable(vma->vm_task,address); + if (!page) + return; + page &= PAGE_MASK; + page += PAGE_PTR(address); + entry = *(unsigned long *) page; + if (entry & PAGE_VALID) + return; + if (entry) { + do_swap_page(vma, address, (unsigned long *) page, entry); + return; + } + address &= PAGE_MASK; + + if (!vma->vm_ops || !vma->vm_ops->nopage) { + ++vma->vm_task->mm->rss; + ++vma->vm_task->mm->min_flt; + get_empty_page(vma->vm_task,address); + return; + } + page = get_free_page(GFP_KERNEL); + if (share_page(vma, address, error_code, page)) { + ++vma->vm_task->mm->min_flt; + ++vma->vm_task->mm->rss; + return; + } + if (!page) { + oom(current); + put_page(vma->vm_task, BAD_PAGE, address, PAGE_PRIVATE); + return; + } + ++vma->vm_task->mm->maj_flt; + ++vma->vm_task->mm->rss; + prot = vma->vm_page_prot; + /* + * The fourth argument is "no_share", which tells the low-level code + * to copy, not share the page even if sharing is possible. It's + * essentially an early COW detection ("moo at 5 AM"). + */ + page = vma->vm_ops->nopage(vma, address, page, (error_code & PAGE_RW) && (prot & PAGE_COW)); + if (share_page(vma, address, error_code, 0)) { + free_page(page); + return; + } + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. + */ + if (error_code & PAGE_RW) { + prot |= PAGE_DIRTY; /* can't be COW-shared: see "no_share" above */ + } else if ((prot & PAGE_COW) && mem_map[MAP_NR(page)] > 1) + prot &= ~PAGE_RW; + if (put_page(vma->vm_task, page, address, prot)) + return; + free_page(page); + oom(current); +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct vm_area_struct * vma; + unsigned long address; + unsigned long page; + + /* get the address */ + __asm__("dmfc0\t%0,$8":"=r" (address)); + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + goto bad_area; + if (vma->vm_end > address) + break; + } + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur) + goto bad_area; + vma->vm_offset -= vma->vm_start - (address & PAGE_MASK); + vma->vm_start = (address & PAGE_MASK); +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: +#if 0 + if (regs->eflags & VM_MASK) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + current->screen_bitmap |= 1 << bit; + } +#endif + if (!(vma->vm_page_prot & PAGE_USER)) + goto bad_area; + if (error_code & PAGE_VALID) { + if (!(vma->vm_page_prot & (PAGE_RW | PAGE_COW))) + goto bad_area; + do_wp_page(vma, address, error_code); + return; + } + do_no_page(vma, address, error_code); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + if (error_code & PAGE_USER) { + current->tss.cp0_badvaddr = address; + current->tss.error_code = error_code; + current->tss.trap_no = 14; + send_sig(SIGSEGV, current, 1); + return; + } +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + if (wp_works_ok < 0 && address == TASK_SIZE && (error_code & PAGE_VALID)) { + wp_works_ok = 1; + pg0[0] = PAGE_SHARED; + printk("This processor honours the WP bit even when in supervisor mode. Good.\n"); + return; + } + if ((unsigned long) (address-TASK_SIZE) < PAGE_SIZE) { + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + pg0[0] = PAGE_SHARED; + } else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at virtual address %08lx\n",address); + printk(KERN_ALERT "current->tss.pg_dir = %08lx\n", current->tss.pg_dir); + page = ((unsigned long *) page)[address >> 22]; + printk(KERN_ALERT "*pde = %08lx\n", page); + if (page & PAGE_VALID) { + page &= PAGE_MASK; + address &= 0x003ff000; + page = ((unsigned long *) page)[address >> PAGE_SHIFT]; + printk(KERN_ALERT "*pte = %08lx\n", page); + } + die_if_kernel("Oops", regs, error_code); + do_exit(SIGKILL); +} + +/* + * BAD_PAGE is the page that is used for page faults when linux + * is out-of-memory. Older versions of linux just did a + * do_exit(), but using this instead means there is less risk + * for a process dying in kernel mode, possibly leaving a inode + * unused etc.. + * + * BAD_PAGETABLE is the accompanying page-table: it is initialized + * to point to BAD_PAGE entries. + * + * ZERO_PAGE is a special page that is used for zero-initialized + * data and COW. + */ +unsigned long __bad_pagetable(void) +{ + extern char empty_bad_page_table[PAGE_SIZE]; + unsigned long dummy; + + __asm__ __volatile__( + ".set\tnoreorder\n\t" + "1:\tsw\t%2,(%0)\n\t" + "subu\t%1,%1,1\n\t" + "bne\t$0,%1,1b\n\t" + "addiu\t%0,%0,1\n\t" + ".set\treorder" + :"=r" (dummy), + "=r" (dummy) + :"r" (BAD_PAGE + PAGE_TABLE), + "0" ((long) empty_bad_page_table), + "1" (PTRS_PER_PAGE)); + + return (unsigned long) empty_bad_page_table; +} + +unsigned long __bad_page(void) +{ + extern char empty_bad_page[PAGE_SIZE]; + unsigned long dummy; + + __asm__ __volatile__( + ".set\tnoreorder\n\t" + "1:\tsw\t$0,(%0)\n\t" + "subu\t%1,%1,1\n\t" + "bne\t$0,%1,1b\n\t" + "addiu\t%0,%0,1\n\t" + ".set\treorder" + :"=r" (dummy), + "=r" (dummy) + :"0" ((long) empty_bad_page), + "1" (PTRS_PER_PAGE)); + + return (unsigned long) empty_bad_page; +} + +unsigned long __zero_page(void) +{ + extern char empty_zero_page[PAGE_SIZE]; + unsigned long dummy; + + __asm__ __volatile__( + ".set\tnoreorder\n\t" + "1:\tsw\t$0,(%0)\n\t" + "subu\t%1,%1,1\n\t" + "bne\t$0,%1,1b\n\t" + "addiu\t%0,%0,1\n\t" + ".set\treorder" + :"=r" (dummy), + "=r" (dummy) + :"0" ((long) empty_zero_page), + "1" (PTRS_PER_PAGE)); + + return (unsigned long) empty_zero_page; +} + +void show_mem(void) +{ + int i,free = 0,total = 0,reserved = 0; + int shared = 0; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); + i = high_memory >> PAGE_SHIFT; + while (i-- > 0) { + total++; + if (mem_map[i] & MAP_PAGE_RESERVED) + reserved++; + else if (!mem_map[i]) + free++; + else + shared += mem_map[i]-1; + } + printk("%d pages of RAM\n",total); + printk("%d free pages\n",free); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + show_buffers(); +#ifdef CONFIG_NET + show_net_buffers(); +#endif +} + +#if 0 +extern unsigned long free_area_init(unsigned long, unsigned long); + +/* + * paging_init() sets up the page tables - note that the first 4MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +unsigned long paging_init(unsigned long start_mem, unsigned long end_mem) +{ + unsigned long * pg_dir; + unsigned long * pg_table; + unsigned long tmp; + unsigned long address; + + start_mem = PAGE_ALIGN(start_mem); + address = 0; + pg_dir = swapper_pg_dir; + while (address < end_mem) { + /* + * at virtual addr 0xC0000000 + */ + tmp = *(pg_dir + 768); + tmp &= PAGE_MASK; + if (!tmp) { + tmp = start_mem | PAGE_TABLE; + *(pg_dir + 768) = tmp; + start_mem += PAGE_SIZE; + } + /* + * also map it in at 0x0000000 for init + */ + *pg_dir = tmp; + pg_dir++; + pg_table = (unsigned long *) (tmp & PAGE_MASK); + for (tmp = 0 ; tmp < PTRS_PER_PAGE ; tmp++,pg_table++) { + if (address < end_mem) + *pg_table = address | PAGE_SHARED; + else + *pg_table = 0; + address += PAGE_SIZE; + } + } + invalidate(); + return free_area_init(start_mem, end_mem); +} +#endif + +void mem_init(unsigned long start_mem, unsigned long end_mem) +{ + int codepages = 0; + int reservedpages = 0; + int datapages = 0; + unsigned long tmp; + extern int etext; + + cli(); + end_mem &= PAGE_MASK; + high_memory = end_mem; + + /* mark usable pages in the mem_map[] */ + start_mem = PAGE_ALIGN(start_mem); + + while (start_mem < high_memory) { + mem_map[MAP_NR(start_mem)] = 0; + start_mem += PAGE_SIZE; + } +#ifdef CONFIG_SOUND + sound_mem_init(); +#endif + for (tmp = 0 ; tmp < high_memory ; tmp += PAGE_SIZE) { + if (mem_map[MAP_NR(tmp)]) { + /* + * Don't have any reserved pages + */ + if (0) + reservedpages++; + else if (tmp < (0x7fffffff & (unsigned long) &etext)) + codepages++; + else + datapages++; + continue; + } + mem_map[MAP_NR(tmp)] = 1; + free_page(tmp); + } + tmp = nr_free_pages << PAGE_SHIFT; + printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data)\n", + tmp >> 10, + high_memory >> 10, + codepages << (PAGE_SHIFT-10), + reservedpages << (PAGE_SHIFT-10), + datapages << (PAGE_SHIFT-10)); + +#if 0 + /* + * No need to cope with Intel bugs + */ + wp_works_ok = 1; +#endif + invalidate(); + return; +} + +void si_meminfo(struct sysinfo *val) +{ + int i; + + i = high_memory >> PAGE_SHIFT; + val->totalram = 0; + val->sharedram = 0; + val->freeram = nr_free_pages << PAGE_SHIFT; + val->bufferram = buffermem; + while (i-- > 0) { + if (mem_map[i] & MAP_PAGE_RESERVED) + continue; + val->totalram++; + if (!mem_map[i]) + continue; + val->sharedram += mem_map[i]-1; + } + val->totalram <<= PAGE_SHIFT; + val->sharedram <<= PAGE_SHIFT; + return; +} + + +/* + * This handles a generic mmap of a disk file. + */ +static unsigned long file_mmap_nopage(struct vm_area_struct * area, unsigned long address, + unsigned long page, int no_share) +{ + struct inode * inode = area->vm_inode; + unsigned int block; + int nr[8]; + int i, *p; + + address &= PAGE_MASK; + block = address - area->vm_start + area->vm_offset; + block >>= inode->i_sb->s_blocksize_bits; + i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; + p = nr; + do { + *p = bmap(inode,block); + i--; + block++; + p++; + } while (i > 0); + return bread_page(page, inode->i_dev, nr, inode->i_sb->s_blocksize, no_share); +} + +struct vm_operations_struct file_mmap = { + NULL, /* open */ + NULL, /* close */ + file_mmap_nopage, /* nopage */ + NULL, /* wppage */ + NULL, /* share */ + NULL, /* unmap */ +}; diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c new file mode 100644 index 000000000..dbe63b55e --- /dev/null +++ b/arch/mips/mm/mmap.c @@ -0,0 +1,470 @@ +/* + * linux/mm/mmap.c + * + * Written by obz. + */ +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> + +#include <asm/segment.h> +#include <asm/system.h> + +static int anon_map(struct inode *, struct file *, struct vm_area_struct *); + +/* + * description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ + +int do_mmap(struct file * file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long off) +{ + int mask, error; + struct vm_area_struct * vma; + + if ((len = PAGE_ALIGN(len)) == 0) + return addr; + + if (addr > TASK_SIZE || len > TASK_SIZE || addr > TASK_SIZE-len) + return -EINVAL; + + /* offset overflow? */ + if (off + len < off) + return -EINVAL; + + /* + * do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + + if (file != NULL) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot & PROT_WRITE) && !(file->f_mode & 2)) + return -EACCES; + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & 1)) + return -EACCES; + break; + + default: + return -EINVAL; + } + if ((flags & MAP_DENYWRITE) && (file->f_inode->i_wcount > 0)) + return -ETXTBSY; + } else if ((flags & MAP_TYPE) == MAP_SHARED) + return -EINVAL; + + /* + * obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + + if (flags & MAP_FIXED) { + if (addr & ~PAGE_MASK) + return -EINVAL; + if (len > TASK_SIZE || addr > TASK_SIZE - len) + return -EINVAL; + } else { + addr = get_unmapped_area(len); + if (!addr) + return -ENOMEM; + } + + /* + * determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + if (file && (!file->f_op || !file->f_op->mmap)) + return -ENODEV; + mask = PAGE_VALID; + if (prot & (PROT_READ | PROT_EXEC)) + mask |= PAGE_READONLY; + if (prot & PROT_WRITE) + if ((flags & MAP_TYPE) == MAP_PRIVATE) + mask |= PAGE_COPY; + else + mask |= PAGE_SHARED; + + vma = (struct vm_area_struct *)kmalloc(sizeof(struct vm_area_struct), + GFP_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_task = current; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_page_prot = mask; + vma->vm_flags = prot & (VM_READ | VM_WRITE | VM_EXEC); + vma->vm_flags |= flags & (VM_GROWSDOWN | VM_DENYWRITE | VM_EXECUTABLE); + + if (file) { + if (file->f_mode & 1) + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (flags & MAP_SHARED) { + vma->vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & 2)) + vma->vm_flags &= ~VM_MAYWRITE; + } + } else + vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + vma->vm_ops = NULL; + vma->vm_offset = off; + vma->vm_inode = NULL; + vma->vm_pte = 0; + + do_munmap(addr, len); /* Clear old maps */ + + if (file) + error = file->f_op->mmap(file->f_inode, file, vma); + else + error = anon_map(NULL, NULL, vma); + + if (error) { + kfree(vma); + return error; + } + insert_vm_struct(current, vma); + merge_segments(current->mm->mmap); + return addr; +} + +/* + * Get an address range which is currently unmapped. + * For mmap() without MAP_FIXED and shmat() with addr=0. + * Return value 0 means ENOMEM. + */ +unsigned long get_unmapped_area(unsigned long len) +{ + struct vm_area_struct * vmm; + unsigned long gap_start = 0, gap_end; + + for (vmm = current->mm->mmap; ; vmm = vmm->vm_next) { + if (gap_start < SHM_RANGE_START) + gap_start = SHM_RANGE_START; + if (!vmm || ((gap_end = vmm->vm_start) > SHM_RANGE_END)) + gap_end = SHM_RANGE_END; + gap_start = PAGE_ALIGN(gap_start); + gap_end &= PAGE_MASK; + if ((gap_start <= gap_end) && (gap_end - gap_start >= len)) + return gap_start; + if (!vmm) + return 0; + gap_start = vmm->vm_end; + } +} + +asmlinkage int sys_mmap(unsigned long *buffer) +{ + int error; + unsigned long flags; + struct file * file = NULL; + + error = verify_area(VERIFY_READ, buffer, 6*sizeof(long)); + if (error) + return error; + flags = get_fs_long(buffer+3); + if (!(flags & MAP_ANONYMOUS)) { + unsigned long fd = get_fs_long(buffer+4); + if (fd >= NR_OPEN || !(file = current->files->fd[fd])) + return -EBADF; + } + return do_mmap(file, get_fs_long(buffer), get_fs_long(buffer+1), + get_fs_long(buffer+2), flags, get_fs_long(buffer+5)); +} + +/* + * Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * This function works out what part of an area is affected and + * adjusts the mapping information. Since the actual page + * manipulation is done in do_mmap(), none need be done here, + * though it would probably be more appropriate. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list, so it needs to be + * reinserted if necessary. + * + * The 4 main cases are: + * Unmapping the whole area + * Unmapping from the start of the segment to a point in it + * Unmapping from an intermediate point to the end + * Unmapping between to intermediate points, making a hole. + * + * Case 4 involves the creation of 2 new areas, for each side of + * the hole. + */ +void unmap_fixup(struct vm_area_struct *area, + unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt; + unsigned long end = addr + len; + + if (addr < area->vm_start || addr >= area->vm_end || + end <= area->vm_start || end > area->vm_end || + end < addr) + { + printk("unmap_fixup: area=%lx-%lx, unmap %lx-%lx!!\n", + area->vm_start, area->vm_end, addr, end); + return; + } + + /* Unmapping the whole area */ + if (addr == area->vm_start && end == area->vm_end) { + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_inode) + iput(area->vm_inode); + return; + } + + /* Work out to one of the ends */ + if (addr >= area->vm_start && end == area->vm_end) + area->vm_end = addr; + if (addr == area->vm_start && end <= area->vm_end) { + area->vm_offset += (end - area->vm_start); + area->vm_start = end; + } + + /* Unmapping a hole */ + if (addr > area->vm_start && end < area->vm_end) + { + /* Add end mapping -- leave beginning for below */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + + if (!mpnt) + return; + *mpnt = *area; + mpnt->vm_offset += (end - area->vm_start); + mpnt->vm_start = end; + if (mpnt->vm_inode) + mpnt->vm_inode->i_count++; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + area->vm_end = addr; /* Truncate area */ + insert_vm_struct(current, mpnt); + } + + /* construct whatever mapping is needed */ + mpnt = (struct vm_area_struct *)kmalloc(sizeof(*mpnt), GFP_KERNEL); + if (!mpnt) + return; + *mpnt = *area; + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + if (area->vm_ops && area->vm_ops->close) { + area->vm_end = area->vm_start; + area->vm_ops->close(area); + } + insert_vm_struct(current, mpnt); +} + +asmlinkage int sys_munmap(unsigned long addr, size_t len) +{ + return do_munmap(addr, len); +} + +/* + * Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardine <jeremy@sw.oz.au> + */ +int do_munmap(unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt, **npp, *free; + + if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return 0; + + /* + * Check if this memory area is ok - put it on the temporary + * list if so.. The checks here are pretty simple -- + * every area affected in some way (by any overlap) is put + * on the list. If nothing is put on, nothing is affected. + */ + npp = ¤t->mm->mmap; + free = NULL; + for (mpnt = *npp; mpnt != NULL; mpnt = *npp) { + unsigned long end = addr+len; + + if ((addr < mpnt->vm_start && end <= mpnt->vm_start) || + (addr >= mpnt->vm_end && end > mpnt->vm_end)) + { + npp = &mpnt->vm_next; + continue; + } + + *npp = mpnt->vm_next; + mpnt->vm_next = free; + free = mpnt; + } + + if (free == NULL) + return 0; + + /* + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and unmap the page range.. + * If the one of the segments is only being partially unmapped, + * it will put new vm_area_struct(s) into the address space. + */ + while (free) { + unsigned long st, end; + + mpnt = free; + free = free->vm_next; + + st = addr < mpnt->vm_start ? mpnt->vm_start : addr; + end = addr+len; + end = end > mpnt->vm_end ? mpnt->vm_end : end; + + if (mpnt->vm_ops && mpnt->vm_ops->unmap) + mpnt->vm_ops->unmap(mpnt, st, end-st); + else + unmap_fixup(mpnt, st, end-st); + + kfree(mpnt); + } + + unmap_page_range(addr, len); + return 0; +} + +/* This is used for a general mmap of a disk file */ +int generic_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma) +{ + extern struct vm_operations_struct file_mmap; + + if (vma->vm_page_prot & PAGE_RW) /* only PAGE_COW or read-only supported right now */ + return -EINVAL; + if (vma->vm_offset & (inode->i_sb->s_blocksize - 1)) + return -EINVAL; + if (!inode->i_sb || !S_ISREG(inode->i_mode)) + return -EACCES; + if (!inode->i_op || !inode->i_op->bmap) + return -ENOEXEC; + if (!IS_RDONLY(inode)) { + inode->i_atime = CURRENT_TIME; + inode->i_dirt = 1; + } + vma->vm_inode = inode; + inode->i_count++; + vma->vm_ops = &file_mmap; + return 0; +} + +/* + * Insert vm structure into process list sorted by address. + */ +void insert_vm_struct(struct task_struct *t, struct vm_area_struct *vmp) +{ + struct vm_area_struct **p, *mpnt; + + p = &t->mm->mmap; + while ((mpnt = *p) != NULL) { + if (mpnt->vm_start > vmp->vm_start) + break; + if (mpnt->vm_end > vmp->vm_start) + printk("insert_vm_struct: overlapping memory areas\n"); + p = &mpnt->vm_next; + } + vmp->vm_next = mpnt; + *p = vmp; +} + +/* + * Merge a list of memory segments if possible. + * Redundant vm_area_structs are freed. + * This assumes that the list is ordered by address. + */ +void merge_segments(struct vm_area_struct *mpnt) +{ + struct vm_area_struct *prev, *next; + + if (mpnt == NULL) + return; + + for(prev = mpnt, mpnt = mpnt->vm_next; + mpnt != NULL; + prev = mpnt, mpnt = next) + { + next = mpnt->vm_next; + + /* + * To share, we must have the same inode, operations.. + */ + if (mpnt->vm_inode != prev->vm_inode) + continue; + if (mpnt->vm_pte != prev->vm_pte) + continue; + if (mpnt->vm_ops != prev->vm_ops) + continue; + if (mpnt->vm_page_prot != prev->vm_page_prot || + mpnt->vm_flags != prev->vm_flags) + continue; + if (prev->vm_end != mpnt->vm_start) + continue; + /* + * and if we have an inode, the offsets must be contiguous.. + */ + if ((mpnt->vm_inode != NULL) || (mpnt->vm_flags & VM_SHM)) { + if (prev->vm_offset + prev->vm_end - prev->vm_start != mpnt->vm_offset) + continue; + } + + /* + * merge prev with mpnt and set up pointers so the new + * big segment can possibly merge with the next one. + * The old unused mpnt is freed. + */ + prev->vm_end = mpnt->vm_end; + prev->vm_next = mpnt->vm_next; + if (mpnt->vm_ops && mpnt->vm_ops->close) { + mpnt->vm_offset += mpnt->vm_end - mpnt->vm_start; + mpnt->vm_start = mpnt->vm_end; + mpnt->vm_ops->close(mpnt); + } + if (mpnt->vm_inode) + mpnt->vm_inode->i_count--; + kfree_s(mpnt, sizeof(*mpnt)); + mpnt = prev; + } +} + +/* + * Map memory not associated with any file into a process + * address space. Adjacent memory is merged. + */ +static int anon_map(struct inode *ino, struct file * file, struct vm_area_struct * vma) +{ + if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) + return -ENOMEM; + return 0; +} diff --git a/arch/mips/mm/mprotect.c b/arch/mips/mm/mprotect.c new file mode 100644 index 000000000..7bb4148f4 --- /dev/null +++ b/arch/mips/mm/mprotect.c @@ -0,0 +1,230 @@ +/* + * linux/mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + */ +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/shm.h> +#include <linux/errno.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <linux/malloc.h> + +#include <asm/segment.h> +#include <asm/system.h> + +#define CHG_MASK (PAGE_MASK | PAGE_ACCESSED | PAGE_DIRTY | CACHE_UNCACHED) + +static void change_protection(unsigned long start, unsigned long end, int prot) +{ + unsigned long *page_table, *dir; + unsigned long page, offset; + int nr; + + dir = PAGE_DIR_OFFSET(current->tss.pg_dir, start); + offset = (start >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + nr = (end - start) >> PAGE_SHIFT; + while (nr > 0) { + page = *dir; + dir++; + if (!(page & PAGE_VALID)) { + nr = nr - PTRS_PER_PAGE + offset; + offset = 0; + continue; + } + page_table = offset + (unsigned long *) (page & PAGE_MASK); + offset = PTRS_PER_PAGE - offset; + if (offset > nr) + offset = nr; + nr = nr - offset; + do { + page = *page_table; + if (page & PAGE_VALID) + *page_table = (page & CHG_MASK) | prot; + ++page_table; + } while (--offset); + } + return; +} + +static inline int mprotect_fixup_all(struct vm_area_struct * vma, + int newflags, int prot) +{ + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + return 0; +} + +static inline int mprotect_fixup_start(struct vm_area_struct * vma, + unsigned long end, + int newflags, int prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_start = end; + n->vm_end = end; + vma->vm_offset += vma->vm_start - n->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_end(struct vm_area_struct * vma, + unsigned long start, + int newflags, int prot) +{ + struct vm_area_struct * n; + + n = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!n) + return -ENOMEM; + *n = *vma; + vma->vm_end = start; + n->vm_start = start; + n->vm_offset += n->vm_start - vma->vm_start; + n->vm_flags = newflags; + n->vm_page_prot = prot; + if (n->vm_inode) + n->vm_inode->i_count++; + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + insert_vm_struct(current, n); + return 0; +} + +static inline int mprotect_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, + int newflags, int prot) +{ + struct vm_area_struct * left, * right; + + left = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!left) + return -ENOMEM; + right = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!right) { + kfree(left); + return -ENOMEM; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + vma->vm_start = start; + vma->vm_end = end; + right->vm_start = end; + vma->vm_offset += vma->vm_start - left->vm_start; + right->vm_offset += right->vm_start - left->vm_start; + vma->vm_flags = newflags; + vma->vm_page_prot = prot; + if (vma->vm_inode) + vma->vm_inode->i_count += 2; + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + insert_vm_struct(current, left); + insert_vm_struct(current, right); + return 0; +} + +static int mprotect_fixup(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned int newflags) +{ + int prot, error; + + if (newflags == vma->vm_flags) + return 0; + prot = PAGE_VALID; + if (newflags & (VM_READ | VM_EXEC)) + prot |= PAGE_READONLY; + if (newflags & VM_WRITE) + if (newflags & VM_SHARED) + prot |= PAGE_SHARED; + else + prot |= PAGE_COPY; + + if (start == vma->vm_start) + if (end == vma->vm_end) + error = mprotect_fixup_all(vma, newflags, prot); + else + error = mprotect_fixup_start(vma, end, newflags, prot); + else if (end == vma->vm_end) + error = mprotect_fixup_end(vma, start, newflags, prot); + else + error = mprotect_fixup_middle(vma, start, end, newflags, prot); + + if (error) + return error; + + change_protection(start, end, prot); + return 0; +} + +asmlinkage int sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long end, tmp; + struct vm_area_struct * vma, * next; + int error; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) + return -EINVAL; + if (end == start) + return 0; + for (vma = current->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + return -EFAULT; + if (vma->vm_end > start) + break; + } + if (vma->vm_start > start) + return -EFAULT; + + for ( ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= start < vma->vm_end. */ + + newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + break; + } + + if (vma->vm_end >= end) { + error = mprotect_fixup(vma, start, end, newflags); + break; + } + + tmp = vma->vm_end; + next = vma->vm_next; + error = mprotect_fixup(vma, start, tmp, newflags); + if (error) + break; + start = tmp; + vma = next; + if (!vma || vma->vm_start != start) { + error = -EFAULT; + break; + } + } + merge_segments(current->mm->mmap); + return error; +} diff --git a/arch/mips/mm/swap.c b/arch/mips/mm/swap.c new file mode 100644 index 000000000..084208c04 --- /dev/null +++ b/arch/mips/mm/swap.c @@ -0,0 +1,986 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file should contain most things doing the swapping from/to disk. + * Started 18.12.91 + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/fs.h> + +#include <asm/system.h> /* for cli()/sti() */ +#include <asm/bitops.h> + +#define MAX_SWAPFILES 8 + +#define SWP_USED 1 +#define SWP_WRITEOK 3 + +#define SWP_TYPE(entry) (((entry) & 0xfe) >> 1) +#define SWP_OFFSET(entry) ((entry) >> PAGE_SHIFT) +#define SWP_ENTRY(type,offset) (((type) << 1) | ((offset) << PAGE_SHIFT)) + +int min_free_pages = 20; + +static int nr_swapfiles = 0; +static struct wait_queue * lock_queue = NULL; + +static struct swap_info_struct { + unsigned long flags; + struct inode * swap_file; + unsigned int swap_device; + unsigned char * swap_map; + unsigned char * swap_lockmap; + int pages; + int lowest_bit; + int highest_bit; + unsigned long max; +} swap_info[MAX_SWAPFILES]; + +extern int shm_swap (int); + +unsigned long *swap_cache; + +#ifdef SWAP_CACHE_INFO +unsigned long swap_cache_add_total = 0; +unsigned long swap_cache_add_success = 0; +unsigned long swap_cache_del_total = 0; +unsigned long swap_cache_del_success = 0; +unsigned long swap_cache_find_total = 0; +unsigned long swap_cache_find_success = 0; + +extern inline void show_swap_cache_info(void) +{ + printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n", + swap_cache_add_total, swap_cache_add_success, + swap_cache_del_total, swap_cache_del_success, + swap_cache_find_total, swap_cache_find_success); +} +#endif + +extern inline int add_to_swap_cache(unsigned long addr, unsigned long entry) +{ + struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)]; + +#ifdef SWAP_CACHE_INFO + swap_cache_add_total++; +#endif + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + atomic_exchange(swap_cache[addr >> PAGE_SHIFT],entry); + if (entry) { + printk("swap_cache: replacing non-NULL entry\n"); + } +#ifdef SWAP_CACHE_INFO + swap_cache_add_success++; +#endif + return 1; + } + return 0; +} + +static unsigned long init_swap_cache(unsigned long mem_start, + unsigned long mem_end) +{ + unsigned long swap_cache_size; + + mem_start = (mem_start + 15) & ~15; + swap_cache = (unsigned long *) mem_start; + swap_cache_size = mem_end >> PAGE_SHIFT; + memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long)); + return (unsigned long) (swap_cache + swap_cache_size); +} + +void rw_swap_page(int rw, unsigned long entry, char * buf) +{ + unsigned long type, offset; + struct swap_info_struct * p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk("Internal error: bad swap-device\n"); + return; + } + p = &swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("rw_swap_page: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to swap to unused swap-device\n"); + return; + } + while (set_bit(offset,p->swap_lockmap)) + sleep_on(&lock_queue); + if (rw == READ) + kstat.pswpin++; + else + kstat.pswpout++; + if (p->swap_device) { + ll_rw_page(rw,p->swap_device,offset,buf); + } else if (p->swap_file) { + struct inode *swapf = p->swap_file; + unsigned int zones[8]; + int i; + if (swapf->i_op->bmap == NULL + && swapf->i_op->smap != NULL){ + /* + With MsDOS, we use msdos_smap which return + a sector number (not a cluster or block number). + It is a patch to enable the UMSDOS project. + Other people are working on better solution. + + It sounds like ll_rw_swap_file defined + it operation size (sector size) based on + PAGE_SIZE and the number of block to read. + So using bmap or smap should work even if + smap will require more blocks. + */ + int j; + unsigned int block = offset << 3; + + for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){ + if (!(zones[i] = swapf->i_op->smap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + }else{ + int j; + unsigned int block = offset + << (12 - swapf->i_sb->s_blocksize_bits); + + for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize) + if (!(zones[i] = bmap(swapf,block++))) { + printk("rw_swap_page: bad swap file\n"); + return; + } + } + ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf); + } else + printk("re_swap_page: no swap file or device\n"); + if (offset && !clear_bit(offset,p->swap_lockmap)) + printk("rw_swap_page: lock already cleared\n"); + wake_up(&lock_queue); +} + +unsigned int get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned int offset, type; + + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) { + if (p->swap_map[offset]) + continue; + p->swap_map[offset] = 1; + nr_swap_pages--; + if (offset == p->highest_bit) + p->highest_bit--; + p->lowest_bit = offset; + return SWP_ENTRY(type,offset); + } + } + return 0; +} + +unsigned long swap_duplicate(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return 0; + offset = SWP_OFFSET(entry); + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return entry; + if (type >= nr_swapfiles) { + printk("Trying to duplicate nonexistent swap-page\n"); + return 0; + } + p = type + swap_info; + if (offset >= p->max) { + printk("swap_duplicate: weirdness\n"); + return 0; + } + if (!p->swap_map[offset]) { + printk("swap_duplicate: trying to duplicate unused page\n"); + return 0; + } + p->swap_map[offset]++; + return entry; +} + +void swap_free(unsigned long entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry) + return; + type = SWP_TYPE(entry); + if (type == SHM_SWP_TYPE) + return; + if (type >= nr_swapfiles) { + printk("Trying to free nonexistent swap-page\n"); + return; + } + p = & swap_info[type]; + offset = SWP_OFFSET(entry); + if (offset >= p->max) { + printk("swap_free: weirdness\n"); + return; + } + if (!(p->flags & SWP_USED)) { + printk("Trying to free swap from unused swap-device\n"); + return; + } + while (set_bit(offset,p->swap_lockmap)) + sleep_on(&lock_queue); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (!p->swap_map[offset]) + printk("swap_free: swap-space map bad (entry %08lx)\n",entry); + else + if (!--p->swap_map[offset]) + nr_swap_pages++; + if (!clear_bit(offset,p->swap_lockmap)) + printk("swap_free: lock already cleared\n"); + wake_up(&lock_queue); +} + +unsigned long swap_in(unsigned long entry) +{ + unsigned long page; + + if (!(page = get_free_page(GFP_KERNEL))) { + oom(current); + return BAD_PAGE; + } + read_swap_page(entry, (char *) page); + if (add_to_swap_cache(page, entry)) + return page | PAGE_VALID; + swap_free(entry); + return page | PAGE_DIRTY | PAGE_VALID; +} + +static inline int try_to_swap_out(unsigned long * table_ptr) +{ + unsigned long page, entry; + + page = *table_ptr; + if (!(PAGE_VALID & page)) + return 0; + if (page >= high_memory) + return 0; + if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) + return 0; + + if ((PAGE_DIRTY & page) && delete_from_swap_cache(page)) { + *table_ptr &= ~PAGE_ACCESSED; + return 0; + } + if (PAGE_ACCESSED & page) { + *table_ptr &= ~PAGE_ACCESSED; + return 0; + } + if (PAGE_DIRTY & page) { + page &= PAGE_MASK; + if (mem_map[MAP_NR(page)] != 1) + return 0; + if (!(entry = get_swap_page())) + return 0; + *table_ptr = entry; + invalidate(); + write_swap_page(entry, (char *) page); + free_page(page); + return 1; + } + if ((entry = find_in_swap_cache(page))) { + if (mem_map[MAP_NR(page)] != 1) { + *table_ptr |= PAGE_DIRTY; + printk("Aiee.. duplicated cached swap-cache entry\n"); + return 0; + } + *table_ptr = entry; + invalidate(); + free_page(page & PAGE_MASK); + return 1; + } + page &= PAGE_MASK; + *table_ptr = 0; + invalidate(); + free_page(page); + return 1 + mem_map[MAP_NR(page)]; +} + +/* + * A new implementation of swap_out(). We do not swap complete processes, + * but only a small number of blocks, before we continue with the next + * process. The number of blocks actually swapped is determined on the + * number of page faults, that this process actually had in the last time, + * so we won't swap heavily used processes all the time ... + * + * Note: the priority argument is a hint on much CPU to waste with the + * swap block search, not a hint, of how much blocks to swap with + * each process. + * + * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de + */ + +/* + * These are the minimum and maximum number of pages to swap from one process, + * before proceeding to the next: + */ +#define SWAP_MIN 4 +#define SWAP_MAX 32 + +/* + * The actual number of pages to swap is determined as: + * SWAP_RATIO / (number of recent major page faults) + */ +#define SWAP_RATIO 128 + +static int swap_out_process(struct task_struct * p) +{ + unsigned long address; + unsigned long offset; + unsigned long *pgdir; + unsigned long pg_table; + + /* + * Go through process' page directory. + */ + address = p->mm->swap_address; + pgdir = (address >> PGDIR_SHIFT) + (unsigned long *) p->tss.pg_dir; + offset = address & ~PGDIR_MASK; + address &= PGDIR_MASK; + for ( ; address < TASK_SIZE ; + pgdir++, address = address + PGDIR_SIZE, offset = 0) { + pg_table = *pgdir; + if (pg_table >= high_memory) + continue; + if (mem_map[MAP_NR(pg_table)] & MAP_PAGE_RESERVED) + continue; + if (!(PAGE_VALID & pg_table)) { + printk("swap_out_process (%s): bad page-table at vm %08lx: %08lx\n", + p->comm, address + offset, pg_table); + *pgdir = 0; + continue; + } + pg_table &= 0xfffff000; + + /* + * Go through this page table. + */ + for( ; offset < ~PGDIR_MASK ; offset += PAGE_SIZE) { + switch(try_to_swap_out((unsigned long *) (pg_table + (offset >> 10)))) { + case 0: + break; + + case 1: + p->mm->rss--; + /* continue with the following page the next time */ + p->mm->swap_address = address + offset + PAGE_SIZE; + return 1; + + default: + p->mm->rss--; + break; + } + } + } + /* + * Finish work with this process, if we reached the end of the page + * directory. Mark restart from the beginning the next time. + */ + p->mm->swap_address = 0; + return 0; +} + +static int swap_out(unsigned int priority) +{ + static int swap_task; + int loop; + int counter = NR_TASKS * 2 >> priority; + struct task_struct *p; + + counter = NR_TASKS * 2 >> priority; + for(; counter >= 0; counter--, swap_task++) { + /* + * Check that swap_task is suitable for swapping. If not, look for + * the next suitable process. + */ + loop = 0; + while(1) { + if (swap_task >= NR_TASKS) { + swap_task = 1; + if (loop) + /* all processes are unswappable or already swapped out */ + return 0; + loop = 1; + } + + p = task[swap_task]; + if (p && p->mm->swappable && p->mm->rss) + break; + + swap_task++; + } + + /* + * Determine the number of pages to swap from this process. + */ + if (!p->mm->swap_cnt) { + p->mm->dec_flt = (p->mm->dec_flt * 3) / 4 + p->mm->maj_flt - p->mm->old_maj_flt; + p->mm->old_maj_flt = p->mm->maj_flt; + + if (p->mm->dec_flt >= SWAP_RATIO / SWAP_MIN) { + p->mm->dec_flt = SWAP_RATIO / SWAP_MIN; + p->mm->swap_cnt = SWAP_MIN; + } else if (p->mm->dec_flt <= SWAP_RATIO / SWAP_MAX) + p->mm->swap_cnt = SWAP_MAX; + else + p->mm->swap_cnt = SWAP_RATIO / p->mm->dec_flt; + } + if (swap_out_process(p)) { + if ((--p->mm->swap_cnt) == 0) + swap_task++; + return 1; + } + } + return 0; +} + +static int try_to_free_page(int priority) +{ + int i=6; + + while (i--) { + if (priority != GFP_NOBUFFER && shrink_buffers(i)) + return 1; + if (shm_swap(i)) + return 1; + if (swap_out(i)) + return 1; + } + return 0; +} + +static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->prev = head; + entry->next = head->next; + entry->next->prev = entry; + head->next = entry; +} + +static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry) +{ + entry->next->prev = entry->prev; + entry->prev->next = entry->next; +} + +/* + * Free_page() adds the page to the free lists. This is optimized for + * fast normal cases (no error jumps taken normally). + * + * The way to optimize jumps for gcc-2.2.2 is to: + * - select the "normal" case and put it inside the if () { XXX } + * - no else-statements if you can avoid them + * + * With the above two rules, you get a straight-line execution path + * for the normal case, giving better asm-code. + */ + +/* + * Buddy system. Hairy. You really aren't expected to understand this + */ +static inline void free_pages_ok(unsigned long addr, unsigned long order) +{ + unsigned long index = addr >> (PAGE_SHIFT + 1 + order); + unsigned long mask = PAGE_MASK << order; + + addr &= mask; + nr_free_pages += 1 << order; + while (order < NR_MEM_LISTS-1) { + if (!change_bit(index, free_area_map[order])) + break; + remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask))); + order++; + index >>= 1; + mask <<= 1; + addr &= mask; + } + add_mem_queue(free_area_list+order, (struct mem_list *) addr); +} + +static inline void check_free_buffers(unsigned long addr) +{ + struct buffer_head * bh; + + bh = buffer_pages[MAP_NR(addr)]; + if (bh) { + struct buffer_head *tmp = bh; + do { + if (tmp->b_list == BUF_SHARED && tmp->b_dev != 0xffff) + refile_buffer(tmp); + tmp = tmp->b_this_page; + } while (tmp != bh); + } +} + +void free_pages(unsigned long addr, unsigned long order) +{ + if (addr < high_memory) { + unsigned long flag; + unsigned short * map = mem_map + MAP_NR(addr); + if (*map) { + if (!(*map & MAP_PAGE_RESERVED)) { + save_flags(flag); + cli(); + if (!--*map) { + free_pages_ok(addr, order); + delete_from_swap_cache(addr); + } + restore_flags(flag); + if (*map == 1) + check_free_buffers(addr); + } + return; + } + printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr); + printk("PC = %08lx\n",*(((unsigned long *)&addr)-1)); + return; + } +} + +/* + * Some ugly macros to speed up __get_free_pages().. + */ +#define RMQUEUE(order) \ +do { struct mem_list * queue = free_area_list+order; \ + unsigned long new_order = order; \ + do { struct mem_list *next = queue->next; \ + if (queue != next) { \ + (queue->next = next->next)->prev = queue; \ + mark_used((unsigned long) next, new_order); \ + nr_free_pages -= 1 << order; \ + restore_flags(flags); \ + EXPAND(next, order, new_order); \ + return (unsigned long) next; \ + } new_order++; queue++; \ + } while (new_order < NR_MEM_LISTS); \ +} while (0) + +static inline int mark_used(unsigned long addr, unsigned long order) +{ + return change_bit(addr >> (PAGE_SHIFT+1+order), free_area_map[order]); +} + +#define EXPAND(addr,low,high) \ +do { unsigned long size = PAGE_SIZE << high; \ + while (high > low) { \ + high--; size >>= 1; cli(); \ + add_mem_queue(free_area_list+high, addr); \ + mark_used((unsigned long) addr, high); \ + restore_flags(flags); \ + addr = (struct mem_list *) (size + (unsigned long) addr); \ + } mem_map[MAP_NR((unsigned long) addr)] = 1; \ +} while (0) + +unsigned long __get_free_pages(int priority, unsigned long order) +{ + unsigned long flags; + int reserved_pages; + + if (intr_count && priority != GFP_ATOMIC) { + static int count = 0; + if (++count < 5) { + printk("gfp called nonatomically from interrupt %p\n", + __builtin_return_address(0)); + priority = GFP_ATOMIC; + } + } + reserved_pages = 5; + if (priority != GFP_NFS) + reserved_pages = min_free_pages; + save_flags(flags); +repeat: + cli(); + if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) { + RMQUEUE(order); + restore_flags(flags); + return 0; + } + restore_flags(flags); + if (priority != GFP_BUFFER && try_to_free_page(priority)) + goto repeat; + return 0; +} + +/* + * Yes, I know this is ugly. Don't tell me. + */ +unsigned long __get_dma_pages(int priority, unsigned long order) +{ + unsigned long list = 0; + unsigned long result; + unsigned long limit = 16*1024*1024; + + /* if (EISA_bus) limit = ~0UL; */ + if (priority != GFP_ATOMIC) + priority = GFP_BUFFER; + for (;;) { + result = __get_free_pages(priority, order); + if (result < limit) /* covers failure as well */ + break; + *(unsigned long *) result = list; + list = result; + } + while (list) { + unsigned long tmp = list; + list = *(unsigned long *) list; + free_pages(tmp, order); + } + return result; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + unsigned long order, flags; + unsigned long total = 0; + + printk("Free pages: %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10)); + save_flags(flags); + cli(); + for (order=0 ; order < NR_MEM_LISTS; order++) { + struct mem_list * tmp; + unsigned long nr = 0; + for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) { + nr ++; + } + total += nr * (4 << order); + printk("%lu*%ukB ", nr, 4 << order); + } + restore_flags(flags); + printk("= %lukB)\n", total); +#ifdef SWAP_CACHE_INFO + show_swap_cache_info(); +#endif +} + +/* + * Trying to stop swapping from a file is fraught with races, so + * we repeat quite a bit here when we have to pause. swapoff() + * isn't exactly timing-critical, so who cares? + */ +static int try_to_unuse(unsigned int type) +{ + int nr, pgt, pg; + unsigned long page, *ppage; + unsigned long tmp = 0; + struct task_struct *p; + + nr = 0; + +/* + * When we have to sleep, we restart the whole algorithm from the same + * task we stopped in. That at least rids us of all races. + */ +repeat: + for (; nr < NR_TASKS ; nr++) { + p = task[nr]; + if (!p) + continue; + for (pgt = 0 ; pgt < PTRS_PER_PAGE ; pgt++) { + ppage = pgt + ((unsigned long *) p->tss.pg_dir); + page = *ppage; + if (!page) + continue; + if (!(page & PAGE_VALID) || (page >= high_memory)) + continue; + if (mem_map[MAP_NR(page)] & MAP_PAGE_RESERVED) + continue; + ppage = (unsigned long *) (page & PAGE_MASK); + for (pg = 0 ; pg < PTRS_PER_PAGE ; pg++,ppage++) { + page = *ppage; + if (!page) + continue; + if (page & PAGE_VALID) { + if (!(page = in_swap_cache(page))) + continue; + if (SWP_TYPE(page) != type) + continue; + *ppage |= PAGE_DIRTY; + delete_from_swap_cache(*ppage); + continue; + } + if (SWP_TYPE(page) != type) + continue; + if (!tmp) { + if (!(tmp = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + goto repeat; + } + read_swap_page(page, (char *) tmp); + if (*ppage == page) { + *ppage = tmp | (PAGE_DIRTY | PAGE_PRIVATE); + ++p->mm->rss; + swap_free(page); + tmp = 0; + } + goto repeat; + } + } + } + free_page(tmp); + return 0; +} + +asmlinkage int sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * inode; + unsigned int type; + int i; + + if (!suser()) + return -EPERM; + i = namei(specialfile,&inode); + if (i) + return i; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) { + if ((p->flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + if (p->swap_file) { + if (p->swap_file == inode) + break; + } else { + if (!S_ISBLK(inode->i_mode)) + continue; + if (p->swap_device == inode->i_rdev) + break; + } + } + iput(inode); + if (type >= nr_swapfiles) + return -EINVAL; + p->flags = SWP_USED; + i = try_to_unuse(type); + if (i) { + p->flags = SWP_WRITEOK; + return i; + } + nr_swap_pages -= p->pages; + iput(p->swap_file); + p->swap_file = NULL; + p->swap_device = 0; + vfree(p->swap_map); + p->swap_map = NULL; + free_page((long) p->swap_lockmap); + p->swap_lockmap = NULL; + p->flags = 0; + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage int sys_swapon(const char * specialfile) +{ + struct swap_info_struct * p; + struct inode * swap_inode; + unsigned int type; + int i,j; + int error; + + if (!suser()) + return -EPERM; + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + if (type >= MAX_SWAPFILES) + return -EPERM; + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->max = 1; + error = namei(specialfile,&swap_inode); + if (error) + goto bad_swap; + p->swap_file = swap_inode; + error = -EBUSY; + if (swap_inode->i_count != 1) + goto bad_swap; + error = -EINVAL; + if (S_ISBLK(swap_inode->i_mode)) { + p->swap_device = swap_inode->i_rdev; + p->swap_file = NULL; + iput(swap_inode); + error = -ENODEV; + if (!p->swap_device) + goto bad_swap; + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + if (i == type) + continue; + if (p->swap_device == swap_info[i].swap_device) + goto bad_swap; + } + } else if (!S_ISREG(swap_inode->i_mode)) + goto bad_swap; + p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER); + if (!p->swap_lockmap) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap); + if (memcmp("SWAP-SPACE",p->swap_lockmap+4086,10)) { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + memset(p->swap_lockmap+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,p->swap_lockmap)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + p->max = i+1; + j++; + } + } + if (!j) { + printk("Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map = (unsigned char *) vmalloc(p->max); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < p->max ; i++) { + if (test_bit(i,p->swap_lockmap)) + p->swap_map[i] = 0; + else + p->swap_map[i] = 0x80; + } + p->swap_map[0] = 0x80; + memset(p->swap_lockmap,0,PAGE_SIZE); + p->flags = SWP_WRITEOK; + p->pages = j; + nr_swap_pages += j; + printk("Adding Swap: %dk swap-space\n",j<<2); + return 0; +bad_swap: + free_page((long) p->swap_lockmap); + vfree(p->swap_map); + iput(p->swap_file); + p->swap_device = 0; + p->swap_file = NULL; + p->swap_map = NULL; + p->swap_lockmap = NULL; + p->flags = 0; + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i, j; + + val->freeswap = val->totalswap = 0; + for (i = 0; i < nr_swapfiles; i++) { + if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK) + continue; + for (j = 0; j < swap_info[i].max; ++j) + switch (swap_info[i].swap_map[j]) { + case 128: + continue; + case 0: + ++val->freeswap; + default: + ++val->totalswap; + } + } + val->freeswap <<= PAGE_SHIFT; + val->totalswap <<= PAGE_SHIFT; + return; +} + +/* + * set up the free-area data structures: + * - mark all pages MAP_PAGE_RESERVED + * - mark all memory queues empty + * - clear the memory bitmaps + */ +unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem) +{ + unsigned short * p; + unsigned long mask = PAGE_MASK; + int i; + + /* + * select nr of pages we try to keep free for important stuff + * with a minimum of 16 pages. This is totally arbitrary + */ + i = end_mem >> (PAGE_SHIFT+6); + if (i < 16) + i = 16; + min_free_pages = i; + start_mem = init_swap_cache(start_mem, end_mem); + mem_map = (unsigned short *) start_mem; + p = mem_map + MAP_NR(end_mem); + start_mem = (unsigned long) p; + while (p > mem_map) + *--p = MAP_PAGE_RESERVED; + + for (i = 0 ; i < NR_MEM_LISTS ; i++, mask <<= 1) { + unsigned long bitmap_size; + free_area_list[i].prev = free_area_list[i].next = &free_area_list[i]; + end_mem = (end_mem + ~mask) & mask; + bitmap_size = end_mem >> (PAGE_SHIFT + i); + bitmap_size = (bitmap_size + 7) >> 3; + free_area_map[i] = (unsigned char *) start_mem; + memset((void *) start_mem, 0, bitmap_size); + start_mem += bitmap_size; + } + return start_mem; +} diff --git a/arch/mips/mm/vmalloc.c b/arch/mips/mm/vmalloc.c new file mode 100644 index 000000000..9b3ab7f59 --- /dev/null +++ b/arch/mips/mm/vmalloc.c @@ -0,0 +1,202 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + */ + +#include <asm/system.h> +#include <linux/config.h> + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/malloc.h> +#include <asm/segment.h> + +struct vm_struct { + unsigned long flags; + void * addr; + unsigned long size; + struct vm_struct * next; +}; + +static struct vm_struct * vmlist = NULL; + +/* Just any arbitrary offset to the start of the vmalloc VM area: the + * current 8MB value just means that there will be a 8MB "hole" after the + * physical memory until the kernel virtual memory starts. That means that + * any out-of-bounds memory accesses will hopefully be caught. + * The vmalloc() routines leaves a hole of 4kB between each vmalloced + * area for the same reason. ;) + */ +#define VMALLOC_OFFSET (8*1024*1024) + +static inline void set_pgdir(unsigned long dindex, unsigned long value) +{ + struct task_struct * p; + + p = &init_task; + do { + ((unsigned long *) p->tss.pg_dir)[dindex] = value; + p = p->next_task; + } while (p != &init_task); +} + +static int free_area_pages(unsigned long dindex, unsigned long index, unsigned long nr) +{ + unsigned long page, *pte; + + if (!(PAGE_VALID & (page = swapper_pg_dir[dindex]))) + return 0; + page &= PAGE_MASK; + pte = index + (unsigned long *) page; + do { + unsigned long pg = *pte; + *pte = 0; + if (pg & PAGE_VALID) + free_page(pg); + pte++; + } while (--nr); + pte = (unsigned long *) page; + for (nr = 0 ; nr < 1024 ; nr++, pte++) + if (*pte) + return 0; + set_pgdir(dindex,0); + mem_map[MAP_NR(page)] = 1; + free_page(page); + invalidate(); + return 0; +} + +static int alloc_area_pages(unsigned long dindex, unsigned long index, unsigned long nr) +{ + unsigned long page, *pte; + + page = swapper_pg_dir[dindex]; + if (!page) { + page = get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + if (swapper_pg_dir[dindex]) { + free_page(page); + page = swapper_pg_dir[dindex]; + } else { + mem_map[MAP_NR(page)] = MAP_PAGE_RESERVED; + set_pgdir(dindex, page | PAGE_SHARED); + } + } + page &= PAGE_MASK; + pte = index + (unsigned long *) page; + *pte = PAGE_SHARED; /* remove a race with vfree() */ + do { + unsigned long pg = get_free_page(GFP_KERNEL); + + if (!pg) + return -ENOMEM; + *pte = pg | PAGE_SHARED; + pte++; + } while (--nr); + invalidate(); + return 0; +} + +static int do_area(void * addr, unsigned long size, + int (*area_fn)(unsigned long,unsigned long,unsigned long)) +{ + unsigned long nr, dindex, index; + + nr = size >> PAGE_SHIFT; + dindex = (TASK_SIZE + (unsigned long) addr) >> 22; + index = (((unsigned long) addr) >> PAGE_SHIFT) & (PTRS_PER_PAGE-1); + while (nr > 0) { + unsigned long i = PTRS_PER_PAGE - index; + + if (i > nr) + i = nr; + nr -= i; + if (area_fn(dindex, index, i)) + return -1; + index = 0; + dindex++; + } + return 0; +} + +void vfree(void * addr) +{ + struct vm_struct **p, *tmp; + + if (!addr) + return; + if ((PAGE_SIZE-1) & (unsigned long) addr) { + printk("Trying to vfree() bad address (%p)\n", addr); + return; + } + for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { + if (tmp->addr == addr) { + *p = tmp->next; + do_area(tmp->addr, tmp->size, free_area_pages); + kfree(tmp); + return; + } + } + printk("Trying to vfree() nonexistent vm area (%p)\n", addr); +} + +void * vmalloc(unsigned long size) +{ + void * addr; + struct vm_struct **p, *tmp, *area; + + size = PAGE_ALIGN(size); + if (!size || size > high_memory) + return NULL; + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + addr = (void *) ((high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)); + area->size = size + PAGE_SIZE; + area->next = NULL; + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + if (size + (unsigned long) addr < (unsigned long) tmp->addr) + break; + addr = (void *) (tmp->size + (unsigned long) tmp->addr); + } + area->addr = addr; + area->next = *p; + *p = area; + if (do_area(addr, size, alloc_area_pages)) { + vfree(addr); + return NULL; + } + return addr; +} + +int vread(char *buf, char *addr, int count) +{ + struct vm_struct **p, *tmp; + char *vaddr, *buf_start = buf; + int n; + + for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { + vaddr = (char *) tmp->addr; + while (addr < vaddr) { + if (count == 0) + goto finished; + put_fs_byte('\0', buf++), addr++, count--; + } + n = tmp->size - PAGE_SIZE; + if (addr > vaddr) + n -= addr - vaddr; + while (--n >= 0) { + if (count == 0) + goto finished; + put_fs_byte(*addr++, buf++), count--; + } + } +finished: + return buf - buf_start; +} diff --git a/arch/mips/ptrace.c b/arch/mips/ptrace.c new file mode 100644 index 000000000..0a42b9c38 --- /dev/null +++ b/arch/mips/ptrace.c @@ -0,0 +1,523 @@ +/* ptrace.c */ +/* By Ross Biro 1/23/92 */ +/* edited by Linus Torvalds */ + +#include <linux/head.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/user.h> + +#include <asm/segment.h> +#include <asm/system.h> + +#if 0 +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x00044dd5 + +/* set's the trap flag. */ +#define TRAP_FLAG 0x100 + +/* + * this is the number to subtract from the top of the stack. To find + * the local frame. + */ +#define MAGICNUMBER 68 + +/* change a pid into a task struct. */ +static inline struct task_struct * get_task(int pid) +{ + int i; + + for (i = 1; i < NR_TASKS; i++) { + if (task[i] != NULL && (task[i]->pid == pid)) + return task[i]; + } + return NULL; +} + +/* + * this routine will get a word off of the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int get_stack_long(struct task_struct *task, int offset) +{ + unsigned char *stack; + + stack = (unsigned char *)task->tss.esp0; + stack += offset; + return (*((int *)stack)); +} + +/* + * this routine will put a word on the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int put_stack_long(struct task_struct *task, int offset, + unsigned long data) +{ + unsigned char * stack; + + stack = (unsigned char *) task->tss.esp0; + stack += offset; + *(unsigned long *) stack = data; + return 0; +} + +/* + * This routine gets a long from any process space by following the page + * tables. NOTE! You should check that the long isn't on a page boundary, + * and that it is in the task area before calling this: this routine does + * no checking. + */ +static unsigned long get_long(struct vm_area_struct * vma, unsigned long addr) +{ + unsigned long page; + +repeat: + page = *PAGE_DIR_OFFSET(vma->vm_task->tss.cr3, addr); + if (page & PAGE_PRESENT) { + page &= PAGE_MASK; + page += PAGE_PTR(addr); + page = *((unsigned long *) page); + } + if (!(page & PAGE_PRESENT)) { + do_no_page(vma, addr, 0); + goto repeat; + } +/* this is a hack for non-kernel-mapped video buffers and similar */ + if (page >= high_memory) + return 0; + page &= PAGE_MASK; + page += addr & ~PAGE_MASK; + return *(unsigned long *) page; +} + +/* + * This routine puts a long into any process space by following the page + * tables. NOTE! You should check that the long isn't on a page boundary, + * and that it is in the task area before calling this: this routine does + * no checking. + * + * Now keeps R/W state of page so that a text page stays readonly + * even if a debugger scribbles breakpoints into it. -M.U- + */ +static void put_long(struct vm_area_struct * vma, unsigned long addr, + unsigned long data) +{ + unsigned long page, pte = 0; + int readonly = 0; + +repeat: + page = *PAGE_DIR_OFFSET(vma->vm_task->tss.cr3, addr); + if (page & PAGE_PRESENT) { + page &= PAGE_MASK; + page += PAGE_PTR(addr); + pte = page; + page = *((unsigned long *) page); + } + if (!(page & PAGE_PRESENT)) { + do_no_page(vma, addr, 0 /* PAGE_RW */); + goto repeat; + } + if (!(page & PAGE_RW)) { + if (!(page & PAGE_COW)) + readonly = 1; + do_wp_page(vma, addr, PAGE_RW | PAGE_PRESENT); + goto repeat; + } +/* this is a hack for non-kernel-mapped video buffers and similar */ + if (page >= high_memory) + return; +/* we're bypassing pagetables, so we have to set the dirty bit ourselves */ + *(unsigned long *) pte |= (PAGE_DIRTY|PAGE_COW); + page &= PAGE_MASK; + page += addr & ~PAGE_MASK; + *(unsigned long *) page = data; + if (readonly) { + *(unsigned long *) pte &=~ (PAGE_RW|PAGE_COW); + invalidate(); + } +} + +static struct vm_area_struct * find_vma(struct task_struct * tsk, unsigned long addr) +{ + struct vm_area_struct * vma; + + addr &= PAGE_MASK; + for (vma = tsk->mm->mmap ; ; vma = vma->vm_next) { + if (!vma) + return NULL; + if (vma->vm_end > addr) + break; + } + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + if (vma->vm_end - addr > tsk->rlim[RLIMIT_STACK].rlim_cur) + return NULL; + vma->vm_offset -= vma->vm_start - addr; + vma->vm_start = addr; + return vma; +} + +/* + * This routine checks the page boundaries, and that the offset is + * within the task area. It then calls get_long() to read a long. + */ +static int read_long(struct task_struct * tsk, unsigned long addr, + unsigned long * result) +{ + struct vm_area_struct * vma = find_vma(tsk, addr); + + if (!vma) + return -EIO; + if ((addr & ~PAGE_MASK) > PAGE_SIZE-sizeof(long)) { + unsigned long low,high; + struct vm_area_struct * vma_high = vma; + + if (addr + sizeof(long) >= vma->vm_end) { + vma_high = vma->vm_next; + if (!vma_high || vma_high->vm_start != vma->vm_end) + return -EIO; + } + low = get_long(vma, addr & ~(sizeof(long)-1)); + high = get_long(vma_high, (addr+sizeof(long)) & ~(sizeof(long)-1)); + switch (addr & (sizeof(long)-1)) { + case 1: + low >>= 8; + low |= high << 24; + break; + case 2: + low >>= 16; + low |= high << 16; + break; + case 3: + low >>= 24; + low |= high << 8; + break; + } + *result = low; + } else + *result = get_long(vma, addr); + return 0; +} + +/* + * This routine checks the page boundaries, and that the offset is + * within the task area. It then calls put_long() to write a long. + */ +static int write_long(struct task_struct * tsk, unsigned long addr, + unsigned long data) +{ + struct vm_area_struct * vma = find_vma(tsk, addr); + + if (!vma) + return -EIO; + if ((addr & ~PAGE_MASK) > PAGE_SIZE-sizeof(long)) { + unsigned long low,high; + struct vm_area_struct * vma_high = vma; + + if (addr + sizeof(long) >= vma->vm_end) { + vma_high = vma->vm_next; + if (!vma_high || vma_high->vm_start != vma->vm_end) + return -EIO; + } + low = get_long(vma, addr & ~(sizeof(long)-1)); + high = get_long(vma_high, (addr+sizeof(long)) & ~(sizeof(long)-1)); + switch (addr & (sizeof(long)-1)) { + case 0: /* shouldn't happen, but safety first */ + low = data; + break; + case 1: + low &= 0x000000ff; + low |= data << 8; + high &= ~0xff; + high |= data >> 24; + break; + case 2: + low &= 0x0000ffff; + low |= data << 16; + high &= ~0xffff; + high |= data >> 16; + break; + case 3: + low &= 0x00ffffff; + low |= data << 24; + high &= ~0xffffff; + high |= data >> 8; + break; + } + put_long(vma, addr & ~(sizeof(long)-1),low); + put_long(vma_high, (addr+sizeof(long)) & ~(sizeof(long)-1),high); + } else + put_long(vma, addr, data); + return 0; +} +#endif + +asmlinkage int sys_ptrace(long request, long pid, long addr, long data) +{ +#if 1 + return -ENOSYS; +#else + struct task_struct *child; + struct user * dummy; + int i; + + + dummy = NULL; + + if (request == PTRACE_TRACEME) { + /* are we already being traced? */ + if (current->flags & PF_PTRACED) + return -EPERM; + /* set the ptrace bit in the process flags. */ + current->flags |= PF_PTRACED; + return 0; + } + if (pid == 1) /* you may not mess with init */ + return -EPERM; + if (!(child = get_task(pid))) + return -ESRCH; + if (request == PTRACE_ATTACH) { + if (child == current) + return -EPERM; + if ((!child->dumpable || + (current->uid != child->euid) || + (current->uid != child->uid) || + (current->gid != child->egid) || + (current->gid != child->gid)) && !suser()) + return -EPERM; + /* the same process cannot be attached many times */ + if (child->flags & PF_PTRACED) + return -EPERM; + child->flags |= PF_PTRACED; + if (child->p_pptr != current) { + REMOVE_LINKS(child); + child->p_pptr = current; + SET_LINKS(child); + } + send_sig(SIGSTOP, child, 1); + return 0; + } + if (!(child->flags & PF_PTRACED)) + return -ESRCH; + if (child->state != TASK_STOPPED) { + if (request != PTRACE_KILL) + return -ESRCH; + } + if (child->p_pptr != current) + return -ESRCH; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: { + unsigned long tmp; + int res; + + res = read_long(child, addr, &tmp); + if (res < 0) + return res; + res = verify_area(VERIFY_WRITE, (void *) data, sizeof(long)); + if (!res) + put_fs_long(tmp,(unsigned long *) data); + return res; + } + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + int res; + + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + return -EIO; + + res = verify_area(VERIFY_WRITE, (void *) data, sizeof(long)); + if (res) + return res; + tmp = 0; /* Default return condition */ + if(addr < 17*sizeof(long)) { + addr = addr >> 2; /* temporary hack. */ + + tmp = get_stack_long(child, sizeof(long)*addr - MAGICNUMBER); + if (addr == DS || addr == ES || + addr == FS || addr == GS || + addr == CS || addr == SS) + tmp &= 0xffff; + }; + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + addr -= (long) &dummy->u_debugreg[0]; + addr = addr >> 2; + tmp = child->debugreg[addr]; + }; + put_fs_long(tmp,(unsigned long *) data); + return 0; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + return write_long(child,addr,data); + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + return -EIO; + + addr = addr >> 2; /* temporary hack. */ + + if (addr == ORIG_EAX) + return -EIO; + if (addr == DS || addr == ES || + addr == FS || addr == GS || + addr == CS || addr == SS) { + data &= 0xffff; + if (data && (data & 3) != 3) + return -EIO; + } + if (addr == EFL) { /* flags. */ + data &= FLAG_MASK; + data |= get_stack_long(child, EFL*sizeof(long)-MAGICNUMBER) & ~FLAG_MASK; + } + /* Do not allow the user to set the debug register for kernel + address space */ + if(addr < 17){ + if (put_stack_long(child, sizeof(long)*addr-MAGICNUMBER, data)) + return -EIO; + return 0; + }; + + /* We need to be very careful here. We implicitly + want to modify a portion of the task_struct, and we + have to be selective about what portions we allow someone + to modify. */ + + addr = addr << 2; /* Convert back again */ + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + + if(addr == (long) &dummy->u_debugreg[4]) return -EIO; + if(addr == (long) &dummy->u_debugreg[5]) return -EIO; + if(addr < (long) &dummy->u_debugreg[4] && + ((unsigned long) data) >= 0xbffffffd) return -EIO; + + if(addr == (long) &dummy->u_debugreg[7]) { + data &= ~DR_CONTROL_RESERVED; + for(i=0; i<4; i++) + if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) + return -EIO; + }; + + addr -= (long) &dummy->u_debugreg; + addr = addr >> 2; + child->debugreg[addr] = data; + return 0; + }; + return -EIO; + + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: { /* restart after signal. */ + long tmp; + + if ((unsigned long) data > NSIG) + return -EIO; + if (request == PTRACE_SYSCALL) + child->flags |= PF_TRACESYS; + else + child->flags &= ~PF_TRACESYS; + child->exit_code = data; + child->state = TASK_RUNNING; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + return 0; + } + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: { + long tmp; + + child->state = TASK_RUNNING; + child->exit_code = SIGKILL; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + return 0; + } + + case PTRACE_SINGLESTEP: { /* set the trap flag. */ + long tmp; + + if ((unsigned long) data > NSIG) + return -EIO; + child->flags &= ~PF_TRACESYS; + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) | TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + child->state = TASK_RUNNING; + child->exit_code = data; + /* give it a chance to run. */ + return 0; + } + + case PTRACE_DETACH: { /* detach a process that was attached. */ + long tmp; + + if ((unsigned long) data > NSIG) + return -EIO; + child->flags &= ~(PF_PTRACED|PF_TRACESYS); + child->state = TASK_RUNNING; + child->exit_code = data; + REMOVE_LINKS(child); + child->p_pptr = child->p_opptr; + SET_LINKS(child); + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, sizeof(long)*EFL-MAGICNUMBER) & ~TRAP_FLAG; + put_stack_long(child, sizeof(long)*EFL-MAGICNUMBER,tmp); + return 0; + } + + default: + return -EIO; + } +#endif +} + +asmlinkage void syscall_trace(void) +{ + if ((current->flags & (PF_PTRACED|PF_TRACESYS)) + != (PF_PTRACED|PF_TRACESYS)) + return; + current->exit_code = SIGTRAP; + current->state = TASK_STOPPED; + notify_parent(current); + schedule(); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) + current->signal |= (1 << (current->exit_code - 1)); + current->exit_code = 0; +} diff --git a/arch/mips/sched.c b/arch/mips/sched.c new file mode 100644 index 000000000..5c60764a8 --- /dev/null +++ b/arch/mips/sched.c @@ -0,0 +1,804 @@ +/* + * linux/kernel/sched.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'sched.c' is the main kernel file. It contains scheduling primitives + * (sleep_on, wakeup, schedule etc) as well as a number of simple system + * call functions (type getpid(), which just extracts a field from + * current-task + */ + +#include <linux/config.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/fdreg.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/ptrace.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/tqueue.h> +#include <linux/resource.h> + +#include <asm/system.h> +#include <asm/io.h> +#include <asm/segment.h> + +#define TIMER_IRQ 0 + +#include <linux/timex.h> + +/* + * kernel variables + */ +long tick = 1000000 / HZ; /* timer interrupt period */ +volatile struct timeval xtime; /* The current time */ +int tickadj = 500/HZ; /* microsecs */ + +DECLARE_TASK_QUEUE(tq_timer); +DECLARE_TASK_QUEUE(tq_immediate); + +/* + * phase-lock loop variables + */ +int time_status = TIME_BAD; /* clock synchronization status */ +long time_offset = 0; /* time adjustment (us) */ +long time_constant = 0; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = 0x70000000;/* maximum error */ +long time_esterror = 0x70000000;/* estimated error */ +long time_phase = 0; /* phase offset (scaled us) */ +long time_freq = 0; /* frequency offset (scaled ppm) */ +long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ +long time_reftime = 0; /* time at last adjustment (s) */ + +long time_adjust = 0; +long time_adjust_step = 0; + +int need_resched = 0; +unsigned long event = 0; + +/* + * Tell us the machine setup.. + */ +int hard_math = 0; /* set by boot/head.S */ +int wp_works_ok = 0; /* set if paging hardware honours WP */ + +/* + * Bus types .. + */ +int EISA_bus = 0; + +extern int _setitimer(int, struct itimerval *, struct itimerval *); +unsigned long * prof_buffer = NULL; +unsigned long prof_len = 0; + +#define _S(nr) (1<<((nr)-1)) + +extern void mem_use(void); + +extern int timer_interrupt(void); +asmlinkage int system_call(void); + +static unsigned long init_kernel_stack[1024] = { STACK_MAGIC, }; +static struct vm_area_struct init_mmap = INIT_MMAP; +struct task_struct init_task = INIT_TASK; + +unsigned long volatile jiffies=0; + +struct task_struct *current = &init_task; +struct task_struct *last_task_used_math = NULL; + +struct task_struct * task[NR_TASKS] = {&init_task, }; + +long user_stack [ PAGE_SIZE>>2 ] = { STACK_MAGIC, }; + +struct kernel_stat kstat = { 0 }; + +#ifndef CONFIG_MATH_EMULATION + +/* + * FIXME: There is no fpa emulator yet + */ +asmlinkage void math_emulate(long arg) +{ + printk("math-emulation not enabled and no coprocessor found.\n"); + printk("killing %s.\n",current->comm); + send_sig(SIGFPE,current,1); + schedule(); +} + +#endif /* CONFIG_MATH_EMULATION */ + +unsigned long itimer_ticks = 0; +unsigned long itimer_next = ~0; + +/* + * 'schedule()' is the scheduler function. It's a very simple and nice + * scheduler: it's not perfect, but certainly works for most things. + * The one thing you might take a look at is the signal-handler code here. + * + * NOTE!! Task 0 is the 'idle' task, which gets called when no other + * tasks can run. It can not be killed, and it cannot sleep. The 'state' + * information in task[0] is never used. + * + * The "confuse_gcc" goto is used only to get better assembly code.. + * Dijkstra probably hates me. + */ +asmlinkage void schedule(void) +{ + int c; + struct task_struct * p; + struct task_struct * next; + unsigned long ticks; + + /* + * check alarm, wake up any interruptible tasks that have got a signal + */ + if (intr_count) { + printk("Aiee: scheduling in interrupt\n"); + intr_count = 0; + } + cli(); + ticks = itimer_ticks; + itimer_ticks = 0; + itimer_next = ~0; + sti(); + need_resched = 0; + p = &init_task; + for (;;) { + if ((p = p->next_task) == &init_task) + goto confuse_gcc1; + if (ticks && p->it_real_value) { + if (p->it_real_value <= ticks) { + send_sig(SIGALRM, p, 1); + if (!p->it_real_incr) { + p->it_real_value = 0; + goto end_itimer; + } + do { + p->it_real_value += p->it_real_incr; + } while (p->it_real_value <= ticks); + } + p->it_real_value -= ticks; + if (p->it_real_value < itimer_next) + itimer_next = p->it_real_value; + } +end_itimer: + if (p->state != TASK_INTERRUPTIBLE) + continue; + if (p->signal & ~p->blocked) { + p->state = TASK_RUNNING; + continue; + } + if (p->timeout && p->timeout <= jiffies) { + p->timeout = 0; + p->state = TASK_RUNNING; + } + } +confuse_gcc1: + +/* this is the scheduler proper: */ +#if 0 + /* give processes that go to sleep a bit higher priority.. */ + /* This depends on the values for TASK_XXX */ + /* This gives smoother scheduling for some things, but */ + /* can be very unfair under some circumstances, so.. */ + if (TASK_UNINTERRUPTIBLE >= (unsigned) current->state && + current->counter < current->priority*2) { + ++current->counter; + } +#endif + c = -1000; + next = p = &init_task; + for (;;) { + if ((p = p->next_task) == &init_task) + goto confuse_gcc2; + if (p->state == TASK_RUNNING && p->counter > c) + c = p->counter, next = p; + } +confuse_gcc2: + if (!c) { + for_each_task(p) + p->counter = (p->counter >> 1) + p->priority; + } + if (current == next) + return; + kstat.context_swtch++; + switch_to(next); +} + +asmlinkage int sys_pause(void) +{ + current->state = TASK_INTERRUPTIBLE; + schedule(); + return -ERESTARTNOHAND; +} + +/* + * wake_up doesn't wake up stopped processes - they have to be awakened + * with signals or similar. + * + * Note that this doesn't need cli-sti pairs: interrupts may not change + * the wait-queue structures directly, but only call wake_up() to wake + * a process. The process itself must remove the queue once it has woken. + */ +void wake_up(struct wait_queue **q) +{ + struct wait_queue *tmp; + struct task_struct * p; + + if (!q || !(tmp = *q)) + return; + do { + if ((p = tmp->task) != NULL) { + if ((p->state == TASK_UNINTERRUPTIBLE) || + (p->state == TASK_INTERRUPTIBLE)) { + p->state = TASK_RUNNING; + if (p->counter > current->counter + 3) + need_resched = 1; + } + } + if (!tmp->next) { + printk("wait_queue is bad (pc = %p)\n", + __builtin_return_address(0)); + printk(" q = %p\n",q); + printk(" *q = %p\n",*q); + printk(" tmp = %p\n",tmp); + break; + } + tmp = tmp->next; + } while (tmp != *q); +} + +void wake_up_interruptible(struct wait_queue **q) +{ + struct wait_queue *tmp; + struct task_struct * p; + + if (!q || !(tmp = *q)) + return; + do { + if ((p = tmp->task) != NULL) { + if (p->state == TASK_INTERRUPTIBLE) { + p->state = TASK_RUNNING; + if (p->counter > current->counter + 3) + need_resched = 1; + } + } + if (!tmp->next) { + printk("wait_queue is bad (eip = %p)\n", + __builtin_return_address(0)); + printk(" q = %p\n",q); + printk(" *q = %p\n",*q); + printk(" tmp = %p\n",tmp); + break; + } + tmp = tmp->next; + } while (tmp != *q); +} + +void __down(struct semaphore * sem) +{ + struct wait_queue wait = { current, NULL }; + add_wait_queue(&sem->wait, &wait); + current->state = TASK_UNINTERRUPTIBLE; + while (sem->count <= 0) { + schedule(); + current->state = TASK_UNINTERRUPTIBLE; + } + current->state = TASK_RUNNING; + remove_wait_queue(&sem->wait, &wait); +} + +static inline void __sleep_on(struct wait_queue **p, int state) +{ + unsigned long flags; + struct wait_queue wait = { current, NULL }; + + if (!p) + return; + if (current == task[0]) + panic("task[0] trying to sleep"); + current->state = state; + add_wait_queue(p, &wait); + save_flags(flags); + sti(); + schedule(); + remove_wait_queue(p, &wait); + restore_flags(flags); +} + +void interruptible_sleep_on(struct wait_queue **p) +{ + __sleep_on(p,TASK_INTERRUPTIBLE); +} + +void sleep_on(struct wait_queue **p) +{ + __sleep_on(p,TASK_UNINTERRUPTIBLE); +} + +/* + * The head for the timer-list has a "expires" field of MAX_UINT, + * and the sorting routine counts on this.. + */ +static struct timer_list timer_head = { &timer_head, &timer_head, ~0, 0, NULL }; +#define SLOW_BUT_DEBUGGING_TIMERS 1 + +void add_timer(struct timer_list * timer) +{ + unsigned long flags; + struct timer_list *p; + +#if SLOW_BUT_DEBUGGING_TIMERS + if (timer->next || timer->prev) { + printk("add_timer() called with non-zero list from %p\n", + __builtin_return_address(0)); + return; + } +#endif + p = &timer_head; + timer->expires += jiffies; + save_flags(flags); + cli(); + do { + p = p->next; + } while (timer->expires > p->expires); + timer->next = p; + timer->prev = p->prev; + p->prev = timer; + timer->prev->next = timer; + restore_flags(flags); +} + +int del_timer(struct timer_list * timer) +{ + unsigned long flags; +#if SLOW_BUT_DEBUGGING_TIMERS + struct timer_list * p; + + p = &timer_head; + save_flags(flags); + cli(); + while ((p = p->next) != &timer_head) { + if (p == timer) { + timer->next->prev = timer->prev; + timer->prev->next = timer->next; + timer->next = timer->prev = NULL; + restore_flags(flags); + timer->expires -= jiffies; + return 1; + } + } + if (timer->next || timer->prev) + printk("del_timer() called from %p with timer not initialized\n", + __builtin_return_address(0)); + restore_flags(flags); + return 0; +#else + save_flags(flags); + cli(); + if (timer->next) { + timer->next->prev = timer->prev; + timer->prev->next = timer->next; + timer->next = timer->prev = NULL; + restore_flags(flags); + timer->expires -= jiffies; + return 1; + } + restore_flags(flags); + return 0; +#endif +} + +unsigned long timer_active = 0; +struct timer_struct timer_table[32]; + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + */ +unsigned long avenrun[3] = { 0,0,0 }; + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + struct task_struct **p; + unsigned long nr = 0; + + for(p = &LAST_TASK; p > &FIRST_TASK; --p) + if (*p && ((*p)->state == TASK_RUNNING || + (*p)->state == TASK_UNINTERRUPTIBLE || + (*p)->state == TASK_SWAPPING)) + nr += FIXED_1; + return nr; +} + +static inline void calc_load(void) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + if (count-- > 0) + return; + count = LOAD_FREQ; + active_tasks = count_active_tasks(); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * These were ported to Linux by Philip Gladstone. + */ +static void second_overflow(void) +{ + long ltemp; + /* last time the cmos clock got updated */ + static long last_rtc_update=0; + extern int set_rtc_mmss(unsigned long); + + /* Bump the maxerror field */ + time_maxerror = (0x70000000-time_maxerror < time_tolerance) ? + 0x70000000 : (time_maxerror + time_tolerance); + + /* Run the PLL */ + if (time_offset < 0) { + ltemp = (-(time_offset+1) >> (SHIFT_KG + time_constant)) + 1; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + time_offset += (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); + time_adj = - time_adj; + } else if (time_offset > 0) { + ltemp = ((time_offset-1) >> (SHIFT_KG + time_constant)) + 1; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + time_offset -= (time_adj * HZ) >> (SHIFT_SCALE - SHIFT_UPDATE); + } else { + time_adj = 0; + } + + time_adj += (time_freq >> (SHIFT_KF + SHIFT_HZ - SHIFT_SCALE)) + + FINETUNE; + + /* Handle the leap second stuff */ + switch (time_status) { + case TIME_INS: + /* ugly divide should be replaced */ + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; /* !! */ + time_status = TIME_OOP; + printk("Clock: inserting leap second 23:59:60 GMT\n"); + } + break; + + case TIME_DEL: + /* ugly divide should be replaced */ + if (xtime.tv_sec % 86400 == 86399) { + xtime.tv_sec++; + time_status = TIME_OK; + printk("Clock: deleting leap second 23:59:59 GMT\n"); + } + break; + + case TIME_OOP: + time_status = TIME_OK; + break; + } + if (xtime.tv_sec > last_rtc_update + 660) + if (set_rtc_mmss(xtime.tv_sec) == 0) + last_rtc_update = xtime.tv_sec; + else + last_rtc_update = xtime.tv_sec - 600; /* do it again in one min */ +} + +/* + * disregard lost ticks for now.. We don't care enough. + */ +static void timer_bh(void * unused) +{ + unsigned long mask; + struct timer_struct *tp; + struct timer_list * timer; + + cli(); + while ((timer = timer_head.next) != &timer_head && timer->expires < jiffies) { + void (*fn)(unsigned long) = timer->function; + unsigned long data = timer->data; + timer->next->prev = timer->prev; + timer->prev->next = timer->next; + timer->next = timer->prev = NULL; + sti(); + fn(data); + cli(); + } + sti(); + + for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { + if (mask > timer_active) + break; + if (!(mask & timer_active)) + continue; + if (tp->expires > jiffies) + continue; + timer_active &= ~mask; + tp->fn(); + sti(); + } +} + +void tqueue_bh(void * unused) +{ + run_task_queue(&tq_timer); +} + +void immediate_bh(void * unused) +{ + run_task_queue(&tq_immediate); +} + +/* + * The int argument is really a (struct pt_regs *), in case the + * interrupt wants to know from where it was called. The timer + * irq uses this to decide if it should update the user or system + * times. + */ +static void do_timer(struct pt_regs * regs) +{ + unsigned long mask; + struct timer_struct *tp; + + long ltemp, psecs; + + /* Advance the phase, once it gets to one microsecond, then + * advance the tick more. + */ + time_phase += time_adj; + if (time_phase < -FINEUSEC) { + ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + xtime.tv_usec += tick + time_adjust_step - ltemp; + } + else if (time_phase > FINEUSEC) { + ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + xtime.tv_usec += tick + time_adjust_step + ltemp; + } else + xtime.tv_usec += tick + time_adjust_step; + + if (time_adjust) + { + /* We are doing an adjtime thing. + * + * Modify the value of the tick for next time. + * Note that a positive delta means we want the clock + * to run fast. This means that the tick should be bigger + * + * Limit the amount of the step for *next* tick to be + * in the range -tickadj .. +tickadj + */ + if (time_adjust > tickadj) + time_adjust_step = tickadj; + else if (time_adjust < -tickadj) + time_adjust_step = -tickadj; + else + time_adjust_step = time_adjust; + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + else + time_adjust_step = 0; + + if (xtime.tv_usec >= 1000000) { + xtime.tv_usec -= 1000000; + xtime.tv_sec++; + second_overflow(); + } + + jiffies++; + calc_load(); + if (USES_USER_TIME(regs)) { + current->utime++; + if (current != task[0]) { + if (current->priority < 15) + kstat.cpu_nice++; + else + kstat.cpu_user++; + } + /* Update ITIMER_VIRT for current task if not in a system call */ + if (current->it_virt_value && !(--current->it_virt_value)) { + current->it_virt_value = current->it_virt_incr; + send_sig(SIGVTALRM,current,1); + } + } else { + current->stime++; + if(current != task[0]) + kstat.cpu_system++; +#if defined (CONFIG_PROFILE) & !defined (__mips__) + if (prof_buffer && current != task[0]) { + unsigned long eip = regs->eip; + eip >>= 2; + if (eip < prof_len) + prof_buffer[eip]++; + } +#endif + } + /* + * check the cpu time limit on the process. + */ + if ((current->rlim[RLIMIT_CPU].rlim_max != RLIM_INFINITY) && + (((current->stime + current->utime) / HZ) >= current->rlim[RLIMIT_CPU].rlim_max)) + send_sig(SIGKILL, current, 1); + if ((current->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) && + (((current->stime + current->utime) % HZ) == 0)) { + psecs = (current->stime + current->utime) / HZ; + /* send when equal */ + if (psecs == current->rlim[RLIMIT_CPU].rlim_cur) + send_sig(SIGXCPU, current, 1); + /* and every five seconds thereafter. */ + else if ((psecs > current->rlim[RLIMIT_CPU].rlim_cur) && + ((psecs - current->rlim[RLIMIT_CPU].rlim_cur) % 5) == 0) + send_sig(SIGXCPU, current, 1); + } + + if (current != task[0] && 0 > --current->counter) { + current->counter = 0; + need_resched = 1; + } + /* Update ITIMER_PROF for the current task */ + if (current->it_prof_value && !(--current->it_prof_value)) { + current->it_prof_value = current->it_prof_incr; + send_sig(SIGPROF,current,1); + } + for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { + if (mask > timer_active) + break; + if (!(mask & timer_active)) + continue; + if (tp->expires > jiffies) + continue; + mark_bh(TIMER_BH); + } + cli(); + itimer_ticks++; + if (itimer_ticks > itimer_next) + need_resched = 1; + if (timer_head.next->expires < jiffies) + mark_bh(TIMER_BH); + if (tq_timer != &tq_last) + mark_bh(TQUEUE_BH); + sti(); +} + +asmlinkage int sys_alarm(long seconds) +{ + struct itimerval it_new, it_old; + + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + _setitimer(ITIMER_REAL, &it_new, &it_old); + return(it_old.it_value.tv_sec + (it_old.it_value.tv_usec / 1000000)); +} + +asmlinkage int sys_getpid(void) +{ + return current->pid; +} + +asmlinkage int sys_getppid(void) +{ + return current->p_opptr->pid; +} + +asmlinkage int sys_getuid(void) +{ + return current->uid; +} + +asmlinkage int sys_geteuid(void) +{ + return current->euid; +} + +asmlinkage int sys_getgid(void) +{ + return current->gid; +} + +asmlinkage int sys_getegid(void) +{ + return current->egid; +} + +asmlinkage int sys_nice(long increment) +{ + int newprio; + + if (increment < 0 && !suser()) + return -EPERM; + newprio = current->priority - increment; + if (newprio < 1) + newprio = 1; + if (newprio > 35) + newprio = 35; + current->priority = newprio; + return 0; +} + +static void show_task(int nr,struct task_struct * p) +{ + unsigned long free; + static char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; + + printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr); + if (((unsigned) p->state) < sizeof(stat_nam)/sizeof(char *)) + printk(stat_nam[p->state]); + else + printk(" "); + if (p == current) + printk(" current "); + else + printk(" %08lX ", (unsigned long *)p->tss.cp0_epc); + for (free = 1; free < 1024 ; free++) { + if (((unsigned long *)p->kernel_stack_page)[free]) + break; + } + printk("%5lu %5d %6d ", free << 2, p->pid, p->p_pptr->pid); + if (p->p_cptr) + printk("%5d ", p->p_cptr->pid); + else + printk(" "); + if (p->p_ysptr) + printk("%7d", p->p_ysptr->pid); + else + printk(" "); + if (p->p_osptr) + printk(" %5d\n", p->p_osptr->pid); + else + printk("\n"); +} + +void show_state(void) +{ + int i; + + printk(" free sibling\n"); + printk(" task PC stack pid father child younger older\n"); + for (i=0 ; i<NR_TASKS ; i++) + if (task[i]) + show_task(i,task[i]); +} + +void sched_init(void) +{ + bh_base[TIMER_BH].routine = timer_bh; + bh_base[TQUEUE_BH].routine = tqueue_bh; + bh_base[IMMEDIATE_BH].routine = immediate_bh; + if (sizeof(struct sigaction) != 16) + panic("Struct sigaction MUST be 16 bytes"); + + outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(LATCH & 0xff , 0x40); /* LSB */ + outb(LATCH >> 8 , 0x40); /* MSB */ + if (request_irq(TIMER_IRQ,(void (*)(int)) do_timer, 0, "timer") != 0) + panic("Could not allocate timer IRQ!"); +} diff --git a/arch/mips/signal.c b/arch/mips/signal.c new file mode 100644 index 000000000..ef3246ee0 --- /dev/null +++ b/arch/mips/signal.c @@ -0,0 +1,440 @@ +/* + * linux/kernel/signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/ptrace.h> +#include <linux/unistd.h> + +#include <asm/segment.h> +#include <asm/cachectl.h> + +#define _S(nr) (1<<((nr)-1)) + +#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) + +asmlinkage int do_signal(unsigned long oldmask, struct pt_regs * regs); + +asmlinkage int sys_sigprocmask(int how, sigset_t *set, sigset_t *oset) +{ + sigset_t new_set, old_set = current->blocked; + int error; + + if (set) { + error = verify_area(VERIFY_READ, set, sizeof(sigset_t)); + if (error) + return error; + new_set = get_fs_long((unsigned long *) set) & _BLOCKABLE; + switch (how) { + case SIG_BLOCK: + current->blocked |= new_set; + break; + case SIG_UNBLOCK: + current->blocked &= ~new_set; + break; + case SIG_SETMASK: + current->blocked = new_set; + break; + default: + return -EINVAL; + } + } + if (oset) { + error = verify_area(VERIFY_WRITE, oset, sizeof(sigset_t)); + if (error) + return error; + put_fs_long(old_set, (unsigned long *) oset); + } + return 0; +} + +asmlinkage int sys_sgetmask(void) +{ + return current->blocked; +} + +asmlinkage int sys_ssetmask(int newmask) +{ + int old=current->blocked; + + current->blocked = newmask & _BLOCKABLE; + return old; +} + +asmlinkage int sys_sigpending(sigset_t *set) +{ + int error; + /* fill in "set" with signals pending but blocked. */ + error = verify_area(VERIFY_WRITE, set, 4); + if (!error) + put_fs_long(current->blocked & current->signal, (unsigned long *)set); + return error; +} + +/* + * atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, unsigned long set) +{ + unsigned long mask; + struct pt_regs * regs = (struct pt_regs *) &restart; + + mask = current->blocked; + current->blocked = set & _BLOCKABLE; +#if defined (__i386__) + regs->eax = -EINTR; +#elif defined (__mips__) + regs->reg2 = -EINTR; +#endif + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(mask,regs)) + return -EINTR; + } +} + +/* + * POSIX 3.3.1.3: + * "Setting a signal action to SIG_IGN for a signal that is pending + * shall cause the pending signal to be discarded, whether or not + * it is blocked" (but SIGCHLD is unspecified: linux leaves it alone). + * + * "Setting a signal action to SIG_DFL for a signal that is pending + * and whose default action is to ignore the signal (for example, + * SIGCHLD), shall cause the pending signal to be discarded, whether + * or not it is blocked" + * + * Note the silly behaviour of SIGCHLD: SIG_IGN means that the signal + * isn't actually ignored, but does automatic child reaping, while + * SIG_DFL is explicitly said by POSIX to force the signal to be ignored.. + */ +static void check_pending(int signum) +{ + struct sigaction *p; + + p = signum - 1 + current->sigaction; + if (p->sa_handler == SIG_IGN) { + if (signum == SIGCHLD) + return; + current->signal &= ~_S(signum); + return; + } + if (p->sa_handler == SIG_DFL) { + if (signum != SIGCONT && signum != SIGCHLD && signum != SIGWINCH) + return; + current->signal &= ~_S(signum); + return; + } +} + +asmlinkage int sys_signal(int signum, unsigned long handler) +{ + struct sigaction tmp; + + if (signum<1 || signum>32) + return -EINVAL; + if (signum==SIGKILL || signum==SIGSTOP) + return -EINVAL; + if (handler >= TASK_SIZE) + return -EFAULT; + tmp.sa_handler = (void (*)(int)) handler; + tmp.sa_mask = 0; + tmp.sa_flags = SA_ONESHOT | SA_NOMASK; + tmp.sa_restorer = NULL; + handler = (long) current->sigaction[signum-1].sa_handler; + current->sigaction[signum-1] = tmp; + check_pending(signum); + return handler; +} + +asmlinkage int sys_sigaction(int signum, const struct sigaction * action, + struct sigaction * oldaction) +{ + struct sigaction new_sa, *p; + + if (signum<1 || signum>32) + return -EINVAL; + if (signum==SIGKILL || signum==SIGSTOP) + return -EINVAL; + p = signum - 1 + current->sigaction; + if (action) { + int err = verify_area(VERIFY_READ, action, sizeof(*action)); + if (err) + return err; + memcpy_fromfs(&new_sa, action, sizeof(struct sigaction)); + if (new_sa.sa_flags & SA_NOMASK) + new_sa.sa_mask = 0; + else { + new_sa.sa_mask |= _S(signum); + new_sa.sa_mask &= _BLOCKABLE; + } + if (TASK_SIZE <= (unsigned long) new_sa.sa_handler) + return -EFAULT; + } + if (oldaction) { + int err = verify_area(VERIFY_WRITE, oldaction, sizeof(*oldaction)); + if (err) + return err; + memcpy_tofs(oldaction, p, sizeof(struct sigaction)); + } + if (action) { + *p = new_sa; + check_pending(signum); + } + return 0; +} + +asmlinkage int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options); + +/* + * This sets regs->reg29 even though we don't actually use sigstacks yet.. + */ +asmlinkage int sys_sigreturn(unsigned long __unused) +{ + struct sigcontext_struct context; + struct pt_regs * regs; + + regs = (struct pt_regs *) &__unused; + if (verify_area(VERIFY_READ, (void *) regs->reg29, sizeof(context))) + goto badframe; + memcpy_fromfs(&context,(void *) regs->reg29, sizeof(context)); + current->blocked = context.oldmask & _BLOCKABLE; + regs->reg1 = context.sc_at; + regs->reg2 = context.sc_v0; + regs->reg3 = context.sc_v1; + regs->reg4 = context.sc_a0; + regs->reg5 = context.sc_a1; + regs->reg6 = context.sc_a2; + regs->reg7 = context.sc_a3; + regs->reg8 = context.sc_t0; + regs->reg9 = context.sc_t1; + regs->reg10 = context.sc_t2; + regs->reg11 = context.sc_t3; + regs->reg12 = context.sc_t4; + regs->reg13 = context.sc_t5; + regs->reg14 = context.sc_t6; + regs->reg15 = context.sc_t7; + regs->reg24 = context.sc_t8; + regs->reg25 = context.sc_t9; + regs->reg29 = context.sc_sp; + + /* + * disable syscall checks + */ + regs->orig_reg2 = -1; + return regs->orig_reg2; +badframe: + do_exit(SIGSEGV); +} + +/* + * Set up a signal frame... + */ +static void setup_frame(struct sigaction * sa, unsigned long ** fp, + unsigned long pc, struct pt_regs *regs, + int signr, unsigned long oldmask) +{ + unsigned long * frame; + + frame = *fp; + frame -= 21; + if (verify_area(VERIFY_WRITE,frame,21*4)) + do_exit(SIGSEGV); + /* + * set up the "normal" stack seen by the signal handler (iBCS2) + */ + put_fs_long(regs->reg1 , frame ); + put_fs_long(regs->reg2 , frame+ 1); + put_fs_long(regs->reg3 , frame+ 2); + put_fs_long(regs->reg4 , frame+ 3); + put_fs_long(regs->reg5 , frame+ 4); + put_fs_long(regs->reg6 , frame+ 5); + put_fs_long(regs->reg7 , frame+ 6); + put_fs_long(regs->reg8 , frame+ 7); + put_fs_long(regs->reg10, frame+ 8); + put_fs_long(regs->reg11, frame+ 9); + put_fs_long(regs->reg12, frame+10); + put_fs_long(regs->reg13, frame+11); + put_fs_long(regs->reg14, frame+12); + put_fs_long(regs->reg15, frame+13); + put_fs_long(regs->reg16, frame+14); + put_fs_long(regs->reg17, frame+15); + put_fs_long(regs->reg24, frame+16); + put_fs_long(regs->reg25, frame+17); + put_fs_long(regs->reg29, frame+18); + put_fs_long(pc , frame+19); + put_fs_long(oldmask , frame+20); + /* + * set up the return code... + * + * .set noat + * syscall + * li $1,__NR_sigreturn + * .set at + */ + put_fs_long(0x24010077, frame+20); /* li $1,119 */ + put_fs_long(0x000000c0, frame+21); /* syscall */ + *fp = frame; + /* + * Flush caches so the instructions will be correctly executed. + */ + cacheflush(frame, 21*4, BCACHE); +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + * + * Note that we go through the signals twice: once to check the signals that + * the kernel can handle, and then we build all the user-level signal handling + * stack-frames in one go after that. + */ +asmlinkage int do_signal(unsigned long oldmask, struct pt_regs * regs) +{ + unsigned long mask = ~current->blocked; + unsigned long handler_signal = 0; + unsigned long *frame = NULL; + unsigned long pc = 0; + unsigned long signr; + struct sigaction * sa; + + while ((signr = current->signal & mask)) { +#if defined (__i386__) + __asm__("bsf %2,%1\n\t" + "btrl %1,%0" + :"=m" (current->signal),"=r" (signr) + :"1" (signr)); +#elif defined (__mips__) + __asm__(".set\tnoreorder\n\t" + ".set\tnoat\n\t" + "li\t%1,1\n" + "1:\tand\t$1,%2,%1\n\t" + "beq\t$0,$1,2f\n\t" + "sll\t%2,%2,1\n\t" + "bne\t$0,%2,1b\n\t" + "add\t%0,%0,1\n" + "2:\tli\t%2,-2\n\t" + "sllv\t%2,%2,%0\n\t" + "and\t%1,%1,%2\n\t" + ".set\tat\n\t" + ".set\treorder\n" + "2:\n\t" + :"=r" (signr),"=r" (current->signal),"=r" (mask) + :"0" (signr),"1" (current->signal) + :"$1"); +#endif + sa = current->sigaction + signr; + signr++; + if ((current->flags & PF_PTRACED) && signr != SIGKILL) { + current->exit_code = signr; + current->state = TASK_STOPPED; + notify_parent(current); + schedule(); + if (!(signr = current->exit_code)) + continue; + current->exit_code = 0; + if (signr == SIGSTOP) + continue; + if (_S(signr) & current->blocked) { + current->signal |= _S(signr); + continue; + } + sa = current->sigaction + signr - 1; + } + if (sa->sa_handler == SIG_IGN) { + if (signr != SIGCHLD) + continue; + /* check for SIGCHLD: it's special */ + while (sys_waitpid(-1,NULL,WNOHANG) > 0) + /* nothing */; + continue; + } + if (sa->sa_handler == SIG_DFL) { + if (current->pid == 1) + continue; + switch (signr) { + case SIGCONT: case SIGCHLD: case SIGWINCH: + continue; + + case SIGSTOP: case SIGTSTP: case SIGTTIN: case SIGTTOU: + if (current->flags & PF_PTRACED) + continue; + current->state = TASK_STOPPED; + current->exit_code = signr; + if (!(current->p_pptr->sigaction[SIGCHLD-1].sa_flags & + SA_NOCLDSTOP)) + notify_parent(current); + schedule(); + continue; + + case SIGQUIT: case SIGILL: case SIGTRAP: + case SIGIOT: case SIGFPE: case SIGSEGV: + if (current->binfmt && current->binfmt->core_dump) { + if (current->binfmt->core_dump(signr, regs)) + signr |= 0x80; + } + /* fall through */ + default: + current->signal |= _S(signr & 0x7f); + do_exit(signr); + } + } + /* + * OK, we're invoking a handler + */ + if (regs->orig_reg2 >= 0) { + if (regs->reg2 == -ERESTARTNOHAND || + (regs->reg2 == -ERESTARTSYS && + !(sa->sa_flags & SA_RESTART))) + regs->reg2 = -EINTR; + } + handler_signal |= 1 << (signr-1); + mask &= ~sa->sa_mask; + } + if (regs->orig_reg2 >= 0 && + (regs->reg2 == -ERESTARTNOHAND || + regs->reg2 == -ERESTARTSYS || + regs->reg2 == -ERESTARTNOINTR)) { + regs->reg2 = regs->orig_reg2; + regs->cp0_epc -= 2; + } + if (!handler_signal) /* no handler will be called - return 0 */ + return 0; + pc = regs->cp0_epc; + frame = (unsigned long *) regs->reg29; + signr = 1; + sa = current->sigaction; + for (mask = 1 ; mask ; sa++,signr++,mask += mask) { + if (mask > handler_signal) + break; + if (!(mask & handler_signal)) + continue; + setup_frame(sa,&frame,pc,regs,signr,oldmask); + pc = (unsigned long) sa->sa_handler; + if (sa->sa_flags & SA_ONESHOT) + sa->sa_handler = NULL; + /* + * force a kernel-mode page-in of the signal + * handler to reduce races + */ + __asm__(".set\tnoat\n\t" + "lwu\t$1,(%0)\n\t" + ".set\tat\n\t" + : + :"r" ((char *) pc) + :"$1"); + current->blocked |= sa->sa_mask; + oldmask |= sa->sa_mask; + } + regs->reg29 = (unsigned long) frame; + regs->cp0_epc = pc; /* "return" to the first handler */ + return 1; +} diff --git a/arch/mips/splx.c b/arch/mips/splx.c new file mode 100644 index 000000000..5ff5e8284 --- /dev/null +++ b/arch/mips/splx.c @@ -0,0 +1,30 @@ +/* + * splx.c - SYSV DDI/DKI ipl manipulation functions + * + * Internally, many unices use a range of different interrupt + * privilege levels, ie from "allow all interrupts" (7) to + * "allow no interrupts." (0) under SYSV. + * + * This a simple splx() function behaves as the SYSV DDI/DKI function does, + * although since Linux only implements the equivalent of level 0 (cli) and + * level 7 (sti), this implementation only implements those levels. + * + * Also, unlike the current Linux routines, splx() also returns the + * old privilege level so that it can be restored. + */ + +#include <asm/system.h> +#include <asm/mipsregs.h> + +int splx (int new_level) +{ + register int old_level, tmp; + + save_flags(tmp); + old_level = ((tmp & (ST0_IE|ST0_ERL|ST0_EXL)) == ST0_IE) ? 7 : 0; + if (new_level) + sti(); + else + cli(); + return old_level; +} diff --git a/arch/mips/traps.c b/arch/mips/traps.c new file mode 100644 index 000000000..9cb1252d5 --- /dev/null +++ b/arch/mips/traps.c @@ -0,0 +1,135 @@ +/* + * linux/kernel/traps.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'traps.c' handles hardware traps and faults after we have saved some + * state in 'asm.s'. Currently mostly a debugging-aid, will be extended + * to mainly kill the offending process (probably by giving it a signal, + * but possibly by killing it outright if necessary). + */ +#include <linux/head.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/ptrace.h> + +#include <asm/system.h> +#include <asm/segment.h> +#include <asm/io.h> +#include <asm/mipsregs.h> + +static inline void console_verbose(void) +{ + extern int console_loglevel; + console_loglevel = 15; +} + +asmlinkage extern void handle_int(void); +asmlinkage extern void handle_mod(void); +asmlinkage extern void handle_tlbl(void); +asmlinkage extern void handle_tlbs(void); +asmlinkage extern void handle_adel(void); +asmlinkage extern void handle_ades(void); +asmlinkage extern void handle_ibe(void); +asmlinkage extern void handle_dbe(void); +asmlinkage extern void handle_sys(void); +asmlinkage extern void handle_bp(void); +asmlinkage extern void handle_ri(void); +asmlinkage extern void handle_cpu(void); +asmlinkage extern void handle_ov(void); +asmlinkage extern void handle_tr(void); +asmlinkage extern void handle_reserved(void); +asmlinkage extern void handle_fpe(void); + +void die_if_kernel(char * str, struct pt_regs * regs, long err) +{ + int i; + unsigned long *sp, *pc; + + if (regs->cp0_status & (ST0_ERL|ST0_EXL) == 0) + return; + + sp = (unsigned long *)regs->reg29; + pc = (unsigned long *)regs->cp0_epc; + + console_verbose(); + printk("%s: %08lx\n", str, err ); + + /* + * Saved main processor registers + */ + printk("at : %08lx\n", regs->reg1); + printk("v0 : %08lx %08lx\n", regs->reg2, regs->reg3); + printk("a0 : %08lx %08lx %08lx %08lx\n", + regs->reg4, regs->reg5, regs->reg6, regs->reg7); + printk("t0 : %08lx %08lx %08lx %08lx %08lx\n", + regs->reg8, regs->reg9, regs->reg10, regs->reg11, regs->reg12); + printk("t5 : %08lx %08lx %08lx %08lx %08lx\n", + regs->reg13, regs->reg14, regs->reg15, regs->reg24, regs->reg25); + printk("s0 : %08lx %08lx %08lx %08lx\n", + regs->reg16, regs->reg17, regs->reg18, regs->reg19); + printk("s4 : %08lx %08lx %08lx %08lx\n", + regs->reg20, regs->reg21, regs->reg22, regs->reg23); + printk("gp : %08lx\n", regs->reg28); + printk("sp : %08lx\n", regs->reg29); + printk("fp/s8: %08lx\n", regs->reg30); + printk("ra : %08lx\n", regs->reg31); + + /* + * Saved cp0 registers + */ + printk("EPC : %08lx\nErrorEPC: %08lx\nStatus: %08lx\n", + regs->cp0_epc, regs->cp0_errorepc, regs->cp0_status); + /* + * Some goodies... + */ + printk("Int : %ld\n", regs->interrupt); + + /* + * Dump the stack + */ + if (STACK_MAGIC != *(unsigned long *)current->kernel_stack_page) + printk("Corrupted stack page\n"); + printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)\nStack: ", + current->comm, current->pid, 0xffff & i, + current->kernel_stack_page); + for(i=0;i<5;i++) + printk("%08lx ", *sp++); + + printk("\nCode: "); + for(i=0;i<5;i++) + printk("%08lx ", *pc++); + printk("\n"); + do_exit(SIGSEGV); +} + +void trap_init(void) +{ + int i; + +#if 0 + set_except_vector(0, handle_int); + set_except_vector(1, handle_mod); + set_except_vector(2, handle_tlbl); + set_except_vector(3, handle_tlbs); + set_except_vector(4, handle_adel); + set_except_vector(5, handle_ades); + set_except_vector(6, handle_ibe); + set_except_vector(7, handle_dbe); + set_except_vector(8, handle_sys); + set_except_vector(9, handle_bp); + set_except_vector(10, handle_ri); + set_except_vector(11, handle_cpu); + set_except_vector(12, handle_ov); + set_except_vector(13, handle_tr); + set_except_vector(14, handle_reserved); + set_except_vector(15, handle_fpe); + + for (i=16;i<256;i++) + set_except_vector(i, handle_reserved); +#endif +} diff --git a/arch/mips/vm86.c b/arch/mips/vm86.c new file mode 100644 index 000000000..454b35fe0 --- /dev/null +++ b/arch/mips/vm86.c @@ -0,0 +1,14 @@ +/* + * arch/mips/vm86.c + * + * Copyright (C) 1994 Waldorf GMBH, + * written by Ralf Baechle + */ +#include <linux/linkage.h> +#include <linux/errno.h> +#include <linux/vm86.h> + +asmlinkage int sys_vm86(struct vm86_struct * v86) +{ + return -ENOSYS; +} |