#define NULL 0 /* * mm/simp.c -- simple allocator for cached objects * * (C) 1997 Thomas Schoebel-Theuer */ #include #include #include #include #include /* The next two defines can be independently enabled for debugging */ /*#define DEBUG*/ /*#define DEAD_BEEF*/ #ifdef DEAD_BEEF #define DEBUG_BEEF 1 #else #define DEBUG_BEEF 0 #endif #ifdef __SMP__ #define NR_PROCESSORS NR_CPUS #define GLOBAL_SIZE CHUNK_SIZE #else #define NR_PROCESSORS 1 #define GLOBAL_SIZE PAGE_SIZE #endif #define POSTBUFFER_SIZE 63 #define ORDER 2 #define CHUNK_SIZE (PAGE_SIZE*(1<lock); } spin_lock(&global->lock); simp = &global->simps[global->nr_simps++]; spin_unlock(&global->lock); if(global->nr_simps >= MAX_SIMPS) { printk("SIMP: too many simps allocated\n"); return NULL; } memset(simp, 0, sizeof(struct simp)); spin_lock_init(&simp->lock); strncpy(simp->name, name, 15); simp->size = size; simp->real_size = real_size = ALIGN_CACHE(size); /* allow aggregation of very small objects in 2-power fractions of * cachelines */ fraction = COLOR_INCREMENT / 2; while(size <= fraction && fraction >= sizeof(void*)) { simp->real_size = fraction; fraction >>= 1; } simp->first_ctor = first_ctor; simp->again_ctor = again_ctor; simp->dtor = dtor; real_size += sizeof(void*); simp->max_elems = (CHUNK_SIZE - HEADER_SIZE) / real_size; simp->max_color = (CHUNK_SIZE - HEADER_SIZE) % real_size; for(cpu = 0; cpu < NR_PROCESSORS; cpu++) { struct per_processor * private = &simp->private[cpu]; private->buffer_pos = private->postbuffer; } return simp; } /* Do *not* inline this, it clobbers too many registers... */ static void alloc_header(struct simp * simp) { struct header * hdr; char * ptr; void ** index; long count; spin_unlock(&simp->lock); for(;;) { hdr = (struct header*)__get_free_pages(GFP_KERNEL, ORDER, 0); if(hdr) break; if(!simp_garbage()) return; } #ifdef DEBUG if(CHUNK_BASE(hdr) != hdr) panic("simp: bad kernel page alignment"); #endif memset(hdr, 0, HEADER_SIZE); #ifdef DEBUG memcpy(hdr->magic, global_magic, sizeof(global_magic)); #endif hdr->father = simp; hdr->again_ctor = simp->again_ctor; hdr->first_ctor = simp->first_ctor; /* note: races on simp->color don't produce any error :-) */ ptr = ((char*)hdr) + HEADER_SIZE + simp->color; index = CHUNK_END(hdr); for(count = 0; count < simp->max_elems; count++) { *--index = ptr; ptr += simp->real_size; /* note: constructors are not called here in bunch but * instead at each single simp_alloc(), in order * to maximize chances that the cache will be * polluted after a simp_alloc() anyway, * and not here. */ } hdr->index = hdr->fresh = hdr->emptypos = index; spin_lock(&simp->lock); simp->color += COLOR_INCREMENT; if(simp->color >= simp->max_color) simp->color = 0; hdr->next = simp->usable_list; simp->usable_list = hdr; } /* current x86 memcpy() is horribly moving around registers for nothing, * is doing unnecessary work if the size is dividable by a power-of-two, * and it clobbers way too many registers. * This results in nearly any other register being transfered to stack. * Fixing this would be a major win for the whole kernel! */ static void ** bunch_alloc(struct simp * simp, void ** buffer) { struct header * hdr; void ** index; void ** to; void ** end; structor todo; long length; spin_lock(&simp->lock); hdr = simp->usable_list; if(!hdr) { alloc_header(simp); hdr = simp->usable_list; if(!hdr) { spin_unlock(&simp->lock); *buffer = NULL; return buffer+1; } } index = hdr->index; end = hdr->fresh; todo = hdr->again_ctor; if(index == end) { end = CHUNK_END(hdr); todo = hdr->first_ctor; } to = index + POSTBUFFER_SIZE/2; if(to >= end) { to = end; if(to == CHUNK_END(hdr)) { simp->usable_list = hdr->next; hdr->next = NULL; } } if(to > hdr->fresh) hdr->fresh = to; hdr->index = to; length = ((unsigned long)to) - (unsigned long)index; to = buffer + (length/sizeof(void**)); memcpy(buffer, index, length); spin_unlock(&simp->lock); if(todo) { do { todo(*buffer++); } while(buffer < to); } return to; } void * simp_alloc(struct simp * simp) { #ifdef __SMP__ const long cpu = smp_processor_id(); struct per_processor * priv = &simp->private[cpu]; #else #define priv (&simp->private[0]) /*fool gcc to use no extra register*/ #endif void ** buffer_pos = priv->buffer_pos; void * res; if(buffer_pos == priv->postbuffer) { buffer_pos = bunch_alloc(simp, buffer_pos); } buffer_pos--; res = *buffer_pos; priv->buffer_pos = buffer_pos; return res; } #ifdef DEBUG long check_header(struct header * hdr, void * ptr) { void ** test; if(!hdr) { printk("SIMP: simp_free() with NULL pointer\n"); return 1; } if(strncmp(hdr->magic, global_magic, 32)) { printk("SIMP: simpe_free() with bad ptr %p, or header corruption\n", ptr); return 1; } /* This is brute force, but I don't want to pay for any * overhead if debugging is not enabled, in particular * no space overhead for keeping hashtables etc. */ test = hdr->index; while(test < CHUNK_END(hdr)) { if(*test++ == ptr) { printk("SIMP: trying to simp_free(%p) again\n", ptr); return 1; } } return 0; } #endif static void ** bunch_free(struct simp * simp, void ** buffer) { void ** stop; stop = buffer - POSTBUFFER_SIZE/3; spin_lock(&simp->lock); while(buffer > stop) { void * elem = buffer[-1]; struct header * hdr = CHUNK_BASE(elem); void ** index = hdr->index; index--; hdr->index = index; *index = elem; if(!hdr->next) { hdr->next = simp->usable_list; simp->usable_list = hdr; } buffer -= 2; elem = *buffer; hdr = CHUNK_BASE(elem); index = hdr->index; index--; hdr->index = index; *index = elem; if(!hdr->next) { hdr->next = simp->usable_list; simp->usable_list = hdr; } } spin_unlock(&simp->lock); global->changed_flag = 1; return buffer; } void simp_free(void * objp) { struct header * hdr; void ** buffer_pos; struct per_processor * private; #ifdef __SMP__ const long cpu = smp_processor_id(); #else const long cpu = 0; #endif hdr = CHUNK_BASE(objp); #ifdef DEBUG if(check_header(hdr, objp)) return; #endif private = &hdr->father->private[cpu]; buffer_pos = private->buffer_pos; if(buffer_pos >= private->postbuffer+POSTBUFFER_SIZE) { buffer_pos = bunch_free(hdr->father, buffer_pos); } *buffer_pos++ = objp; private->buffer_pos = buffer_pos; #ifdef DEAD_BEEF { unsigned int * ptr = (unsigned int*)objp; int count = (hdr->father->real_size - ELEM_SIZE) / sizeof(unsigned int); while(count--) *ptr++ = 0xdeadbeef; } #endif } long simp_garbage(void) { int i; int res; if(!global->changed_flag) return 0; /* shortcut */ /* Note: costs do not matter here. Any heavy thrashing of * simp chunks that could be caused by pools stealing each * other's memory has to be considered a BUG :-) * Simply avoid memory shortages by conservative allocating * policies. */ global->changed_flag = 0; res = 0; for(i = 0; i < global->nr_simps; i++) { struct simp * simp = &global->simps[i]; struct header ** base = &simp->usable_list; struct header * del; spin_lock(&simp->lock); del = *base; while(del) { if(del->index == del->emptypos) { if(simp->dtor) { void ** ptr = del->index; while(ptr < CHUNK_END(del)) { simp->dtor(*ptr++); } } *base = del->next; #ifdef DEBUG memset(del, 0, CHUNK_SIZE); #endif free_pages((unsigned long)del, ORDER); res++; } else base = &del->next; del = *base; } spin_unlock(&simp->lock); } return res; }