diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 180 |
1 files changed, 95 insertions, 85 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f29b7dc02c39..c9b43360fd33 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -321,10 +321,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); - if (! ptep) - /* This can happen on truncate, or if an - * mmap() is aborted due to an error before - * the prefault */ + if (!ptep) continue; pte = huge_ptep_get_and_clear(mm, address, ptep); @@ -340,81 +337,92 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +static struct page *find_lock_huge_page(struct address_space *mapping, + unsigned long idx) { - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - - WARN_ON(!is_vm_hugetlb_page(vma)); - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); - - hugetlb_prefault_arch_hook(mm); - - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } + struct page *page; + int err; + struct inode *inode = mapping->host; + unsigned long size; + +retry: + page = find_lock_page(mapping, idx); + if (page) + goto out; + + /* Check to make sure the mapping hasn't been truncated */ + size = i_size_read(inode) >> HPAGE_SHIFT; + if (idx >= size) + goto out; + + if (hugetlb_get_quota(mapping)) + goto out; + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + goto out; + } - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - spin_lock(&mm->page_table_lock); - add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); - spin_unlock(&mm->page_table_lock); + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + page = NULL; } out: - return ret; + return page; } -/* - * On ia64 at least, it is possible to receive a hugetlb fault from a - * stale zero entry left in the TLB from earlier hardware prefetching. - * Low-level arch code should already have flushed the stale entry as - * part of its fault handling, but we do need to accept this minor fault - * and return successfully. Whereas the "normal" case is that this is - * an access to a hugetlb page which has been truncated off since mmap. - */ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { int ret = VM_FAULT_SIGBUS; + unsigned long idx; + unsigned long size; pte_t *pte; + struct page *page; + struct address_space *mapping; + + pte = huge_pte_alloc(mm, address); + if (!pte) + goto out; + + mapping = vma->vm_file->f_mapping; + idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* + * Use page lock to guard against racing truncation + * before we get page_table_lock. + */ + page = find_lock_huge_page(mapping, idx); + if (!page) + goto out; spin_lock(&mm->page_table_lock); - pte = huge_pte_offset(mm, address); - if (pte && !pte_none(*pte)) - ret = VM_FAULT_MINOR; + size = i_size_read(mapping->host) >> HPAGE_SHIFT; + if (idx >= size) + goto backout; + + ret = VM_FAULT_MINOR; + if (!pte_none(*pte)) + goto backout; + + add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); spin_unlock(&mm->page_table_lock); + unlock_page(page); +out: return ret; + +backout: + spin_unlock(&mm->page_table_lock); + hugetlb_put_quota(mapping); + unlock_page(page); + put_page(page); + goto out; } int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -424,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vpfn, vaddr = *position; int remainder = *length; - BUG_ON(!is_vm_hugetlb_page(vma)); - vpfn = vaddr/PAGE_SIZE; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { + pte_t *pte; + struct page *page; - if (pages) { - pte_t *pte; - struct page *page; - - /* Some archs (sparc64, sh*) have multiple - * pte_ts to each hugepage. We have to make - * sure we get the first, for the page - * indexing below to work. */ - pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - - /* the hugetlb file might have been truncated */ - if (!pte || pte_none(*pte)) { - remainder = 0; - if (!i) - i = -EFAULT; - break; - } + /* + * Some archs (sparc64, sh*) have multiple pte_ts to + * each hugepage. We have to make * sure we get the + * first, for the page indexing below to work. + */ + pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; + if (!pte || pte_none(*pte)) { + int ret; - WARN_ON(!PageCompound(page)); + spin_unlock(&mm->page_table_lock); + ret = hugetlb_fault(mm, vma, vaddr, 0); + spin_lock(&mm->page_table_lock); + if (ret == VM_FAULT_MINOR) + continue; + remainder = 0; + if (!i) + i = -EFAULT; + break; + } + + if (pages) { + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; get_page(page); pages[i] = page; } |