Add __tlb_remove_folio_pages(), which will remove multiple consecutive pages that belong to the same large folio, instead of only a single page. We'll be using this function when optimizing unmapping/zapping of large folios that are mapped by PTEs. We're using the remaining spare bit in an encoded_page to indicate that the next enoced page in an array contains actually shifted "nr_pages". Teach swap/freeing code about putting multiple folio references, and delayed rmap handling to remove page ranges of a folio. This extension allows for still gathering almost as many small folios as we used to (-1, because we have to prepare for a possibly bigger next entry), but still allows for gathering consecutive pages that belong to the same large folio. Note that we don't pass the folio pointer, because it is not required for now. Further, we don't support page_size != PAGE_SIZE, it won't be required for simple PTE batching. We have to provide a separate s390 implementation, but it's fairly straight forward. Another, more invasive and likely more expensive, approach would be to use folio+range or a PFN range instead of page+nr_pages. But, we should do that consistently for the whole mmu_gather. For now, let's keep it simple and add "nr_pages" only. Note that it is now possible to gather significantly more pages: In the past, we were able to gather ~10000 pages, now we can also gather ~5000 folio fragments that span multiple pages. A folio fragment on x86-64 can span up to 512 pages (2 MiB THP) and on arm64 with 64k in theory 8192 pages (512 MiB THP). Gathering more memory is not considered something we should worry about, especially because these are already corner cases. While we can gather more total memory, we won't free more folio fragments. As long as page freeing time primarily only depends on the number of involved folios, there is no effective change for !preempt configurations. However, we'll adjust tlb_batch_pages_flush() separately to handle corner cases where page freeing time grows proportionally with the actual memory size. Link: https://lkml.kernel.org/r/20240214204435.167852-9-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Hocko <mhocko@suse.com> Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Cc: Yin Fengwei <fengwei.yin@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
150 lines
4.7 KiB
C
150 lines
4.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _S390_TLB_H
|
|
#define _S390_TLB_H
|
|
|
|
/*
|
|
* TLB flushing on s390 is complicated. The following requirement
|
|
* from the principles of operation is the most arduous:
|
|
*
|
|
* "A valid table entry must not be changed while it is attached
|
|
* to any CPU and may be used for translation by that CPU except to
|
|
* (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY,
|
|
* or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page
|
|
* table entry, or (3) make a change by means of a COMPARE AND SWAP
|
|
* AND PURGE instruction that purges the TLB."
|
|
*
|
|
* The modification of a pte of an active mm struct therefore is
|
|
* a two step process: i) invalidate the pte, ii) store the new pte.
|
|
* This is true for the page protection bit as well.
|
|
* The only possible optimization is to flush at the beginning of
|
|
* a tlb_gather_mmu cycle if the mm_struct is currently not in use.
|
|
*
|
|
* Pages used for the page tables is a different story. FIXME: more
|
|
*/
|
|
|
|
void __tlb_remove_table(void *_table);
|
|
static inline void tlb_flush(struct mmu_gather *tlb);
|
|
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
|
|
struct page *page, bool delay_rmap, int page_size);
|
|
static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
|
|
struct page *page, unsigned int nr_pages, bool delay_rmap);
|
|
|
|
#define tlb_flush tlb_flush
|
|
#define pte_free_tlb pte_free_tlb
|
|
#define pmd_free_tlb pmd_free_tlb
|
|
#define p4d_free_tlb p4d_free_tlb
|
|
#define pud_free_tlb pud_free_tlb
|
|
|
|
#include <asm/tlbflush.h>
|
|
#include <asm-generic/tlb.h>
|
|
|
|
/*
|
|
* Release the page cache reference for a pte removed by
|
|
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
|
|
* has already been freed, so just do free_page_and_swap_cache.
|
|
*
|
|
* s390 doesn't delay rmap removal.
|
|
*/
|
|
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
|
|
struct page *page, bool delay_rmap, int page_size)
|
|
{
|
|
VM_WARN_ON_ONCE(delay_rmap);
|
|
|
|
free_page_and_swap_cache(page);
|
|
return false;
|
|
}
|
|
|
|
static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
|
|
struct page *page, unsigned int nr_pages, bool delay_rmap)
|
|
{
|
|
struct encoded_page *encoded_pages[] = {
|
|
encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT),
|
|
encode_nr_pages(nr_pages),
|
|
};
|
|
|
|
VM_WARN_ON_ONCE(delay_rmap);
|
|
VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
|
|
|
|
free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
|
|
return false;
|
|
}
|
|
|
|
static inline void tlb_flush(struct mmu_gather *tlb)
|
|
{
|
|
__tlb_flush_mm_lazy(tlb->mm);
|
|
}
|
|
|
|
/*
|
|
* pte_free_tlb frees a pte table and clears the CRSTE for the
|
|
* page table from the tlb.
|
|
*/
|
|
static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
|
|
unsigned long address)
|
|
{
|
|
__tlb_adjust_range(tlb, address, PAGE_SIZE);
|
|
tlb->mm->context.flush_mm = 1;
|
|
tlb->freed_tables = 1;
|
|
tlb->cleared_pmds = 1;
|
|
if (mm_alloc_pgste(tlb->mm))
|
|
gmap_unlink(tlb->mm, (unsigned long *)pte, address);
|
|
tlb_remove_ptdesc(tlb, pte);
|
|
}
|
|
|
|
/*
|
|
* pmd_free_tlb frees a pmd table and clears the CRSTE for the
|
|
* segment table entry from the tlb.
|
|
* If the mm uses a two level page table the single pmd is freed
|
|
* as the pgd. pmd_free_tlb checks the asce_limit against 2GB
|
|
* to avoid the double free of the pmd in this case.
|
|
*/
|
|
static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
|
|
unsigned long address)
|
|
{
|
|
if (mm_pmd_folded(tlb->mm))
|
|
return;
|
|
pagetable_pmd_dtor(virt_to_ptdesc(pmd));
|
|
__tlb_adjust_range(tlb, address, PAGE_SIZE);
|
|
tlb->mm->context.flush_mm = 1;
|
|
tlb->freed_tables = 1;
|
|
tlb->cleared_puds = 1;
|
|
tlb_remove_ptdesc(tlb, pmd);
|
|
}
|
|
|
|
/*
|
|
* p4d_free_tlb frees a pud table and clears the CRSTE for the
|
|
* region second table entry from the tlb.
|
|
* If the mm uses a four level page table the single p4d is freed
|
|
* as the pgd. p4d_free_tlb checks the asce_limit against 8PB
|
|
* to avoid the double free of the p4d in this case.
|
|
*/
|
|
static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
|
|
unsigned long address)
|
|
{
|
|
if (mm_p4d_folded(tlb->mm))
|
|
return;
|
|
__tlb_adjust_range(tlb, address, PAGE_SIZE);
|
|
tlb->mm->context.flush_mm = 1;
|
|
tlb->freed_tables = 1;
|
|
tlb_remove_ptdesc(tlb, p4d);
|
|
}
|
|
|
|
/*
|
|
* pud_free_tlb frees a pud table and clears the CRSTE for the
|
|
* region third table entry from the tlb.
|
|
* If the mm uses a three level page table the single pud is freed
|
|
* as the pgd. pud_free_tlb checks the asce_limit against 4TB
|
|
* to avoid the double free of the pud in this case.
|
|
*/
|
|
static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
|
|
unsigned long address)
|
|
{
|
|
if (mm_pud_folded(tlb->mm))
|
|
return;
|
|
tlb->mm->context.flush_mm = 1;
|
|
tlb->freed_tables = 1;
|
|
tlb->cleared_p4ds = 1;
|
|
tlb_remove_ptdesc(tlb, pud);
|
|
}
|
|
|
|
|
|
#endif /* _S390_TLB_H */
|