commitfa41ba0d08
("s390/mm: avoid empty zero pages for KVM guests to avoid postcopy hangs") introduced an undesired side effect when combined with memory ballooning and VM migration: memory part of the inflated memory balloon will consume memory. Assuming we have a 100GiB VM and inflated the balloon to 40GiB. Our VM will consume ~60GiB of memory. If we now trigger a VM migration, hypervisors like QEMU will read all VM memory. As s390x does not support the shared zeropage, we'll end up allocating for all previously-inflated memory part of the memory balloon: 50 GiB. So we might easily (unexpectedly) crash the VM on the migration source. Even worse, hypervisors like QEMU optimize for zeropage migration to not consume memory on the migration destination: when migrating a "page full of zeroes", on the migration destination they check whether the target memory is already zero (by reading the destination memory) and avoid writing to the memory to not allocate memory: however, s390x will also allocate memory here, implying that also on the migration destination, we will end up allocating all previously-inflated memory part of the memory balloon. This is especially bad if actual memory overcommit was not desired, when memory ballooning is used for dynamic VM memory resizing, setting aside some memory during boot that can be added later on demand. Alternatives like virtio-mem that would avoid this issue are not yet available on s390x. There could be ways to optimize some cases in user space: before reading memory in an anonymous private mapping on the migration source, check via /proc/self/pagemap if anything is already populated. Similarly check on the migration destination before reading. While that would avoid populating tables full of shared zeropages on all architectures, it's harder to get right and performant, and requires user space changes. Further, with posctopy live migration we must place a page, so there, "avoid touching memory to avoid allocating memory" is not really possible. (Note that a previously we would have falsely inserted shared zeropages into processes using UFFDIO_ZEROPAGE where mm_forbids_zeropage() would have actually forbidden it) PV is currently incompatible with memory ballooning, and in the common case, KVM guests don't make use of storage keys. Instead of zapping zeropages when enabling storage keys / PV, that turned out to be problematic in the past, let's do exactly the same we do with KSM pages: trigger unsharing faults to replace the shared zeropages by proper anonymous folios. What about added latency when enabling storage kes? Having a lot of zeropages in applicable environments (PV, legacy guests, unittests) is unexpected. Further, KSM could today already unshare the zeropages and unmerging KSM pages when enabling storage kets would unshare the KSM-placed zeropages in the same way, resulting in the same latency. [ agordeev: Fixed sparse and checkpatch complaints and error handling ] Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com> Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com> Fixes:fa41ba0d08
("s390/mm: avoid empty zero pages for KVM guests to avoid postcopy hangs") Signed-off-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20240411161441.910170-3-david@redhat.com Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
128 lines
3.4 KiB
C
128 lines
3.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* S390 version
|
|
*
|
|
* Derived from "include/asm-i386/mmu_context.h"
|
|
*/
|
|
|
|
#ifndef __S390_MMU_CONTEXT_H
|
|
#define __S390_MMU_CONTEXT_H
|
|
|
|
#include <asm/pgalloc.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/mm_types.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/ctlreg.h>
|
|
#include <asm-generic/mm_hooks.h>
|
|
|
|
#define init_new_context init_new_context
|
|
static inline int init_new_context(struct task_struct *tsk,
|
|
struct mm_struct *mm)
|
|
{
|
|
unsigned long asce_type, init_entry;
|
|
|
|
spin_lock_init(&mm->context.lock);
|
|
INIT_LIST_HEAD(&mm->context.gmap_list);
|
|
cpumask_clear(&mm->context.cpu_attach_mask);
|
|
atomic_set(&mm->context.flush_count, 0);
|
|
atomic_set(&mm->context.protected_count, 0);
|
|
mm->context.gmap_asce = 0;
|
|
mm->context.flush_mm = 0;
|
|
#ifdef CONFIG_PGSTE
|
|
mm->context.alloc_pgste = page_table_allocate_pgste ||
|
|
test_thread_flag(TIF_PGSTE) ||
|
|
(current->mm && current->mm->context.alloc_pgste);
|
|
mm->context.has_pgste = 0;
|
|
mm->context.uses_skeys = 0;
|
|
mm->context.uses_cmm = 0;
|
|
mm->context.allow_cow_sharing = 1;
|
|
mm->context.allow_gmap_hpage_1m = 0;
|
|
#endif
|
|
switch (mm->context.asce_limit) {
|
|
default:
|
|
/*
|
|
* context created by exec, the value of asce_limit can
|
|
* only be zero in this case
|
|
*/
|
|
VM_BUG_ON(mm->context.asce_limit);
|
|
/* continue as 3-level task */
|
|
mm->context.asce_limit = _REGION2_SIZE;
|
|
fallthrough;
|
|
case _REGION2_SIZE:
|
|
/* forked 3-level task */
|
|
init_entry = _REGION3_ENTRY_EMPTY;
|
|
asce_type = _ASCE_TYPE_REGION3;
|
|
break;
|
|
case TASK_SIZE_MAX:
|
|
/* forked 5-level task */
|
|
init_entry = _REGION1_ENTRY_EMPTY;
|
|
asce_type = _ASCE_TYPE_REGION1;
|
|
break;
|
|
case _REGION1_SIZE:
|
|
/* forked 4-level task */
|
|
init_entry = _REGION2_ENTRY_EMPTY;
|
|
asce_type = _ASCE_TYPE_REGION2;
|
|
break;
|
|
}
|
|
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
|
|
_ASCE_USER_BITS | asce_type;
|
|
crst_table_init((unsigned long *) mm->pgd, init_entry);
|
|
return 0;
|
|
}
|
|
|
|
static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk)
|
|
{
|
|
int cpu = smp_processor_id();
|
|
|
|
if (next == &init_mm)
|
|
S390_lowcore.user_asce = s390_invalid_asce;
|
|
else
|
|
S390_lowcore.user_asce.val = next->context.asce;
|
|
cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
|
|
/* Clear previous user-ASCE from CR7 */
|
|
local_ctl_load(7, &s390_invalid_asce);
|
|
if (prev != next)
|
|
cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
|
|
}
|
|
#define switch_mm_irqs_off switch_mm_irqs_off
|
|
|
|
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk)
|
|
{
|
|
unsigned long flags;
|
|
|
|
local_irq_save(flags);
|
|
switch_mm_irqs_off(prev, next, tsk);
|
|
local_irq_restore(flags);
|
|
}
|
|
|
|
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
|
|
static inline void finish_arch_post_lock_switch(void)
|
|
{
|
|
struct task_struct *tsk = current;
|
|
struct mm_struct *mm = tsk->mm;
|
|
|
|
if (mm) {
|
|
preempt_disable();
|
|
while (atomic_read(&mm->context.flush_count))
|
|
cpu_relax();
|
|
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
|
|
__tlb_flush_mm_lazy(mm);
|
|
preempt_enable();
|
|
}
|
|
local_ctl_load(7, &S390_lowcore.user_asce);
|
|
}
|
|
|
|
#define activate_mm activate_mm
|
|
static inline void activate_mm(struct mm_struct *prev,
|
|
struct mm_struct *next)
|
|
{
|
|
switch_mm(prev, next, current);
|
|
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
|
|
local_ctl_load(7, &S390_lowcore.user_asce);
|
|
}
|
|
|
|
#include <asm-generic/mmu_context.h>
|
|
|
|
#endif /* __S390_MMU_CONTEXT_H */
|