x86/nmi: Accumulate NMI-progress evidence in exc_nmi()
CPUs ignoring NMIs is often a sign of those CPUs going bad, but there are quite a few other reasons why a CPU might ignore NMIs. Therefore, accumulate evidence within exc_nmi() as to what might be preventing a given CPU from responding to an NMI. [ paulmck: Apply Peter Zijlstra feedback. ] Signed-off-by: Paul E. McKenney <paulmck@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <x86@kernel.org> Reviewed-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
1b929c02af
commit
1a3ea611fc
2 changed files with 45 additions and 1 deletions
|
@ -69,6 +69,15 @@ struct nmi_stats {
|
||||||
unsigned int unknown;
|
unsigned int unknown;
|
||||||
unsigned int external;
|
unsigned int external;
|
||||||
unsigned int swallow;
|
unsigned int swallow;
|
||||||
|
unsigned long recv_jiffies;
|
||||||
|
unsigned long idt_seq;
|
||||||
|
unsigned long idt_nmi_seq;
|
||||||
|
unsigned long idt_ignored;
|
||||||
|
atomic_long_t idt_calls;
|
||||||
|
unsigned long idt_seq_snap;
|
||||||
|
unsigned long idt_nmi_seq_snap;
|
||||||
|
unsigned long idt_ignored_snap;
|
||||||
|
long idt_calls_snap;
|
||||||
};
|
};
|
||||||
|
|
||||||
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
|
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
|
||||||
|
@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
|
||||||
DEFINE_IDTENTRY_RAW(exc_nmi)
|
DEFINE_IDTENTRY_RAW(exc_nmi)
|
||||||
{
|
{
|
||||||
irqentry_state_t irq_state;
|
irqentry_state_t irq_state;
|
||||||
|
struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Re-enable NMIs right here when running as an SEV-ES guest. This might
|
* Re-enable NMIs right here when running as an SEV-ES guest. This might
|
||||||
* cause nested NMIs, but those can be handled safely.
|
* cause nested NMIs, but those can be handled safely.
|
||||||
*/
|
*/
|
||||||
sev_es_nmi_complete();
|
sev_es_nmi_complete();
|
||||||
|
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
|
||||||
|
arch_atomic_long_inc(&nsp->idt_calls);
|
||||||
|
|
||||||
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
|
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
|
||||||
return;
|
return;
|
||||||
|
@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
|
||||||
}
|
}
|
||||||
this_cpu_write(nmi_state, NMI_EXECUTING);
|
this_cpu_write(nmi_state, NMI_EXECUTING);
|
||||||
this_cpu_write(nmi_cr2, read_cr2());
|
this_cpu_write(nmi_cr2, read_cr2());
|
||||||
|
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
|
||||||
|
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
|
||||||
|
WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
|
||||||
|
WRITE_ONCE(nsp->recv_jiffies, jiffies);
|
||||||
|
}
|
||||||
nmi_restart:
|
nmi_restart:
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -509,8 +526,19 @@ nmi_restart:
|
||||||
|
|
||||||
inc_irq_stat(__nmi_count);
|
inc_irq_stat(__nmi_count);
|
||||||
|
|
||||||
if (!ignore_nmis)
|
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
|
||||||
|
WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
|
||||||
|
} else if (!ignore_nmis) {
|
||||||
|
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
|
||||||
|
WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
|
||||||
|
WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
|
||||||
|
}
|
||||||
default_do_nmi(regs);
|
default_do_nmi(regs);
|
||||||
|
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
|
||||||
|
WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
|
||||||
|
WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
irqentry_nmi_exit(regs, irq_state);
|
irqentry_nmi_exit(regs, irq_state);
|
||||||
|
|
||||||
|
@ -525,6 +553,11 @@ nmi_restart:
|
||||||
|
|
||||||
if (user_mode(regs))
|
if (user_mode(regs))
|
||||||
mds_user_clear_cpu_buffers();
|
mds_user_clear_cpu_buffers();
|
||||||
|
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
|
||||||
|
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
|
||||||
|
WARN_ON_ONCE(nsp->idt_seq & 0x1);
|
||||||
|
WRITE_ONCE(nsp->recv_jiffies, jiffies);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
|
#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
|
||||||
|
|
|
@ -1552,6 +1552,17 @@ config TRACE_IRQFLAGS_NMI
|
||||||
depends on TRACE_IRQFLAGS
|
depends on TRACE_IRQFLAGS
|
||||||
depends on TRACE_IRQFLAGS_NMI_SUPPORT
|
depends on TRACE_IRQFLAGS_NMI_SUPPORT
|
||||||
|
|
||||||
|
config NMI_CHECK_CPU
|
||||||
|
bool "Debugging for CPUs failing to respond to backtrace requests"
|
||||||
|
depends on DEBUG_KERNEL
|
||||||
|
depends on X86
|
||||||
|
default n
|
||||||
|
help
|
||||||
|
Enables debug prints when a CPU fails to respond to a given
|
||||||
|
backtrace NMI. These prints provide some reasons why a CPU
|
||||||
|
might legitimately be failing to respond, for example, if it
|
||||||
|
is offline of if ignore_nmis is set.
|
||||||
|
|
||||||
config DEBUG_IRQFLAGS
|
config DEBUG_IRQFLAGS
|
||||||
bool "Debug IRQ flag manipulation"
|
bool "Debug IRQ flag manipulation"
|
||||||
help
|
help
|
||||||
|
|
Loading…
Add table
Reference in a new issue