1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/arch/powerpc/include/asm/topology.h
Scott Cheloha 72cdd117c4 pseries/hotplug-memory: hot-add: skip redundant LMB lookup
During memory hot-add, dlpar_add_lmb() calls memory_add_physaddr_to_nid()
to determine which node id (nid) to use when later calling __add_memory().

This is wasteful.  On pseries, memory_add_physaddr_to_nid() finds an
appropriate nid for a given address by looking up the LMB containing the
address and then passing that LMB to of_drconf_to_nid_single() to get the
nid.  In dlpar_add_lmb() we get this address from the LMB itself.

In short, we have a pointer to an LMB and then we are searching for
that LMB *again* in order to find its nid.

If we call of_drconf_to_nid_single() directly from dlpar_add_lmb() we
can skip the redundant lookup.  The only error handling we need to
duplicate from memory_add_physaddr_to_nid() is the fallback to the
default nid when drconf_to_nid_single() returns -1 (NUMA_NO_NODE) or
an invalid nid.

Skipping the extra lookup makes hot-add operations faster, especially
on machines with many LMBs.

Consider an LPAR with 126976 LMBs.  In one test, hot-adding 126000
LMBs on an upatched kernel took ~3.5 hours while a patched kernel
completed the same operation in ~2 hours:

Unpatched (12450 seconds):
Sep  9 04:06:31 ltc-brazos1 drmgr[810169]: drmgr: -c mem -a -q 126000
Sep  9 04:06:31 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 126000 LMB(s)
[...]
Sep  9 07:34:01 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 20000000 (drc index 80000002) was hot-added

Patched (7065 seconds):
Sep  8 21:49:57 ltc-brazos1 drmgr[877703]: drmgr: -c mem -a -q 126000
Sep  8 21:49:57 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 126000 LMB(s)
[...]
Sep  8 23:27:42 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 20000000 (drc index 80000002) was hot-added

It should be noted that the speedup grows more substantial when
hot-adding LMBs at the end of the drconf range.  This is because we
are skipping a linear LMB search.

To see the distinction, consider smaller hot-add test on the same
LPAR.  A perf-stat run with 10 iterations showed that hot-adding 4096
LMBs completed less than 1 second faster on a patched kernel:

Unpatched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

        104,753.42 msec task-clock                #    0.992 CPUs utilized            ( +-  0.55% )
             4,708      context-switches          #    0.045 K/sec                    ( +-  0.69% )
             2,444      cpu-migrations            #    0.023 K/sec                    ( +-  1.25% )
               394      page-faults               #    0.004 K/sec                    ( +-  0.22% )
   445,902,503,057      cycles                    #    4.257 GHz                      ( +-  0.55% )  (66.67%)
     8,558,376,740      stalled-cycles-frontend   #    1.92% frontend cycles idle     ( +-  0.88% )  (49.99%)
   300,346,181,651      stalled-cycles-backend    #   67.36% backend cycles idle      ( +-  0.76% )  (50.01%)
   258,091,488,691      instructions              #    0.58  insn per cycle
                                                  #    1.16  stalled cycles per insn  ( +-  0.22% )  (66.67%)
    70,568,169,256      branches                  #  673.660 M/sec                    ( +-  0.17% )  (50.01%)
     3,100,725,426      branch-misses             #    4.39% of all branches          ( +-  0.20% )  (49.99%)

           105.583 +- 0.589 seconds time elapsed  ( +-  0.56% )

Patched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

        104,055.69 msec task-clock                #    0.993 CPUs utilized            ( +-  0.32% )
             4,606      context-switches          #    0.044 K/sec                    ( +-  0.20% )
             2,463      cpu-migrations            #    0.024 K/sec                    ( +-  0.93% )
               394      page-faults               #    0.004 K/sec                    ( +-  0.25% )
   442,951,129,921      cycles                    #    4.257 GHz                      ( +-  0.32% )  (66.66%)
     8,710,413,329      stalled-cycles-frontend   #    1.97% frontend cycles idle     ( +-  0.47% )  (50.06%)
   299,656,905,836      stalled-cycles-backend    #   67.65% backend cycles idle      ( +-  0.39% )  (50.02%)
   252,731,168,193      instructions              #    0.57  insn per cycle
                                                  #    1.19  stalled cycles per insn  ( +-  0.20% )  (66.66%)
    68,902,851,121      branches                  #  662.173 M/sec                    ( +-  0.13% )  (49.94%)
     3,100,242,882      branch-misses             #    4.50% of all branches          ( +-  0.15% )  (49.98%)

           104.829 +- 0.325 seconds time elapsed  ( +-  0.31% )

This is consistent.  An add-by-count hot-add operation adds LMBs
greedily, so LMBs near the start of the drconf range are considered
first.  On an otherwise idle LPAR with so many LMBs we would expect to
find the LMBs we need near the start of the drconf range, hence the
smaller speedup.

Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200916145122.3408129-1-cheloha@linux.ibm.com
2020-10-06 23:22:27 +11:00

130 lines
2.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_POWERPC_TOPOLOGY_H
#define _ASM_POWERPC_TOPOLOGY_H
#ifdef __KERNEL__
struct device;
struct device_node;
#ifdef CONFIG_NUMA
/*
* If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that
* all zones on all nodes will be eligible for zone_reclaim().
*/
#define RECLAIM_DISTANCE 10
#include <asm/mmzone.h>
#define cpumask_of_node(node) ((node) == -1 ? \
cpu_all_mask : \
node_to_cpumask_map[node])
struct pci_bus;
#ifdef CONFIG_PCI
extern int pcibus_to_node(struct pci_bus *bus);
#else
static inline int pcibus_to_node(struct pci_bus *bus)
{
return -1;
}
#endif
#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \
cpu_all_mask : \
cpumask_of_node(pcibus_to_node(bus)))
extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
extern void __init dump_numa_cpu_topology(void);
extern int sysfs_add_device_to_node(struct device *dev, int nid);
extern void sysfs_remove_device_from_node(struct device *dev, int nid);
static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
{
numa_cpu_lookup_table[cpu] = node;
}
static inline int early_cpu_to_node(int cpu)
{
int nid;
nid = numa_cpu_lookup_table[cpu];
/*
* Fall back to node 0 if nid is unset (it should be, except bugs).
* This allows callers to safely do NODE_DATA(early_cpu_to_node(cpu)).
*/
return (nid < 0) ? 0 : nid;
}
#else
static inline int early_cpu_to_node(int cpu) { return 0; }
static inline void dump_numa_cpu_topology(void) {}
static inline int sysfs_add_device_to_node(struct device *dev, int nid)
{
return 0;
}
static inline void sysfs_remove_device_from_node(struct device *dev,
int nid)
{
}
static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
{
return 0;
}
#endif /* CONFIG_NUMA */
struct drmem_lmb;
int of_drconf_to_nid_single(struct drmem_lmb *lmb);
#if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
extern int find_and_online_cpu_nid(int cpu);
extern int cpu_to_coregroup_id(int cpu);
#else
static inline int find_and_online_cpu_nid(int cpu)
{
return 0;
}
static inline int cpu_to_coregroup_id(int cpu)
{
#ifdef CONFIG_SMP
return cpu_to_core_id(cpu);
#else
return 0;
#endif
}
#endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
#include <asm-generic/topology.h>
#ifdef CONFIG_SMP
#include <asm/cputable.h>
#ifdef CONFIG_PPC64
#include <asm/smp.h>
#define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
#define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu))
#define topology_core_id(cpu) (cpu_to_core_id(cpu))
#endif
#endif
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_TOPOLOGY_H */