From 3bfe57165b4bf86a431099078df422f54598f5c6 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Tue, 2 May 2017 18:29:55 +0200 Subject: numa: equally distribute memory on nodes When there are more nodes than available memory to put the minimum allowed memory by node, all the memory is put on the last node. This is because we put (ram_size / nb_numa_nodes) & ~((1 << mc->numa_mem_align_shift) - 1); on each node, and in this case the value is 0. This is particularly true with pseries, as the memory must be aligned to 256MB. To avoid this problem, this patch uses an error diffusion algorithm [1] to distribute equally the memory on nodes. We introduce numa_auto_assign_ram() function in MachineClass to keep compatibility between machine type versions. The legacy function is used with pseries-2.9, pc-q35-2.9 and pc-i440fx-2.9 (and previous), the new one with all others. Example: qemu-system-ppc64 -S -nographic -nodefaults -monitor stdio -m 1G -smp 8 \ -numa node -numa node -numa node \ -numa node -numa node -numa node Before: (qemu) info numa 6 nodes node 0 cpus: 0 6 node 0 size: 0 MB node 1 cpus: 1 7 node 1 size: 0 MB node 2 cpus: 2 node 2 size: 0 MB node 3 cpus: 3 node 3 size: 0 MB node 4 cpus: 4 node 4 size: 0 MB node 5 cpus: 5 node 5 size: 1024 MB After: (qemu) info numa 6 nodes node 0 cpus: 0 6 node 0 size: 0 MB node 1 cpus: 1 7 node 1 size: 256 MB node 2 cpus: 2 node 2 size: 0 MB node 3 cpus: 3 node 3 size: 256 MB node 4 cpus: 4 node 4 size: 256 MB node 5 cpus: 5 node 5 size: 256 MB [1] https://en.wikipedia.org/wiki/Error_diffusion Signed-off-by: Laurent Vivier Message-Id: <20170502162955.1610-2-lvivier@redhat.com> Reviewed-by: Eduardo Habkost [ehabkost: s/ram_size/size/ at numa_default_auto_assign_ram()] Signed-off-by: Eduardo Habkost --- hw/core/machine.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'hw/core') diff --git a/hw/core/machine.c b/hw/core/machine.c index ada9eea483..2482c630c1 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -17,6 +17,7 @@ #include "qapi/visitor.h" #include "hw/sysbus.h" #include "sysemu/sysemu.h" +#include "sysemu/numa.h" #include "qemu/error-report.h" #include "qemu/cutils.h" @@ -400,6 +401,7 @@ static void machine_class_init(ObjectClass *oc, void *data) * On Linux, each node's border has to be 8MB aligned */ mc->numa_mem_align_shift = 23; + mc->numa_auto_assign_ram = numa_default_auto_assign_ram; object_class_property_add_str(oc, "accel", machine_get_accel, machine_set_accel, &error_abort); -- cgit v1.2.3-55-g7522 From 7c88e65d9e9ff7df7fa9cff1869d64a0eaac63a1 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:50 +0200 Subject: numa: mirror cpu to node mapping in MachineState::possible_cpus Introduce machine_set_cpu_numa_node() helper that stores node mapping for CPU in MachineState::possible_cpus. CPU and node it belongs to is specified by 'props' argument. Patch doesn't remove old way of storing mapping in numa_info[X].node_cpu as removing it at the same time makes patch rather big. Instead it just mirrors mapping in possible_cpus and follow up per target patches will switch to possible_cpus and numa_info[X].node_cpu will be removed once there isn't any users left. Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Reviewed-by: Andrew Jones Message-Id: <1494415802-227633-7-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- hw/core/machine.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/hw/boards.h | 3 ++ numa.c | 8 +++++ 3 files changed, 107 insertions(+) (limited to 'hw/core') diff --git a/hw/core/machine.c b/hw/core/machine.c index 2482c630c1..420c8c4d16 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -389,6 +389,102 @@ HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine) return head; } +/** + * machine_set_cpu_numa_node: + * @machine: machine object to modify + * @props: specifies which cpu objects to assign to + * numa node specified by @props.node_id + * @errp: if an error occurs, a pointer to an area to store the error + * + * Associate NUMA node specified by @props.node_id with cpu slots that + * match socket/core/thread-ids specified by @props. It's recommended to use + * query-hotpluggable-cpus.props values to specify affected cpu slots, + * which would lead to exact 1:1 mapping of cpu slots to NUMA node. + * + * However for CLI convenience it's possible to pass in subset of properties, + * which would affect all cpu slots that match it. + * Ex for pc machine: + * -smp 4,cores=2,sockets=2 -numa node,nodeid=0 -numa node,nodeid=1 \ + * -numa cpu,node-id=0,socket_id=0 \ + * -numa cpu,node-id=1,socket_id=1 + * will assign all child cores of socket 0 to node 0 and + * of socket 1 to node 1. + * + * On attempt of reassigning (already assigned) cpu slot to another NUMA node, + * return error. + * Empty subset is disallowed and function will return with error in this case. + */ +void machine_set_cpu_numa_node(MachineState *machine, + const CpuInstanceProperties *props, Error **errp) +{ + MachineClass *mc = MACHINE_GET_CLASS(machine); + bool match = false; + int i; + + if (!mc->possible_cpu_arch_ids) { + error_setg(errp, "mapping of CPUs to NUMA node is not supported"); + return; + } + + /* disabling node mapping is not supported, forbid it */ + assert(props->has_node_id); + + /* force board to initialize possible_cpus if it hasn't been done yet */ + mc->possible_cpu_arch_ids(machine); + + for (i = 0; i < machine->possible_cpus->len; i++) { + CPUArchId *slot = &machine->possible_cpus->cpus[i]; + + /* reject unsupported by board properties */ + if (props->has_thread_id && !slot->props.has_thread_id) { + error_setg(errp, "thread-id is not supported"); + return; + } + + if (props->has_core_id && !slot->props.has_core_id) { + error_setg(errp, "core-id is not supported"); + return; + } + + if (props->has_socket_id && !slot->props.has_socket_id) { + error_setg(errp, "socket-id is not supported"); + return; + } + + /* skip slots with explicit mismatch */ + if (props->has_thread_id && props->thread_id != slot->props.thread_id) { + continue; + } + + if (props->has_core_id && props->core_id != slot->props.core_id) { + continue; + } + + if (props->has_socket_id && props->socket_id != slot->props.socket_id) { + continue; + } + + /* reject assignment if slot is already assigned, for compatibility + * of legacy cpu_index mapping with SPAPR core based mapping do not + * error out if cpu thread and matched core have the same node-id */ + if (slot->props.has_node_id && + slot->props.node_id != props->node_id) { + error_setg(errp, "CPU is already assigned to node-id: %" PRId64, + slot->props.node_id); + return; + } + + /* assign slot to node as it's matched '-numa cpu' key */ + match = true; + slot->props.node_id = props->node_id; + slot->props.has_node_id = props->has_node_id; + } + + if (!match) { + error_setg(errp, "no match found"); + } +} + static void machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); diff --git a/include/hw/boards.h b/include/hw/boards.h index 3ffa255fb8..4e14ff060e 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -42,6 +42,9 @@ bool machine_dump_guest_core(MachineState *machine); bool machine_mem_merge(MachineState *machine); void machine_register_compat_props(MachineState *machine); HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine); +void machine_set_cpu_numa_node(MachineState *machine, + const CpuInstanceProperties *props, + Error **errp); /** * CPUArchId: diff --git a/numa.c b/numa.c index 718248161c..7db5dde873 100644 --- a/numa.c +++ b/numa.c @@ -170,6 +170,7 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, exit(1); } for (cpus = node->cpus; cpus; cpus = cpus->next) { + CpuInstanceProperties props; if (cpus->value >= max_cpus) { error_setg(errp, "CPU index (%" PRIu16 ")" @@ -178,6 +179,10 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, return; } bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); + props = mc->cpu_index_to_instance_props(ms, cpus->value); + props.node_id = nodenr; + props.has_node_id = true; + machine_set_cpu_numa_node(ms, &props, &error_fatal); } if (node->has_mem && node->has_memdev) { @@ -528,9 +533,12 @@ void parse_numa_opts(MachineState *ms) if (i == nb_numa_nodes) { for (i = 0; i < max_cpus; i++) { CpuInstanceProperties props; + /* fetch default mapping from board and enable it */ props = mc->cpu_index_to_instance_props(ms, i); + props.has_node_id = true; set_bit(i, numa_info[props.node_id].node_cpu); + machine_set_cpu_numa_node(ms, &props, &error_fatal); } } -- cgit v1.2.3-55-g7522 From 482dfe9a9e8fe72d6a96c927e23078808f9cacd2 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:58 +0200 Subject: machine: call machine init from wrapper add machine_run_board_init() wrapper that calls machine init for now but in follow up patches it will be used to run generic machine code that should run before machine init. Signed-off-by: Igor Mammedov Reviewed-by: Andrew Jones Reviewed-by: David Gibson Reviewed-by: Eduardo Habkost Message-Id: <1494415802-227633-15-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- hw/core/machine.c | 6 ++++++ include/hw/boards.h | 1 + vl.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'hw/core') diff --git a/hw/core/machine.c b/hw/core/machine.c index 420c8c4d16..64e2a4ff2c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -678,6 +678,12 @@ bool machine_mem_merge(MachineState *machine) return machine->mem_merge; } +void machine_run_board_init(MachineState *machine) +{ + MachineClass *machine_class = MACHINE_GET_CLASS(machine); + machine_class->init(machine); +} + static void machine_class_finalize(ObjectClass *klass, void *data) { MachineClass *mc = MACHINE_CLASS(klass); diff --git a/include/hw/boards.h b/include/hw/boards.h index 4e14ff060e..76ce0219ff 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -32,6 +32,7 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, MachineClass *find_default_machine(void); extern MachineState *current_machine; +void machine_run_board_init(MachineState *machine); bool machine_usb(MachineState *machine); bool machine_kernel_irqchip_allowed(MachineState *machine); bool machine_kernel_irqchip_required(MachineState *machine); diff --git a/vl.c b/vl.c index 5cd0c17ba0..3d8c140011 100644 --- a/vl.c +++ b/vl.c @@ -4559,7 +4559,7 @@ int main(int argc, char **argv, char **envp) current_machine->boot_order = boot_order; current_machine->cpu_model = cpu_model; - machine_class->init(current_machine); + machine_run_board_init(current_machine); realtime_init(); -- cgit v1.2.3-55-g7522 From ec78f8114bc4c133fc56fefa7f2af99725e42857 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 May 2017 13:29:59 +0200 Subject: numa: use possible_cpus for not mapped CPUs check and remove corresponding part in numa.c that uses node_cpu bitmaps. Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Reviewed-by: Andrew Jones Message-Id: <1494415802-227633-16-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- hw/core/machine.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ numa.c | 10 ---------- 2 files changed, 58 insertions(+), 10 deletions(-) (limited to 'hw/core') diff --git a/hw/core/machine.c b/hw/core/machine.c index 64e2a4ff2c..fd6a436064 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -20,6 +20,7 @@ #include "sysemu/numa.h" #include "qemu/error-report.h" #include "qemu/cutils.h" +#include "sysemu/numa.h" static char *machine_get_accel(Object *obj, Error **errp) { @@ -678,9 +679,66 @@ bool machine_mem_merge(MachineState *machine) return machine->mem_merge; } +static char *cpu_slot_to_string(const CPUArchId *cpu) +{ + GString *s = g_string_new(NULL); + if (cpu->props.has_socket_id) { + g_string_append_printf(s, "socket-id: %"PRId64, cpu->props.socket_id); + } + if (cpu->props.has_core_id) { + if (s->len) { + g_string_append_printf(s, ", "); + } + g_string_append_printf(s, "core-id: %"PRId64, cpu->props.core_id); + } + if (cpu->props.has_thread_id) { + if (s->len) { + g_string_append_printf(s, ", "); + } + g_string_append_printf(s, "thread-id: %"PRId64, cpu->props.thread_id); + } + return g_string_free(s, false); +} + +static void machine_numa_validate(MachineState *machine) +{ + int i; + GString *s = g_string_new(NULL); + MachineClass *mc = MACHINE_GET_CLASS(machine); + const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(machine); + + assert(nb_numa_nodes); + for (i = 0; i < possible_cpus->len; i++) { + const CPUArchId *cpu_slot = &possible_cpus->cpus[i]; + + /* at this point numa mappings are initilized by CLI options + * or with default mappings so it's sufficient to list + * all not yet mapped CPUs here */ + /* TODO: make it hard error in future */ + if (!cpu_slot->props.has_node_id) { + char *cpu_str = cpu_slot_to_string(cpu_slot); + g_string_append_printf(s, "%sCPU %d [%s]", s->len ? ", " : "", i, + cpu_str); + g_free(cpu_str); + } + } + if (s->len) { + error_report("warning: CPU(s) not present in any NUMA nodes: %s", + s->str); + error_report("warning: All CPU(s) up to maxcpus should be described " + "in NUMA config, ability to start up with partial NUMA " + "mappings is obsoleted and will be removed in future"); + } + g_string_free(s, true); +} + void machine_run_board_init(MachineState *machine) { MachineClass *machine_class = MACHINE_GET_CLASS(machine); + + if (nb_numa_nodes) { + machine_numa_validate(machine); + } machine_class->init(machine); } diff --git a/numa.c b/numa.c index dc739eadfa..63bff5a54d 100644 --- a/numa.c +++ b/numa.c @@ -337,16 +337,6 @@ static void validate_numa_cpus(void) bitmap_or(seen_cpus, seen_cpus, numa_info[i].node_cpu, max_cpus); } - - if (!bitmap_full(seen_cpus, max_cpus)) { - char *msg; - bitmap_complement(seen_cpus, seen_cpus, max_cpus); - msg = enumerate_cpus(seen_cpus, max_cpus); - error_report("warning: CPU(s) not present in any NUMA nodes: %s", msg); - error_report("warning: All CPU(s) up to maxcpus should be described " - "in NUMA config"); - g_free(msg); - } g_free(seen_cpus); } -- cgit v1.2.3-55-g7522