diff --git a/cpu-defs.h b/cpu-defs.h index b462a9fa0c..7cbf85d1e1 100644 --- a/cpu-defs.h +++ b/cpu-defs.h @@ -205,6 +205,7 @@ typedef struct CPUWatchpoint { \ CPUState *next_cpu; /* next CPU sharing TB cache */ \ int cpu_index; /* CPU index (informative) */ \ + int numa_node; /* NUMA node this cpu is belonging to */ \ int running; /* Nonzero if cpu is currently running(usermode). */ \ /* user data */ \ void *opaque; \ diff --git a/exec.c b/exec.c index fc7e08ca73..8245ac0409 100644 --- a/exec.c +++ b/exec.c @@ -554,6 +554,7 @@ void cpu_exec_init(CPUState *env) cpu_index++; } env->cpu_index = cpu_index; + env->numa_node = 0; TAILQ_INIT(&env->breakpoints); TAILQ_INIT(&env->watchpoints); *penv = env; diff --git a/qemu-options.hx b/qemu-options.hx index 5c594fae2e..2738a7a66a 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -47,6 +47,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs to 4. ETEXI +DEF("numa", HAS_ARG, QEMU_OPTION_numa, + "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n") +STEXI +@item -numa @var{opts} +Simulate a multi node NUMA system. If mem and cpus are omitted, resources +are split equally. +ETEXI + DEF("fda", HAS_ARG, QEMU_OPTION_fda, "-fda/-fdb file use 'file' as floppy disk 0/1 image\n") DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "") diff --git a/sysemu.h b/sysemu.h index 24b4bd100f..cbfbb8e6d1 100644 --- a/sysemu.h +++ b/sysemu.h @@ -108,6 +108,10 @@ extern int old_param; extern int kqemu_allowed; #endif +#define MAX_NODES 64 +extern int nb_numa_nodes; +extern uint64_t node_mem[MAX_NODES]; + #define MAX_OPTION_ROMS 16 extern const char *option_rom[MAX_OPTION_ROMS]; extern int nb_option_roms; @@ -248,7 +252,7 @@ void do_usb_add(Monitor *mon, const char *devname); void do_usb_del(Monitor *mon, const char *devname); void usb_info(Monitor *mon); -const char *get_opt_name(char *buf, int buf_size, const char *p); +const char *get_opt_name(char *buf, int buf_size, const char *p, char delim); const char *get_opt_value(char *buf, int buf_size, const char *p); int get_param_value(char *buf, int buf_size, const char *tag, const char *str); diff --git a/vl.c b/vl.c index 0a5605d211..f596553656 100644 --- a/vl.c +++ b/vl.c @@ -265,6 +265,10 @@ const char *prom_envs[MAX_PROM_ENVS]; int nb_drives_opt; struct drive_opt drives_opt[MAX_DRIVES]; +int nb_numa_nodes; +uint64_t node_mem[MAX_NODES]; +uint64_t node_cpumask[MAX_NODES]; + static CPUState *cur_cpu; static CPUState *next_cpu; static int event_pending = 1; @@ -1865,12 +1869,12 @@ static int socket_init(void) } #endif -const char *get_opt_name(char *buf, int buf_size, const char *p) +const char *get_opt_name(char *buf, int buf_size, const char *p, char delim) { char *q; q = buf; - while (*p != '\0' && *p != '=') { + while (*p != '\0' && *p != delim) { if (q && (q - buf) < buf_size - 1) *q++ = *p; p++; @@ -1910,7 +1914,7 @@ int get_param_value(char *buf, int buf_size, p = str; for(;;) { - p = get_opt_name(option, sizeof(option), p); + p = get_opt_name(option, sizeof(option), p, '='); if (*p != '=') break; p++; @@ -1935,7 +1939,7 @@ int check_params(char *buf, int buf_size, p = str; while (*p != '\0') { - p = get_opt_name(buf, buf_size, p); + p = get_opt_name(buf, buf_size, p, '='); if (*p != '=') return -1; p++; @@ -2628,6 +2632,62 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque) return drives_table_idx; } +static void numa_add(const char *optarg) +{ + char option[128]; + char *endptr; + unsigned long long value, endvalue; + int nodenr; + + optarg = get_opt_name(option, 128, optarg, ',') + 1; + if (!strcmp(option, "node")) { + if (get_param_value(option, 128, "nodeid", optarg) == 0) { + nodenr = nb_numa_nodes; + } else { + nodenr = strtoull(option, NULL, 10); + } + + if (get_param_value(option, 128, "mem", optarg) == 0) { + node_mem[nodenr] = 0; + } else { + value = strtoull(option, &endptr, 0); + switch (*endptr) { + case 0: case 'M': case 'm': + value <<= 20; + break; + case 'G': case 'g': + value <<= 30; + break; + } + node_mem[nodenr] = value; + } + if (get_param_value(option, 128, "cpus", optarg) == 0) { + node_cpumask[nodenr] = 0; + } else { + value = strtoull(option, &endptr, 10); + if (value >= 64) { + value = 63; + fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n"); + } else { + if (*endptr == '-') { + endvalue = strtoull(endptr+1, &endptr, 10); + if (endvalue >= 63) { + endvalue = 62; + fprintf(stderr, + "only 63 CPUs in NUMA mode supported.\n"); + } + value = (1 << (endvalue + 1)) - (1 << value); + } else { + value = 1 << value; + } + } + node_cpumask[nodenr] = value; + } + nb_numa_nodes++; + } + return; +} + /***********************************************************/ /* USB devices */ @@ -4290,6 +4350,7 @@ int main(int argc, char **argv, char **envp) const char *chroot_dir = NULL; const char *run_as = NULL; #endif + CPUState *env; qemu_cache_utils_init(envp); @@ -4353,12 +4414,18 @@ int main(int argc, char **argv, char **envp) virtio_consoles[i] = NULL; virtio_console_index = 0; + for (i = 0; i < MAX_NODES; i++) { + node_mem[i] = 0; + node_cpumask[i] = 0; + } + usb_devices_index = 0; nb_net_clients = 0; nb_bt_opts = 0; nb_drives = 0; nb_drives_opt = 0; + nb_numa_nodes = 0; hda_index = -1; nb_nics = 0; @@ -4508,6 +4575,13 @@ int main(int argc, char **argv, char **envp) ",trans=none" : ""); } break; + case QEMU_OPTION_numa: + if (nb_numa_nodes >= MAX_NODES) { + fprintf(stderr, "qemu: too many NUMA nodes\n"); + exit(1); + } + numa_add(optarg); + break; case QEMU_OPTION_nographic: nographic = 1; break; @@ -5211,6 +5285,48 @@ int main(int argc, char **argv, char **envp) } } + if (nb_numa_nodes > 0) { + int i; + + if (nb_numa_nodes > smp_cpus) { + nb_numa_nodes = smp_cpus; + } + + /* If no memory size if given for any node, assume the default case + * and distribute the available memory equally across all nodes + */ + for (i = 0; i < nb_numa_nodes; i++) { + if (node_mem[i] != 0) + break; + } + if (i == nb_numa_nodes) { + uint64_t usedmem = 0; + + /* On Linux, the each node's border has to be 8MB aligned, + * the final node gets the rest. + */ + for (i = 0; i < nb_numa_nodes - 1; i++) { + node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); + usedmem += node_mem[i]; + } + node_mem[i] = ram_size - usedmem; + } + + for (i = 0; i < nb_numa_nodes; i++) { + if (node_cpumask[i] != 0) + break; + } + /* assigning the VCPUs round-robin is easier to implement, guest OSes + * must cope with this anyway, because there are BIOSes out there in + * real machines which also use this scheme. + */ + if (i == nb_numa_nodes) { + for (i = 0; i < smp_cpus; i++) { + node_cpumask[i % nb_numa_nodes] |= 1 << i; + } + } + } + if (kvm_enabled()) { int ret; @@ -5274,6 +5390,15 @@ int main(int argc, char **argv, char **envp) machine->init(ram_size, vga_ram_size, boot_devices, kernel_filename, kernel_cmdline, initrd_filename, cpu_model); + + for (env = first_cpu; env != NULL; env = env->next_cpu) { + for (i = 0; i < nb_numa_nodes; i++) { + if (node_cpumask[i] & (1 << env->cpu_index)) { + env->numa_node = i; + } + } + } + current_machine = machine; /* Set KVM's vcpu state to qemu's initial CPUState. */