Faster barrier implementation.

The old barrier implementation was very slow when running on a multi-socket
machine (pcmemtest issue 16).

The new implementation provides two options:

  - when blocked, spin on a thread-local flag
  - when blocked, execute a HLT instruction and wait for a NMI

The first option might be faster, but we need to measure it to find out. A
new boot command line option is provided to select between the two, with a
third setting that uses a mixture of the two.
This commit is contained in:
Martin Whitaker 2022-02-28 20:21:50 +00:00
parent 311a597766
commit 4078b7760e
20 changed files with 292 additions and 121 deletions

View File

@ -92,10 +92,11 @@ cpu_state_t cpu_state[MAX_CPUS];
bool enable_temperature = false;
bool enable_trace = false;
bool enable_halt = true;
bool pause_at_start = true;
power_save_t power_save = POWER_SAVE_HIGH;
//------------------------------------------------------------------------------
// Private Functions
//------------------------------------------------------------------------------
@ -115,8 +116,14 @@ static void parse_option(const char *option, const char *params)
}
} else if (strncmp(option, "nopause", 8) == 0) {
pause_at_start = false;
} else if (strncmp(option, "nohalt", 7) == 0) {
enable_halt = false;
} else if (strncmp(option, "powersave", 10) == 0) {
if (strncmp(params, "off", 4) == 0) {
power_save = POWER_SAVE_OFF;
} else if (strncmp(params, "low", 4) == 0) {
power_save = POWER_SAVE_LOW;
} else if (strncmp(params, "high", 5) == 0) {
power_save = POWER_SAVE_HIGH;
}
} else if (strncmp(option, "smp", 4) == 0) {
smp_enabled = true;
} else if (strncmp(option, "trace", 6) == 0) {
@ -653,6 +660,8 @@ void config_init(void)
enable_temperature = !no_temperature;
power_save = POWER_SAVE_HIGH;
const boot_params_t *boot_params = (boot_params_t *)boot_params_addr;
uintptr_t cmd_line_addr = boot_params->cmd_line_ptr;

View File

@ -28,6 +28,12 @@ typedef enum {
ERROR_MODE_BADRAM
} error_mode_t;
typedef enum {
POWER_SAVE_OFF,
POWER_SAVE_LOW,
POWER_SAVE_HIGH
} power_save_t;
extern uintptr_t pm_limit_lower;
extern uintptr_t pm_limit_upper;
@ -41,10 +47,11 @@ extern cpu_state_t cpu_state[MAX_CPUS];
extern bool enable_temperature;
extern bool enable_trace;
extern bool enable_halt;
extern bool pause_at_start;
extern power_save_t power_save;
void config_init(void);
void config_menu(bool initial);

View File

@ -231,12 +231,21 @@ void scroll(void)
void do_tick(int my_cpu)
{
barrier_wait(run_barrier);
bool use_spin_wait = (power_save < POWER_SAVE_HIGH);
if (use_spin_wait) {
barrier_spin_wait(run_barrier);
} else {
barrier_halt_wait(run_barrier);
}
if (master_cpu == my_cpu) {
check_input();
error_update();
}
barrier_wait(run_barrier);
if (use_spin_wait) {
barrier_spin_wait(run_barrier);
} else {
barrier_halt_wait(run_barrier);
}
// Only the master CPU does the update.
if (master_cpu != my_cpu) {

View File

@ -118,21 +118,42 @@ uintptr_t test_addr[MAX_CPUS];
// Private Functions
//------------------------------------------------------------------------------
#define BARRIER \
#define SHORT_BARRIER \
if (TRACE_BARRIERS) { \
trace(my_cpu, "Start barrier wait at %s line %i", __FILE__, __LINE__); \
} \
barrier_wait(start_barrier);
if (power_save < POWER_SAVE_HIGH) { \
barrier_spin_wait(start_barrier); \
} else { \
barrier_halt_wait(start_barrier); \
}
#define LONG_BARRIER \
if (TRACE_BARRIERS) { \
trace(my_cpu, "Start barrier wait at %s line %i", __FILE__, __LINE__); \
} \
if (power_save > POWER_SAVE_OFF) { \
barrier_halt_wait(start_barrier); \
} else { \
barrier_spin_wait(start_barrier); \
}
static void run_at(uintptr_t addr, int my_cpu)
{
uintptr_t *new_start_addr = (uintptr_t *)(addr + startup - _start);
if (my_cpu == 0) {
// Copy the program code and all data except the stacks.
memcpy((void *)addr, &_start, _stacks - _start);
memcpy((void *)addr, (void *)_start, _stacks - _start);
// Copy the thread-local storage.
size_t locals_offset = _stacks - _start + BSP_STACK_SIZE - LOCALS_SIZE;
for (int cpu_num = 0; cpu_num < num_available_cpus; cpu_num++) {
memcpy((void *)(addr + locals_offset), (void *)(_start + locals_offset), LOCALS_SIZE);
locals_offset += AP_STACK_SIZE;
}
}
BARRIER;
LONG_BARRIER;
#ifndef __x86_64__
// The 32-bit startup code needs to know where it is located.
@ -317,7 +338,7 @@ static void test_all_windows(int my_cpu)
display_active_cpu(my_cpu);
}
}
barrier_init(run_barrier, num_active_cpus);
barrier_reset(run_barrier, num_active_cpus);
}
int iterations = test_list[test_num].iterations;
@ -328,7 +349,7 @@ static void test_all_windows(int my_cpu)
// Loop through all possible windows.
do {
BARRIER;
LONG_BARRIER;
if (bail) {
break;
}
@ -344,7 +365,7 @@ static void test_all_windows(int my_cpu)
window_num = 1;
}
}
BARRIER;
SHORT_BARRIER;
// Relocate if necessary.
if (window_num > 0) {
@ -374,16 +395,9 @@ static void test_all_windows(int my_cpu)
}
setup_vm_map(window_start, window_end);
}
BARRIER;
SHORT_BARRIER;
// There is a significant overhead in restarting halted CPU cores, so only enable
// halting if the memory present in the window is a reasonable size.
bool halt_if_inactive = enable_halt && num_enabled_cpus > num_active_cpus && num_mapped_pages > PAGE_C(16,MB);
if (!i_am_active) {
if (!dummy_run && halt_if_inactive) {
cpu_state[my_cpu] = CPU_STATE_HALTED;
__asm__ __volatile__ ("hlt");
}
continue;
}
@ -408,29 +422,6 @@ static void test_all_windows(int my_cpu)
}
if (i_am_master) {
if (!dummy_run && halt_if_inactive) {
int cpu_num = 0;
int retries = 0;
while (cpu_num < num_available_cpus) {
if (cpu_num == my_cpu) {
cpu_num++;
continue;
}
if (cpu_state[cpu_num] == CPU_STATE_ENABLED) {
// This catches a potential race between the inactive CPU halting and the master CPU waking
// it up. This should be an unlikely event, so just spin until the inactive CPU catches up.
usleep(10);
if (++retries < 1000) {
continue;
}
}
if (cpu_state[cpu_num] == CPU_STATE_HALTED) {
smp_send_nmi(cpu_num);
}
retries = 0;
cpu_num++;
}
}
window_num++;
}
} while (window_end < pm_map[pm_map_size - 1].end);
@ -467,7 +458,7 @@ void main(void)
set_scroll_lock(false);
trace(0, "starting other CPUs");
}
barrier_init(start_barrier, num_enabled_cpus);
barrier_reset(start_barrier, num_enabled_cpus);
int failed = smp_start(cpu_state);
if (failed) {
const char *message = "Failed to start CPU core %i. Press any key to reboot...";
@ -501,7 +492,7 @@ void main(void)
// where we left off after each relocation.
while (1) {
BARRIER;
SHORT_BARRIER;
if (my_cpu == 0) {
if (start_run) {
pass_num = 0;
@ -542,11 +533,11 @@ void main(void)
start_test = false;
rerun_test = false;
}
BARRIER;
SHORT_BARRIER;
if (test_list[test_num].enabled) {
test_all_windows(my_cpu);
}
BARRIER;
SHORT_BARRIER;
if (my_cpu != 0) {
continue;
}

View File

@ -24,6 +24,8 @@
#define STACKS_SIZE (BSP_STACK_SIZE + MAX_APS * AP_STACK_SIZE)
#define LOCALS_SIZE 16 /* Stack region reserved for thread-local storage */
#define LOW_LOAD_ADDR 0x00010000 /* The low load address for the main program */
#define HIGH_LOAD_ADDR 0x00100000 /* The high load address for the main program */

View File

@ -122,7 +122,7 @@ startup:
call smp_my_cpu_num
movl $AP_STACK_SIZE, %edx
mul %edx
addl $BSP_STACK_SIZE, %eax
addl $(BSP_STACK_SIZE - LOCALS_SIZE), %eax
leal _stacks@GOTOFF(%ebx), %esp
addl %eax, %esp

View File

@ -158,7 +158,7 @@ startup:
call smp_my_cpu_num
movl $AP_STACK_SIZE, %edx
mul %edx
addq $BSP_STACK_SIZE, %rax
addq $(BSP_STACK_SIZE - LOCALS_SIZE), %rax
leaq _stacks(%rip), %rsp
addq %rax, %rsp

View File

@ -8,6 +8,7 @@ INC_DIRS = -I../boot -I../system -I../lib -I../tests -I../app
SYS_OBJS = system/cpuid.o \
system/cpuinfo.o \
system/cpulocal.o \
system/ehci.o \
system/font.o \
system/hwctrl.o \

View File

@ -8,6 +8,7 @@ INC_DIRS = -I../boot -I../system -I../lib -I../tests -I../app
SYS_OBJS = system/cpuid.o \
system/cpuinfo.o \
system/cpulocal.o \
system/ehci.o \
system/font.o \
system/hwctrl.o \

25
lib/assert.h Normal file
View File

@ -0,0 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef ASSERT_H
#define ASSERT_H
/**
* \file
*
* Provides a function to terminate the program if an unexpected and fatal
* error is detected.
*
*//*
* Copyright (C) 2022 Martin Whitaker.
*/
/*
* Terminates the program (using a breakpoint exception) if expr is equal
* to zero.
*/
static inline void assert(int expr)
{
if (!expr) {
__asm__ __volatile__ ("int $3");
}
}
#endif // ASSERT_H

View File

@ -1,19 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2020 Martin Whitaker.
//
// Derived from an extract of memtest86+ smp.c:
//
// MemTest86+ V5 Specific code (GPL V2.0)
// By Samuel DEMEULEMEESTER, sdemeule@memtest.org
// http://www.canardpc.com - http://www.memtest.org
// ------------------------------------------------
// smp.c - MemTest-86 Version 3.5
//
// Released under version 2 of the Gnu Public License.
// By Chris Brady
// Copyright (C) 2020-2022 Martin Whitaker.
#include <stdbool.h>
#include <stddef.h>
#include "cpulocal.h"
#include "smp.h"
#include "assert.h"
#include "barrier.h"
//------------------------------------------------------------------------------
@ -22,34 +17,67 @@
void barrier_init(barrier_t *barrier, int num_threads)
{
barrier->num_threads = num_threads;
barrier->count = num_threads;
spin_unlock(&barrier->lock);
spin_unlock(&barrier->st1);
spin_unlock(&barrier->st2);
spin_lock(&barrier->st2);
barrier->flag_num = allocate_local_flag();
assert(barrier->flag_num >= 0);
barrier_reset(barrier, num_threads);
}
void barrier_wait(barrier_t *barrier)
void barrier_reset(barrier_t *barrier, int num_threads)
{
barrier->num_threads = num_threads;
barrier->count = num_threads;
local_flag_t *waiting_flags = local_flags(barrier->flag_num);
for (int cpu_num = 0; cpu_num < num_available_cpus; cpu_num++) {
waiting_flags[cpu_num].flag = false;
}
}
void barrier_spin_wait(barrier_t *barrier)
{
if (barrier == NULL || barrier->num_threads < 2) {
return;
}
spin_wait(&barrier->st1); // Wait if the barrier is active.
spin_lock(&barrier->lock); // Get lock for barrier struct.
if (--barrier->count == 0) { // Last process?
spin_lock(&barrier->st1); // Hold up any processes re-entering.
spin_unlock(&barrier->st2); // Release the other processes.
barrier->count++;
spin_unlock(&barrier->lock);
} else {
spin_unlock(&barrier->lock);
spin_wait(&barrier->st2); // Wait for peers to arrive.
spin_lock(&barrier->lock);
if (++barrier->count == barrier->num_threads) {
spin_unlock(&barrier->st1);
spin_lock(&barrier->st2);
local_flag_t *waiting_flags = local_flags(barrier->flag_num);
int my_cpu = smp_my_cpu_num();
waiting_flags[my_cpu].flag = true;
if (__sync_fetch_and_sub(&barrier->count, 1) > 1) {
volatile bool *i_am_blocked = &waiting_flags[my_cpu].flag;
while (*i_am_blocked) {
__builtin_ia32_pause();
}
return;
}
// Last one here, so reset the barrier and wake the others. No need to
// check if a CPU core is actually waiting - just clear all the flags.
barrier->count = barrier->num_threads;
__sync_synchronize();
for (int cpu_num = 0; cpu_num < num_available_cpus; cpu_num++) {
waiting_flags[cpu_num].flag = false;
}
}
void barrier_halt_wait(barrier_t *barrier)
{
if (barrier == NULL || barrier->num_threads < 2) {
return;
}
local_flag_t *waiting_flags = local_flags(barrier->flag_num);
int my_cpu = smp_my_cpu_num();
waiting_flags[my_cpu].flag = true;
if (__sync_fetch_and_sub(&barrier->count, 1) > 1) {
__asm__ __volatile__ ("hlt");
return;
}
// Last one here, so reset the barrier and wake the others.
barrier->count = barrier->num_threads;
__sync_synchronize();
waiting_flags[my_cpu].flag = false;
for (int cpu_num = 0; cpu_num < num_available_cpus; cpu_num++) {
if (waiting_flags[cpu_num].flag) {
waiting_flags[cpu_num].flag = false;
smp_send_nmi(cpu_num);
}
spin_unlock(&barrier->lock);
}
}

View File

@ -10,6 +10,8 @@
* Copyright (C) 2020-2022 Martin Whitaker.
*/
#include "cpulocal.h"
#include "spinlock.h"
/**
@ -17,21 +19,31 @@
*/
typedef struct
{
int num_threads;
volatile int count;
spinlock_t lock;
spinlock_t st1;
spinlock_t st2;
int flag_num;
int num_threads;
int count;
} barrier_t;
/**
* Initialises the barrier to block the specified number of threads.
* Initialises a new barrier to block the specified number of threads.
*/
void barrier_init(barrier_t *barrier, int num_threads);
/**
* Waits for all threads to arrive at the barrier.
* Resets an existing barrier to block the specified number of threads.
*/
void barrier_wait(barrier_t *barrier);
void barrier_reset(barrier_t *barrier, int num_threads);
/**
* Waits for all threads to arrive at the barrier. A CPU core spins in an
* idle loop when waiting.
*/
void barrier_spin_wait(barrier_t *barrier);
/**
* Waits for all threads to arrive at the barrier. A CPU core halts when
* waiting.
*/
void barrier_halt_wait(barrier_t *barrier);
#endif // BARRIER_H

26
system/cpulocal.c Normal file
View File

@ -0,0 +1,26 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2022 Martin Whitaker.
#include <stdbool.h>
#include "boot.h"
#include "cpulocal.h"
//------------------------------------------------------------------------------
// Variables
//------------------------------------------------------------------------------
int local_bytes_used = 0;
//------------------------------------------------------------------------------
// Public Functions
//------------------------------------------------------------------------------
int allocate_local_flag(void)
{
if (local_bytes_used == LOCALS_SIZE) {
return -1;
}
return local_bytes_used += sizeof(bool);
}

46
system/cpulocal.h Normal file
View File

@ -0,0 +1,46 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef CPULOCAL_H
#define CPULOCAL_H
/**
* \file
*
* Provides functions to allocate and access thread-local flags.
*
*//*
* Copyright (C) 2022 Martin Whitaker.
*/
#include <stdbool.h>
#include <stdint.h>
#include "boot.h"
/**
* A single thread-local flag. These are spaced out in memory to ensure each
* flag occupies a different cache line.
*/
typedef struct __attribute__((packed)) {
bool flag;
uint8_t spacing[AP_STACK_SIZE - sizeof(bool)];
} local_flag_t;
/**
* Allocates an array of thread-local flags, one per CPU core, and returns
* a ID number that identifies the allocated array. Returns -1 if there is
* insufficient thread local storage remaining to allocate a new array of
* flags.
*/
int allocate_local_flag(void);
/**
* Returns a pointer to the previously allocated array of thread-local flags
* identified by flag_num.
*/
static inline local_flag_t *local_flags(int flag_num)
{
// The number returned by allocate_local_flag is the byte offset of the
// flag from the start of the thread-local storage.
return (local_flag_t *)(_stacks + BSP_STACK_SIZE - LOCALS_SIZE + flag_num);
}
#endif // CPULOCAL_H

View File

@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2020 Martin Whitaker.
// Copyright (C) 2020-2022 Martin Whitaker.
//
// Derived from memtest86+ reloc.c:
//
@ -11,6 +11,8 @@
#include <stddef.h>
#include <stdint.h>
#include "assert.h"
//------------------------------------------------------------------------------
// Constants
//------------------------------------------------------------------------------
@ -61,13 +63,6 @@ typedef struct
#define ELF32_R_TYPE(r_info) ((r_info) & 0xff)
static inline void assert(int expr)
{
if (!expr) {
__asm__ __volatile__ ("int $3");
}
}
/*
* Return the run-time load address of the shared object. This must be inlined
* in a function which uses global data.

View File

@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2020 Martin Whitaker.
// Copyright (C) 2020-2022 Martin Whitaker.
//
// Derived from memtest86+ reloc.c:
//
@ -11,6 +11,8 @@
#include <stddef.h>
#include <stdint.h>
#include "assert.h"
//------------------------------------------------------------------------------
// Constants
//------------------------------------------------------------------------------
@ -62,13 +64,6 @@ typedef struct
#define ELF64_R_TYPE(r_info) ((r_info) & 0xffffffff)
static inline void assert(int expr)
{
if (!expr) {
__asm__ __volatile__ ("int $3");
}
}
/*
* Return the run-time load address of the shared object.
*/

View File

@ -63,6 +63,10 @@
#define APIC_DELMODE_STARTUP 6
#define APIC_DELMODE_EXTINT 7
// APIC ICR busy flag
#define APIC_ICR_BUSY (1 << 12)
// IA32_APIC_BASE MSR bits
#define IA32_APIC_ENABLED (1 << 11)
@ -614,18 +618,23 @@ static bool find_cpus_in_rsdp(void)
return false;
}
static bool send_ipi(int apic_id, int trigger, int level, int mode, uint8_t vector, int delay_before_poll)
static inline void send_ipi(int apic_id, int trigger, int level, int mode, uint8_t vector)
{
apic_write(APIC_REG_ICRHI, apic_id << 24);
apic_write(APIC_REG_ICRLO, trigger << 15 | level << 14 | mode << 8 | vector);
}
static bool send_ipi_and_wait(int apic_id, int trigger, int level, int mode, uint8_t vector, int delay_before_poll)
{
send_ipi(apic_id, trigger, level, mode, vector);
usleep(delay_before_poll);
// Wait for send complete or timeout after 100ms.
int timeout = 1000;
while (timeout > 0) {
bool send_pending = (apic_read(APIC_REG_ICRLO) & 0x00001000);
bool send_pending = (apic_read(APIC_REG_ICRLO) & APIC_ICR_BUSY);
if (!send_pending) {
return true;
}
@ -663,13 +672,13 @@ static bool start_cpu(int cpu_num)
(void)read_apic_esr(is_p5);
// Pulse the INIT IPI.
if (!send_ipi(apic_id, APIC_TRIGGER_LEVEL, 1, APIC_DELMODE_INIT, 0, 0)) {
if (!send_ipi_and_wait(apic_id, APIC_TRIGGER_LEVEL, 1, APIC_DELMODE_INIT, 0, 0)) {
return false;
}
if (use_long_delays) {
usleep(10*1000); // 10ms
}
if (!send_ipi(apic_id, APIC_TRIGGER_LEVEL, 0, APIC_DELMODE_INIT, 0, 0)) {
if (!send_ipi_and_wait(apic_id, APIC_TRIGGER_LEVEL, 0, APIC_DELMODE_INIT, 0, 0)) {
return false;
}
@ -679,7 +688,7 @@ static bool start_cpu(int cpu_num)
(void)read_apic_esr(is_p5);
// Send the STARTUP IPI.
if (!send_ipi(apic_id, 0, 0, APIC_DELMODE_STARTUP, AP_TRAMPOLINE_PAGE, use_long_delays ? 300 : 10)) {
if (!send_ipi_and_wait(apic_id, 0, 0, APIC_DELMODE_STARTUP, AP_TRAMPOLINE_PAGE, use_long_delays ? 300 : 10)) {
return false;
}
@ -785,9 +794,12 @@ int smp_start(cpu_state_t cpu_state[MAX_CPUS])
#endif
}
bool smp_send_nmi(int cpu_num)
void smp_send_nmi(int cpu_num)
{
return send_ipi(cpu_num_to_apic_id[cpu_num], 0, 0, APIC_DELMODE_NMI, 0, 200);
while (apic_read(APIC_REG_ICRLO) & APIC_ICR_BUSY) {
__builtin_ia32_pause();
}
send_ipi(cpu_num_to_apic_id[cpu_num], 0, 0, APIC_DELMODE_NMI, 0);
}
int smp_my_cpu_num(void)

View File

@ -29,8 +29,7 @@
typedef enum __attribute__ ((packed)) {
CPU_STATE_DISABLED = 0,
CPU_STATE_ENABLED = 1,
CPU_STATE_RUNNING = 2,
CPU_STATE_HALTED = 3
CPU_STATE_RUNNING = 2
} cpu_state_t;
/**
@ -63,7 +62,7 @@ int smp_start(cpu_state_t cpu_state[MAX_CPUS]);
* Sends a non-maskable interrupt to the CPU core whose ordinal number
* is cpu_num.
*/
bool smp_send_nmi(int cpu_num);
void smp_send_nmi(int cpu_num);
/**
* Returns the ordinal number of the calling CPU core.

View File

@ -117,10 +117,19 @@ void calculate_chunk(testword_t **start, testword_t **end, int my_cpu, int segme
void flush_caches(int my_cpu)
{
if (my_cpu >= 0) {
barrier_wait(run_barrier);
bool use_spin_wait = (power_save < POWER_SAVE_HIGH);
if (use_spin_wait) {
barrier_spin_wait(run_barrier);
} else {
barrier_halt_wait(run_barrier);
}
if (my_cpu == master_cpu) {
cache_flush();
}
barrier_wait(run_barrier);
if (use_spin_wait) {
barrier_spin_wait(run_barrier);
} else {
barrier_halt_wait(run_barrier);
}
}
}

View File

@ -79,7 +79,11 @@ int ticks_per_test[NUM_PASS_TYPES][NUM_TEST_PATTERNS];
if (TRACE_BARRIERS) { \
trace(my_cpu, "Run barrier wait at %s line %i", __FILE__, __LINE__); \
} \
barrier_wait(run_barrier); \
if (power_save < POWER_SAVE_HIGH) { \
barrier_spin_wait(run_barrier); \
} else { \
barrier_halt_wait(run_barrier); \
} \
}
int run_test(int my_cpu, int test, int stage, int iterations)