Preliminary ECC support for AMD Zen CPUs (#353)

* Initial commit for ECC support. Preliminary support for AMD Zen.

* Clear ECC registers at startup

* Add config flag (enable_ecc_polling) to toggle ECC polling. (Currently disabled by default for v7 release)
This commit is contained in:
Sam Demeulemeester 2023-11-29 12:53:05 +01:00 committed by GitHub
parent 9b9c65b968
commit 5dde13b0a1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 286 additions and 35 deletions

View File

@ -99,6 +99,8 @@ bool enable_sm = true;
bool enable_bench = true;
bool enable_mch_read = true;
bool enable_ecc_polling = false;
bool pause_at_start = true;
power_save_t power_save = POWER_SAVE_HIGH;

View File

@ -59,6 +59,7 @@ extern bool enable_sm;
extern bool enable_tty;
extern bool enable_bench;
extern bool enable_mch_read;
extern bool enable_ecc_polling;
extern bool pause_at_start;

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2020-2022 Martin Whitaker.
// Copyright (C) 2004-2022 Sam Demeulemeester.
// Copyright (C) 2004-2023 Sam Demeulemeester.
#include <stdbool.h>
#include <stdint.h>
@ -119,6 +119,10 @@ void display_init(void)
prints(8, 0, "Using: | Pass: Errors:");
// prints(9, 0, "--------------------------------------------------------------------------------");
if (ecc_status.ecc_enabled) {
prints(8, 57, "Err: ECC:");
}
for (int i = 0;i < 80; i++) {
print_char(6, i, 0xc4);
print_char(9, i, 0xc4);
@ -293,11 +297,20 @@ void display_start_run(void)
clear_message_area();
}
clear_screen_region(7, 49, 7, 57); // run time
clear_screen_region(8, 49, 8, 57); // pass number
clear_screen_region(8, 68, 8, SCREEN_WIDTH - 1); // error count
clear_screen_region(7, 49, 7, 57); // run time
if (ecc_status.ecc_enabled) {
clear_screen_region(8, 49, 8, 53); // pass number
clear_screen_region(8, 61, 8, 68); // error count
clear_screen_region(8, 74, 8, SCREEN_WIDTH - 1); // ecc error count
} else {
clear_screen_region(8, 49, 8, 59); // pass number
clear_screen_region(8, 68, 8, SCREEN_WIDTH - 1); // error count
}
display_pass_count(0);
display_error_count(0);
error_count = 0;
display_error_count();
if (clks_per_msec > 0) {
// If we've measured the CPU speed, we know the TSC is available.
run_start_time = get_tsc();
@ -332,6 +345,15 @@ void display_start_test(void)
test_ticks = 0;
}
void display_error_count(void)
{
if (ecc_status.ecc_enabled) {
display_err_count_with_ecc(error_count, error_count_cecc);
} else {
display_err_count_without_ecc(error_count);
}
}
void display_temperature(void)
{
if (!enable_temperature) {
@ -477,6 +499,7 @@ void do_tick(int my_cpu)
} else {
barrier_halt_wait(run_barrier);
}
if (master_cpu == my_cpu) {
check_input();
error_update();
@ -551,6 +574,9 @@ void do_tick(int my_cpu)
display_big_status(false);
}
// Check ECC Errors
memctrl_poll_ecc();
// Update temperature
display_temperature();
@ -561,6 +587,7 @@ void do_tick(int my_cpu)
tty_partial_redraw();
}
}
timed_update_done = true;
}

View File

@ -9,7 +9,7 @@
*
*//*
* Copyright (C) 2020-2022 Martin Whitaker.
* Copyright (C) 2004-2022 Sam Demeulemeester.
* Copyright (C) 2004-2023 Sam Demeulemeester.
*/
#include <stdbool.h>
@ -188,9 +188,15 @@ typedef enum {
#define display_pass_count(count) \
printi(8, 51, count, 0, false, true)
#define display_error_count(count) \
#define display_err_count_without_ecc(count) \
printi(8, 68, count, 0, false, true)
#define display_err_count_with_ecc(count_err, count_ecc) \
{ \
printi(8, 62, count_err, 0, false, true); \
printi(8, 74, count_ecc, 0, false, true); \
}
#define clear_message_area() \
{ \
clear_screen_region(ROW_MESSAGE_T, 0, ROW_MESSAGE_B, SCREEN_WIDTH - 1); \
@ -245,6 +251,8 @@ void display_start_pass(void);
void display_start_test(void);
void display_error_count(void);
void display_temperature(void);
void display_big_status(bool pass);

View File

@ -22,9 +22,8 @@
#include "test.h"
#include "tests.h"
#include "serial.h"
#include "memctrl.h"
#include "error.h"
//------------------------------------------------------------------------------
@ -39,7 +38,13 @@
// Types
//------------------------------------------------------------------------------
typedef enum { ADDR_ERROR, DATA_ERROR, PARITY_ERROR, NEW_MODE } error_type_t;
typedef enum { ADDR_ERROR,
DATA_ERROR,
PARITY_ERROR,
UECC_ERROR,
CECC_ERROR,
NEW_MODE
} error_type_t;
typedef struct {
uintptr_t page;
@ -71,7 +76,8 @@ static error_info_t error_info;
// Public Variables
//------------------------------------------------------------------------------
uint64_t error_count = 0;
uint64_t error_count = 0;
uint64_t error_count_cecc = 0;
//------------------------------------------------------------------------------
// Private Functions
@ -150,7 +156,7 @@ static void common_err(error_type_t type, uintptr_t addr, testword_t good, testw
restore_big_status();
bool new_header = (error_count == 0) || (error_mode != last_error_mode);
bool new_header = (error_count == 0 && error_count_cecc == 0) || (error_mode != last_error_mode);
if (new_header) {
clear_message_area();
badram_init();
@ -184,11 +190,17 @@ static void common_err(error_type_t type, uintptr_t addr, testword_t good, testw
}
if (new_address) {
if (error_count < ERROR_LIMIT) {
error_count++;
}
if (test_list[test_num].errors < INT_MAX) {
test_list[test_num].errors++;
if (type == CECC_ERROR) {
if ((error_count_cecc + ecc_status.count) < 999999) {
error_count_cecc += ecc_status.count;
}
} else {
if (error_count < ERROR_LIMIT) {
error_count++;
}
if (test_list[test_num].errors < INT_MAX) {
test_list[test_num].errors++;
}
}
}
@ -241,7 +253,7 @@ static void common_err(error_type_t type, uintptr_t addr, testword_t good, testw
test_list[i].errors);
}
display_error_count(error_count);
display_error_count();
}
break;
@ -268,10 +280,15 @@ static void common_err(error_type_t type, uintptr_t addr, testword_t good, testw
scroll();
set_foreground_colour(YELLOW);
display_scrolled_message(0, " %2i %4i %2i %09x%03x (%kB)",
smp_my_cpu_num(), pass_num, test_num, page, offset, page << 2);
type != CECC_ERROR ? smp_my_cpu_num() : ecc_status.core,
pass_num, test_num, page, offset, page << 2);
if (type == PARITY_ERROR) {
display_scrolled_message(41, "%s", "Parity error detected near this address");
} else if (type == CECC_ERROR) {
display_scrolled_message(41, "%s%2i", "Correctable ECC Error - CH#", ecc_status.channel);
} else {
#if TESTWORD_WIDTH > 32
display_scrolled_message(41, "%016x %016x", good, bad);
@ -279,9 +296,10 @@ static void common_err(error_type_t type, uintptr_t addr, testword_t good, testw
display_scrolled_message(41, "%08x %08x %08x %i", good, bad, xor, error_count);
#endif
}
set_foreground_colour(WHITE);
display_error_count(error_count);
display_error_count();
}
break;
@ -295,7 +313,7 @@ static void common_err(error_type_t type, uintptr_t addr, testword_t good, testw
break;
}
if (type != PARITY_ERROR) {
if (type != PARITY_ERROR && type != CECC_ERROR) {
error_info.last_addr = addr;
error_info.last_xor = xor;
}
@ -345,6 +363,12 @@ void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_
common_err(DATA_ERROR, (uintptr_t)addr, good, bad, use_for_badram);
}
void ecc_error()
{
common_err(CECC_ERROR, ecc_status.addr, 0, 0, false);
error_update();
}
#if REPORT_PARITY_ERRORS
void parity_error(void)
{
@ -356,7 +380,7 @@ void parity_error(void)
void error_update(void)
{
if (error_count > 0) {
if (error_count > 0 || error_count_cecc > 0) {
if (error_mode != last_error_mode) {
common_err(NEW_MODE, 0, 0, 0, false);
}
@ -365,12 +389,16 @@ void error_update(void)
test_list[test_num].errors == INT_MAX ? '>' : ' ',
test_list[test_num].errors);
}
display_error_count(error_count);
display_status("Failed!");
display_error_count();
// Display FAIL banner on first error
if (error_count == 1) {
display_big_status(false);
// Only fail if error is uncorrected
if (error_count > 0) {
display_status("Failed!");
// Display FAIL banner on first uncorrectable error
if (error_count == 1) {
display_big_status(false);
}
}
if (enable_tty) {

View File

@ -20,6 +20,11 @@
*/
extern uint64_t error_count;
/**
* The number of correctable ECC errors recorded during the current run.
*/
extern uint64_t error_count_cecc;
/**
* Initialises the error records.
*/
@ -35,6 +40,12 @@ void addr_error(testword_t *addr1, testword_t *addr2, testword_t good, testword_
*/
void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_badram);
/**
* Adds an ECC error to the error reports.
* ECC Error details are stored in ecc_status
*/
void ecc_error();
#if REPORT_PARITY_ERRORS
/**
* Adds a parity error to the error reports.

View File

@ -6,6 +6,9 @@
// Platform-specific code for AMD Zen CPUs
//
#include "error.h"
#include "config.h"
#include "cpuinfo.h"
#include "memctrl.h"
#include "msr.h"
@ -13,12 +16,30 @@
#include "imc.h"
#include "display.h" // DEBUG
#define AMD_SMN_UMC_BAR 0x050000
#define AMD_SMN_UMC_CHB_OFFSET 0x100000
#define AMD_SMN_UMC_DRAM_ECC_CTRL AMD_SMN_UMC_BAR + 0x14C
#define AMD_SMN_UMC_DRAM_CONFIG AMD_SMN_UMC_BAR + 0x200
#define AMD_SMN_UMC_DRAM_TIMINGS1 AMD_SMN_UMC_BAR + 0x204
#define AMD_SMN_UMC_DRAM_TIMINGS2 AMD_SMN_UMC_BAR + 0x208
#define AMD_SMN_UMC_ECC_ERR_CNT_SEL AMD_SMN_UMC_BAR + 0xD80
#define AMD_SMN_UMC_ECC_ERR_CNT AMD_SMN_UMC_BAR + 0xD84
#define AMD_UMC_OFFSET 0x10
#define AMD_UMC_VALID_ERROR_BIT (1 << 31)
#define AMD_UMC_ERROR_CECC_BIT (1 << 14)
#define AMD_UMC_ERROR_UECC_BIT (1 << 13)
#define AMD_UMC_ERR_CNT_EN (1 << 15)
#define AMD_MCG_CTL_2_BANKS (1 << 16) | (1 << 15)
#define AMD_MCG_CTL_4_BANKS (1 << 18) | (1 << 17) | (1 << 16) | (1 << 15)
#define AMD_MCA_STATUS_WR_ENABLE (1 << 18)
#define ECC_RD_EN (1 << 10)
#define ECC_WR_EN (1 << 0)
void get_imc_config_amd_zen(void)
{
uint32_t smn_reg, offset;
@ -68,4 +89,119 @@ void get_imc_config_amd_zen(void)
// RAS Precharge (tRP)
imc.tRP = (smn_reg >> 16) & 0x3F;
// Detect ECC (x64 only)
#if TESTWORD_WIDTH > 32
if (enable_ecc_polling) {
uint32_t regl, regh;
smn_reg = amd_smn_read(AMD_SMN_UMC_DRAM_ECC_CTRL + offset);
if (smn_reg & (ECC_RD_EN | ECC_WR_EN)) {
ecc_status.ecc_enabled = true;
// Number of UMC to init
uint8_t umc = 0, umc_max = 0;
uint32_t umc_banks_bits = 0;
if (imc.family == IMC_K19_VRM || imc.family == IMC_K19_RPL) {
umc_max = 4;
umc_banks_bits = AMD_MCG_CTL_4_BANKS;
} else {
umc_max = 2;
umc_banks_bits = AMD_MCG_CTL_2_BANKS;
}
// Enable ECC reporting
rdmsr(MSR_IA32_MCG_CTL, regl, regh);
wrmsr(MSR_IA32_MCG_CTL, regl | umc_banks_bits, regh);
rdmsr(MSR_AMD64_HW_CONF, regl, regh);
wrmsr(MSR_AMD64_HW_CONF, regl | AMD_MCA_STATUS_WR_ENABLE, regh); // // Enable Write to MCA STATUS Register
for (umc = 0; umc < umc_max; umc++)
{
rdmsr(MSR_AMD64_UMC_MCA_CTRL + (umc * AMD_UMC_OFFSET), regl, regh);
wrmsr(MSR_AMD64_UMC_MCA_CTRL + (umc * AMD_UMC_OFFSET), regl | 1, regh);
}
smn_reg = amd_smn_read(AMD_SMN_UMC_ECC_ERR_CNT_SEL);
amd_smn_write(AMD_SMN_UMC_ECC_ERR_CNT_SEL, smn_reg | AMD_UMC_ERR_CNT_EN); // Enable CH0 Error CNT
smn_reg = amd_smn_read(AMD_SMN_UMC_ECC_ERR_CNT_SEL + AMD_SMN_UMC_CHB_OFFSET);
amd_smn_write(AMD_SMN_UMC_ECC_ERR_CNT_SEL + AMD_SMN_UMC_CHB_OFFSET, smn_reg | AMD_UMC_ERR_CNT_EN); // Enable CH1 Error CNT
poll_ecc_amd_zen(false); // Clear ECC registers
}
}
#endif
}
void poll_ecc_amd_zen(bool report)
{
uint8_t umc = 0, umc_max = 0;
uint32_t regh, regl;
// Number of UMC to check
if (imc.family == IMC_K19_VRM || imc.family == IMC_K19_RPL) {
umc_max = 4;
} else {
umc_max = 2;
}
// Check all UMCs
for (umc = 0; umc < umc_max; umc++)
{
// Get Status Register
rdmsr(MSR_AMD64_UMC_MCA_STATUS + (AMD_UMC_OFFSET * umc), regl, regh);
// Check if ECC error happened
if (regh & AMD_UMC_VALID_ERROR_BIT) {
// Check the type or error. Currently, we only report Corrected ECC error
// Uncorrected ECC errors are skipped to avoid double detection
if (regh & AMD_UMC_ERROR_CECC_BIT) {
ecc_status.type = ECC_ERR_CORRECTED;
} else if (regh & AMD_UMC_ERROR_UECC_BIT) {
ecc_status.type = ECC_ERR_UNCORRECTED;
} else {
ecc_status.type = ERR_UNKNOWN;
}
// Populate Channel Number
ecc_status.channel = umc;
// Get Core# associated with the error
ecc_status.core = regh & 0x3F;
// Get address
rdmsr(MSR_AMD64_UMC_MCA_ADDR + (AMD_UMC_OFFSET * umc), regl, regh);
ecc_status.addr = (uint64_t)(regh & 0x00FFFFFF) << 32;
ecc_status.addr |= regl;
// Clear Address n-th LSBs according to MSR bit[61:56]
ecc_status.addr &= ~0ULL << ((regh >> 24) & 0x3F);
// Get ECC Error Count
ecc_status.count = amd_smn_read(AMD_SMN_UMC_ECC_ERR_CNT + (AMD_SMN_UMC_CHB_OFFSET * umc)) & 0xFFFF;
if (!ecc_status.count) ecc_status.count++;
// Report error
if (report) {
ecc_error();
}
// Clear Error
rdmsr(MSR_AMD64_UMC_MCA_STATUS + (AMD_UMC_OFFSET * umc), regl, regh);
wrmsr(MSR_AMD64_UMC_MCA_STATUS + (AMD_UMC_OFFSET * umc), regl, regh & ~AMD_UMC_VALID_ERROR_BIT);
amd_smn_write(AMD_SMN_UMC_ECC_ERR_CNT + (AMD_SMN_UMC_CHB_OFFSET * umc), 0x0);
// Clear Internal ECC Error status
ecc_status.type = ECC_ERR_NONE;
ecc_status.addr = 0;
ecc_status.count = 0;
ecc_status.core = 0;
ecc_status.channel = 0;
}
}
}

View File

@ -3,6 +3,10 @@
#ifndef _IMC_H_
#define _IMC_H_
/**
* Integrated Memory Controler (IMC) Settings Detection Code
*/
/* Memory configuration Detection for AMD Zen CPUs */
void get_imc_config_amd_zen(void);
@ -21,4 +25,11 @@ void get_imc_config_intel_icl(void);
/* Memory configuration Detection for Intel Alder Lake */
void get_imc_config_intel_adl(void);
/**
* ECC Polling Code for various IMCs
*/
/* ECC Polling Code for AMD Zen CPUs */
void poll_ecc_amd_zen(bool report);
#endif /* _IMC_H_ */

View File

@ -14,9 +14,11 @@
#include "memctrl.h"
#include "imc/imc.h"
#include "display.h"
imc_info_t imc = {"UNDEF", 0, 0, 0, 0, 0, 0, 0, 0};
ecc_info_t ecc_status = {false, ECC_ERR_NONE, 0, 0, 0, 0, 0};
ecc_info_t ecc_status = {false, ECC_ERR_NONE, 0, 0, 0, 0};
// ---------------------
// -- Public function --
@ -64,3 +66,21 @@ void memctrl_init(void)
imc.freq = 0;
}
}
void memctrl_poll_ecc(void)
{
if (!ecc_status.ecc_enabled) {
return;
}
switch(imc.family) {
case IMC_K17:
case IMC_K19_VRM:
case IMC_K19_RPL:
case IMC_K19_RBT:
poll_ecc_amd_zen(true);
break;
default:
break;
}
}

View File

@ -27,17 +27,17 @@ typedef struct __attribute__((packed)) imc_infos {
typedef enum {
ECC_ERR_NONE,
ECC_ERR_CORRECTED,
ECC_ERR_UNCORRECTED
ECC_ERR_UNCORRECTED,
ERR_UNKNOWN
} ecc_error_type_t;
typedef struct __attribute__((packed)) ecc_status {
bool ecc_enabled;
ecc_error_type_t err_type;
uint64_t err_adr;
uint32_t err_col;
uint32_t err_row;
uint32_t err_rank;
uint32_t err_bank;
ecc_error_type_t type;
uint64_t addr;
uint32_t count;
uint16_t core;
uint8_t channel;
} ecc_info_t;
/**
@ -54,4 +54,6 @@ extern ecc_info_t ecc_status;
void memctrl_init(void);
void memctrl_poll_ecc(void);
#endif // MEMCTRL_H

View File

@ -8,6 +8,7 @@
*
*//*
* Copyright (C) 2020-2022 Martin Whitaker.
* Copyright (C) 2020-2023 Sam Demeulemeester.
*/
#define MSR_PLATFORM_INFO 0xce
@ -30,6 +31,10 @@
#define MSR_AMD64_NB_CFG 0xc001001f
#define MSR_AMD64_COFVID_STATUS 0xc0010071
#define MSR_AMD64_UMC_MCA_CTRL 0xc00020f0
#define MSR_AMD64_UMC_MCA_STATUS 0xc00020f1
#define MSR_AMD64_UMC_MCA_ADDR 0xc00020f2
#define MSR_AMD64_HW_CONF 0xc0010015
#define MSR_VIA_TEMP_C7 0x1169
#define MSR_VIA_TEMP_NANO 0x1423