From 9f1070514c8adbc7ce06818ba9830d26710711ca Mon Sep 17 00:00:00 2001 From: mjacob Date: Wed, 8 Jul 1998 00:49:06 +0000 Subject: [PATCH] First (prelim placeholder) pass at machine check handler for tlsb --- sys/arch/alpha/alpha/dec_kn8ae.c | 380 ++++++++++++++++++++++++++++++- 1 file changed, 378 insertions(+), 2 deletions(-) diff --git a/sys/arch/alpha/alpha/dec_kn8ae.c b/sys/arch/alpha/alpha/dec_kn8ae.c index bd3aedc915c2..27e6544a5fdb 100644 --- a/sys/arch/alpha/alpha/dec_kn8ae.c +++ b/sys/arch/alpha/alpha/dec_kn8ae.c @@ -1,4 +1,4 @@ -/* $NetBSD: dec_kn8ae.c,v 1.16 1998/04/15 00:46:17 mjacob Exp $ */ +/* $NetBSD: dec_kn8ae.c,v 1.17 1998/07/08 00:49:06 mjacob Exp $ */ /* * Copyright (c) 1997 by Matthew Jacob @@ -32,7 +32,7 @@ #include /* RCS ID & Copyright macro defns */ -__KERNEL_RCSID(0, "$NetBSD: dec_kn8ae.c,v 1.16 1998/04/15 00:46:17 mjacob Exp $"); +__KERNEL_RCSID(0, "$NetBSD: dec_kn8ae.c,v 1.17 1998/07/08 00:49:06 mjacob Exp $"); #include #include @@ -44,6 +44,9 @@ __KERNEL_RCSID(0, "$NetBSD: dec_kn8ae.c,v 1.16 1998/04/15 00:46:17 mjacob Exp $" #include #include #include +#include +#include +#include #include #include @@ -56,9 +59,18 @@ __KERNEL_RCSID(0, "$NetBSD: dec_kn8ae.c,v 1.16 1998/04/15 00:46:17 mjacob Exp $" #include #include +#include +#include +#include +#define KV(_addr) ((caddr_t)ALPHA_PHYS_TO_K0SEG((_addr))) + + void dec_kn8ae_init __P((void)); static void dec_kn8ae_device_register __P((struct device *, void *)); +static void dec_kn8ae_mcheck_handler + __P((unsigned long, struct trapframe *, unsigned long, unsigned long)); + const struct alpha_variation_table dec_kn8ae_variations[] = { { 0, "AlphaServer 8400" }, { 0, NULL }, @@ -80,6 +92,7 @@ dec_kn8ae_init() platform.iobus = "tlsb"; platform.device_register = dec_kn8ae_device_register; + platform.mcheck_handler = dec_kn8ae_mcheck_handler; } /* @@ -215,3 +228,366 @@ dec_kn8ae_device_register(dev, aux) } } } + +/* + * KN8AE Machine Check Handlers. + */ +void kn8ae_harderr __P((unsigned long, unsigned long, + unsigned long, struct trapframe *)); + +static void kn8ae_softerr __P((unsigned long, unsigned long, + unsigned long, struct trapframe *)); + +void kn8ae_mcheck __P((unsigned long, unsigned long, + unsigned long, struct trapframe *)); + +/* + * Support routine for clearing errors + */ +static void clear_tlsb_ebits __P((int)); + +static void +clear_tlsb_ebits(cpuonly) + int cpuonly; +{ + int node; + u_int32_t tldev; + + for (node = 0; node <= TLSB_NODE_MAX; ++node) { + if ((tlsb_found & (1 << node)) == 0) + continue; + tldev = TLSB_GET_NODEREG(node, TLDEV); + if (tldev == 0) { + /* "cannot happen" */ + continue; + } + /* + * Registers to clear for all nodes. + */ + if (TLSB_GET_NODEREG(node, TLBER) & + (TLBER_UDE|TLBER_CWDE|TLBER_CRDE)) { + TLSB_PUT_NODEREG(node, TLESR0, + TLSB_GET_NODEREG(node, TLESR0)); + TLSB_PUT_NODEREG(node, TLESR1, + TLSB_GET_NODEREG(node, TLESR1)); + TLSB_PUT_NODEREG(node, TLESR2, + TLSB_GET_NODEREG(node, TLESR2)); + TLSB_PUT_NODEREG(node, TLESR3, + TLSB_GET_NODEREG(node, TLESR3)); + } + TLSB_PUT_NODEREG(node, TLBER, + TLSB_GET_NODEREG(node, TLBER)); + TLSB_PUT_NODEREG(node, TLFADR0, + TLSB_GET_NODEREG(node, TLFADR0)); + TLSB_PUT_NODEREG(node, TLFADR1, + TLSB_GET_NODEREG(node, TLFADR1)); + + if (TLDEV_ISCPU(tldev)) { + TLSB_PUT_NODEREG(node, TLEPAERR, + TLSB_GET_NODEREG(node, TLEPAERR)); + TLSB_PUT_NODEREG(node, TLEPDERR, + TLSB_GET_NODEREG(node, TLEPDERR)); + TLSB_PUT_NODEREG(node, TLEPMERR, + TLSB_GET_NODEREG(node, TLEPMERR)); + continue; + } + /* + * If we're only doing CPU nodes, or this was a memory + * node, we're done. Onwards. + */ + if (cpuonly || TLDEV_ISMEM(tldev)) { + continue; + } + + TLSB_PUT_NODEREG(node, KFT_ICCNSE, + TLSB_GET_NODEREG(node, KFT_ICCNSE)); + TLSB_PUT_NODEREG(node, KFT_IDPNSE0, + TLSB_GET_NODEREG(node, KFT_IDPNSE0)); + TLSB_PUT_NODEREG(node, KFT_IDPNSE1, + TLSB_GET_NODEREG(node, KFT_IDPNSE1)); + if (TLDEV_DTYPE(tldev) == TLDEV_DTYPE_KFTHA) { + TLSB_PUT_NODEREG(node, KFT_IDPNSE2, + TLSB_GET_NODEREG(node, KFT_IDPNSE2)); + TLSB_PUT_NODEREG(node, KFT_IDPNSE3, + TLSB_GET_NODEREG(node, KFT_IDPNSE3)); + } + /* + * Digital Unix cleares the Mailbox Transaction Register + * here. I don't think we should because we aren't using + * mailboxes yet, and the tech manual makes dire warnings + * about *not* rewriting this register. + */ + } +} + +/* + * System Corrected Errors. + */ +static const char *fmt1 = " %-25s = 0x%l016x\n"; + +void +kn8ae_harderr(mces, type, logout, framep) + unsigned long mces; + unsigned long type; + unsigned long logout; + struct trapframe *framep; +{ + int whami, cpuwerr, dof_cnt; + mc_hdr_ev5 *hdr; + mc_cc_ev5 *mptr; + struct tlsb_mchk_fatal *ptr; + + hdr = (mc_hdr_ev5 *) logout; + mptr = (mc_cc_ev5 *) (logout + sizeof (*hdr)); + ptr = (struct tlsb_mchk_fatal *) + (logout + sizeof (*hdr) + sizeof (*mptr)); + whami = alpha_pal_whami(); + + printf("kn8ae: CPU ID %d system correctable error\n", whami); + + printf(" Machine Check Code 0x%lx\n", hdr->mcheck_code); + printf(fmt1, "EI Status", mptr->ei_stat); + printf(fmt1, "EI Address", mptr->ei_addr); + printf(fmt1, "Fill Syndrome", mptr->fill_syndrome); + printf(fmt1, "Interrupt Status Reg.", mptr->isr); + printf("\n"); + dof_cnt = (ptr->rsvdheader & 0xffffffff00000000) >> 32; + cpuwerr = ptr->rsvdheader & 0xffff; + + printf(fmt1, "CPU W/Error.", cpuwerr); + printf(fmt1, "DOF Count.", dof_cnt); + printf(fmt1, "TLDEV", ptr->tldev); + printf(fmt1, "TLSB Bus Error", ptr->tlber); + printf(fmt1, "TLSB CNR", ptr->tlcnr); + printf(fmt1, "TLSB VID", ptr->tlvid); + printf(fmt1, "TLSB Error Syndrome 0", ptr->tlesr0); + printf(fmt1, "TLSB Error Syndrome 1", ptr->tlesr1); + printf(fmt1, "TLSB Error Syndrome 2", ptr->tlesr2); + printf(fmt1, "TLSB Error Syndrome 3", ptr->tlesr3); + printf(fmt1, "TLSB LEP_AERR", ptr->tlepaerr); + printf(fmt1, "TLSB MODCONF", ptr->tlmodconfig); + printf(fmt1, "TLSB LEP_MERR", ptr->tlepmerr); + printf(fmt1, "TLSB LEP_DERR", ptr->tlepderr); + printf(fmt1, "TLSB INTRMASK0", ptr->tlintrmask0); + printf(fmt1, "TLSB INTRMASK1", ptr->tlintrmask1); + printf(fmt1, "TLSB INTRSUM0", ptr->tlintrsum0); + printf(fmt1, "TLSB INTRSUM1", ptr->tlintrsum1); + printf(fmt1, "TLSB VMG", ptr->tlep_vmg); + + /* CLEAN UP */ + /* + * Here's what Digital Unix says to do- + * + * 1. Log the ECC error that got us here + * + * 2. Turn off error reporting + * + * 3. Attempt to have CPU read bad memory location (specified by the + * tlfadr reg of the TIOP or TMEM (depending on type of error, + * see upcoming code branches) and write data back to location. + * + * 4. When the CPU attempts to read the location, another 620 interrupt + * should occur for the cpu at which instant PAL will scrub the + * location. Then the o.s. scrub routine finishes. If the PAL scrubs + * the location then the scrubbed flag should be 0 (this is what we + * expect). + * + * If it's a 1 then the alpha_scrub_long routine did the scrub. + * + * 5. We renable correctable error logging and continue + */ + printf("WARNING THIS IS NOT DONE YET YOU MAY GET DATA CORRUPTION"); + clear_tlsb_ebits(0); + /* + * Clear error by rewriting register. + */ + alpha_pal_wrmces(mces); +} + +/* + * Processor Corrected Errors- BCACHE ECC errors. + */ + +static void +kn8ae_softerr(mces, type, logout, framep) + unsigned long mces; + unsigned long type; + unsigned long logout; + struct trapframe *framep; +{ + int whami, cpuwerr, dof_cnt; + mc_hdr_ev5 *hdr; + mc_cc_ev5 *mptr; + struct tlsb_mchk_soft *ptr; + + hdr = (mc_hdr_ev5 *) logout; + mptr = (mc_cc_ev5 *) (logout + sizeof (*hdr)); + ptr = (struct tlsb_mchk_soft *) + (logout + sizeof (*hdr) + sizeof (*mptr)); + whami = alpha_pal_whami(); + + printf("kn8ae: CPU ID %d processor correctable error\n", whami); + printf(" Machine Check Code 0x%lx\n", hdr->mcheck_code); + printf(fmt1, "EI Status", mptr->ei_stat); + printf(fmt1, "EI Address", mptr->ei_addr); + printf(fmt1, "Fill Syndrome", mptr->fill_syndrome); + printf(fmt1, "Interrupt Status Reg.", mptr->isr); + printf("\n"); + dof_cnt = (ptr->rsvdheader & 0xffffffff00000000) >> 32; + cpuwerr = ptr->rsvdheader & 0xffff; + + printf(fmt1, "CPU W/Error.", cpuwerr); + printf(fmt1, "DOF Count.", dof_cnt); + printf(fmt1, "TLDEV", ptr->tldev); + printf(fmt1, "TLSB Bus Error", ptr->tlber); + printf(fmt1, "TLSB Error Syndrome 0", ptr->tlesr0); + printf(fmt1, "TLSB Error Syndrome 1", ptr->tlesr1); + printf(fmt1, "TLSB Error Syndrome 2", ptr->tlesr2); + printf(fmt1, "TLSB Error Syndrome 3", ptr->tlesr3); + + /* + * Clear TLSB bits on all CPU TLSB nodes. + */ + clear_tlsb_ebits(1); + + /* + * Clear error by rewriting register. + */ + alpha_pal_wrmces(mces); +} + +/* + * KN8AE specific machine check handler + */ + +void +kn8ae_mcheck(mces, type, logout, framep) + unsigned long mces; + unsigned long type; + unsigned long logout; + struct trapframe *framep; +{ + struct mchkinfo *mcp; + int get_dwlpx_regs; + struct tlsb_mchk_fatal mcs[TLSB_NODE_MAX+1], *ptr; + mc_hdr_ev5 *hdr; + mc_uc_ev5 *mptr; + + /* + * If we expected a machine check, just go handle it in common code. + */ + mcp = &mchkinfo[alpha_pal_whami()]; + if (mcp->mc_expected) { + machine_check(mces, framep, type, logout); + return; + } + + get_dwlpx_regs = 0; + ptr = NULL; + bzero(mcs, sizeof (mcs)); + + hdr = (mc_hdr_ev5 *) logout; + mptr = (mc_uc_ev5 *) (logout + sizeof (*hdr)); + + /* + * If detected by the system, we print out some TLASER registers. + */ + if (type == ALPHA_SYS_MCHECK) { +#if 0 + int get_lsb_regs = 0; + int get_dwlpx_regs = 0; +#endif + + ptr = (struct tlsb_mchk_fatal *) + (logout + sizeof (*hdr) + sizeof (*mptr)); + +#if 0 + if (ptr->tlepaerr & TLEPAERR_WSPC_RD) { + get_dwlpx_regs++; + } + if ((ptr->tlepaerr & TLEPAERR_IBOX_TMO) && + (mptr->ic_perr_stat & EV5_IC_PERR_IBOXTMO) && + (ptr->tlepderr & TLEPDERR_GBTMO)) { + get_dwlpx_regs++; + } +#endif + } else { + /* + * We have a processor machine check- which doesn't + * have information with it about any TLSB related + * failures. + */ + } + + /* + * Now we can finally print some stuff... + */ + ev5_logout_print(hdr, mptr); + if (type == ALPHA_SYS_MCHECK) { + if (ptr->tlepaerr & TLEPAERR_WSPC_RD) { + printf("\tWSPC READ error\n"); + } + if ((ptr->tlepaerr & TLEPAERR_IBOX_TMO) && + (mptr->ic_perr_stat & EV5_IC_PERR_IBOXTMO) && + (ptr->tlepderr & TLEPDERR_GBTMO)) { + printf ("\tWSPC IBOX timeout detected\n"); + } +#ifdef DIAGNOSTIC + printf(fmt1, "TLDEV", ptr->tldev); + printf(fmt1, "TLSB Bus Error", ptr->tlber); + printf(fmt1, "TLSB CNR", ptr->tlcnr); + printf(fmt1, "TLSB VID", ptr->tlvid); + printf(fmt1, "TLSB Error Syndrome 0", ptr->tlesr0); + printf(fmt1, "TLSB Error Syndrome 1", ptr->tlesr1); + printf(fmt1, "TLSB Error Syndrome 2", ptr->tlesr2); + printf(fmt1, "TLSB Error Syndrome 3", ptr->tlesr3); + printf(fmt1, "TLSB LEP_AERR", ptr->tlepaerr); + printf(fmt1, "TLSB MODCONF", ptr->tlmodconfig); + printf(fmt1, "TLSB LEP_MERR", ptr->tlepmerr); + printf(fmt1, "TLSB LEP_DERR", ptr->tlepderr); + printf(fmt1, "TLSB INTRMASK0", ptr->tlintrmask0); + printf(fmt1, "TLSB INTRMASK1", ptr->tlintrmask1); + printf(fmt1, "TLSB INTRSUM0", ptr->tlintrsum0); + printf(fmt1, "TLSB INTRSUM1", ptr->tlintrsum1); + printf(fmt1, "TLSB VMG", ptr->tlep_vmg); +#endif + } else { + } + + /* + * Now that we've printed all sorts of useful information + * and have decided that we really can't do any more to + * respond to the error, go on to the common code for + * final disposition. Usually this means that we die. + */ + clear_tlsb_ebits(0); + + machine_check(mces, framep, type, logout); +} + +static void +dec_kn8ae_mcheck_handler(mces, framep, vector, param) + unsigned long mces; + struct trapframe *framep; + unsigned long vector; + unsigned long param; +{ + switch (vector) { + case ALPHA_SYS_ERROR: + kn8ae_harderr(mces, vector, param, framep); + break; + + case ALPHA_PROC_ERROR: + kn8ae_softerr(mces, vector, param, framep); + break; + + case ALPHA_SYS_MCHECK: + case ALPHA_PROC_MCHECK: + kn8ae_mcheck(mces, vector, param, framep); + break; + default: + printf("KN8AE_MCHECK: unknown check vector %x\n", vector); + machine_check(mces, framep, vector, param); + break; + } +}