1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char *to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char *f10h_nb_mce_desc[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
98 static const char * const f15h_cu_mce_desc[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
112 "PRB address parity error"
115 static const char * const fr_ex_mce_desc[] = {
116 "CPU Watchdog timer expire",
117 "Wakeup array dest tag",
121 "Retire dispatch queue",
122 "Mapper checkpoint array",
123 "Physical register file EX0 port",
124 "Physical register file EX1 port",
125 "Physical register file AG0 port",
126 "Physical register file AG1 port",
127 "Flag register file",
128 "DE correctable error could not be corrected"
131 static bool f12h_dc_mce(u16 ec, u8 xec)
140 pr_cont("during L1 linefill from L2.\n");
141 else if (ll == LL_L1)
142 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
149 static bool f10h_dc_mce(u16 ec, u8 xec)
151 u8 r4 = (ec >> 4) & 0xf;
154 if (r4 == R4_GEN && ll == LL_L1) {
155 pr_cont("during data scrub.\n");
158 return f12h_dc_mce(ec, xec);
161 static bool k8_dc_mce(u16 ec, u8 xec)
164 pr_cont("during system linefill.\n");
168 return f10h_dc_mce(ec, xec);
171 static bool f14h_dc_mce(u16 ec, u8 xec)
173 u8 r4 = (ec >> 4) & 0xf;
175 u8 tt = (ec >> 2) & 0x3;
181 if (tt != TT_DATA || ll != LL_L1)
187 pr_cont("Data/Tag parity error due to %s.\n",
188 (r4 == R4_DRD ? "load/hw prf" : "store"));
191 pr_cont("Copyback parity error on a tag miss.\n");
194 pr_cont("Tag parity error during snoop.\n");
199 } else if (BUS_ERROR(ec)) {
201 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
204 pr_cont("System read data error on a ");
208 pr_cont("TLB reload.\n");
226 static bool f15h_dc_mce(u16 ec, u8 xec)
234 pr_cont("Data Array access error.\n");
238 pr_cont("UC error during a linefill from L2/NB.\n");
243 pr_cont("STQ access error.\n");
247 pr_cont("SCB access error.\n");
251 pr_cont("Tag error.\n");
255 pr_cont("LDQ access error.\n");
261 } else if (BUS_ERROR(ec)) {
264 pr_cont("during system linefill.\n");
266 pr_cont(" Internal %s condition.\n",
267 ((xec == 1) ? "livelock" : "deadlock"));
274 static void amd_decode_dc_mce(struct mce *m)
276 u16 ec = m->status & 0xffff;
277 u8 xec = (m->status >> 16) & xec_mask;
279 pr_emerg(HW_ERR "Data Cache Error: ");
281 /* TLB error signatures are the same across families */
283 u8 tt = (ec >> 2) & 0x3;
286 pr_cont("%s TLB %s.\n", LL_MSG(ec),
287 ((xec == 2) ? "locked miss"
288 : (xec ? "multimatch" : "parity")));
291 } else if (fam_ops->dc_mce(ec, xec))
294 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
297 static bool k8_ic_mce(u16 ec, u8 xec)
300 u8 r4 = (ec >> 4) & 0xf;
307 pr_cont("during a linefill from L2.\n");
308 else if (ll == 0x1) {
311 pr_cont("Parity error during data load.\n");
315 pr_cont("Copyback Parity/Victim error.\n");
319 pr_cont("Tag Snoop error.\n");
332 static bool f14h_ic_mce(u16 ec, u8 xec)
335 u8 tt = (ec >> 2) & 0x3;
336 u8 r4 = (ec >> 4) & 0xf;
340 if (tt != 0 || ll != 1)
344 pr_cont("Data/tag array parity error for a tag hit.\n");
345 else if (r4 == R4_SNOOP)
346 pr_cont("Tag error during snoop/victimization.\n");
353 static bool f15h_ic_mce(u16 ec, u8 xec)
362 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
366 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
370 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
379 static void amd_decode_ic_mce(struct mce *m)
381 u16 ec = m->status & 0xffff;
382 u8 xec = (m->status >> 16) & xec_mask;
384 pr_emerg(HW_ERR "Instruction Cache Error: ");
387 pr_cont("%s TLB %s.\n", LL_MSG(ec),
388 (xec ? "multimatch" : "parity error"));
389 else if (BUS_ERROR(ec)) {
390 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
392 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
393 } else if (fam_ops->ic_mce(ec, xec))
396 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
399 static void amd_decode_bu_mce(struct mce *m)
401 u32 ec = m->status & 0xffff;
402 u32 xec = (m->status >> 16) & xec_mask;
404 pr_emerg(HW_ERR "Bus Unit Error");
407 pr_cont(" in the write data buffers.\n");
409 pr_cont(" in the victim data buffers.\n");
410 else if (xec == 0x2 && MEM_ERROR(ec))
411 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
412 else if (xec == 0x0) {
414 pr_cont(": %s error in a Page Descriptor Cache or "
415 "Guest TLB.\n", TT_MSG(ec));
416 else if (BUS_ERROR(ec))
417 pr_cont(": %s/ECC error in data read from NB: %s.\n",
418 RRRR_MSG(ec), PP_MSG(ec));
419 else if (MEM_ERROR(ec)) {
420 u8 rrrr = (ec >> 4) & 0xf;
423 pr_cont(": %s error during data copyback.\n",
425 else if (rrrr <= 0x1)
426 pr_cont(": %s parity/ECC error during data "
427 "access from L2.\n", RRRR_MSG(ec));
438 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
441 static void amd_decode_cu_mce(struct mce *m)
443 u16 ec = m->status & 0xffff;
444 u8 xec = (m->status >> 16) & xec_mask;
446 pr_emerg(HW_ERR "Combined Unit Error: ");
450 pr_cont("Data parity TLB read error.\n");
452 pr_cont("Poison data provided for TLB fill.\n");
455 } else if (BUS_ERROR(ec)) {
459 pr_cont("Error during attempted NB data read.\n");
460 } else if (MEM_ERROR(ec)) {
463 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
467 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
478 pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
481 static void amd_decode_ls_mce(struct mce *m)
483 u16 ec = m->status & 0xffff;
484 u8 xec = (m->status >> 16) & xec_mask;
486 if (boot_cpu_data.x86 >= 0x14) {
487 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
488 " please report on LKML.\n");
492 pr_emerg(HW_ERR "Load Store Error");
495 u8 r4 = (ec >> 4) & 0xf;
497 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
500 pr_cont(" during %s.\n", RRRR_MSG(ec));
507 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
510 static bool k8_nb_mce(u16 ec, u8 xec)
516 pr_cont("CRC error detected on HT link.\n");
520 pr_cont("Invalid GART PTE entry during GART table walk.\n");
524 pr_cont("Unsupported atomic RMW received from an IO link.\n");
529 if (boot_cpu_data.x86 == 0x11)
532 pr_cont("DRAM ECC error detected on the NB.\n");
536 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
547 static bool f10h_nb_mce(u16 ec, u8 xec)
552 if (k8_nb_mce(ec, xec))
566 pr_cont("GART Table Walk data error.\n");
567 else if (BUS_ERROR(ec))
568 pr_cont("DMA Exclusion Vector Table Walk error.\n");
576 if (boot_cpu_data.x86 == 0x15)
577 pr_cont("Compute Unit Data Error.\n");
595 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
601 static bool nb_noop_mce(u16 ec, u8 xec)
606 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
608 u8 xec = (m->status >> 16) & 0x1f;
609 u16 ec = m->status & 0xffff;
610 u32 nbsh = (u32)(m->status >> 32);
612 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
615 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
616 * value encoding has changed so interpret those differently
618 if ((boot_cpu_data.x86 == 0x10) &&
619 (boot_cpu_data.x86_model > 7)) {
620 if (nbsh & K8_NBSH_ERR_CPU_VAL)
621 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
623 u8 assoc_cpus = nbsh & nb_err_cpumask;
626 pr_cont(", core: %d", fls(assoc_cpus) - 1);
631 pr_cont("Sync error (sync packets on HT link detected).\n");
635 pr_cont("HT Master abort.\n");
639 pr_cont("HT Target abort.\n");
643 pr_cont("NB Watchdog timeout.\n");
647 pr_cont("SVM DMA Exclusion Vector error.\n");
654 if (!fam_ops->nb_mce(ec, xec))
657 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
658 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
659 nb_bus_decoder(node_id, m, nbcfg);
664 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
666 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
668 static void amd_decode_fr_mce(struct mce *m)
670 struct cpuinfo_x86 *c = &boot_cpu_data;
671 u8 xec = (m->status >> 16) & xec_mask;
673 if (c->x86 == 0xf || c->x86 == 0x11)
676 if (c->x86 != 0x15 && xec != 0x0)
679 pr_emerg(HW_ERR "%s Error: ",
680 (c->x86 == 0x15 ? "Execution Unit" : "FIROB"));
682 if (xec == 0x0 || xec == 0xc)
683 pr_cont("%s.\n", fr_ex_mce_desc[xec]);
685 pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]);
692 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
695 static inline void amd_decode_err_code(u16 ec)
698 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
699 TT_MSG(ec), LL_MSG(ec));
700 } else if (MEM_ERROR(ec)) {
701 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
702 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
703 } else if (BUS_ERROR(ec)) {
704 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
705 "Participating Processor: %s\n",
706 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
709 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
713 * Filter out unwanted MCE signatures here.
715 static bool amd_filter_mce(struct mce *m)
717 u8 xec = (m->status >> 16) & 0x1f;
720 * NB GART TLB error reporting is disabled by default.
722 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
728 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
730 struct mce *m = (struct mce *)data;
733 if (amd_filter_mce(m))
736 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
738 pr_cont("%sorrected error, other errors lost: %s, "
739 "CPU context corrupt: %s",
740 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
741 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
742 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
744 /* do the two bits[14:13] together */
745 ecc = (m->status >> 45) & 0x3;
747 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
753 amd_decode_dc_mce(m);
757 amd_decode_ic_mce(m);
761 if (boot_cpu_data.x86 == 0x15)
762 amd_decode_cu_mce(m);
764 amd_decode_bu_mce(m);
768 amd_decode_ls_mce(m);
772 node = amd_get_nb_id(m->extcpu);
773 amd_decode_nb_mce(node, m, 0);
777 amd_decode_fr_mce(m);
784 amd_decode_err_code(m->status & 0xffff);
788 EXPORT_SYMBOL_GPL(amd_decode_mce);
790 static struct notifier_block amd_mce_dec_nb = {
791 .notifier_call = amd_decode_mce,
794 static int __init mce_amd_init(void)
796 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
799 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
800 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
803 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
807 switch (boot_cpu_data.x86) {
809 fam_ops->dc_mce = k8_dc_mce;
810 fam_ops->ic_mce = k8_ic_mce;
811 fam_ops->nb_mce = k8_nb_mce;
815 fam_ops->dc_mce = f10h_dc_mce;
816 fam_ops->ic_mce = k8_ic_mce;
817 fam_ops->nb_mce = f10h_nb_mce;
821 fam_ops->dc_mce = k8_dc_mce;
822 fam_ops->ic_mce = k8_ic_mce;
823 fam_ops->nb_mce = f10h_nb_mce;
827 fam_ops->dc_mce = f12h_dc_mce;
828 fam_ops->ic_mce = k8_ic_mce;
829 fam_ops->nb_mce = nb_noop_mce;
833 nb_err_cpumask = 0x3;
834 fam_ops->dc_mce = f14h_dc_mce;
835 fam_ops->ic_mce = f14h_ic_mce;
836 fam_ops->nb_mce = nb_noop_mce;
841 fam_ops->dc_mce = f15h_dc_mce;
842 fam_ops->ic_mce = f15h_ic_mce;
843 fam_ops->nb_mce = f10h_nb_mce;
847 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
853 pr_info("MCE: In-kernel MCE decoding enabled.\n");
855 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
859 early_initcall(mce_amd_init);
862 static void __exit mce_amd_exit(void)
864 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
868 MODULE_DESCRIPTION("AMD MCE decoder");
869 MODULE_ALIAS("edac-mce-amd");
870 MODULE_LICENSE("GPL");
871 module_exit(mce_amd_exit);