代码拉取完成,页面将自动刷新
From ece0a63689ee54498bb0d372cfb568e431ada5d5 Mon Sep 17 00:00:00 2001
From: Ruidong Tian <tianruidong@linux.alibaba.com>
Date: Mon, 9 Dec 2024 16:28:54 +0800
Subject: [PATCH 77/85] rasdaemon: add rasdaemon json exporter
Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
Makefile.am | 5 +-
configure.ac | 16 +++
misc/rasdaemon.env | 2 +
ras-aer-handler.c | 9 ++
ras-arm-handler.c | 4 +
ras-mc-handler.c | 9 ++
ras-mce-handler.c | 5 +
ras-mce-handler.h | 1 +
ras-memory-failure-handler.c | 4 +
ras-record.h | 9 ++
ras-report-json.c | 240 +++++++++++++++++++++++++++++++++++
ras-report.h | 14 ++
rasdaemon.c | 9 +-
13 files changed, 325 insertions(+), 2 deletions(-)
create mode 100644 ras-report-json.c
diff --git a/Makefile.am b/Makefile.am
index fb0248e..2582454 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -54,6 +54,9 @@ endif
if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
+if WITH_JSON_REPORT
+ rasdaemon_SOURCES += ras-report-json.c
+endif
if WITH_HISI_NS_DECODE
rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c
endif
@@ -70,7 +73,7 @@ if WITH_YITIAN_NS_DECODE
rasdaemon_SOURCES += non-standard-yitian.c
endif
-rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
+rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a $(PCI_LIBS)
include_HEADERS = config.h types.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
diff --git a/configure.ac b/configure.ac
index 135af9c..c8b4ab6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -131,6 +131,21 @@ AS_IF([test "x$enable_abrt_report" = "xyes" || test "x$enable_all" == "xyes"], [
AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_ABRT_REPORT], [USE_ABRT_REPORT="yes"], [USE_ABRT_REPORT="no"])
+AC_ARG_ENABLE([json_report],
+ AS_HELP_STRING([--enable-json-report], [enable storing data at SQL lite database (currently experimental)]))
+
+AS_IF([test "x$enable_json_report" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_CHECK_LIB(pci, pci_lookup_name,[echo "found pci"] , AC_MSG_ERROR([*** Unable to find pci library]), )
+ PCI_LIBS="-lpci"
+ AC_DEFINE(HAVE_JSON_REPORT,1,"have libpci")
+ AC_SUBST([WITH_JSON_REPORT])
+])
+
+AM_CONDITIONAL([WITH_JSON_REPORT], [test x$enable_json_report = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_JSON_REPORT], [USE_JSON_REPORT="yes"], [USE_JSON_REPORT="no"])
+
+AC_SUBST([PCI_LIBS])
+
AC_ARG_ENABLE([hisi_ns_decode],
AS_HELP_STRING([--enable-hisi-ns-decode], [enable HISI_NS_DECODE events (currently experimental)]))
@@ -223,4 +238,5 @@ compile time options summary
AMP RAS errors : $USE_AMP_NS_DECODE
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
YITIAN RAS errors : $USE_YITIAN_NS_DECODE
+ Json exporter : $USE_JSON_REPORT
EOF
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
index 67f488f..760a42d 100644
--- a/misc/rasdaemon.env
+++ b/misc/rasdaemon.env
@@ -45,6 +45,8 @@ CPU_ISOLATION_CYCLE="24h"
# Prevent excessive isolation from causing an avalanche effect
CPU_ISOLATION_LIMIT="10"
+
+DISABLE=""
# Event Trigger
# Event trigger will be executed when the specified event occurs.
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
index b00703e..d0eb4df 100644
--- a/ras-aer-handler.c
+++ b/ras-aer-handler.c
@@ -22,6 +22,8 @@
#include "libtrace/kbuffer.h"
#include "ras-aer-handler.h"
#include "types.h"
+#include "ras-events.h"
+#include "ras-record.h"
#include "ras-logger.h"
#include "bitfield.h"
#include "ras-report.h"
@@ -135,18 +137,22 @@ int ras_aer_event_handler(struct trace_seq *s,
case HW_EVENT_AER_UNCORRECTED_NON_FATAL:
ev.error_type = "Uncorrected (Non-Fatal)";
sel_data[0] = 0xca;
+ ev.severity = GHES_SEV_RECOVERABLE;
break;
case HW_EVENT_AER_UNCORRECTED_FATAL:
ev.error_type = "Uncorrected (Fatal)";
sel_data[0] = 0xca;
+ ev.severity = GHES_SEV_PANIC;
break;
case HW_EVENT_AER_CORRECTED:
ev.error_type = "Corrected";
sel_data[0] = 0xbf;
+ ev.severity = GHES_SEV_CORRECTED;
break;
default:
ev.error_type = "Unknown severity";
sel_data[0] = 0xbf;
+ ev.severity = GHES_SEV_NO;
}
trace_seq_puts(s, ev.error_type);
@@ -184,6 +190,9 @@ int ras_aer_event_handler(struct trace_seq *s,
system(ipmi_add_sel);
#endif
+#ifdef HAVE_JSON_REPORT
+ report_aer_event_json(s, &ev);
+#endif
run_aer_event_trigger(&ev);
return 0;
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
index 19150cb..97ebe21 100644
--- a/ras-arm-handler.c
+++ b/ras-arm-handler.c
@@ -264,5 +264,9 @@ int ras_arm_event_handler(struct trace_seq *s,
ras_report_arm_event(ras, &ev);
#endif
+#ifdef HAVE_JSON_REPORT
+ report_arm_event_json(s, &ev);
+#endif
+
return 0;
}
diff --git a/ras-mc-handler.c b/ras-mc-handler.c
index a270637..bb93c9d 100644
--- a/ras-mc-handler.c
+++ b/ras-mc-handler.c
@@ -29,6 +29,7 @@
#include "ras-page-isolation.h"
#include "types.h"
#include "ras-report.h"
+#include "ras-events.h"
#include "trigger.h"
int ras_mc_event_handler(struct trace_seq *s,
@@ -77,16 +78,20 @@ int ras_mc_event_handler(struct trace_seq *s,
switch (val) {
case HW_EVENT_ERR_CORRECTED:
ev.error_type = "Corrected";
+ ev.severity = GHES_SEV_CORRECTED;
break;
case HW_EVENT_ERR_UNCORRECTED:
ev.error_type = "Uncorrected";
+ ev.severity = GHES_SEV_RECOVERABLE;
break;
case HW_EVENT_ERR_FATAL:
ev.error_type = "Fatal";
+ ev.severity = GHES_SEV_PANIC;
break;
case HW_EVENT_ERR_INFO:
default:
ev.error_type = "Info";
+ ev.severity = GHES_SEV_NO;
}
trace_seq_puts(s, ev.error_type);
@@ -202,6 +207,10 @@ int ras_mc_event_handler(struct trace_seq *s,
run_mc_event_trigger(&ev);
+#ifdef HAVE_JSON_REPORT
+ report_mc_event_json(s, &ev);
+#endif
+
return 0;
parse_error:
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
index 9601704..ecc6468 100644
--- a/ras-mce-handler.c
+++ b/ras-mce-handler.c
@@ -30,6 +30,7 @@
#include "ras-logger.h"
#include "ras-report.h"
#include "trigger.h"
+#include "ras-events.h"
/*
* The code below were adapted from Andi Kleen/Intel/SuSe mcelog code,
@@ -581,6 +582,10 @@ int ras_mce_event_handler(struct trace_seq *s,
ras_report_mce_event(ras, &e);
#endif
+#ifdef HAVE_JSON_REPORT
+ report_mce_event_json(s, &e);
+#endif
+
run_mce_record_trigger(&e);
return 0;
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
index e1064f6..f0dbdab 100644
--- a/ras-mce-handler.h
+++ b/ras-mce-handler.h
@@ -76,6 +76,7 @@ struct mce_event {
uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
+ int severity;
/* Parsed data */
char timestamp[64];
char bank_name[64];
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
index 8bc7a9d..9cd56b4 100644
--- a/ras-memory-failure-handler.c
+++ b/ras-memory-failure-handler.c
@@ -174,5 +174,9 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
#endif
run_mf_event_trigger(&ev);
+#ifdef HAVE_JSON_REPORT
+ report_mf_event_json(s, &ev);
+#endif
+
return 0;
}
diff --git a/ras-record.h b/ras-record.h
index f1edcc0..f48fe37 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -25,6 +25,13 @@
#include "config.h"
#include "types.h"
+static const char * const severity_strs[] = {
+ "info",
+ "corrected",
+ "recoverable",
+ "fatal",
+};
+
extern long user_hz;
struct ras_events;
@@ -32,6 +39,7 @@ struct ras_events;
struct ras_mc_event {
char timestamp[64];
int error_count;
+ int severity;
const char *error_type, *msg, *label;
unsigned char mc_index;
signed char top_layer, middle_layer, lower_layer;
@@ -51,6 +59,7 @@ struct ras_mc_offline_event {
struct ras_aer_event {
char timestamp[64];
const char *error_type;
+ int severity;
const char *dev_name;
uint8_t tlp_header_valid;
uint32_t *tlp_header;
diff --git a/ras-report-json.c b/ras-report-json.c
new file mode 100644
index 0000000..f59ca32
--- /dev/null
+++ b/ras-report-json.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <pci/pci.h>
+
+#include "libtrace/event-parse.h"
+#include "ras-report.h"
+
+#define NONE ""
+int json_report = 1;
+
+void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev)
+{
+ if (!s || !ev || !json_report)
+ return;
+
+ trace_seq_printf(s,
+ "\n{ \"%s\": \"mc_event\", " \
+ "\"timestamp\": \"%s\", " \
+ "\"severity\": \"%s\", " \
+ "\"error_count\": %d, " \
+ "\"error_type\": \"%s\", " \
+ "\"msg\": \"%s\", " \
+ "\"label\": \"%s\", " \
+ "\"location\": \"%d:%d:%d:%d\", " \
+ "\"address\": \"%#llx\", " \
+ "\"grain\": \"%#llx\", " \
+ "\"syndrome\": \"%#llx\", " \
+ "\"driver_detail\": \"%s\" }",
+ JSON_REPORT_KEY,
+ (*ev->timestamp) ? ev->timestamp : NONE,
+ severity_strs[ev->severity],
+ ev->error_count,
+ (ev->error_type) ? ev->error_type : NONE,
+ (ev->msg) ? ev->msg : NONE,
+ (ev->label) ? ev->label : NONE,
+ ev->mc_index, ev->top_layer, ev->middle_layer, ev->lower_layer,
+ ev->address,
+ ev->grain,
+ ev->syndrome,
+ (ev->driver_detail) ? ev->driver_detail : NONE);
+}
+
+static void get_pci_dev_name(const char *bdf, char *pci_name, ssize_t len, u16 *vendor_id, u16 *device_id)
+{
+ struct pci_access *pacc;
+ struct pci_dev *dev;
+ struct pci_filter filter = {0};
+ int domain, bus, device, function;
+
+ pacc = pci_alloc();
+ if (!pacc)
+ return;
+ pci_init(pacc);
+ pci_scan_bus(pacc);
+
+ if (!pci_name)
+ goto free;
+
+
+ if (sscanf(bdf, "%x:%x.%x", &bus, &device, &function) == 3)
+ domain = 0;
+ else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &device) == 3)
+ function = 0;
+ else if (sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &device, &function) != 4)
+ goto free;
+
+ pci_filter_init(pacc, &filter);
+ filter.bus = bus;
+ filter.slot = device;
+ filter.func = function;
+ filter.domain = domain;
+
+ for (dev = pacc->devices; dev; dev = dev->next) {
+ if (pci_filter_match(&filter, dev)) {
+ pci_fill_info(dev, PCI_FILL_IDENT);
+ *vendor_id = dev->vendor_id;
+ *device_id = dev->device_id;
+ pci_lookup_name(pacc, pci_name, len,
+ PCI_LOOKUP_VENDOR | PCI_LOOKUP_DEVICE,
+ dev->vendor_id, dev->device_id);
+ break;
+ }
+ }
+
+free:
+ pci_cleanup(pacc);
+ return;
+}
+
+void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev)
+{
+ char pci_name[128];
+ u16 vendor, device;
+
+ if (!s || !ev || !json_report)
+ return;
+
+ get_pci_dev_name(ev->dev_name, pci_name, 128, &vendor, &device);
+
+ trace_seq_printf(s,
+ "\n{ \"%s\": \"aer_event\", " \
+ "\"timestamp\": \"%s\", " \
+ "\"severity\": \"%s\", " \
+ "\"error_type\": \"%s\", " \
+ "\"dev_name\": \"%s\", " \
+ "\"pci_dev_name\": \"%s\", " \
+ "\"vendor_id\": \"%#x\", " \
+ "\"device_id\": \"%#x\", " \
+ "\"msg\": \"%s\" }",
+ JSON_REPORT_KEY,
+ (*ev->timestamp) ? ev->timestamp : NONE,
+ severity_strs[ev->severity],
+ (ev->error_type) ? ev->error_type : NONE,
+ (ev->dev_name) ? ev->dev_name : NONE,
+ (*pci_name) ? pci_name : NONE,
+ vendor, device,
+ (ev->msg) ? ev->msg : NONE);
+}
+
+void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev)
+{
+ if (!s || !ev || !json_report)
+ return;
+
+ trace_seq_printf(s,
+ "\n{ \"%s\": \"arm_event\", " \
+ "\"timestamp\": \"%s\", " \
+ "\"error_count\": %d, " \
+ "\"affinity\": %d, " \
+ "\"mpidr\": \"%#lx\", " \
+ "\"midr\": \"%#lx\", " \
+ "\"running_state\": %d, " \
+ "\"psci_state\": %d }",
+ JSON_REPORT_KEY,
+ (*ev->timestamp) ? ev->timestamp : NONE,
+ ev->error_count,
+ ev->affinity,
+ ev->mpidr,
+ ev->midr,
+ ev->running_state,
+ ev->psci_state);
+}
+
+void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev)
+{
+ if (!s || !ev || !json_report)
+ return;
+
+ trace_seq_printf(s,
+ "\n{ \"%s\": \"mf_event\", \"timestamp\": \"%s\", " \
+ "\"pfn\": %s, \"page_type\": \"%s\", " \
+ "\"action_result\": \"%s\" }",
+ JSON_REPORT_KEY,
+ (*ev->timestamp) ? ev->timestamp : NONE,
+ (*ev->pfn) ? ev->pfn : NONE,
+ (ev->page_type) ? ev->page_type : NONE,
+ (ev->action_result) ? ev->action_result : NONE);
+}
+
+void report_mce_event_json(struct trace_seq *s, struct mce_event *ev)
+{
+ if (!s || !ev || !json_report)
+ return;
+
+ if (ev->status & MCI_STATUS_UC)
+ ev->severity = GHES_SEV_RECOVERABLE;
+ else if (ev->status & MCI_STATUS_DEFERRED)
+ ev->severity = GHES_SEV_RECOVERABLE;
+ else
+ ev->severity = GHES_SEV_CORRECTED;
+
+ trace_seq_printf(s,
+ "\n{ \"%s\": \"mce_record\", " \
+ "\"timestamp\": \"%s\", " \
+ "\"severity\": \"%s\", " \
+ "\"bank\": %d, " \
+ "\"bank_name\": \"%s\", " \
+ "\"status\": \"%#lx\", " \
+ "\"error_msg\": \"%s\", " \
+ "\"mcistatus_msg\": \"%s\", " \
+ "\"mcastatus_msg\": \"%s\", " \
+ "\"user_action\": \"%s\", " \
+ "\"mc_location\": \"%s\", " \
+ "\"cpuid\": \"%#x\", " \
+ "\"cpu\": %d, " \
+ "\"socketid\": %d, " \
+ "\"ip\": \"%#lx\", " \
+ "\"cs\": \"%#x\", " \
+ "\"misc\": \"%#lx\", " \
+ "\"addr\": \"%#lx\", " \
+ "\"synd\": \"%#lx\", " \
+ "\"ipid\": \"%#lx\", " \
+ "\"mcgstatus_msg\": \"%s\", " \
+ "\"mcgstatus\": \"%#lx\", " \
+ "\"mcgcap\": \"%#lx\", " \
+ "\"apicid\": \"%#x\" }",
+ JSON_REPORT_KEY,
+ (*ev->timestamp) ? ev->timestamp : NONE,
+ severity_strs[ev->severity],
+ ev->bank,
+ (*ev->bank_name) ? ev->bank_name : NONE,
+ ev->status,
+ (*ev->error_msg) ? ev->error_msg : NONE,
+ (*ev->mcistatus_msg) ? ev->mcistatus_msg : NONE,
+ (*ev->mcastatus_msg) ? ev->mcastatus_msg : NONE,
+ (*ev->user_action) ? ev->user_action : NONE,
+ (*ev->mc_location) ? ev->mc_location : NONE,
+ ev->cpuid,
+ ev->cpu,
+ ev->socketid,
+ ev->ip,
+ ev->cs,
+ ev->misc,
+ ev->addr,
+ ev->synd,
+ ev->ipid,
+ (*ev->mcgstatus_msg) ? ev->mcgstatus_msg : NONE,
+ ev->mcgstatus,
+ ev->mcgcap,
+ ev->apicid);
+}
+
diff --git a/ras-report.h b/ras-report.h
index a2edf3c..fb15dc3 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -30,6 +30,12 @@
/* ABRT socket file */
#define ABRT_SOCKET "/var/run/abrt/abrt.socket"
+#ifdef HAVE_JSON_REPORT
+#define JSON_REPORT_KEY "rasdaemon_event_name"
+
+extern int json_report;
+#endif
+
#ifdef HAVE_ABRT_REPORT
int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
@@ -54,4 +60,12 @@ static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_even
#endif
+#ifdef HAVE_JSON_REPORT
+void report_mc_event_json(struct trace_seq *s, struct ras_mc_event *ev);
+void report_aer_event_json(struct trace_seq *s, struct ras_aer_event *ev);
+void report_arm_event_json(struct trace_seq *s, struct ras_arm_event *ev);
+void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev);
+void report_mce_event_json(struct trace_seq *s, struct mce_event *ev);
+#endif
+
#endif
diff --git a/rasdaemon.c b/rasdaemon.c
index 88ba1ca..e0e85c1 100644
--- a/rasdaemon.c
+++ b/rasdaemon.c
@@ -26,6 +26,7 @@
#include "ras-logger.h"
#include "ras-events.h"
#include "ras-record.h"
+#include "ras-report.h"
/*
* Arguments(argp) handling logic and main
@@ -130,10 +131,16 @@ int main(int argc, char *argv[])
{
struct arguments args;
int idx = -1;
- choices_disable = getenv(DISABLE);
choices_disable = getenv(DISABLE);
+#ifdef HAVE_JSON_REPORT
+ if (choices_disable &&
+ strlen(choices_disable) != 0 &&
+ strstr(choices_disable, "json_report"))
+ json_report = 0;
+#endif
+
#ifdef HAVE_MCE
const struct argp_option offline_options[] = {
{"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
--
2.33.1
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。