代码拉取完成,页面将自动刷新
From 65e6c233804512a40f4626d86a0f3de0041f403b Mon Sep 17 00:00:00 2001
From: Ruidong Tian <tianruidong@linux.alibaba.com>
Date: Tue, 17 Dec 2024 09:36:55 +0800
Subject: [PATCH 82/85] anolis: rasdaemon: add amdgpu ras error monitor
Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
Makefile.am | 2 +-
misc/rasdaemon.env | 1 +
ras-events.c | 1 +
ras-kmsg-amdgpu.c | 219 +++++++++++++++++++++++++++++++++++++++++++++
ras-kmsg.c | 4 +
ras-kmsg.h | 25 ++++++
ras-mce-handler.c | 3 +
ras-record.h | 3 +
ras-report-json.c | 80 +++++++++++++++++
ras-report.h | 3 +
rasdaemon.c | 2 +
11 files changed, 342 insertions(+), 1 deletion(-)
create mode 100644 ras-kmsg-amdgpu.c
diff --git a/Makefile.am b/Makefile.am
index 3efcd9e..3d0a315 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -73,7 +73,7 @@ if WITH_YITIAN_NS_DECODE
rasdaemon_SOURCES += non-standard-yitian.c
endif
if WITH_KMSG_MONITOR
- rasdaemon_SOURCES += ras-kmsg.c
+ rasdaemon_SOURCES += ras-kmsg.c ras-kmsg-amdgpu.c
endif
if WITH_ERST
rasdaemon_SOURCES += ras-erst.c
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
index 1287183..c001afb 100644
--- a/misc/rasdaemon.env
+++ b/misc/rasdaemon.env
@@ -103,6 +103,7 @@ SIGNAL_TRIGGER=
SIGNAL_TRIGGER_TIMEOUT=0
ERST_DELETE=1
+AMDGPU_MCA_ENABLED=1
# KMSG MONITOR
KMSG_IGNORE_XID=""
diff --git a/ras-events.c b/ras-events.c
index f61b155..845e879 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -609,6 +609,7 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
goto error;
} else if (size > 0) {
kmsg_match(kmsg_buf);
+ amdgpu_tracer_match(kmsg_buf);
memset(kmsg_buf, 0, PRINTK_MESSAGE_MAX);
} else {
count_nready++;
diff --git a/ras-kmsg-amdgpu.c b/ras-kmsg-amdgpu.c
new file mode 100644
index 0000000..0d9900c
--- /dev/null
+++ b/ras-kmsg-amdgpu.c
@@ -0,0 +1,219 @@
+#include "ras-time.h"
+#define _GNU_SOURCE
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <sys/syslog.h>
+#include <sys/time.h>
+#include <libtrace/event-parse.h>
+#include "ras-logger.h"
+#include "ras-report.h"
+
+#include "ras-kmsg.h"
+#include "trigger.h"
+
+#define AMDGPU_ERROR_HEADER ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: Accelerator Check Architecture events logged\n"
+#define AMDGPU_ERROR_STATUS ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].STATUS=(0x[0-9A-Fa-f]+)\n"
+#define AMDGPU_ERROR_ADDR ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].ADDR=(0x[0-9A-Fa-f]+)\n"
+#define AMDGPU_ERROR_MISC0 ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].MISC0=(0x[0-9A-Fa-f]+)\n"
+#define AMDGPU_ERROR_IPID ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].IPID=(0x[0-9A-Fa-f]+)\n"
+#define AMDGPU_ERROR_SYND ".*amdgpu (.*): \\{[0-9]+\\}\\[Hardware Error\\]: aca entry\\[[0-9]+\\].SYND=(0x[0-9A-Fa-f]+)\n"
+
+#define AMDGPU_MCA_ENABLED "AMDGPU_MCA_ENABLED"
+
+static struct amdgpu_tracer *amdgpu_tracer;
+static struct amdgpu_error *amdgpu_error;
+static int amdgpu_mca_enable;
+
+static void report_amdgpu_mca(struct amdgpu_error *e)
+{
+#ifdef HAVE_MCE
+ struct ras_mc_offline_event event;
+
+ event.smca = true;
+ event.family = 0x17;
+ event.model = 0x17;
+ event.bank = 1;
+ event.status = e->status;
+ event.synd = e->synd;
+ event.ipid = e->ipid;
+ event.addr = e->addr;
+ event.misc0 = e->misc0;
+ event.domain = e->seq;
+ event.bus = e->bus;
+ event.device = e->dev;
+ event.function = e->func;
+
+ ras_offline_mce_event(&event);
+#endif
+}
+
+static void report_amdgpu_error(struct amdgpu_error *e)
+{
+ if (amdgpu_mca_enable && e->ipid && e->status)
+ report_amdgpu_mca(e);
+ else
+ report_amdgpu_error_json(e);
+}
+
+static void regex_group(regmatch_t *m, int i, const char *line, char *buf)
+{
+ int e, s;
+
+ s = m[i].rm_so;
+ e = m[i].rm_eo;
+ if (s >= 0)
+ snprintf(buf, e - s + 1, "%s", line + s);
+ else
+ buf = NULL;
+}
+
+void amdgpu_tracer_match(char *msg)
+{
+ regmatch_t matches[10];
+ regex_t *re;
+ char buf[128];
+ int ret;
+
+ ret = regexec(re = &(amdgpu_tracer->header), msg, 2, matches, 0);
+ if (ret > REG_NOMATCH) {
+ goto error;
+ } else if (!ret) {
+ if (amdgpu_error->tracing) {
+ report_amdgpu_error(amdgpu_error);
+ }
+
+ memset(amdgpu_error, 0, sizeof(*amdgpu_error));
+ amdgpu_error->tracing = 1;
+
+ get_kmsg_time(msg, amdgpu_error->timestamp);
+
+ regex_group(matches, 1, msg, buf);
+ sscanf(buf, "%x:%x:%x.%x",
+ &amdgpu_error->seq,
+ &amdgpu_error->bus,
+ &amdgpu_error->dev,
+ &amdgpu_error->func);
+
+ return;
+ }
+
+ ret = regexec(re = &amdgpu_tracer->status, msg, 3, matches, 0);
+ if (ret > REG_NOMATCH) {
+ goto error;
+ } else if (!ret) {
+ regex_group(matches, 2, msg, buf);
+ amdgpu_error->status = strtoull(buf, NULL, 16);
+
+ return;
+ }
+
+ ret = regexec(re = &amdgpu_tracer->addr, msg, 3, matches, 0);
+ if (ret > REG_NOMATCH) {
+ goto error;
+ } else if (!ret) {
+ regex_group(matches, 2, msg, buf);
+ amdgpu_error->addr = strtoull(buf, NULL, 16);
+
+ return;
+ }
+
+ ret = regexec(re = &amdgpu_tracer->misc0, msg, 3, matches, 0);
+ if (ret > REG_NOMATCH) {
+ goto error;
+ } else if (!ret) {
+ regex_group(matches, 2, msg, buf);
+ amdgpu_error->misc0 = strtoull(buf, NULL, 16);
+
+ return;
+ }
+
+ ret = regexec(re = &amdgpu_tracer->ipid, msg, 3, matches, 0);
+ if (ret > REG_NOMATCH) {
+ goto error;
+ } else if (!ret) {
+ regex_group(matches, 2, msg, buf);
+ amdgpu_error->ipid = strtoull(buf, NULL, 16);
+
+ return;
+ }
+
+ ret = regexec(re = &amdgpu_tracer->synd, msg, 3, matches, 0);
+ if (ret > REG_NOMATCH) {
+ goto error;
+ } else if (!ret) {
+ regex_group(matches, 2, msg, buf);
+ amdgpu_error->synd = strtoull(buf, NULL, 16);
+
+ report_amdgpu_error(amdgpu_error);
+ amdgpu_error->tracing = 0;
+
+ return;
+ }
+
+error:
+ if (ret == REG_NOMATCH)
+ return;
+ regerror(ret, re, buf, sizeof(buf));
+ printf("Regex execution error: %s\n", buf);
+ return;
+}
+
+int amdgpu_tracer_destroy(void)
+{
+ log(ALL, LOG_INFO, "amdgpu tracer destroy\n");
+
+ if (!amdgpu_error)
+ free(amdgpu_error);
+
+ if (!amdgpu_tracer)
+ free(amdgpu_tracer);
+
+ return 0;
+}
+
+static int init_reg(regex_t *re, const char *str)
+{
+ char buf[128];
+ int ret = 0;
+
+ ret = regcomp(re, str, REG_EXTENDED);
+ if (ret) {
+ regerror(ret, re, buf, sizeof(buf));
+ printf("Regex execution error: %s\n", buf);
+ return ret;
+ }
+
+ return ret;
+}
+
+int amdgpu_tracer_init(void)
+{
+ char *s;
+
+ s = getenv(AMDGPU_MCA_ENABLED);
+ if (!s || strcmp(s, "1"))
+ amdgpu_mca_enable = 0;
+ else
+ amdgpu_mca_enable = 1;
+
+ amdgpu_error = calloc(1, sizeof(struct amdgpu_error));
+ if (!amdgpu_error)
+ return -1;
+
+ amdgpu_tracer = calloc(1, sizeof(struct amdgpu_tracer));
+ if (!amdgpu_tracer)
+ return -1;
+
+ if (init_reg(&amdgpu_tracer->header, AMDGPU_ERROR_HEADER) ||
+ init_reg(&amdgpu_tracer->status, AMDGPU_ERROR_STATUS) ||
+ init_reg(&amdgpu_tracer->addr, AMDGPU_ERROR_ADDR) ||
+ init_reg(&amdgpu_tracer->misc0, AMDGPU_ERROR_MISC0) ||
+ init_reg(&amdgpu_tracer->ipid, AMDGPU_ERROR_IPID) ||
+ init_reg(&amdgpu_tracer->synd, AMDGPU_ERROR_SYND))
+ log(ALL, LOG_ERR, "amdgpu tracer init failed\n");
+
+ return 0;
+}
\ No newline at end of file
diff --git a/ras-kmsg.c b/ras-kmsg.c
index 0230180..c288f26 100644
--- a/ras-kmsg.c
+++ b/ras-kmsg.c
@@ -77,6 +77,8 @@ int kmsg_tracer_destroy(void)
}
free(kmsg_tracer);
+ amdgpu_tracer_destroy();
+
return 0;
}
@@ -87,6 +89,8 @@ int kmsg_tracer_init(void)
char buf[1026], *kmsg_tracer_name, *kmsg_tracer_regex, *tmp;
char *kmsg_tracer_group_key, *token;
+ amdgpu_tracer_init();
+
s = getenv(KMSG_TRACE_END);
if (!s)
kmsg_trace_end = 0;
diff --git a/ras-kmsg.h b/ras-kmsg.h
index f31125f..9e34da5 100644
--- a/ras-kmsg.h
+++ b/ras-kmsg.h
@@ -3,6 +3,7 @@
#define __RAS_KMSG_H
#include <regex.h>
+#include <stdint.h>
/**
* Kernel message tracer related definitions
@@ -40,8 +41,32 @@ struct kmsg_tracer_info {
} info;
};
+struct amdgpu_tracer {
+ regex_t header;
+ regex_t status;
+ regex_t addr;
+ regex_t misc0;
+ regex_t ipid;
+ regex_t synd;
+};
+
+struct amdgpu_error {
+ char timestamp[64];
+ int seq, bus, dev, func;
+ int tracing;
+ uint64_t status;
+ uint64_t addr;
+ uint64_t misc0;
+ uint64_t ipid;
+ uint64_t synd;
+};
+
int kmsg_tracer_init(void);
int kmsg_tracer_destroy(void);
int kmsg_match(char *msg);
+void amdgpu_tracer_match(char *msg);
+int amdgpu_tracer_destroy(void);
+int amdgpu_tracer_init(void);
+
#endif
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
index 686c308..e53854d 100644
--- a/ras-mce-handler.c
+++ b/ras-mce-handler.c
@@ -475,6 +475,9 @@ int ras_offline_mce_event(struct ras_mc_offline_event *event)
trace_seq_init(&s);
report_mce_offline(&s, mce, priv);
+#ifdef HAVE_JSON_REPORT
+ report_mce_offline_json(&s, mce, event);
+#endif
trace_seq_do_printf(&s);
fflush(stdout);
trace_seq_destroy(&s);
diff --git a/ras-record.h b/ras-record.h
index 91f9d1c..42fecb8 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -56,6 +56,9 @@ struct ras_mc_offline_event {
uint64_t ipid;
uint64_t synd;
uint64_t status;
+ uint64_t addr;
+ uint64_t misc0;
+ int domain, bus, device, function;
};
struct ras_aer_event {
diff --git a/ras-report-json.c b/ras-report-json.c
index 8dbcd90..6508b60 100644
--- a/ras-report-json.c
+++ b/ras-report-json.c
@@ -324,3 +324,83 @@ out:
fflush(stdout);
trace_seq_destroy(&seq);
}
+
+void report_mce_offline_json(struct trace_seq *s, struct mce_event *mce,
+ struct ras_mc_offline_event *e)
+{
+ char tmpbuf[128] = {0}, pci_name[128] = {0};
+ u16 vendor, device;
+
+ if (!s || !e || !mce || !json_report)
+ return;
+
+ snprintf(tmpbuf, 128, "%x:%x:%x.%x", e->domain, e->bus, e->device, e->function);
+ get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device);
+
+ trace_seq_printf(s,
+ "\n{ \"%s\": \"amdgpu_ras_event\", " \
+ "\"timestamp\": \"%s\", " \
+ "\"bank_name\": \"%s\", " \
+ "\"bank\": %d, " \
+ "\"mcastatus_msg\": \"%s\", " \
+ "\"mcistatus_msg\": \"%s\", " \
+ "\"mc_location\": \"%s\", " \
+ "\"error_msg\": \"%s\", " \
+ "\"pci_dev_name\": \"%s\", " \
+ "\"vendor_id\": \"%#x\", " \
+ "\"device_id\": \"%#x\", " \
+ "\"status\": \"%#lx\", " \
+ "\"addr\": \"%#lx\", " \
+ "\"misc0\": \"%#lx\", " \
+ "\"ipid\": \"%#lx\", " \
+ "\"synd\": \"%#lx\" }\n",
+ JSON_REPORT_KEY,
+ (*mce->timestamp) ? mce->timestamp : NONE,
+ (*mce->bank_name) ? mce->bank_name : NONE,
+ mce->bank,
+ (*mce->mcastatus_msg) ? mce->mcastatus_msg : NONE,
+ (*mce->mcistatus_msg) ? mce->mcistatus_msg : NONE,
+ (*mce->mc_location) ? mce->mc_location : NONE,
+ (*mce->error_msg) ? mce->error_msg : NONE,
+ pci_name, vendor, device,
+ e->status, e->addr, e->misc0, e->ipid, e->synd);
+}
+
+void report_amdgpu_error_json(struct amdgpu_error *e)
+{
+ struct trace_seq seq;
+ char tmpbuf[128] = {0}, pci_name[128] = {0};
+ u16 vendor, device;
+
+ if (!e || !json_report)
+ return;
+
+ snprintf(tmpbuf, 128, "%x:%x:%x.%x", e->seq, e->bus, e->dev, e->func);
+ get_pci_dev_name(tmpbuf, pci_name, 128, &vendor, &device);
+
+ trace_seq_init(&seq);
+ trace_seq_printf(&seq,
+ "\n{ \"%s\": \"amdgpu_ras_event\", " \
+ "\"timestamp\": \"%s\", " \
+ "\"pci_dev_name\": \"%s\", " \
+ "\"vendor_id\": \"%#x\", " \
+ "\"device_id\": \"%#x\", " \
+ "\"status\": \"0x%#lx\", " \
+ "\"addr\": \"0x%#lx\", " \
+ "\"misc0\": \"0x%#lx\", " \
+ "\"ipid\": \"0x%#lx\", " \
+ "\"synd\": \"0x%#lx\" }",
+ JSON_REPORT_KEY,
+ (*e->timestamp) ? e->timestamp : "",
+ pci_name, vendor, device,
+ e->status,
+ e->addr,
+ e->misc0,
+ e->ipid,
+ e->synd);
+
+ trace_seq_do_printf(&seq);
+ printf("\n");
+ fflush(stdout);
+ trace_seq_destroy(&seq);
+}
\ No newline at end of file
diff --git a/ras-report.h b/ras-report.h
index 98c4542..b2cd97d 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -69,6 +69,9 @@ void report_mf_event_json(struct trace_seq *s, struct ras_mf_event *ev);
void report_mce_event_json(struct trace_seq *s, struct mce_event *ev);
void report_kmsg_event_json(struct kmsg_tracer_info *kmsg_tracer, const char *msg);
void report_signal_event_json(struct trace_seq *s, struct ras_signal_event *ev);
+void report_mce_offline_json(struct trace_seq *s, struct mce_event *mce,
+ struct ras_mc_offline_event *e);
+void report_amdgpu_error_json(struct amdgpu_error *e);
#endif
#endif
diff --git a/rasdaemon.c b/rasdaemon.c
index 987c544..4f7246f 100644
--- a/rasdaemon.c
+++ b/rasdaemon.c
@@ -257,6 +257,8 @@ int main(int argc, char *argv[])
handle_erst_mce();
#endif
#endif
+ get_boot_time(&boot_time);
+ suspended_time = get_suspended_time();
handle_ras_events(args.record_events);
--
2.33.1
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。