Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions amdsmi_cli/amdsmi_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -7190,7 +7190,7 @@ def partition(self, args, multiple_devices=False, gpu=None, current=None, memory
output_file.write(legend_output + '\n')


def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None, decode=None,
severity=None, folder=None, file_limit=None, cper_file=None, follow=None):
"""
Retrieve and process CPER (RAS) entries for a target GPU.
Expand All @@ -7210,6 +7210,8 @@ def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
args.cper = cper
if afid:
args.afid = afid
if decode:
args.decode = decode
if severity:
args.severity = severity
if folder:
Expand All @@ -7225,7 +7227,11 @@ def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,

if args.afid:
if args.cper_file:
afids = self.helpers.pvtDumpAfids(args.cper_file)
if args.decode:
args.cursor = [0]
self.helpers.ras_cper(args, None, self.logger, 0)
return
afids = self.helpers.cper_dump_afids(args.cper_file)
print(' '.join(map(str, afids)))
return
else:
Expand Down
73 changes: 57 additions & 16 deletions amdsmi_cli/amdsmi_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import pwd
import stat
from typing import Tuple, Optional, Union
import tempfile

from enum import Enum
from pathlib import Path
Expand Down Expand Up @@ -1471,7 +1472,9 @@ def display_cper_files_generated(self, entries, device_handle, folder):
for entry_index, entry in enumerate(entries.values()):
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
gpu_id = '-'
if not isinstance(device_handle, Path):
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
prefix = self._severity_as_string(entry.get("error_severity", "Unknown"),
entry.get("notify_type", "Unknown"),
False)
Expand All @@ -1481,7 +1484,7 @@ def display_cper_files_generated(self, entries, device_handle, folder):
entry.get("notify_type", "Unknown"),
True)
cper_data_file = f"{prefix}_{self.get_cper_count() + 1}.cper"
afids = self.pvtDumpAfids(cper_data_file)
afids = self.cper_dump_afids(cper_data_file)
afids_str = ' '.join(map(str, afids))
output += f" {cper_data_file:<17} {afids_str}"

Expand All @@ -1494,7 +1497,8 @@ def _print_header(self, folder):
print(f" {'file_name':<17} {'list of afids'}", end="")
print("")

def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):

def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None, cper_file=None):
"""
Dump CPER entries to files in the specified folder. Handles batch deletion if file limit is exceeded.

Expand All @@ -1504,6 +1508,7 @@ def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limi
cper_data (list): List of CPER data objects with 'bytes' and 'size' keys.
device_handle: Device handle for GPU identification.
file_limit (int, optional): Maximum number of files to retain in the folder.
cper_file (str, optional): cper file name to use when saving to folder
"""
# Initialize header display
if not getattr(self, "_cper_display_initialized", False):
Expand All @@ -1524,7 +1529,10 @@ def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limi

# Generate filenames
count = self.get_cper_count() + 1
cper_name = f"{prefix}-{count}.cper"
if cper_file:
cper_name = cper_file
else:
cper_name = f"{prefix}-{count}.cper"
json_name = f"{prefix}-{count}.json"
cper_path = folder / cper_name
json_path = folder / json_name
Expand Down Expand Up @@ -1553,7 +1561,9 @@ def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limi

# Collect data for printing
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
gpu_id = '-'
if not isinstance(device_handle, Path):
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
severity = self._severity_as_string(error_severity, notify_type, False)
output_rows[cper_path] = [timestamp, gpu_id, severity, cper_name]
self.increment_cper_count()
Expand All @@ -1576,7 +1586,7 @@ def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limi
for cper_path, row in output_rows.items():
timestamp, gpu_id, severity, fname = row
try:
afids = self.pvtDumpAfids(cper_path)
afids = self.cper_dump_afids(cper_path)
afids_str = ' '.join(map(str, afids))
except Exception as e:
afids_str = "Error fetching AFIDs"
Expand All @@ -1593,6 +1603,26 @@ def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limi
))
except Exception as e:
logging.debug(f"Failed to dump entries as JSON: {e}")

def dump_cper_entries_as_json(self, entries, _cper_data, _device_handle):
"""
Return the CPER entries as a formatted JSON string and print it.
Parameters largely mirror dump_cper_entries so that callers can reuse the same argument list.
Unused arguments (_cper_data, _device_handle) are retained for API symmetry.
Returns:
str: The JSON representation of the CPER entries, or an empty string on failure.
"""
try:
entries_json = json.dumps(
entries,
indent=2,
default=lambda o: o.decode("utf-8") if isinstance(o, bytes) else o,
)
print(entries_json)
return entries_json
except Exception as e:
logging.debug(f"Failed to serialize CPER entries as JSON: {e}")
return ""

def write_binary(self, data, size, filepath):
"""
Expand Down Expand Up @@ -1652,7 +1682,7 @@ def binary_to_hexdump_string(self, data: Union[bytes, List[int]]) -> str:

return "\n".join(lines)

def pvtDumpAfids(self, cper_file):
def cper_dump_afids(self, cper_file):
# 1) Fetch the CPER “file” and ensure we have raw bytes
raw_data = cper_file
if hasattr(raw_data, "read"):
Expand Down Expand Up @@ -1743,14 +1773,16 @@ def ras_cper(self, args, device_handle, logger, gpu_idx):

buffer_size = 1048576

gpu_id = self.get_gpu_id_from_device_handle(device_handle)
if args.follow and not getattr(self, "_cper_follow_prompted", False):
print("Press CTRL + C to stop.")
self._cper_follow_prompted = True

primary_partition = self.is_primary_partition(device_handle, gpu_id)
if not primary_partition:
return
if args.decode and args.cper_file:
device_handle = args.cper_file
else:
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
if args.follow and not getattr(self, "_cper_follow_prompted", False):
print("Press CTRL + C to stop.")
self._cper_follow_prompted = True
primary_partition = self.is_primary_partition(device_handle, gpu_id)
if not primary_partition:
return

if args.folder and not getattr(self, "_cper_folder_prompted", False):
self._cper_folder_prompted = True
Expand All @@ -1759,6 +1791,7 @@ def ras_cper(self, args, device_handle, logger, gpu_idx):
self.stop = False

num_entries = 0
entries = {}
while True:
try:
entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries(
Expand All @@ -1779,7 +1812,15 @@ def ras_cper(self, args, device_handle, logger, gpu_idx):
args.cursor[gpu_idx] = new_cursor
if len(entries) == 0:
break
if args.folder:
if args.decode and args.cper_file:
if args.json:
self.dump_cper_entries_as_json(entries, cper_data, device_handle)
elif args.folder:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
with tempfile.TemporaryDirectory() as tmp_dir:
self.dump_cper_entries(tmp_dir, entries, cper_data, device_handle, args.file_limit, os.path.basename(args.cper_file))
elif args.folder:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
self.display_cper_files_generated(entries, device_handle, args.folder)
Expand Down
2 changes: 2 additions & 0 deletions amdsmi_cli/amdsmi_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1534,6 +1534,7 @@ def _add_ras_parser(self, subparsers: argparse._SubParsersAction, func):
# Help text for RAS arguments
cper_help = "Trigger current CPER data retrieval"
afid_help = "Generate an AFID (AMD Field ID) given a CPER record file"
decode_help = "Decode out-of-band CPER files captured by or collected from other systems"
severity_choices = ["nonfatal-uncorrected", "fatal", "nonfatal-corrected", "all"]
severity_choices_str = ", ".join(severity_choices)
severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}"
Expand Down Expand Up @@ -1562,6 +1563,7 @@ def _add_ras_parser(self, subparsers: argparse._SubParsersAction, func):
# AFID Arguments
afid_group = ras_parser.add_argument_group("AFID Arguments")
afid_group.add_argument("--cper-file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help)
afid_group.add_argument("--decode", action="store_true", help=decode_help)

# Add common modifiers and device selection arguments.
self._add_device_arguments(ras_parser, required=False)
Expand Down
5 changes: 5 additions & 0 deletions cmake.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash -xe
set -xe
cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=/opt/rocm-6.5.0 -DBUILD_TESTS=ON ..
#-DBUILD_WRAPPER=ON #

13 changes: 9 additions & 4 deletions py-interface/amdsmi_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2713,15 +2713,20 @@ def notifyTypeToString(notify_type_b):
return "".join(guid[::-1])

def amdsmi_get_gpu_cper_entries(
processor_handle: processor_handle_t,
device_handle: amdsmi_wrapper.amdsmi_processor_handle | Path,
# processor_handle: Union[amdsmi_wrapper.amdsmi_processor_handle, str],
severity_mask: int,
buffer_size: int = 4 * 1048576,
cursor: int = 0
) -> Tuple[Dict[str, Any], int, List[Dict[str, Any]], int]:

if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
if isinstance(device_handle, Path):
if not os.path.isfile(device_handle):
raise AmdSmiParameterException(device_handle, str)
device_handle = ctypes.c_char_p(str(device_handle).encode("utf-8"))
elif not isinstance(device_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
device_handle, amdsmi_wrapper.amdsmi_processor_handle
)

# Allocate a buffer for CPER data.
Expand All @@ -2737,7 +2742,7 @@ def amdsmi_get_gpu_cper_entries(

# Call the underlying AMD-SMI API.
status_code = amdsmi_wrapper.amdsmi_get_gpu_cper_entries(
processor_handle,
device_handle,
ctypes.c_uint32(severity_mask),
buf,
ctypes.byref(buf_size),
Expand Down
10 changes: 8 additions & 2 deletions src/amd_smi/amd_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4404,6 +4404,12 @@ amdsmi_get_gpu_cper_entries(
uint64_t *entry_count,
uint64_t *cursor) {

std::string path;
if(amd::smi::FileExists(static_cast<char const *>(processor_handle))) {
path = std::string(static_cast<char const *>(processor_handle));
}
else {

AMDSMI_CHECK_INIT();
if (!amd::smi::is_sudo_user()) {
return AMDSMI_STATUS_NO_PERM;
Expand All @@ -4414,10 +4420,10 @@ amdsmi_get_gpu_cper_entries(
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}

std::string path = std::string("/sys/kernel/debug/dri/") +
path = std::string("/sys/kernel/debug/dri/") +
std::to_string(gpu_device->get_card_id()) +
"/amdgpu_ring_cper";
}

return amdsmi_get_gpu_cper_entries_by_path(
path.c_str(),
Expand Down
Loading