diff --git a/symbolic-debuginfo/src/dwarf.rs b/symbolic-debuginfo/src/dwarf.rs index 6de54b5b0..35f755971 100644 --- a/symbolic-debuginfo/src/dwarf.rs +++ b/symbolic-debuginfo/src/dwarf.rs @@ -413,6 +413,52 @@ impl<'d> UnitRef<'d, '_> { self.unit.header.offset() } + /// Returns the source language declared in the root DIE of this compilation unit. + fn language(&self) -> Result, DwarfError> { + let mut entries = self.unit.entries(); + let Some((_, root_entry)) = entries.next_dfs()? else { + return Ok(None); + }; + let Some(AttributeValue::Language(lang)) = + root_entry.attr_value(constants::DW_AT_language)? + else { + return Ok(None); + }; + Ok(Some(language_from_dwarf(lang))) + } + + /// Maximum recursion depth for following `DW_AT_abstract_origin` chains, matching the limit + /// used by elfutils `dwarf_attr_integrate`. + const MAX_ABSTRACT_ORIGIN_DEPTH: u8 = 16; + + /// Resolves the source language for a DIE by following `DW_AT_abstract_origin` chains, + /// including across compilation unit boundaries. `depth` limits recursion to guard against + /// cycles or malformed DWARF. + fn resolve_entry_language( + &self, + entry: &Die<'d, '_>, + depth: u8, + ) -> Result, DwarfError> { + if depth == 0 { + return Ok(None); + } + if let Ok(Some(attr)) = entry.attr(constants::DW_AT_abstract_origin) { + return self.resolve_reference(attr, |ref_unit, ref_entry| { + // Recurse first to follow deeper chains. + if let Some(lang) = ref_unit.resolve_entry_language(ref_entry, depth - 1)? { + return Ok(Some(lang)); + } + // No deeper reference: use the CU language if this is a cross-unit ref. + if self.offset() != ref_unit.offset() { + ref_unit.language() + } else { + Ok(None) + } + }); + } + Ok(None) + } + /// Resolves the function name of a debug entry. fn resolve_function_name( &self, @@ -453,7 +499,7 @@ impl<'d> UnitRef<'d, '_> { if let Some(attr) = reference_target { return self.resolve_reference(attr, |ref_unit, ref_entry| { - // Self-references may have a layer of indircetion. Avoid infinite recursion + // Self-references may have a layer of indirection. Avoid infinite recursion // in this scenario. if let Some(prior) = prior_offset { if self.offset() == ref_unit.offset() && prior == ref_entry.offset() { @@ -719,18 +765,29 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { } /// Resolves the name of a function from the symbol table. - fn resolve_symbol_name(&self, address: u64) -> Option> { + fn resolve_symbol_name(&self, address: u64, language: Language) -> Option> { let symbol = self.inner.info.symbol_map.lookup_exact(address)?; let name = resolve_cow_name(self.bcsymbolmap, symbol.name.clone()?); - Some(Name::new(name, NameMangling::Mangled, self.language)) + Some(Name::new(name, NameMangling::Mangled, language)) } - /// Resolves the name of a function from DWARF debug information. - fn resolve_dwarf_name(&self, entry: &Die<'d, '_>) -> Option> { + /// Resolves the source language for a function by following `DW_AT_abstract_origin` to the + /// origin compilation unit when crossing unit boundaries. + /// + /// With LTO, the linker may create artificial compilation units whose `DW_AT_language` + /// does not reflect the original source language (e.g., a C++ CU containing functions + /// originally written in C). When such a CU's subprogram carries a cross-unit + /// `DW_AT_abstract_origin`, the referenced CU's language is more authoritative. + fn resolve_function_language( + &self, + entry: &Die<'d, '_>, + fallback_language: Language, + ) -> Language { self.inner - .resolve_function_name(entry, self.language, self.bcsymbolmap, None) + .resolve_entry_language(entry, UnitRef::MAX_ABSTRACT_ORIGIN_DEPTH) .ok() .flatten() + .unwrap_or(fallback_language) } /// Parses any DW_TAG_subprogram DIEs in the DIE subtree. @@ -814,17 +871,29 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { // // XXX: Maybe we should actually parse the ranges in the resolve function and always // look at the symbol table based on the start of the DIE range. + + let entry = self.inner.unit.entry(dw_die_offset)?; + // With LTO the current CU may be an artificial unit with an incorrect language. Follow + // DW_AT_abstract_origin cross-unit to find the true source language. The resolved + // language is also propagated to all inlinees of this function. + let language = self.resolve_function_language(&entry, self.language); + let symbol_name = if self.prefer_dwarf_names { None } else { let first_range_begin = ranges.iter().map(|range| range.begin).min().unwrap(); let function_address = offset(first_range_begin, self.inner.info.address_offset); - self.resolve_symbol_name(function_address) + self.resolve_symbol_name(function_address, language) }; let name = symbol_name - .or_else(|| self.resolve_dwarf_name(&self.inner.unit.entry(dw_die_offset).unwrap())) - .unwrap_or_else(|| Name::new("", NameMangling::Unmangled, self.language)); + .or_else(|| { + self.inner + .resolve_function_name(&entry, language, self.bcsymbolmap, None) + .ok() + .flatten() + }) + .unwrap_or_else(|| Name::new("", NameMangling::Unmangled, language)); // Create one function per range. In the common case there is only one range, so // we usually only have one function builder here. @@ -840,7 +909,7 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { }) .collect(); - self.parse_function_children(depth, 0, entries, &mut builders, output)?; + self.parse_function_children(depth, 0, entries, &mut builders, output, language)?; if let Some(line_program) = &self.line_program { for (range, builder) in &mut builders { @@ -869,6 +938,7 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { entries: &mut EntriesRaw<'d, '_>, builders: &mut [(Range, FunctionBuilder<'d>)], output: &mut FunctionsOutput<'_, 'd>, + language: Language, ) -> Result<(), DwarfError> { while !entries.is_empty() { let dw_die_offset = entries.next_offset(); @@ -882,6 +952,7 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { }; match abbrev.tag() { constants::DW_TAG_subprogram => { + // Nested subprograms resolve their own language independently. self.parse_function(dw_die_offset, next_depth, entries, abbrev, output)?; } constants::DW_TAG_inlined_subroutine => { @@ -893,6 +964,7 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { abbrev, builders, output, + language, )?; } _ => { @@ -921,6 +993,7 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { abbrev: &gimli::Abbreviation, builders: &mut [(Range, FunctionBuilder<'d>)], output: &mut FunctionsOutput<'_, 'd>, + language: Language, ) -> Result<(), DwarfError> { let (ranges, call_location) = self.parse_ranges(entries, abbrev, &mut output.range_buf)?; @@ -937,9 +1010,18 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { return self.parse_functions(depth, entries, output); } + let entry = self.inner.unit.entry(dw_die_offset)?; + let language = self.resolve_function_language(&entry, language); + + // Use the language resolved for the enclosing top-level subprogram rather than + // self.language: the inlinee's DW_AT_abstract_origin may resolve to a partial unit + // which carries the wrong language (e.g. a C++ LTO partial unit for C code). let name = self - .resolve_dwarf_name(&self.inner.unit.entry(dw_die_offset).unwrap()) - .unwrap_or_else(|| Name::new("", NameMangling::Unmangled, self.language)); + .inner + .resolve_function_name(&entry, language, self.bcsymbolmap, None) + .ok() + .flatten() + .unwrap_or_else(|| Name::new("", NameMangling::Unmangled, language)); let call_file = call_location .call_file @@ -970,7 +1052,7 @@ impl<'d, 'a> DwarfUnit<'d, 'a> { ); } - self.parse_function_children(depth, inline_depth + 1, entries, builders, output) + self.parse_function_children(depth, inline_depth + 1, entries, builders, output, language) } /// Collects all functions within this compilation unit. diff --git a/symbolic-debuginfo/tests/test_objects.rs b/symbolic-debuginfo/tests/test_objects.rs index a7faed811..174a6fbee 100644 --- a/symbolic-debuginfo/tests/test_objects.rs +++ b/symbolic-debuginfo/tests/test_objects.rs @@ -1,6 +1,6 @@ use std::{ffi::CString, fmt, io::BufWriter}; -use symbolic_common::ByteView; +use symbolic_common::{ByteView, Language}; use symbolic_debuginfo::{ elf::ElfObject, pe::PeObject, FileEntry, Function, LineInfo, Object, SymbolMap, }; @@ -935,3 +935,53 @@ fn test_wasm_line_program() -> Result<(), Error> { Ok(()) } + +fn find_functions_by_name<'a>(functions: &'a [Function<'a>], name: &str) -> Vec<&'a Function<'a>> { + let mut result = Vec::new(); + for f in functions { + if f.name.as_str() == name { + result.push(f); + } + result.extend(find_functions_by_name(&f.inlinees, name)); + } + result +} + +#[test] +fn test_lto_language_detection() -> Result<(), Error> { + // libjemalloc is compiled as C but LTO creates artificial CUs with DW_LANG_C_plus_plus. + // The fix follows DW_AT_abstract_origin cross-unit to find the true source language. + let view = ByteView::open(fixture("linux/libjemalloc.so.debug"))?; + let object = Object::parse(&view)?; + let session = object.debug_session()?; + + let functions: Vec<_> = session.functions().filter_map(|f| f.ok()).collect(); + + for name in &["je_tcache_arena_associate", "malloc_mutex_trylock_final"] { + let matches = find_functions_by_name(&functions, name); + assert!(!matches.is_empty(), "{name} should be found"); + for func in matches { + assert_eq!(func.name.language(), Language::C, "{name} should be C"); + } + } + + Ok(()) +} + +#[test] +fn test_cross_language_lto_inlinee_language() -> Result<(), Error> { + // cross_lang_lto is a Rust binary that inlines a C function (my_add) via LTO. + // The inlinee should be detected as C, not Rust. + let view = ByteView::open(fixture("linux/cross_language_lto.debug"))?; + let object = Object::parse(&view)?; + let session = object.debug_session()?; + + let functions: Vec<_> = session.functions().filter_map(|f| f.ok()).collect(); + let matches = find_functions_by_name(&functions, "my_add"); + assert!(!matches.is_empty(), "my_add should be found as an inlinee"); + for func in matches { + assert_eq!(func.name.language(), Language::C, "my_add should be C"); + } + + Ok(()) +} diff --git a/symbolic-testutils/fixtures/linux/cross_language_lto.debug b/symbolic-testutils/fixtures/linux/cross_language_lto.debug new file mode 100755 index 000000000..b363afe90 Binary files /dev/null and b/symbolic-testutils/fixtures/linux/cross_language_lto.debug differ diff --git a/symbolic-testutils/fixtures/linux/libjemalloc.so.debug b/symbolic-testutils/fixtures/linux/libjemalloc.so.debug new file mode 100755 index 000000000..3dda758f4 Binary files /dev/null and b/symbolic-testutils/fixtures/linux/libjemalloc.so.debug differ