Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/diffguard-analytics/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ pub struct TrendRun {
/// Stored as `u64` to avoid silent truncation for very large repositories
/// (those with more than 2^32 - 1 unique files).
pub files_scanned: u64,
pub lines_scanned: u32,
pub lines_scanned: u64,
pub findings: u32,
}

Expand Down
24 changes: 1 addition & 23 deletions crates/diffguard-core/src/render.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use diffguard_types::{
CheckReceipt, Finding, REASON_GIT_UNAVAILABLE, REASON_MISSING_BASE, REASON_NO_DIFF_INPUT,
REASON_TOOL_ERROR, REASON_TRUNCATED, VerdictStatus,
REASON_TOOL_ERROR, REASON_TRUNCATED, VerdictStatus, escape_md,
};

/// Reasons that are meaningful to render in markdown output.
Expand Down Expand Up @@ -114,28 +114,6 @@ fn render_finding_row(f: &Finding) -> String {
)
}

/// Escapes special Markdown characters in table cell content.
///
/// Escapes pipe (`|`), backtick (`` ` ``), hash (`#`), asterisk (`*`),
/// underscore (`_`), open bracket (`[`), close bracket (`]`), and greater-than
/// (`>`) characters by prefixing with backslash. Also escapes CRLF (`\r\n`)
/// and LF (`\n`) line endings to prevent breaking the markdown table structure.
///
/// These escapes are needed to prevent breaking the markdown table structure
/// and prevent unintended markdown formatting.
fn escape_md(s: &str) -> String {
s.replace('|', "\\|")
.replace('`', "\\`")
.replace('#', "\\#")
.replace('*', "\\*")
.replace('_', "\\_")
.replace('[', "\\[")
.replace(']', "\\]")
.replace('>', "\\>")
.replace('\r', "\\r")
.replace('\n', "\\n")
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"dir\\ name\\\"quote\\\"\\\\tab\\tnewline\\ncarriage\\rend\"#)"
---
dir name"quote"\tab newline
carriageend
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"octal\\141\\040space\"#)"
---
octala space
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"hello\\041world\"#)"
---
hello!world
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(\"\")"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\77\"#)"
---
?
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\377\"#)"
---
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\001\\002\\003\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(\"plain_string_no_escapes\")"
---
plain_string_no_escapes
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\9\"#)"
---
\9
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\8\"#)"
---
\8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\1\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\2\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\3\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\4\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\5\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\6\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\7\"#)"
---

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\177\"#)"
---

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"endswith\\\"#)"
---
endswith\
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\07\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\10\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\12\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\15\"#)"
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\77\"#)"
---
?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\q1\"#)"
---
\q1
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: crates/diffguard-diff/src/unified.rs
expression: "unescape_git_path(r#\"\\q\"#)"
---
\q
147 changes: 147 additions & 0 deletions crates/diffguard-diff/tests/integration_octal_escaped_paths.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
//! Integration tests for parsing diffs with octal-escaped paths.
//!
//! These tests exercise the full pipeline: raw diff text with quoted paths
//! containing octal escape sequences → parse_unified_diff → DiffLine with
//! correctly unescaped paths.
//!
//! The change being tested: replacing `u8 as u32` with `u32::from(u8)` in
//! unescape_git_path's octal parsing branch. This is a lossless widening cast
//! that doesn't affect behavior, but these tests verify the full integration
//! path still works correctly.
Comment on lines +7 to +10
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Remove implementation-detail commentary from integration test documentation.

These lines describe an internal casting change (u8 as u32u32::from(u8)) in unescape_git_path. Integration tests should document the behavior being tested, not internal implementation details. This commentary creates confusion about the test suite's purpose.

♻️ Suggested revision
-//! The change being tested: replacing `u8 as u32` with `u32::from(u8)` in
-//! unescape_git_path's octal parsing branch. This is a lossless widening cast
-//! that doesn't affect behavior, but these tests verify the full integration
-//! path still works correctly.
+//! These tests verify that the parser correctly handles all valid octal escape
+//! sequences, including edge cases like boundary values and non-printable characters.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
//! The change being tested: replacing `u8 as u32` with `u32::from(u8)` in
//! unescape_git_path's octal parsing branch. This is a lossless widening cast
//! that doesn't affect behavior, but these tests verify the full integration
//! path still works correctly.
//! These tests verify that the parser correctly handles all valid octal escape
//! sequences, including edge cases like boundary values and non-printable characters.
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/diffguard-diff/tests/integration_octal_escaped_paths.rs` around lines
7 - 10, The test documentation in integration_octal_escaped_paths.rs currently
references an internal casting change in unescape_git_path (u8 as u32 →
u32::from(u8)); remove that implementation-detail commentary and replace it with
a behavior-focused sentence describing what the test validates (e.g., that
octal-escaped path decoding in unescape_git_path correctly unescapes octal
sequences and preserves characters). Keep the test doc concise and high-level,
mention the function under test (unescape_git_path) and the behavior (correct
octal parsing/unescaping), but do not reference specific casts or internal
refactorings.


use diffguard_diff::parse_unified_diff;
use diffguard_types::Scope;

/// Test: Full pipeline with a path containing an octal-escaped space.
/// Git outputs paths with spaces as `\040` (octal for space).
///
/// Flow: diff text with quoted path → parse_diff_git_line → tokenize_git_paths
/// → unquote_git_token → unescape_git_path → unescaped path in DiffLine
#[test]
fn test_parse_diff_with_octal_escaped_space_in_path() {
// Git quotes paths with special characters and uses octal escapes for spaces
// "\040" is octal for space (ASCII 32)
let diff = r#"
diff --git "a/path\040with spaces/file.rs" "b/path with spaces/file.rs"
--- "a/path\040with spaces/file.rs"
+++ "b/path with spaces/file.rs"
@@ -1 +1,2 @@
fn existing() {}
+fn added() {}
"#;
Comment on lines +24 to +31
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Description: Verify Git's actual escaping behavior for paths with spaces

# Create a temporary test repo
temp_dir=$(mktemp -d)
cd "$temp_dir"
git init -q

# Create a file with spaces in the path
mkdir -p "path with spaces"
echo "original" > "path with spaces/file.txt"
git add .
git commit -q -m "initial"

# Modify the file to generate a diff
echo "modified" >> "path with spaces/file.txt"

# Show how Git formats the diff header
echo "=== Git diff header format for path with spaces ==="
git diff HEAD | grep -A2 "^diff --git"

# Cleanup
cd -
rm -rf "$temp_dir"

Repository: EffortlessMetrics/diffguard

Length of output: 739


🏁 Script executed:

#!/bin/bash
# Description: Verify Git's actual escaping behavior for paths with spaces (corrected)

# Configure git first
git config --global user.email "test@example.com"
git config --global user.name "Test User"

# Create a temporary test repo
temp_dir=$(mktemp -d)
cd "$temp_dir"
git init -q

# Create a file with spaces in the path
mkdir -p "path with spaces"
echo "original" > "path with spaces/file.txt"
git add .
git commit -q -m "initial"

# Modify the file to generate a diff
echo "modified" >> "path with spaces/file.txt"

# Show how Git formats the diff header
echo "=== Git diff header format for path with spaces ==="
git diff HEAD | head -6

# Cleanup
cd -
rm -rf "$temp_dir"

Repository: EffortlessMetrics/diffguard

Length of output: 407


🏁 Script executed:

# Examine the actual test file
head -50 crates/diffguard-diff/tests/integration_octal_escaped_paths.rs

Repository: EffortlessMetrics/diffguard

Length of output: 1970


🏁 Script executed:

# Check the full test file for context
cat crates/diffguard-diff/tests/integration_octal_escaped_paths.rs

Repository: EffortlessMetrics/diffguard

Length of output: 4645


🏁 Script executed:

# Also check if there are other diff-related tests to understand testing patterns
fd -t f "test" crates/diffguard-diff/tests/ | head -10

Repository: EffortlessMetrics/diffguard

Length of output: 53


Correct the misleading comment about Git's escaping behavior.

The inline comment at line 23 incorrectly states "Git outputs paths with spaces as \040". In reality, when Git generates a unified diff for a file with spaces in its path, it outputs literal spaces without octal escaping:

diff --git a/path with spaces/file.txt b/path with spaces/file.txt

The test itself appears designed to verify the parser's robustness in handling mixed escaping formats (a/ path escaped, b/ path literal). If that's the intent, add a comment clarifying this is intentional edge-case testing rather than realistic Git output. Otherwise, update the test data to match actual Git behavior.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/diffguard-diff/tests/integration_octal_escaped_paths.rs` around lines
24 - 31, The inline comment in integration_octal_escaped_paths.rs that claims
"Git outputs paths with spaces as `\040`" is misleading; update the comment
above the diff variable to state that real Git unified diffs use literal spaces
and that this test intentionally mixes an escaped a/ path and a literal b/ path
to exercise the parser's robustness (or alternatively, if you prefer realistic
data, change the test diff string in the diff variable so both file paths use
literal spaces consistently). Ensure the updated comment clearly marks the
mixed-escaping as an intentional edge-case test and keep references to the diff
variable when editing.


let (lines, stats) = parse_unified_diff(diff, Scope::Added).unwrap();

// The path should be unescaped: \040 → ' '
assert_eq!(lines.len(), 1);
assert_eq!(lines[0].path, "path with spaces/file.rs");
assert_eq!(lines[0].content, "fn added() {}");
assert_eq!(stats.files, 1);
assert_eq!(stats.lines, 1);
}

/// Test: Full pipeline with embedded octal escapes in path.
/// Path contains multiple octal escapes representing different characters.
///
/// \041 = '!' (ASCII 33)
/// \040 = ' ' (ASCII 32)
#[test]
fn test_parse_diff_with_multiple_octal_escapes_in_path() {
let diff = r#"
diff --git "a/file\041name\040here.rs" "b/file!name here.rs"
--- "a/file\041name\040here.rs"
+++ "b/file!name here.rs"
@@ -1 +1,2 @@
fn existing() {}
+fn added() {}
"#;

let (lines, _stats) = parse_unified_diff(diff, Scope::Added).unwrap();

// All octal escapes should be properly decoded
assert_eq!(lines[0].path, "file!name here.rs");
}

/// Test: Octal escapes at path boundaries (start, middle, end).
///
/// \143 = 'c' (ASCII 99) - octal for lowercase 'c'
#[test]
fn test_parse_diff_with_octal_escape_at_path_boundaries() {
// \143 = 'c'
let diff = r#"
diff --git "a/\143at.rs" "b/cat.rs"
--- "a/\143at.rs"
+++ "b/cat.rs"
@@ -1 +1,2 @@
fn existing() {}
+fn added() {}
"#;

let (lines, _stats) = parse_unified_diff(diff, Scope::Added).unwrap();
assert_eq!(lines[0].path, "cat.rs");
}

/// Test: Three-digit octal escape at maximum value.
/// \177 = DEL (ASCII 127), \000 = NUL (ASCII 0)
#[test]
fn test_parse_diff_with_octal_edge_cases() {
// \177 = 127 (DEL), \000 = 0 (NUL)
// These are boundary cases for the u8→u32 cast
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Remove implementation-detail reference from test comment.

The comment "These are boundary cases for the u8→u32 cast" references internal implementation details. Integration tests should focus on the behavior being tested (edge octal values), not on internal casting operations.

♻️ Suggested revision
-    // \177 = 127 (DEL), \000 = 0 (NUL)
-    // These are boundary cases for the u8→u32 cast
+    // \177 = 127 (DEL), \000 = 0 (NUL)
+    // Edge case: octal values at the boundaries of the valid byte range
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// These are boundary cases for the u8→u32 cast
// \177 = 127 (DEL), \000 = 0 (NUL)
// Edge case: octal values at the boundaries of the valid byte range
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/diffguard-diff/tests/integration_octal_escaped_paths.rs` at line 89,
Replace the implementation-detail comment "These are boundary cases for the
u8→u32 cast" in the integration_octal_escaped_paths test with a behavior-focused
description (e.g., "Boundary cases for octal-escaped byte values" or "Edge octal
values") so the test documents expected behavior rather than internal casting;
locate the comment by searching for the exact string "These are boundary cases
for the u8→u32 cast" in integration_octal_escaped_paths.rs and update it
accordingly.

let diff = r#"
diff --git "a/\177\000file.rs" "b/\177\000file.rs"
--- "a/\177\000file.rs"
+++ "b/\177\000file.rs"
@@ -1 +1,2 @@
fn existing() {}
+fn added() {}
"#;

let (lines, _stats) = parse_unified_diff(diff, Scope::Added).unwrap();
// The path should contain the raw bytes (non-printable but valid)
assert_eq!(lines[0].path, "\x7F\x00file.rs");
}

/// Test: Renamed file with octal-escaped path.
/// When a file is renamed, the "rename to" path can also have octal escapes.
#[test]
fn test_parse_diff_rename_with_octal_escaped_path() {
let diff = r#"
diff --git "a/old\040name.rs" "b/new\040name.rs"
rename from old name.rs
rename to new name.rs
--- "a/old\040name.rs"
+++ "b/new\040name.rs"
@@ -1 +1,2 @@
fn existing() {}
+fn added() {}
"#;

let (lines, _stats) = parse_unified_diff(diff, Scope::Added).unwrap();
// The path should be unescaped
assert_eq!(lines[0].path, "new name.rs");
}

/// Test: Multiple files with mixed quoted/unquoted paths.
#[test]
fn test_parse_diff_multiple_files_mixed_path_formats() {
let diff = r#"
diff --git "a/quoted\040path.rs" "b/quoted path.rs"
--- "a/quoted\040path.rs"
+++ "b/quoted path.rs"
@@ -1 +1,2 @@
+added to quoted path
diff --git a/normal_path.rs b/normal_path.rs
--- a/normal_path.rs
+++ b/normal_path.rs
@@ -1 +1,2 @@
+added to normal path
"#;

let (lines, stats) = parse_unified_diff(diff, Scope::Added).unwrap();

assert_eq!(lines.len(), 2);
assert_eq!(lines[0].path, "quoted path.rs");
assert_eq!(lines[1].path, "normal_path.rs");
assert_eq!(stats.files, 2);
assert_eq!(stats.lines, 2);
}
4 changes: 2 additions & 2 deletions crates/diffguard-domain/src/evaluate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pub struct Evaluation {
/// The previous `u32` cast would silently truncate, producing incorrect
/// (often zero) counts for very large codebases.
pub files_scanned: u64,
pub lines_scanned: u32,
pub lines_scanned: u64,
/// Aggregated per-rule hit counts (deterministically sorted by rule ID).
pub rule_hits: Vec<RuleHitStat>,
}
Expand Down Expand Up @@ -102,7 +102,7 @@ pub fn evaluate_lines_with_overrides_and_language(
.iter()
.map(|line| line.path.clone())
.collect::<BTreeSet<_>>();
let lines_scanned = u32::try_from(input_lines.len()).unwrap_or(u32::MAX);
let lines_scanned = u64::try_from(input_lines.len()).unwrap_or(u64::MAX);

let mut current_file: Option<String> = None;
let mut current_lang = Language::Unknown;
Expand Down
2 changes: 2 additions & 0 deletions crates/diffguard-domain/src/preprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,15 @@ pub struct PreprocessOptions {
}

impl PreprocessOptions {
#[must_use]
pub fn none() -> Self {
Self {
mask_comments: false,
mask_strings: false,
}
}

#[must_use]
pub fn comments_only() -> Self {
Self {
mask_comments: true,
Expand Down
Loading
Loading