fix: route archive inspection through byte-safe boundaries

This commit is contained in:
2026-06-30 01:54:57 +04:00
parent d0bc7f2f26
commit 7337492c30
3 changed files with 216 additions and 78 deletions
+80 -68
View File
@@ -35,6 +35,8 @@ use std::collections::{BTreeMap, BTreeSet};
use std::fmt;
use std::fs;
use std::io::Write;
#[cfg(unix)]
use std::os::unix::ffi::OsStrExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
@@ -69,6 +71,8 @@ pub struct DiscoverOptions {
pub struct ManifestEntry {
/// Normalized relative path.
pub path: String,
/// Byte-exact relative host path used for reopening corpus files.
pub host_rel_path: PathBuf,
/// File size in bytes.
pub size: u64,
/// SHA-256 content fingerprint.
@@ -188,7 +192,7 @@ pub fn discover(root: &Path, options: DiscoverOptions) -> Result<CorpusManifest,
}
let mut files = Vec::new();
walk(root, root, options, &mut files)?;
files.sort_by(|a, b| a.path.cmp(&b.path));
files.sort_by(|a, b| a.host_rel_path.cmp(&b.host_rel_path));
let kind = classify(root, &files);
let casefold_collisions = detect_casefold_collisions(&files);
@@ -243,17 +247,22 @@ fn walk(
let rel = path
.strip_prefix(root)
.map_err(|_| CorpusError::InvalidPath(path.display().to_string()))?;
let rel_text = rel
#[cfg(unix)]
let rel_bytes = rel.as_os_str().as_bytes();
#[cfg(not(unix))]
let rel_bytes = rel
.to_str()
.ok_or_else(|| CorpusError::InvalidPath(path.display().to_string()))?;
let normalized = normalize_relative(rel_text.as_bytes(), PathPolicy::HostCompatible)
.map_err(|_| CorpusError::InvalidPath(rel_text.to_string()))?;
.ok_or_else(|| CorpusError::InvalidPath(path.display().to_string()))?
.as_bytes();
let normalized = normalize_relative(rel_bytes, PathPolicy::HostCompatible)
.map_err(|_| CorpusError::InvalidPath(path.display().to_string()))?;
let bytes = fs::read(&path).map_err(|source| CorpusError::Io {
path: path.clone(),
source,
})?;
out.push(ManifestEntry {
path: normalized.as_str().to_string(),
path: normalized.display_lossy().to_string(),
host_rel_path: rel.to_path_buf(),
size: metadata.len(),
hash: sha256(&bytes),
});
@@ -285,7 +294,7 @@ fn detect_casefold_collisions(files: &[ManifestEntry]) -> Vec<Vec<String>> {
let mut grouped: BTreeMap<Vec<u8>, BTreeSet<String>> = BTreeMap::new();
for file in files {
grouped
.entry(ascii_lookup_key(file.path.as_bytes()).0)
.entry(ascii_lookup_key(path_identity_bytes(&file.host_rel_path)).0)
.or_default()
.insert(file.path.clone());
}
@@ -353,7 +362,7 @@ fn inspect_report_file(
) -> CorpusFileRecord {
let lower = entry.path.to_ascii_lowercase();
let mut variant = inspect_path_metrics(&lower, metrics);
let path = root.join(&entry.path);
let path = root.join(&entry.host_rel_path);
let bytes = match fs::read(&path) {
Ok(bytes) => bytes,
Err(source) => {
@@ -439,6 +448,17 @@ fn inspect_report_file(
}
}
fn path_identity_bytes(path: &Path) -> &[u8] {
#[cfg(unix)]
{
path.as_os_str().as_bytes()
}
#[cfg(not(unix))]
{
path.to_str().unwrap_or_default().as_bytes()
}
}
fn inspect_path_metrics(lower: &str, metrics: &mut BTreeMap<String, u64>) -> String {
let mut variant = "file";
if lower.ends_with("data.tma") {
@@ -767,11 +787,7 @@ mod tests {
fn report_json_contains_metrics_and_hashes_not_paths_or_payloads() {
let manifest = CorpusManifest {
kind: CorpusKind::Part1,
files: vec![ManifestEntry {
path: "secret/payload.bin".to_string(),
size: 4,
hash: sha256(b"DATA"),
}],
files: vec![manifest_entry("secret/payload.bin", 4, sha256(b"DATA"))],
casefold_collisions: Vec::new(),
};
let report = report(Path::new("."), &manifest).expect("report");
@@ -791,11 +807,7 @@ mod tests {
let root = temp_dir("report-missing");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "missing.lib".to_string(),
size: 1,
hash: sha256(b"missing"),
}],
files: vec![manifest_entry("missing.lib", 1, sha256(b"missing"))],
casefold_collisions: Vec::new(),
};
@@ -814,11 +826,7 @@ mod tests {
fs::write(root.join("bad.lib"), b"NRes").expect("bad nres");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "bad.lib".to_string(),
size: 4,
hash: sha256(b"NRes"),
}],
files: vec![manifest_entry("bad.lib", 4, sha256(b"NRes"))],
casefold_collisions: Vec::new(),
};
@@ -857,11 +865,11 @@ mod tests {
fs::write(root.join("archive.lib"), &archive).expect("archive");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "archive.lib".to_string(),
size: u64::try_from(archive.len()).expect("archive size"),
hash: sha256(&archive),
}],
files: vec![manifest_entry(
"archive.lib",
u64::try_from(archive.len()).expect("archive size"),
sha256(&archive),
)],
casefold_collisions: Vec::new(),
};
@@ -886,11 +894,7 @@ mod tests {
fs::write(root.join("WORLD/MAP/land.map"), build_nres(&[])).expect("land map");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "WORLD/MAP/land.map".to_string(),
size: 16,
hash: sha256(b"land.map"),
}],
files: vec![manifest_entry("WORLD/MAP/land.map", 16, sha256(b"land.map"))],
casefold_collisions: Vec::new(),
};
@@ -909,11 +913,7 @@ mod tests {
fs::write(root.join("WORLD/MAP/land.msh"), build_nres(&[])).expect("land msh");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "WORLD/MAP/land.msh".to_string(),
size: 16,
hash: sha256(b"land.msh"),
}],
files: vec![manifest_entry("WORLD/MAP/land.msh", 16, sha256(b"land.msh"))],
casefold_collisions: Vec::new(),
};
@@ -932,11 +932,11 @@ mod tests {
fs::write(root.join("MISSIONS/test/data.tma"), b"malformed tma").expect("tma");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "MISSIONS/test/data.tma".to_string(),
size: 12,
hash: sha256(b"malformed tma"),
}],
files: vec![manifest_entry(
"MISSIONS/test/data.tma",
12,
sha256(b"malformed tma"),
)],
casefold_collisions: Vec::new(),
};
@@ -955,11 +955,7 @@ mod tests {
fs::write(root.join("units/unit.dat"), vec![0u8; 120]).expect("unit");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "units/unit.dat".to_string(),
size: 120,
hash: sha256(&[0u8; 120]),
}],
files: vec![manifest_entry("units/unit.dat", 120, sha256(&[0u8; 120]))],
casefold_collisions: Vec::new(),
};
@@ -977,11 +973,7 @@ mod tests {
fs::write(root.join("patch.nl"), b"NL malformed").expect("rsli");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "patch.nl".to_string(),
size: 12,
hash: sha256(b"NL malformed"),
}],
files: vec![manifest_entry("patch.nl", 12, sha256(b"NL malformed"))],
casefold_collisions: Vec::new(),
};
@@ -1052,16 +1044,8 @@ mod tests {
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![
ManifestEntry {
path: "Textures/Foo.TEX".to_string(),
size: 1,
hash: sha256(b"first"),
},
ManifestEntry {
path: "textures/foo.tex".to_string(),
size: 1,
hash: sha256(b"second"),
},
manifest_entry("Textures/Foo.TEX", 1, sha256(b"first")),
manifest_entry("textures/foo.tex", 1, sha256(b"second")),
],
casefold_collisions: Vec::new(),
};
@@ -1081,11 +1065,7 @@ mod tests {
fn fingerprint_changes() {
let mut manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "a".to_string(),
size: 1,
hash: sha256(b"before"),
}],
files: vec![manifest_entry("a", 1, sha256(b"before"))],
casefold_collisions: Vec::new(),
};
let a = fingerprint(&manifest);
@@ -1118,6 +1098,29 @@ mod tests {
let _ = fs::remove_file(tmp);
}
#[cfg(unix)]
#[test]
fn discover_supports_non_utf8_host_paths() {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
let root = temp_dir("non-utf8");
let file_name = OsString::from_vec(vec![0xFF, b'.', b'b', b'i', b'n']);
let file_path = root.join(&file_name);
if let Err(err) = fs::write(&file_path, b"raw") {
assert_eq!(err.kind(), std::io::ErrorKind::PermissionDenied);
let _ = fs::remove_dir_all(root);
return;
}
let manifest = discover(&root, DiscoverOptions::default()).expect("manifest");
assert_eq!(manifest.files.len(), 1);
assert_eq!(manifest.files[0].path, "\u{FFFD}.bin");
assert_eq!(manifest.files[0].host_rel_path, PathBuf::from(&file_name));
let _ = fs::remove_dir_all(root);
}
struct TestNresEntry<'a> {
name: &'a str,
type_id: u32,
@@ -1164,6 +1167,15 @@ mod tests {
out
}
fn manifest_entry(path: &str, size: u64, hash: Sha256Digest) -> ManifestEntry {
ManifestEntry {
path: path.to_string(),
host_rel_path: PathBuf::from(path),
size,
hash,
}
}
fn push_u32(out: &mut Vec<u8>, value: u32) {
out.extend_from_slice(&value.to_le_bytes());
}