fix: route archive inspection through byte-safe boundaries

This commit is contained in:
2026-06-30 01:54:57 +04:00
parent d0bc7f2f26
commit 7337492c30
3 changed files with 216 additions and 78 deletions
+80 -68
View File
@@ -35,6 +35,8 @@ use std::collections::{BTreeMap, BTreeSet};
use std::fmt;
use std::fs;
use std::io::Write;
#[cfg(unix)]
use std::os::unix::ffi::OsStrExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
@@ -69,6 +71,8 @@ pub struct DiscoverOptions {
pub struct ManifestEntry {
/// Normalized relative path.
pub path: String,
/// Byte-exact relative host path used for reopening corpus files.
pub host_rel_path: PathBuf,
/// File size in bytes.
pub size: u64,
/// SHA-256 content fingerprint.
@@ -188,7 +192,7 @@ pub fn discover(root: &Path, options: DiscoverOptions) -> Result<CorpusManifest,
}
let mut files = Vec::new();
walk(root, root, options, &mut files)?;
files.sort_by(|a, b| a.path.cmp(&b.path));
files.sort_by(|a, b| a.host_rel_path.cmp(&b.host_rel_path));
let kind = classify(root, &files);
let casefold_collisions = detect_casefold_collisions(&files);
@@ -243,17 +247,22 @@ fn walk(
let rel = path
.strip_prefix(root)
.map_err(|_| CorpusError::InvalidPath(path.display().to_string()))?;
let rel_text = rel
#[cfg(unix)]
let rel_bytes = rel.as_os_str().as_bytes();
#[cfg(not(unix))]
let rel_bytes = rel
.to_str()
.ok_or_else(|| CorpusError::InvalidPath(path.display().to_string()))?;
let normalized = normalize_relative(rel_text.as_bytes(), PathPolicy::HostCompatible)
.map_err(|_| CorpusError::InvalidPath(rel_text.to_string()))?;
.ok_or_else(|| CorpusError::InvalidPath(path.display().to_string()))?
.as_bytes();
let normalized = normalize_relative(rel_bytes, PathPolicy::HostCompatible)
.map_err(|_| CorpusError::InvalidPath(path.display().to_string()))?;
let bytes = fs::read(&path).map_err(|source| CorpusError::Io {
path: path.clone(),
source,
})?;
out.push(ManifestEntry {
path: normalized.as_str().to_string(),
path: normalized.display_lossy().to_string(),
host_rel_path: rel.to_path_buf(),
size: metadata.len(),
hash: sha256(&bytes),
});
@@ -285,7 +294,7 @@ fn detect_casefold_collisions(files: &[ManifestEntry]) -> Vec<Vec<String>> {
let mut grouped: BTreeMap<Vec<u8>, BTreeSet<String>> = BTreeMap::new();
for file in files {
grouped
.entry(ascii_lookup_key(file.path.as_bytes()).0)
.entry(ascii_lookup_key(path_identity_bytes(&file.host_rel_path)).0)
.or_default()
.insert(file.path.clone());
}
@@ -353,7 +362,7 @@ fn inspect_report_file(
) -> CorpusFileRecord {
let lower = entry.path.to_ascii_lowercase();
let mut variant = inspect_path_metrics(&lower, metrics);
let path = root.join(&entry.path);
let path = root.join(&entry.host_rel_path);
let bytes = match fs::read(&path) {
Ok(bytes) => bytes,
Err(source) => {
@@ -439,6 +448,17 @@ fn inspect_report_file(
}
}
fn path_identity_bytes(path: &Path) -> &[u8] {
#[cfg(unix)]
{
path.as_os_str().as_bytes()
}
#[cfg(not(unix))]
{
path.to_str().unwrap_or_default().as_bytes()
}
}
fn inspect_path_metrics(lower: &str, metrics: &mut BTreeMap<String, u64>) -> String {
let mut variant = "file";
if lower.ends_with("data.tma") {
@@ -767,11 +787,7 @@ mod tests {
fn report_json_contains_metrics_and_hashes_not_paths_or_payloads() {
let manifest = CorpusManifest {
kind: CorpusKind::Part1,
files: vec![ManifestEntry {
path: "secret/payload.bin".to_string(),
size: 4,
hash: sha256(b"DATA"),
}],
files: vec![manifest_entry("secret/payload.bin", 4, sha256(b"DATA"))],
casefold_collisions: Vec::new(),
};
let report = report(Path::new("."), &manifest).expect("report");
@@ -791,11 +807,7 @@ mod tests {
let root = temp_dir("report-missing");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "missing.lib".to_string(),
size: 1,
hash: sha256(b"missing"),
}],
files: vec![manifest_entry("missing.lib", 1, sha256(b"missing"))],
casefold_collisions: Vec::new(),
};
@@ -814,11 +826,7 @@ mod tests {
fs::write(root.join("bad.lib"), b"NRes").expect("bad nres");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "bad.lib".to_string(),
size: 4,
hash: sha256(b"NRes"),
}],
files: vec![manifest_entry("bad.lib", 4, sha256(b"NRes"))],
casefold_collisions: Vec::new(),
};
@@ -857,11 +865,11 @@ mod tests {
fs::write(root.join("archive.lib"), &archive).expect("archive");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "archive.lib".to_string(),
size: u64::try_from(archive.len()).expect("archive size"),
hash: sha256(&archive),
}],
files: vec![manifest_entry(
"archive.lib",
u64::try_from(archive.len()).expect("archive size"),
sha256(&archive),
)],
casefold_collisions: Vec::new(),
};
@@ -886,11 +894,7 @@ mod tests {
fs::write(root.join("WORLD/MAP/land.map"), build_nres(&[])).expect("land map");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "WORLD/MAP/land.map".to_string(),
size: 16,
hash: sha256(b"land.map"),
}],
files: vec![manifest_entry("WORLD/MAP/land.map", 16, sha256(b"land.map"))],
casefold_collisions: Vec::new(),
};
@@ -909,11 +913,7 @@ mod tests {
fs::write(root.join("WORLD/MAP/land.msh"), build_nres(&[])).expect("land msh");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "WORLD/MAP/land.msh".to_string(),
size: 16,
hash: sha256(b"land.msh"),
}],
files: vec![manifest_entry("WORLD/MAP/land.msh", 16, sha256(b"land.msh"))],
casefold_collisions: Vec::new(),
};
@@ -932,11 +932,11 @@ mod tests {
fs::write(root.join("MISSIONS/test/data.tma"), b"malformed tma").expect("tma");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "MISSIONS/test/data.tma".to_string(),
size: 12,
hash: sha256(b"malformed tma"),
}],
files: vec![manifest_entry(
"MISSIONS/test/data.tma",
12,
sha256(b"malformed tma"),
)],
casefold_collisions: Vec::new(),
};
@@ -955,11 +955,7 @@ mod tests {
fs::write(root.join("units/unit.dat"), vec![0u8; 120]).expect("unit");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "units/unit.dat".to_string(),
size: 120,
hash: sha256(&[0u8; 120]),
}],
files: vec![manifest_entry("units/unit.dat", 120, sha256(&[0u8; 120]))],
casefold_collisions: Vec::new(),
};
@@ -977,11 +973,7 @@ mod tests {
fs::write(root.join("patch.nl"), b"NL malformed").expect("rsli");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "patch.nl".to_string(),
size: 12,
hash: sha256(b"NL malformed"),
}],
files: vec![manifest_entry("patch.nl", 12, sha256(b"NL malformed"))],
casefold_collisions: Vec::new(),
};
@@ -1052,16 +1044,8 @@ mod tests {
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![
ManifestEntry {
path: "Textures/Foo.TEX".to_string(),
size: 1,
hash: sha256(b"first"),
},
ManifestEntry {
path: "textures/foo.tex".to_string(),
size: 1,
hash: sha256(b"second"),
},
manifest_entry("Textures/Foo.TEX", 1, sha256(b"first")),
manifest_entry("textures/foo.tex", 1, sha256(b"second")),
],
casefold_collisions: Vec::new(),
};
@@ -1081,11 +1065,7 @@ mod tests {
fn fingerprint_changes() {
let mut manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "a".to_string(),
size: 1,
hash: sha256(b"before"),
}],
files: vec![manifest_entry("a", 1, sha256(b"before"))],
casefold_collisions: Vec::new(),
};
let a = fingerprint(&manifest);
@@ -1118,6 +1098,29 @@ mod tests {
let _ = fs::remove_file(tmp);
}
#[cfg(unix)]
#[test]
fn discover_supports_non_utf8_host_paths() {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
let root = temp_dir("non-utf8");
let file_name = OsString::from_vec(vec![0xFF, b'.', b'b', b'i', b'n']);
let file_path = root.join(&file_name);
if let Err(err) = fs::write(&file_path, b"raw") {
assert_eq!(err.kind(), std::io::ErrorKind::PermissionDenied);
let _ = fs::remove_dir_all(root);
return;
}
let manifest = discover(&root, DiscoverOptions::default()).expect("manifest");
assert_eq!(manifest.files.len(), 1);
assert_eq!(manifest.files[0].path, "\u{FFFD}.bin");
assert_eq!(manifest.files[0].host_rel_path, PathBuf::from(&file_name));
let _ = fs::remove_dir_all(root);
}
struct TestNresEntry<'a> {
name: &'a str,
type_id: u32,
@@ -1164,6 +1167,15 @@ mod tests {
out
}
fn manifest_entry(path: &str, size: u64, hash: Sha256Digest) -> ManifestEntry {
ManifestEntry {
path: path.to_string(),
host_rel_path: PathBuf::from(path),
size,
hash,
}
}
fn push_u32(out: &mut Vec<u8>, value: u32) {
out.extend_from_slice(&value.to_le_bytes());
}
+2
View File
@@ -6,8 +6,10 @@ license.workspace = true
repository.workspace = true
[dependencies]
fparkan-diagnostics = { path = "../fparkan-diagnostics" }
fparkan-msh = { path = "../fparkan-msh" }
fparkan-nres = { path = "../fparkan-nres" }
fparkan-path = { path = "../fparkan-path" }
fparkan-rsli = { path = "../fparkan-rsli" }
fparkan-resource = { path = "../fparkan-resource" }
fparkan-terrain-format = { path = "../fparkan-terrain-format" }
+134 -10
View File
@@ -20,14 +20,20 @@
)]
//! Shared inspection helpers for format-backed tooling.
use fparkan_msh::{decode_msh, validate_msh};
use fparkan_diagnostics::{
diagnostic, render_human, Diagnostic, DiagnosticCode, DiagnosticContext, Phase,
};
use fparkan_msh::{decode_msh, validate_msh, ModelAsset};
use fparkan_nres::{decode as decode_nres, NresDocument, ReadProfile};
use fparkan_path::{normalize_relative, PathPolicy};
use fparkan_resource::{archive_path, resource_name, CachedResourceRepository, ResourceRepository};
use fparkan_rsli::decode as decode_rsli;
use fparkan_terrain_format::{decode_land_map, decode_land_msh};
use fparkan_texm::decode_texm;
use fparkan_vfs::DirectoryVfs;
use fparkan_vfs::{DirectoryVfs, Vfs};
use std::fs;
#[cfg(unix)]
use std::os::unix::ffi::OsStrExt;
use std::path::Path;
use std::sync::Arc;
@@ -131,7 +137,70 @@ pub enum LandFileKind {
///
/// Returns a string error when the archive cannot be read or decoded.
pub fn inspect_archive_file(path: &Path, sample_limit: usize) -> Result<ArchiveInspection, String> {
let bytes = fs::read(path).map_err(|err| format!("{}: {err}", path.display()))?;
inspect_archive_file_diagnostic(path, sample_limit).map_err(|diagnostic| render_human(&diagnostic))
}
/// Inspects a format archive and returns a structured diagnostic on failure.
///
/// # Errors
///
/// Returns a [`Diagnostic`] when the archive cannot be read or decoded.
pub fn inspect_archive_file_diagnostic(
path: &Path,
sample_limit: usize,
) -> Result<ArchiveInspection, Diagnostic> {
let parent = path.parent().unwrap_or_else(|| Path::new("."));
let file_name = path.file_name().ok_or_else(|| {
diagnostic(
DiagnosticCode("S1.VFS.PATH"),
format!("{}: archive path has no file name", path.display()),
)
.with_context(DiagnosticContext {
phase: Some(Phase::Read),
path: Some(path.display().to_string()),
..DiagnosticContext::default()
})
})?;
#[cfg(unix)]
let raw_name = file_name.as_bytes();
#[cfg(not(unix))]
let raw_name = file_name
.to_str()
.ok_or_else(|| {
diagnostic(
DiagnosticCode("S1.VFS.PATH"),
format!("{}: archive file name is not valid text", path.display()),
)
.with_context(DiagnosticContext {
phase: Some(Phase::Read),
path: Some(path.display().to_string()),
..DiagnosticContext::default()
})
})?
.as_bytes();
let normalized = normalize_relative(raw_name, PathPolicy::HostCompatible).map_err(|err| {
diagnostic(
DiagnosticCode("S1.VFS.PATH"),
format!("{}: {err}", path.display()),
)
.with_context(DiagnosticContext {
phase: Some(Phase::Read),
path: Some(path.display().to_string()),
..DiagnosticContext::default()
})
})?;
let vfs = DirectoryVfs::new(parent);
let bytes = vfs.read(&normalized).map_err(|err| {
diagnostic(
DiagnosticCode("S1.VFS.READ"),
format!("{}: {err}", path.display()),
)
.with_context(DiagnosticContext {
phase: Some(Phase::Read),
path: Some(path.display().to_string()),
..DiagnosticContext::default()
})
})?;
inspect_archive_bytes(&bytes, sample_limit, Some(path))
}
@@ -140,13 +209,13 @@ fn inspect_archive_bytes(
bytes: &[u8],
sample_limit: usize,
source: Option<&Path>,
) -> Result<ArchiveInspection, String> {
) -> Result<ArchiveInspection, Diagnostic> {
if bytes.starts_with(b"NRes") {
let document = decode_nres(
Arc::from(bytes.to_vec().into_boxed_slice()),
ReadProfile::Compatible,
)
.map_err(|err| err.to_string())?;
.map_err(|err| archive_parse_diagnostic("S1.NRES.DECODE", source, err.to_string()))?;
let mut sample = Vec::new();
for entry in document.entries().iter().take(sample_limit) {
sample.push(NresEntrySummary {
@@ -165,15 +234,16 @@ fn inspect_archive_bytes(
Arc::from(bytes.to_vec().into_boxed_slice()),
fparkan_rsli::ReadProfile::Compatible,
)
.map_err(|err| err.to_string())?;
.map_err(|err| archive_parse_diagnostic("S1.RSLI.DECODE", source, err.to_string()))?;
Ok(ArchiveInspection::Rsli {
entries: document.entries().len(),
})
} else {
match source {
Some(path) => Err(format!("{}: unsupported archive magic", path.display())),
None => Err("unsupported archive magic".to_string()),
}
Err(archive_parse_diagnostic(
"S1.RESOURCE.UNSUPPORTED_ARCHIVE",
source,
"unsupported archive magic".to_string(),
))
}
}
@@ -202,6 +272,22 @@ pub fn inspect_model_from_root(
})
}
/// Loads and validates a model resource through repository-backed lookup.
///
/// # Errors
///
/// Returns a string error when the resource cannot be resolved or parsed as a
/// valid model payload.
pub fn load_model_from_root(
root: &Path,
archive: &str,
resource: &str,
) -> Result<ModelAsset, String> {
let document = load_model_document_from_root(root, archive, resource)?;
let msh = decode_msh(&document).map_err(|err| err.to_string())?;
validate_msh(&msh).map_err(|err| err.to_string())
}
/// Inspects a texture through repository-backed resource lookup.
///
/// # Errors
@@ -288,6 +374,27 @@ fn read_resource_bytes(root: &Path, archive: &str, name: &str) -> Result<Arc<[u8
Ok(Arc::from(bytes.into_owned()))
}
fn load_model_document_from_root(
root: &Path,
archive: &str,
resource: &str,
) -> Result<NresDocument, String> {
let bytes = read_resource_bytes(root, archive, resource)?;
decode_nres(bytes, ReadProfile::Compatible).map_err(|err| err.to_string())
}
fn archive_parse_diagnostic(
code: &'static str,
source: Option<&Path>,
message: String,
) -> Diagnostic {
diagnostic(DiagnosticCode(code), message).with_context(DiagnosticContext {
phase: Some(Phase::Parse),
path: source.map(|path| path.display().to_string()),
..DiagnosticContext::default()
})
}
#[cfg(test)]
mod tests {
use super::*;
@@ -306,6 +413,23 @@ mod tests {
assert!(error.contains("entry table out of bounds"));
}
#[test]
fn archive_diagnostic_preserves_source_path() {
let dir = temp_dir("inspect-diagnostic");
let path = dir.join("broken.nres");
fs::write(&path, b"NRes").expect("broken nres");
let diagnostic =
inspect_archive_file_diagnostic(&path, 0).expect_err("diagnostic failure");
assert_eq!(diagnostic.code.0, "S1.NRES.DECODE");
let expected_path = path.display().to_string();
assert_eq!(
diagnostic.context.path.as_deref(),
Some(expected_path.as_str())
);
}
#[test]
fn nres_entry_summary_fields_are_readable() {
let dir = temp_dir("inspect-nres");