fix: make corpus reports explicit and fallible

This commit is contained in:
2026-06-22 16:49:32 +04:00
parent 8b91a0bfbf
commit 91c7a8a14e
5 changed files with 325 additions and 95 deletions
Generated
+1
View File
@@ -82,6 +82,7 @@ name = "fparkan-corpus"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"fparkan-binary", "fparkan-binary",
"fparkan-nres",
"fparkan-path", "fparkan-path",
] ]
+5 -2
View File
@@ -32,7 +32,7 @@ fn run(args: &[String]) -> Result<(), String> {
let root = parse_root(&rest)?; let root = parse_root(&rest)?;
let manifest = let manifest =
discover(&root, DiscoverOptions::default()).map_err(|e| e.to_string())?; discover(&root, DiscoverOptions::default()).map_err(|e| e.to_string())?;
let report = report(&root, &manifest); let report = report(&root, &manifest).map_err(|e| e.to_string())?;
println!("{}", render_report_json(&report)); println!("{}", render_report_json(&report));
Ok(()) Ok(())
} }
@@ -41,10 +41,13 @@ fn run(args: &[String]) -> Result<(), String> {
let root = parse_root(&rest)?; let root = parse_root(&rest)?;
let manifest = let manifest =
discover(&root, DiscoverOptions::default()).map_err(|e| e.to_string())?; discover(&root, DiscoverOptions::default()).map_err(|e| e.to_string())?;
let report = report(&root, &manifest); let report = report(&root, &manifest).map_err(|e| e.to_string())?;
if report.casefold_collisions > 0 { if report.casefold_collisions > 0 {
return Err("casefold collisions found".to_string()); return Err("casefold collisions found".to_string());
} }
if report.failures > 0 {
return Err(format!("corpus report found {} failures", report.failures));
}
println!("{}", render_report_json(&report)); println!("{}", render_report_json(&report));
Ok(()) Ok(())
} }
+1
View File
@@ -7,6 +7,7 @@ repository.workspace = true
[dependencies] [dependencies]
fparkan-binary = { path = "../fparkan-binary" } fparkan-binary = { path = "../fparkan-binary" }
fparkan-nres = { path = "../fparkan-nres" }
fparkan-path = { path = "../fparkan-path" } fparkan-path = { path = "../fparkan-path" }
[lints] [lints]
+300 -75
View File
@@ -8,6 +8,7 @@ use std::fmt;
use std::fs; use std::fs;
use std::io::Write; use std::io::Write;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::Arc;
/// Corpus kind. /// Corpus kind.
#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[derive(Clone, Copy, Debug, Eq, PartialEq)]
@@ -72,6 +73,34 @@ pub struct CorpusReport {
pub casefold_collisions: usize, pub casefold_collisions: usize,
/// Manifest fingerprint. /// Manifest fingerprint.
pub fingerprint: Sha256Digest, pub fingerprint: Sha256Digest,
/// Per-file status records.
pub records: Vec<CorpusFileRecord>,
/// Number of files with report errors.
pub failures: usize,
}
/// Per-file report status.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum CorpusFileStatus {
/// File was inspected successfully.
Ok,
/// File was inspected but produced a non-fatal warning.
Warning,
/// File could not be inspected.
Error,
}
/// Per-file report record.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct CorpusFileRecord {
/// Normalized relative path.
pub path: String,
/// Inspection status.
pub status: CorpusFileStatus,
/// Detected file variant.
pub variant: String,
/// Optional status message.
pub message: Option<String>,
} }
/// Corpus error. /// Corpus error.
@@ -88,6 +117,13 @@ pub enum CorpusError {
InvalidRoot(PathBuf), InvalidRoot(PathBuf),
/// Invalid path. /// Invalid path.
InvalidPath(String), InvalidPath(String),
/// Aggregate report failure.
Report {
/// Path where reporting failed.
path: String,
/// Failure message.
message: String,
},
} }
impl fmt::Display for CorpusError { impl fmt::Display for CorpusError {
@@ -96,6 +132,7 @@ impl fmt::Display for CorpusError {
Self::Io { path, source } => write!(f, "{}: {source}", path.display()), Self::Io { path, source } => write!(f, "{}: {source}", path.display()),
Self::InvalidRoot(path) => write!(f, "invalid corpus root: {}", path.display()), Self::InvalidRoot(path) => write!(f, "invalid corpus root: {}", path.display()),
Self::InvalidPath(path) => write!(f, "invalid corpus path: {path}"), Self::InvalidPath(path) => write!(f, "invalid corpus path: {path}"),
Self::Report { path, message } => write!(f, "{path}: {message}"),
} }
} }
} }
@@ -104,7 +141,7 @@ impl std::error::Error for CorpusError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self { match self {
Self::Io { source, .. } => Some(source), Self::Io { source, .. } => Some(source),
Self::InvalidRoot(_) | Self::InvalidPath(_) => None, Self::InvalidRoot(_) | Self::InvalidPath(_) | Self::Report { .. } => None,
} }
} }
} }
@@ -230,8 +267,39 @@ fn detect_casefold_collisions(files: &[ManifestEntry]) -> Vec<Vec<String>> {
} }
/// Builds aggregate report. /// Builds aggregate report.
#[must_use] ///
pub fn report(root: &Path, manifest: &CorpusManifest) -> CorpusReport { /// # Errors
///
/// Returns [`CorpusError`] when the aggregate report cannot be constructed.
/// Per-file inspection failures are represented in [`CorpusReport::records`]
/// and counted in [`CorpusReport::failures`].
pub fn report(root: &Path, manifest: &CorpusManifest) -> Result<CorpusReport, CorpusError> {
let mut metrics = empty_report_metrics();
let mut records = Vec::with_capacity(manifest.files.len());
let mut failures = 0usize;
for entry in &manifest.files {
let record = inspect_report_file(root, entry, &mut metrics);
if record.status == CorpusFileStatus::Error {
failures = failures.saturating_add(1);
}
records.push(record);
}
Ok(CorpusReport {
schema: 1,
kind: manifest.kind,
files: manifest.files.len(),
bytes: manifest.files.iter().map(|f| f.size).sum(),
metrics,
casefold_collisions: manifest.casefold_collisions.len(),
fingerprint: fingerprint(manifest),
records,
failures,
})
}
fn empty_report_metrics() -> BTreeMap<String, u64> {
let mut metrics = BTreeMap::new(); let mut metrics = BTreeMap::new();
metrics.insert("nres_files".to_string(), 0); metrics.insert("nres_files".to_string(), 0);
metrics.insert("nres_entries".to_string(), 0); metrics.insert("nres_entries".to_string(), 0);
@@ -245,67 +313,97 @@ pub fn report(root: &Path, manifest: &CorpusManifest) -> CorpusReport {
metrics.insert("texm_entries".to_string(), 0); metrics.insert("texm_entries".to_string(), 0);
metrics.insert("fxid_entries".to_string(), 0); metrics.insert("fxid_entries".to_string(), 0);
metrics.insert("wear_entries".to_string(), 0); metrics.insert("wear_entries".to_string(), 0);
metrics
}
for entry in &manifest.files { fn inspect_report_file(
root: &Path,
entry: &ManifestEntry,
metrics: &mut BTreeMap<String, u64>,
) -> CorpusFileRecord {
let lower = entry.path.to_ascii_lowercase(); let lower = entry.path.to_ascii_lowercase();
let mut variant = inspect_path_metrics(&lower, metrics);
let path = root.join(&entry.path);
let bytes = match fs::read(&path) {
Ok(bytes) => bytes,
Err(source) => {
return CorpusFileRecord {
path: entry.path.clone(),
status: CorpusFileStatus::Error,
variant,
message: Some(source.to_string()),
};
}
};
if bytes.starts_with(b"NRes") {
variant = "nres".to_string();
bump(metrics, "nres_files", 1);
if let Err(message) = inspect_nres_metrics(bytes, metrics) {
return CorpusFileRecord {
path: entry.path.clone(),
status: CorpusFileStatus::Error,
variant,
message: Some(message),
};
}
} else if bytes.starts_with(b"NL") {
variant = "rsli".to_string();
bump(metrics, "rsli_files", 1);
}
CorpusFileRecord {
path: entry.path.clone(),
status: CorpusFileStatus::Ok,
variant,
message: None,
}
}
fn inspect_path_metrics(lower: &str, metrics: &mut BTreeMap<String, u64>) -> String {
let mut variant = "file";
if lower.ends_with("data.tma") { if lower.ends_with("data.tma") {
bump(&mut metrics, "tma_files", 1); bump(metrics, "tma_files", 1);
variant = "tma";
} }
if lower.ends_with("land.msh") { if lower.ends_with("land.msh") {
bump(&mut metrics, "land_msh_files", 1); bump(metrics, "land_msh_files", 1);
variant = "land_msh";
} }
if lower.ends_with("land.map") { if lower.ends_with("land.map") {
bump(&mut metrics, "land_map_files", 1); bump(metrics, "land_map_files", 1);
variant = "land_map";
} }
if has_extension(&lower, "dat") if has_extension(lower, "dat") && (lower.starts_with("units/") || lower.contains("/units/")) {
&& (lower.starts_with("units/") || lower.contains("/units/")) bump(metrics, "unit_dat_files", 1);
{ variant = "unit_dat";
bump(&mut metrics, "unit_dat_files", 1);
} }
variant.to_string()
}
let path = root.join(&entry.path); fn inspect_nres_metrics(bytes: Vec<u8>, metrics: &mut BTreeMap<String, u64>) -> Result<(), String> {
if let Ok(bytes) = fs::read(path) { let entries = inspect_nres_entries(bytes)?;
if bytes.starts_with(b"NRes") { bump(metrics, "nres_entries", entries.len() as u64);
bump(&mut metrics, "nres_files", 1);
if let Some(entries) = inspect_nres_entries(&bytes) {
bump(&mut metrics, "nres_entries", entries.len() as u64);
for entry in entries { for entry in entries {
let name = entry.name.to_ascii_lowercase(); let name = String::from_utf8_lossy(entry.name_bytes()).to_ascii_lowercase();
if has_extension(&name, "msh") { if has_extension(&name, "msh") {
bump(&mut metrics, "msh_entries", 1); bump(metrics, "msh_entries", 1);
} }
match entry.kind { match entry.meta().type_id {
0x3054_414D => { 0x3054_414D => {
bump(&mut metrics, "mat0_entries", 1); bump(metrics, "mat0_entries", 1);
} }
0x6D78_6554 => { 0x6D78_6554 => {
bump(&mut metrics, "texm_entries", 1); bump(metrics, "texm_entries", 1);
} }
0x4449_5846 => { 0x4449_5846 => {
bump(&mut metrics, "fxid_entries", 1); bump(metrics, "fxid_entries", 1);
} }
0x5241_4557 => { 0x5241_4557 => {
bump(&mut metrics, "wear_entries", 1); bump(metrics, "wear_entries", 1);
} }
_ => {} _ => {}
} }
} }
} Ok(())
} else if bytes.starts_with(b"NL") {
bump(&mut metrics, "rsli_files", 1);
}
}
}
CorpusReport {
schema: 1,
kind: manifest.kind,
files: manifest.files.len(),
bytes: manifest.files.iter().map(|f| f.size).sum(),
metrics,
casefold_collisions: manifest.casefold_collisions.len(),
fingerprint: fingerprint(manifest),
}
} }
fn bump(metrics: &mut BTreeMap<String, u64>, key: &str, delta: u64) { fn bump(metrics: &mut BTreeMap<String, u64>, key: &str, delta: u64) {
@@ -320,35 +418,13 @@ fn has_extension(path: &str, expected: &str) -> bool {
.is_some_and(|extension| extension.eq_ignore_ascii_case(expected)) .is_some_and(|extension| extension.eq_ignore_ascii_case(expected))
} }
#[derive(Clone, Debug)] fn inspect_nres_entries(bytes: Vec<u8>) -> Result<Vec<fparkan_nres::NresEntry>, String> {
struct NresEntryBrief { let document = fparkan_nres::decode(
kind: u32, Arc::from(bytes.into_boxed_slice()),
name: String, fparkan_nres::ReadProfile::Compatible,
} )
.map_err(|err| err.to_string())?;
fn inspect_nres_entries(bytes: &[u8]) -> Option<Vec<NresEntryBrief>> { Ok(document.entries().to_vec())
if bytes.len() < 16 || !bytes.starts_with(b"NRes") {
return None;
}
let count = i32::from_le_bytes(bytes.get(8..12)?.try_into().ok()?);
if count < 0 {
return None;
}
let count = usize::try_from(count).ok()?;
let directory_len = count.checked_mul(64)?;
let directory_offset = bytes.len().checked_sub(directory_len)?;
let mut names = Vec::with_capacity(count);
for index in 0..count {
let base = directory_offset.checked_add(index.checked_mul(64)?)?;
let kind = u32::from_le_bytes(bytes.get(base..base + 4)?.try_into().ok()?);
let raw = bytes.get(base + 20..base + 56)?;
let len = raw.iter().position(|b| *b == 0).unwrap_or(raw.len());
names.push(NresEntryBrief {
kind,
name: String::from_utf8_lossy(&raw[..len]).to_string(),
});
}
Some(names)
} }
/// Computes stable manifest fingerprint. /// Computes stable manifest fingerprint.
@@ -402,13 +478,15 @@ pub fn write_report_atomic(path: &Path, report: &CorpusReport) -> Result<(), Cor
#[must_use] #[must_use]
pub fn render_report_json(report: &CorpusReport) -> String { pub fn render_report_json(report: &CorpusReport) -> String {
let mut out = format!( let mut out = format!(
"{{\"schema_version\":\"fparkan-corpus-report-v1\",\"schema\":{},\"kind\":\"{:?}\",\"files\":{},\"bytes\":{},\"casefold_collisions\":{},\"fingerprint\":\"{}\",\"metrics\":{{", "{{\"schema_version\":\"fparkan-corpus-report-v1\",\"schema\":{},\"kind\":\"{:?}\",\"files\":{},\"bytes\":{},\"casefold_collisions\":{},\"fingerprint\":\"{}\",\"failures\":{},\"record_count\":{},\"metrics\":{{",
report.schema, report.schema,
report.kind, report.kind,
report.files, report.files,
report.bytes, report.bytes,
report.casefold_collisions, report.casefold_collisions,
sha256_hex(&report.fingerprint) sha256_hex(&report.fingerprint),
report.failures,
report.records.len()
); );
for (idx, (key, value)) in report.metrics.iter().enumerate() { for (idx, (key, value)) in report.metrics.iter().enumerate() {
if idx > 0 { if idx > 0 {
@@ -441,7 +519,7 @@ mod tests {
return; return;
} }
let manifest = discover(&root, DiscoverOptions::default()).expect("manifest"); let manifest = discover(&root, DiscoverOptions::default()).expect("manifest");
let report = report(&root, &manifest); let report = report(&root, &manifest).expect("report");
assert!(report.files > 0); assert!(report.files > 0);
assert!(report.metrics["nres_files"] > 0); assert!(report.metrics["nres_files"] > 0);
} }
@@ -451,7 +529,7 @@ mod tests {
fn licensed_part1_manifest_profile_and_counts_match_baseline() { fn licensed_part1_manifest_profile_and_counts_match_baseline() {
let root = testdata_root("IS"); let root = testdata_root("IS");
let manifest = discover(&root, DiscoverOptions::default()).expect("part 1 manifest"); let manifest = discover(&root, DiscoverOptions::default()).expect("part 1 manifest");
let report = report(&root, &manifest); let report = report(&root, &manifest).expect("report");
assert_eq!(manifest.kind, CorpusKind::Part1); assert_eq!(manifest.kind, CorpusKind::Part1);
assert_eq!(report.files, 1_017); assert_eq!(report.files, 1_017);
@@ -468,7 +546,7 @@ mod tests {
fn licensed_part2_manifest_profile_and_counts_match_baseline() { fn licensed_part2_manifest_profile_and_counts_match_baseline() {
let root = testdata_root("IS2"); let root = testdata_root("IS2");
let manifest = discover(&root, DiscoverOptions::default()).expect("part 2 manifest"); let manifest = discover(&root, DiscoverOptions::default()).expect("part 2 manifest");
let report = report(&root, &manifest); let report = report(&root, &manifest).expect("report");
assert_eq!(manifest.kind, CorpusKind::Part2); assert_eq!(manifest.kind, CorpusKind::Part2);
assert_eq!(report.files, 1_302); assert_eq!(report.files, 1_302);
@@ -521,16 +599,111 @@ mod tests {
}], }],
casefold_collisions: Vec::new(), casefold_collisions: Vec::new(),
}; };
let report = report(Path::new("."), &manifest); let report = report(Path::new("."), &manifest).expect("report");
let json = render_report_json(&report); let json = render_report_json(&report);
assert!(json.contains("\"schema_version\":\"fparkan-corpus-report-v1\"")); assert!(json.contains("\"schema_version\":\"fparkan-corpus-report-v1\""));
assert!(json.contains("\"fingerprint\":")); assert!(json.contains("\"fingerprint\":"));
assert!(json.contains("\"failures\":1"));
assert!(json.contains("\"record_count\":1"));
assert!(json.contains("\"metrics\":")); assert!(json.contains("\"metrics\":"));
assert!(!json.contains("secret/payload.bin")); assert!(!json.contains("secret/payload.bin"));
assert!(!json.contains("DATA")); assert!(!json.contains("DATA"));
} }
#[test]
fn report_records_missing_manifest_files_as_failures() {
let root = temp_dir("report-missing");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "missing.lib".to_string(),
size: 1,
hash: sha256(b"missing"),
}],
casefold_collisions: Vec::new(),
};
let report = report(&root, &manifest).expect("report");
assert_eq!(report.failures, 1);
assert_eq!(report.records.len(), 1);
assert_eq!(report.records[0].path, "missing.lib");
assert_eq!(report.records[0].status, CorpusFileStatus::Error);
let _ = fs::remove_dir_all(root);
}
#[test]
fn report_records_malformed_nres_as_failure() {
let root = temp_dir("report-bad-nres");
fs::write(root.join("bad.lib"), b"NRes").expect("bad nres");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "bad.lib".to_string(),
size: 4,
hash: sha256(b"NRes"),
}],
casefold_collisions: Vec::new(),
};
let report = report(&root, &manifest).expect("report");
assert_eq!(report.failures, 1);
assert_eq!(report.records[0].status, CorpusFileStatus::Error);
assert_eq!(report.records[0].variant, "nres");
assert!(report.records[0]
.message
.as_deref()
.is_some_and(|message| message.contains("NRes")));
let _ = fs::remove_dir_all(root);
}
#[test]
fn report_uses_production_nres_parser_for_entry_metrics() {
let root = temp_dir("report-nres");
let archive = build_nres(&[
TestNresEntry {
name: "mesh.msh",
type_id: 0,
payload: b"mesh",
},
TestNresEntry {
name: "mat.bin",
type_id: 0x3054_414D,
payload: b"mat0",
},
TestNresEntry {
name: "texture.bin",
type_id: 0x6D78_6554,
payload: b"texm",
},
]);
fs::write(root.join("archive.lib"), &archive).expect("archive");
let manifest = CorpusManifest {
kind: CorpusKind::Unknown,
files: vec![ManifestEntry {
path: "archive.lib".to_string(),
size: u64::try_from(archive.len()).expect("archive size"),
hash: sha256(&archive),
}],
casefold_collisions: Vec::new(),
};
let report = report(&root, &manifest).expect("report");
assert_eq!(report.failures, 0);
assert_eq!(report.records.len(), 1);
assert_eq!(report.records[0].status, CorpusFileStatus::Ok);
assert_eq!(report.records[0].variant, "nres");
assert_eq!(report.metrics["nres_files"], 1);
assert_eq!(report.metrics["nres_entries"], 3);
assert_eq!(report.metrics["msh_entries"], 1);
assert_eq!(report.metrics["mat0_entries"], 1);
assert_eq!(report.metrics["texm_entries"], 1);
let _ = fs::remove_dir_all(root);
}
#[test] #[test]
fn deterministic_traversal_is_creation_order_independent() { fn deterministic_traversal_is_creation_order_independent() {
let first = temp_dir("order-first"); let first = temp_dir("order-first");
@@ -648,12 +821,64 @@ mod tests {
metrics: BTreeMap::new(), metrics: BTreeMap::new(),
casefold_collisions: 0, casefold_collisions: 0,
fingerprint: sha256(b"empty-report"), fingerprint: sha256(b"empty-report"),
records: Vec::new(),
failures: 0,
}; };
write_report_atomic(&tmp, &report).expect("write"); write_report_atomic(&tmp, &report).expect("write");
assert!(tmp.is_file()); assert!(tmp.is_file());
let _ = fs::remove_file(tmp); let _ = fs::remove_file(tmp);
} }
struct TestNresEntry<'a> {
name: &'a str,
type_id: u32,
payload: &'a [u8],
}
fn build_nres(entries: &[TestNresEntry<'_>]) -> Vec<u8> {
let mut out = vec![0; 16];
let mut offsets = Vec::with_capacity(entries.len());
for entry in entries {
offsets.push(u32::try_from(out.len()).expect("offset"));
out.extend_from_slice(entry.payload);
let padding = (8 - (out.len() % 8)) % 8;
out.resize(out.len() + padding, 0);
}
let mut order: Vec<usize> = (0..entries.len()).collect();
order.sort_by(|left, right| {
entries[*left]
.name
.as_bytes()
.cmp(entries[*right].name.as_bytes())
});
for (index, entry) in entries.iter().enumerate() {
push_u32(&mut out, entry.type_id);
push_u32(&mut out, 0);
push_u32(&mut out, 0);
push_u32(
&mut out,
u32::try_from(entry.payload.len()).expect("payload size"),
);
push_u32(&mut out, 0);
let mut name = [0; 36];
let name_bytes = entry.name.as_bytes();
name[..name_bytes.len()].copy_from_slice(name_bytes);
out.extend_from_slice(&name);
push_u32(&mut out, offsets[index]);
push_u32(&mut out, u32::try_from(order[index]).expect("sort index"));
}
out[0..4].copy_from_slice(b"NRes");
out[4..8].copy_from_slice(&0x100_u32.to_le_bytes());
out[8..12].copy_from_slice(&u32::try_from(entries.len()).expect("count").to_le_bytes());
let total_size = u32::try_from(out.len()).expect("total size");
out[12..16].copy_from_slice(&total_size.to_le_bytes());
out
}
fn push_u32(out: &mut Vec<u8>, value: u32) {
out.extend_from_slice(&value.to_le_bytes());
}
fn temp_dir(name: &str) -> PathBuf { fn temp_dir(name: &str) -> PathBuf {
let path = std::env::temp_dir().join(format!( let path = std::env::temp_dir().join(format!(
"fparkan-corpus-{name}-{}", "fparkan-corpus-{name}-{}",
+1 -1
View File
@@ -57,7 +57,7 @@ fn run(args: &[String]) -> Result<(), String> {
let root = parse_root(rest)?; let root = parse_root(rest)?;
let manifest = let manifest =
discover(&root, DiscoverOptions::default()).map_err(|e| e.to_string())?; discover(&root, DiscoverOptions::default()).map_err(|e| e.to_string())?;
let report = report(&root, &manifest); let report = report(&root, &manifest).map_err(|e| e.to_string())?;
println!("{}", render_report_json(&report)); println!("{}", render_report_json(&report));
Ok(()) Ok(())
} }