From f47ec90d05c0dad8d0baceffde4790394b976548 Mon Sep 17 00:00:00 2001 From: Shaun Arman Date: Sun, 31 May 2026 13:50:59 -0500 Subject: [PATCH] feat(upload): add safe file extension validation and binary text extraction - Add extension allowlist (SAFE_TEXT_EXTENSIONS + SAFE_BINARY_EXTENSIONS) rejecting unsupported file types at both upload_log_file and upload_log_file_by_content entry points - Add extract_text_content() with PDF text extraction via lopdf and DOCX extraction via zip+quick-xml - Binary files (PDF/DOCX) get extracted text written to .extracted.txt for downstream PII detection - Expand frontend file input accept list and add collapsible supported-formats disclosure element - Add 11 unit tests covering allowlist logic and extraction paths --- src-tauri/Cargo.toml | 3 + src-tauri/src/commands/analysis.rs | 287 +++++++++++++++++++++++++++-- src/pages/LogUpload/index.tsx | 15 +- 3 files changed, 286 insertions(+), 19 deletions(-) diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 84150c6f..032982db 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -45,6 +45,9 @@ warp = "0.3" urlencoding = "2" infer = "0.15" url = "2.5.8" +lopdf = "0.31" +zip = "0.6" +quick-xml = "0.36" rmcp = { version = "1.7.0", features = [ "client", "transport-child-process", diff --git a/src-tauri/src/commands/analysis.rs b/src-tauri/src/commands/analysis.rs index 7344e081..135fb12f 100644 --- a/src-tauri/src/commands/analysis.rs +++ b/src-tauri/src/commands/analysis.rs @@ -9,6 +9,139 @@ use crate::state::AppState; const MAX_LOG_FILE_BYTES: u64 = 50 * 1024 * 1024; +const SAFE_TEXT_EXTENSIONS: &[&str] = &[ + "log", + "txt", + "out", + "err", + "syslog", + "journal", + "yaml", + "yml", + "json", + "toml", + "xml", + "ini", + "cfg", + "conf", + "config", + "env", + "properties", + "md", + "markdown", + "rst", + "csv", + "tsv", + "ndjson", + "jsonl", + "sql", + "sh", + "bash", + "zsh", + "py", + "js", + "ts", + "rb", + "go", + "rs", + "java", + "html", + "htm", + "css", + "diff", + "patch", + "rtf", +]; + +const SAFE_BINARY_EXTENSIONS: &[&str] = &["pdf", "docx", "doc", "xlsx", "xls"]; + +pub fn is_safe_file(path: &Path) -> bool { + let ext = path + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.to_lowercase()); + match ext.as_deref() { + Some(e) => SAFE_TEXT_EXTENSIONS.contains(&e) || SAFE_BINARY_EXTENSIONS.contains(&e), + None => false, + } +} + +pub fn extract_text_content(path: &Path) -> Result { + let ext = path + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.to_lowercase()) + .unwrap_or_default(); + + match ext.as_str() { + "pdf" => extract_pdf_text(path), + "docx" | "doc" => extract_docx_text(path), + "xlsx" | "xls" => Err(format!( + "Spreadsheet format .{ext} is not yet supported for text extraction. \ + Export the sheet as CSV and upload that instead." + )), + _ => std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {e}")), + } +} + +fn extract_pdf_text(path: &Path) -> Result { + let doc = lopdf::Document::load(path).map_err(|e| format!("Failed to parse PDF: {e}"))?; + let mut text = String::new(); + let mut pages: Vec = doc.get_pages().keys().copied().collect(); + pages.sort_unstable(); + for page_num in pages { + if let Ok(content) = doc.extract_text(&[page_num]) { + text.push_str(&content); + text.push('\n'); + } + } + if text.trim().is_empty() { + return Err("PDF contains no extractable text (may be a scanned image)".to_string()); + } + Ok(text) +} + +fn extract_docx_text(path: &Path) -> Result { + use std::io::Read as _; + let file = std::fs::File::open(path).map_err(|e| format!("Failed to open file: {e}"))?; + let mut archive = + zip::ZipArchive::new(file).map_err(|e| format!("Failed to open as ZIP/DOCX: {e}"))?; + let mut xml_content = String::new(); + { + let mut doc_xml = archive + .by_name("word/document.xml") + .map_err(|_| "Not a valid DOCX: missing word/document.xml".to_string())?; + doc_xml + .read_to_string(&mut xml_content) + .map_err(|e| format!("Failed to read document.xml: {e}"))?; + } + let mut text = String::new(); + let mut reader = quick_xml::Reader::from_str(&xml_content); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Text(e)) => { + if let Ok(s) = e.unescape() { + let trimmed = s.trim().to_string(); + if !trimmed.is_empty() { + text.push_str(&trimmed); + text.push(' '); + } + } + } + Ok(quick_xml::events::Event::Eof) => break, + Err(e) => return Err(format!("XML parse error: {e}")), + _ => {} + } + buf.clear(); + } + if text.trim().is_empty() { + return Err("DOCX contains no extractable text".to_string()); + } + Ok(text) +} + fn validate_log_file_path(file_path: &str) -> Result { let path = Path::new(file_path); let canonical = std::fs::canonicalize(path).map_err(|_| "Unable to access selected file")?; @@ -35,24 +168,59 @@ pub async fn upload_log_file( state: State<'_, AppState>, ) -> Result { let canonical_path = validate_log_file_path(&file_path)?; - let content = std::fs::read(&canonical_path).map_err(|_| "Failed to read selected log file")?; - let content_hash = format!("{:x}", Sha256::digest(&content)); + + if !is_safe_file(&canonical_path) { + let ext = canonical_path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("(none)"); + return Err(format!( + "File type '.{ext}' is not supported. Supported formats include .log, .txt, .json, .pdf, .docx, .md, and many more." + )); + } + let file_name = canonical_path .file_name() .and_then(|n| n.to_str()) .unwrap_or("unknown") .to_string(); - let file_size = content.len() as i64; - let mime_type = if file_name.ends_with(".json") { - "application/json" - } else if file_name.ends_with(".xml") { - "application/xml" - } else { - "text/plain" + + let file_ext = canonical_path + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.to_lowercase()) + .unwrap_or_default(); + + let extracted_text = extract_text_content(&canonical_path) + .map_err(|e| format!("Failed to read file content: {e}"))?; + let content_bytes = extracted_text.as_bytes(); + let content_hash = format!("{:x}", Sha256::digest(content_bytes)); + let file_size = content_bytes.len() as i64; + + let mime_type = match file_ext.as_str() { + "json" => "application/json", + "xml" => "application/xml", + "yaml" | "yml" => "application/yaml", + "pdf" => "application/pdf", + "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "doc" => "application/msword", + "md" | "markdown" => "text/markdown", + "csv" | "tsv" => "text/csv", + "html" | "htm" => "text/html", + _ => "text/plain", }; - let canonical_file_path = canonical_path.to_string_lossy().to_string(); - let log_file = LogFile::new(issue_id.clone(), file_name, canonical_file_path, file_size); + let is_binary = SAFE_BINARY_EXTENSIONS.contains(&file_ext.as_str()); + let stored_path = if is_binary { + let extracted_path = canonical_path.with_extension("extracted.txt"); + std::fs::write(&extracted_path, &extracted_text) + .map_err(|e| format!("Failed to write extracted text: {e}"))?; + extracted_path.to_string_lossy().to_string() + } else { + canonical_path.to_string_lossy().to_string() + }; + + let log_file = LogFile::new(issue_id.clone(), file_name, stored_path, file_size); let log_file = LogFile { content_hash: content_hash.clone(), mime_type: mime_type.to_string(), @@ -104,17 +272,36 @@ pub async fn upload_log_file_by_content( content: String, state: State<'_, AppState>, ) -> Result { + let fake_path = Path::new(&file_name); + if !is_safe_file(fake_path) { + let ext = fake_path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("(none)"); + return Err(format!("File type '.{ext}' is not supported.")); + } + let content_bytes = content.as_bytes(); let content_hash = format!("{:x}", Sha256::digest(content_bytes)); let file_size = content_bytes.len() as i64; - // Determine mime type based on file extension - let mime_type = if file_name.ends_with(".json") { - "application/json" - } else if file_name.ends_with(".xml") { - "application/xml" - } else { - "text/plain" + let file_ext = fake_path + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.to_lowercase()) + .unwrap_or_default(); + + let mime_type = match file_ext.as_str() { + "json" => "application/json", + "xml" => "application/xml", + "yaml" | "yml" => "application/yaml", + "pdf" => "application/pdf", + "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "doc" => "application/msword", + "md" | "markdown" => "text/markdown", + "csv" | "tsv" => "text/csv", + "html" | "htm" => "text/html", + _ => "text/plain", }; // Use the file_name as the file_path for DB storage @@ -328,4 +515,68 @@ mod tests { assert!(result.is_ok()); let _ = std::fs::remove_file(file_path); } + + #[test] + fn test_is_safe_file_allows_txt() { + assert!(is_safe_file(Path::new("file.txt"))); + } + + #[test] + fn test_is_safe_file_allows_md() { + assert!(is_safe_file(Path::new("readme.md"))); + } + + #[test] + fn test_is_safe_file_allows_pdf() { + assert!(is_safe_file(Path::new("report.pdf"))); + } + + #[test] + fn test_is_safe_file_allows_docx() { + assert!(is_safe_file(Path::new("doc.docx"))); + } + + #[test] + fn test_is_safe_file_rejects_exe() { + assert!(!is_safe_file(Path::new("malware.exe"))); + } + + #[test] + fn test_is_safe_file_rejects_dll() { + assert!(!is_safe_file(Path::new("library.dll"))); + } + + #[test] + fn test_is_safe_file_rejects_zip_directly() { + assert!(!is_safe_file(Path::new("archive.zip"))); + } + + #[test] + fn test_is_safe_file_case_insensitive() { + assert!(is_safe_file(Path::new("file.TXT"))); + assert!(is_safe_file(Path::new("file.Log"))); + } + + #[test] + fn test_is_safe_file_no_extension_rejected() { + assert!(!is_safe_file(Path::new("Makefile"))); + } + + #[test] + fn test_extract_text_plain_file() { + let dir = std::env::temp_dir(); + let path = dir.join(format!("tftsr-test-extract-{}.txt", uuid::Uuid::now_v7())); + std::fs::write(&path, "hello world").unwrap(); + let result = extract_text_content(&path); + assert!(result.is_ok()); + assert_eq!(result.unwrap().trim(), "hello world"); + let _ = std::fs::remove_file(path); + } + + #[test] + fn test_extract_text_unsupported_binary_returns_error() { + let result = extract_text_content(Path::new("data.xlsx")); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("not yet supported")); + } } diff --git a/src/pages/LogUpload/index.tsx b/src/pages/LogUpload/index.tsx index 140a45ab..57a69c6c 100644 --- a/src/pages/LogUpload/index.tsx +++ b/src/pages/LogUpload/index.tsx @@ -252,8 +252,21 @@ export default function LogUpload() { multiple className="hidden" onChange={handleFileSelect} - accept=".log,.txt,.json,.csv,.xml,.yaml,.yml" + accept=".log,.txt,.out,.err,.syslog,.journal,.yaml,.yml,.json,.toml,.xml,.ini,.cfg,.conf,.config,.env,.properties,.md,.markdown,.rst,.csv,.tsv,.ndjson,.jsonl,.sql,.sh,.bash,.zsh,.py,.js,.ts,.rb,.go,.rs,.java,.html,.htm,.css,.diff,.patch,.pdf,.docx,.doc,.rtf,.xlsx,.xls" /> +
+ + Supported formats + +
+
Logs & text: .log, .txt, .out, .err, .syslog, .journal
+
Config & markup: .yaml, .yml, .json, .toml, .xml, .ini, .cfg, .conf, .env, .properties
+
Documents: .pdf, .docx, .doc, .md, .rst, .rtf
+
Data: .csv, .tsv, .xlsx, .xls, .ndjson, .jsonl, .sql
+
Code & scripts: .sh, .bash, .zsh, .py, .js, .ts, .rb, .go, .rs, .java, .html, .css, .diff, .patch
+

Binary formats (PDF, DOCX, XLSX) will have their text extracted automatically.

+
+
{/* File list */}