use flate2::read::GzDecoder; use flate2::write::GzEncoder; use flate2::Compression; use sha2::{Digest, Sha256}; use std::io::Read as IoRead; use std::io::Write; use std::path::{Path, PathBuf}; use tauri::State; use tracing::warn; use crate::db::models::{AuditEntry, LogFile, LogFileSummary, PiiDetectionResult, PiiSpanRecord}; use crate::pii::{self, PiiDetector, RedactedLogFile}; use crate::state::AppState; const MAX_LOG_FILE_BYTES: u64 = 50 * 1024 * 1024; const SAFE_TEXT_EXTENSIONS: &[&str] = &[ "log", "txt", "out", "err", "syslog", "journal", "yaml", "yml", "json", "toml", "xml", "ini", "cfg", "conf", "config", "env", "properties", "md", "markdown", "rst", "csv", "tsv", "ndjson", "jsonl", "sql", "sh", "bash", "zsh", "py", "js", "ts", "rb", "go", "rs", "java", "html", "htm", "css", "diff", "patch", "rtf", ]; const SAFE_BINARY_EXTENSIONS: &[&str] = &["pdf", "docx", "doc", "xlsx", "xls"]; fn compress_text(text: &str) -> Result, String> { let mut encoder = GzEncoder::new(Vec::new(), Compression::default()); encoder .write_all(text.as_bytes()) .map_err(|e| format!("Compression write error: {e}"))?; encoder .finish() .map_err(|e| format!("Compression finish error: {e}")) } /// 100 MB cap — prevents decompression-bomb attacks on crafted DB entries. const MAX_DECOMPRESSED_BYTES: u64 = 100 * 1024 * 1024; fn decompress_text(bytes: &[u8]) -> Result { let decoder = GzDecoder::new(bytes); let mut limited = decoder.take(MAX_DECOMPRESSED_BYTES + 1); let mut s = String::new(); limited .read_to_string(&mut s) .map_err(|e| format!("Failed to decompress: {e}"))?; if s.len() as u64 > MAX_DECOMPRESSED_BYTES { return Err("Decompressed content exceeds 100 MB limit".to_string()); } Ok(s) } pub fn is_safe_file(path: &Path) -> bool { let ext = path .extension() .and_then(|e| e.to_str()) .map(|e| e.to_lowercase()); match ext.as_deref() { Some(e) => SAFE_TEXT_EXTENSIONS.contains(&e) || SAFE_BINARY_EXTENSIONS.contains(&e), None => false, } } pub fn extract_text_content(path: &Path) -> Result { let ext = path .extension() .and_then(|e| e.to_str()) .map(|e| e.to_lowercase()) .unwrap_or_default(); match ext.as_str() { "pdf" => extract_pdf_text(path), "docx" | "doc" => extract_docx_text(path), "xlsx" | "xls" => Err(format!( "Spreadsheet format .{ext} is not yet supported for text extraction. \ Export the sheet as CSV and upload that instead." )), _ => std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {e}")), } } fn extract_pdf_text(path: &Path) -> Result { let doc = lopdf::Document::load(path).map_err(|e| format!("Failed to parse PDF: {e}"))?; let mut text = String::new(); let mut pages: Vec = doc.get_pages().keys().copied().collect(); pages.sort_unstable(); for page_num in pages { if let Ok(content) = doc.extract_text(&[page_num]) { text.push_str(&content); text.push('\n'); } } if text.trim().is_empty() { return Err("PDF contains no extractable text (may be a scanned image)".to_string()); } Ok(text) } fn extract_docx_text(path: &Path) -> Result { use std::io::Read as _; let file = std::fs::File::open(path).map_err(|e| format!("Failed to open file: {e}"))?; let mut archive = zip::ZipArchive::new(file).map_err(|e| format!("Failed to open as ZIP/DOCX: {e}"))?; let mut xml_content = String::new(); { // Safety: only one hardcoded entry is ever accessed; no arbitrary path extraction is // performed, so zip-slip path traversal attacks cannot apply here. let mut doc_xml = archive .by_name("word/document.xml") .map_err(|_| "Not a valid DOCX: missing word/document.xml".to_string())?; doc_xml .read_to_string(&mut xml_content) .map_err(|e| format!("Failed to read document.xml: {e}"))?; } let mut text = String::new(); let mut reader = quick_xml::Reader::from_str(&xml_content); reader.config_mut().trim_text(true); let mut buf = Vec::new(); loop { match reader.read_event_into(&mut buf) { Ok(quick_xml::events::Event::Text(e)) => { if let Ok(s) = e.unescape() { let trimmed = s.trim().to_string(); if !trimmed.is_empty() { text.push_str(&trimmed); text.push(' '); } } } Ok(quick_xml::events::Event::Eof) => break, Err(e) => return Err(format!("XML parse error: {e}")), _ => {} } buf.clear(); } if text.trim().is_empty() { return Err("DOCX contains no extractable text".to_string()); } Ok(text) } fn validate_log_file_path(file_path: &str) -> Result { let path = Path::new(file_path); let canonical = std::fs::canonicalize(path).map_err(|_| "Unable to access selected file")?; let metadata = std::fs::metadata(&canonical).map_err(|_| "Unable to read file metadata")?; if !metadata.is_file() { return Err("Selected path is not a file".to_string()); } if metadata.len() > MAX_LOG_FILE_BYTES { return Err(format!( "File exceeds maximum supported size ({} MB)", MAX_LOG_FILE_BYTES / 1024 / 1024 )); } Ok(canonical) } #[tauri::command] pub async fn upload_log_file( issue_id: String, file_path: String, state: State<'_, AppState>, ) -> Result { let canonical_path = validate_log_file_path(&file_path)?; if !is_safe_file(&canonical_path) { let ext = canonical_path .extension() .and_then(|e| e.to_str()) .unwrap_or("(none)"); return Err(format!( "File type '.{ext}' is not supported. Supported formats include .log, .txt, .json, .pdf, .docx, .md, and many more." )); } let file_name = canonical_path .file_name() .and_then(|n| n.to_str()) .unwrap_or("unknown") .to_string(); let file_ext = canonical_path .extension() .and_then(|e| e.to_str()) .map(|e| e.to_lowercase()) .unwrap_or_default(); let extracted_text = extract_text_content(&canonical_path) .map_err(|e| format!("Failed to read file content: {e}"))?; let content_bytes = extracted_text.as_bytes(); let content_hash = format!("{:x}", Sha256::digest(content_bytes)); let file_size = content_bytes.len() as i64; let mime_type = match file_ext.as_str() { "json" => "application/json", "xml" => "application/xml", "yaml" | "yml" => "application/yaml", "pdf" => "application/pdf", "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "doc" => "application/msword", "md" | "markdown" => "text/markdown", "csv" | "tsv" => "text/csv", "html" | "htm" => "text/html", _ => "text/plain", }; let is_binary = SAFE_BINARY_EXTENSIONS.contains(&file_ext.as_str()); let stored_path = if is_binary { let extracted_path = canonical_path.with_extension("extracted.txt"); std::fs::write(&extracted_path, &extracted_text) .map_err(|e| format!("Failed to write extracted text: {e}"))?; extracted_path.to_string_lossy().to_string() } else { canonical_path.to_string_lossy().to_string() }; let log_file = LogFile::new(issue_id.clone(), file_name, stored_path, file_size); let log_file = LogFile { content_hash: content_hash.clone(), mime_type: mime_type.to_string(), ..log_file }; let compressed = compress_text(&extracted_text) .map_err(|e| format!("Failed to compress log content: {e}"))?; let db = state.db.lock().map_err(|e| e.to_string())?; db.execute( "INSERT INTO log_files (id, issue_id, file_name, file_path, file_size, mime_type, content_hash, uploaded_at, redacted, content_compressed) \ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)", rusqlite::params![ log_file.id, log_file.issue_id, log_file.file_name, log_file.file_path, log_file.file_size, log_file.mime_type, log_file.content_hash, log_file.uploaded_at, log_file.redacted as i32, compressed, ], ) .map_err(|_| "Failed to store uploaded log metadata".to_string())?; // Audit let entry = AuditEntry::new( "upload_log_file".to_string(), "log_file".to_string(), log_file.id.clone(), serde_json::json!({ "issue_id": issue_id, "file_name": log_file.file_name }).to_string(), ); if let Err(err) = crate::audit::log::write_audit_event( &db, &entry.action, &entry.entity_type, &entry.entity_id, &entry.details, ) { warn!(error = %err, "failed to write upload_log_file audit entry"); } Ok(log_file) } #[tauri::command] pub async fn upload_log_file_by_content( issue_id: String, file_name: String, content: String, state: State<'_, AppState>, ) -> Result { let fake_path = Path::new(&file_name); if !is_safe_file(fake_path) { let ext = fake_path .extension() .and_then(|e| e.to_str()) .unwrap_or("(none)"); return Err(format!("File type '.{ext}' is not supported.")); } let content_bytes = content.as_bytes(); let content_hash = format!("{:x}", Sha256::digest(content_bytes)); let file_size = content_bytes.len() as i64; let file_ext = fake_path .extension() .and_then(|e| e.to_str()) .map(|e| e.to_lowercase()) .unwrap_or_default(); let mime_type = match file_ext.as_str() { "json" => "application/json", "xml" => "application/xml", "yaml" | "yml" => "application/yaml", "pdf" => "application/pdf", "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "doc" => "application/msword", "md" | "markdown" => "text/markdown", "csv" | "tsv" => "text/csv", "html" | "htm" => "text/html", _ => "text/plain", }; // Use the file_name as the file_path for DB storage let log_file = LogFile::new( issue_id.clone(), file_name.clone(), file_name.clone(), file_size, ); let log_file = LogFile { content_hash: content_hash.clone(), mime_type: mime_type.to_string(), ..log_file }; let compressed = compress_text(&content).map_err(|e| format!("Failed to compress log content: {e}"))?; let db = state.db.lock().map_err(|e| e.to_string())?; db.execute( "INSERT INTO log_files (id, issue_id, file_name, file_path, file_size, mime_type, content_hash, uploaded_at, redacted, content_compressed) \ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)", rusqlite::params![ log_file.id, log_file.issue_id, log_file.file_name, log_file.file_path, log_file.file_size, log_file.mime_type, log_file.content_hash, log_file.uploaded_at, log_file.redacted as i32, compressed, ], ) .map_err(|_| "Failed to store uploaded log metadata".to_string())?; // Audit let entry = AuditEntry::new( "upload_log_file".to_string(), "log_file".to_string(), log_file.id.clone(), serde_json::json!({ "issue_id": issue_id, "file_name": log_file.file_name }).to_string(), ); if let Err(err) = crate::audit::log::write_audit_event( &db, &entry.action, &entry.entity_type, &entry.entity_id, &entry.details, ) { warn!(error = %err, "failed to write upload_log_file audit entry"); } Ok(log_file) } #[tauri::command] pub async fn detect_pii( log_file_id: String, state: State<'_, AppState>, ) -> Result { // Load file path from DB let file_path: String = { let db = state.db.lock().map_err(|e| e.to_string())?; db.prepare("SELECT file_path FROM log_files WHERE id = ?1") .and_then(|mut stmt| stmt.query_row([&log_file_id], |row| row.get(0))) .map_err(|_| "Failed to load log file metadata".to_string())? }; let content = std::fs::read_to_string(&file_path).map_err(|_| "Failed to read log file content")?; let detector = PiiDetector::new(); let spans = detector.detect(&content); // Store PII spans in the database for later use { let db = state.db.lock().map_err(|e| e.to_string())?; for span in &spans { let record = PiiSpanRecord { id: span.id.clone(), log_file_id: log_file_id.clone(), pii_type: span.pii_type.clone(), start_offset: span.start as i64, end_offset: span.end as i64, original_value: String::new(), replacement: span.replacement.clone(), }; if let Err(err) = db.execute( "INSERT OR REPLACE INTO pii_spans (id, log_file_id, pii_type, start_offset, end_offset, original_value, replacement) \ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)", rusqlite::params![ record.id, record.log_file_id, record.pii_type, record.start_offset, record.end_offset, record.original_value, record.replacement ], ) { warn!(error = %err, span_id = %span.id, "failed to persist pii span"); } } } let total_pii_found = spans.len(); Ok(PiiDetectionResult { log_file_id, detections: spans, total_pii_found, }) } /// Maximum text size accepted by scan_text_for_pii to prevent DoS on large payloads. const MAX_TEXT_SCAN_BYTES: usize = 32 * 1024; // 32 KB /// Scan arbitrary text for PII without creating any database records. /// Used by the backend before sending typed chat messages to AI providers. #[tauri::command] pub async fn scan_text_for_pii(text: String) -> Result { if text.len() > MAX_TEXT_SCAN_BYTES { return Err(format!( "Text too large for inline PII scan ({} bytes; limit {MAX_TEXT_SCAN_BYTES})", text.len() )); } let detector = PiiDetector::new(); let spans = detector.detect(&text); let total_pii_found = spans.len(); Ok(PiiDetectionResult { log_file_id: String::new(), detections: spans, total_pii_found, }) } #[tauri::command] pub async fn apply_redactions( log_file_id: String, approved_span_ids: Vec, state: State<'_, AppState>, ) -> Result { // Load file path let file_path: String = { let db = state.db.lock().map_err(|e| e.to_string())?; db.prepare("SELECT file_path FROM log_files WHERE id = ?1") .and_then(|mut stmt| stmt.query_row([&log_file_id], |row| row.get(0))) .map_err(|_| "Failed to load log file metadata".to_string())? }; let content = std::fs::read_to_string(&file_path).map_err(|_| "Failed to read log file content")?; // Load PII spans from DB, filtering to only approved ones let spans: Vec = { type Row = (String, String, i64, i64, String, String); let db = state.db.lock().map_err(|e| e.to_string())?; let raw: Vec = db .prepare( "SELECT id, pii_type, start_offset, end_offset, original_value, replacement \ FROM pii_spans WHERE log_file_id = ?1 ORDER BY start_offset ASC", ) .and_then(|mut stmt| { stmt.query_map([&log_file_id], |row| { Ok(( row.get::<_, String>(0)?, row.get::<_, String>(1)?, row.get::<_, i64>(2)?, row.get::<_, i64>(3)?, row.get::<_, String>(4)?, row.get::<_, String>(5)?, )) }) .map(|rows| rows.filter_map(|r| r.ok()).collect::>()) }) .unwrap_or_default(); drop(db); raw.into_iter() .map( |(id, pii_type, start, end, original, replacement)| pii::PiiSpan { id, pii_type, start: start as usize, end: end as usize, original, replacement, }, ) .filter(|span| approved_span_ids.contains(&span.id)) .collect() }; // Apply redactions using the redactor module let redacted_text = pii::apply_redactions(&content, &spans); let data_hash = pii::hash_content(&redacted_text); // Save redacted file alongside original let redacted_path = format!("{file_path}.redacted"); std::fs::write(&redacted_path, &redacted_text) .map_err(|_| "Failed to write redacted output file".to_string())?; // Mark the log file as redacted in DB { let db = state.db.lock().map_err(|e| e.to_string())?; db.execute( "UPDATE log_files SET redacted = 1 WHERE id = ?1", [&log_file_id], ) .map_err(|_| "Failed to mark file as redacted".to_string())?; db.execute( "UPDATE pii_spans SET original_value = '' WHERE log_file_id = ?1", [&log_file_id], ) .map_err(|_| "Failed to finalize redaction metadata".to_string())?; } Ok(RedactedLogFile { log_file_id, redacted_text, data_hash, }) } #[tauri::command] pub async fn get_log_file_content( log_file_id: String, state: State<'_, AppState>, ) -> Result { let row: (Option>, String) = { let db = state.db.lock().map_err(|e| e.to_string())?; db.prepare("SELECT content_compressed, file_path FROM log_files WHERE id = ?1") .and_then(|mut stmt| { stmt.query_row([&log_file_id], |row| Ok((row.get(0)?, row.get(1)?))) }) .map_err(|_| "Log file not found".to_string())? }; let (compressed, file_path) = row; if let Some(bytes) = compressed { return decompress_text(&bytes); } std::fs::read_to_string(&file_path).map_err(|e| format!("Log file not found on disk: {e}")) } #[tauri::command] pub async fn list_all_log_files( search: Option, issue_id: Option, state: State<'_, AppState>, ) -> Result, String> { let db = state.db.lock().map_err(|e| e.to_string())?; let mut query = "SELECT id, issue_id, file_name, file_path, file_size, mime_type, content_hash, uploaded_at, redacted, issue_title \ FROM v_log_files_with_issue WHERE 1=1" .to_string(); let mut params: Vec = Vec::new(); if let Some(ref q) = search { query.push_str(" AND file_name LIKE ?"); params.push(format!("%{q}%")); } if let Some(ref id) = issue_id { query.push_str(" AND issue_id = ?"); params.push(id.clone()); } query.push_str(" ORDER BY uploaded_at DESC"); let mut stmt = db.prepare(&query).map_err(|e| e.to_string())?; let results = stmt .query_map(rusqlite::params_from_iter(params.iter()), |row| { Ok(LogFileSummary { id: row.get(0)?, issue_id: row.get(1)?, file_name: row.get(2)?, file_path: row.get(3)?, file_size: row.get(4)?, mime_type: row.get(5)?, content_hash: row.get(6)?, uploaded_at: row.get(7)?, redacted: row.get::<_, i32>(8)? != 0, issue_title: row.get(9)?, }) }) .map_err(|e| e.to_string())? .filter_map(|r| r.ok()) .collect(); Ok(results) } #[cfg(test)] mod tests { use super::*; #[test] fn test_validate_log_file_path_rejects_non_file() { let dir = std::env::temp_dir(); let result = validate_log_file_path(dir.to_string_lossy().as_ref()); assert!(result.is_err()); } #[test] fn test_validate_log_file_path_accepts_small_file() { let file_path = std::env::temp_dir().join(format!("tftsr-analysis-test-{}.log", uuid::Uuid::now_v7())); std::fs::write(&file_path, "hello").unwrap(); let result = validate_log_file_path(file_path.to_string_lossy().as_ref()); assert!(result.is_ok()); let _ = std::fs::remove_file(file_path); } #[test] fn test_is_safe_file_allows_txt() { assert!(is_safe_file(Path::new("file.txt"))); } #[test] fn test_is_safe_file_allows_md() { assert!(is_safe_file(Path::new("readme.md"))); } #[test] fn test_is_safe_file_allows_pdf() { assert!(is_safe_file(Path::new("report.pdf"))); } #[test] fn test_is_safe_file_allows_docx() { assert!(is_safe_file(Path::new("doc.docx"))); } #[test] fn test_is_safe_file_rejects_exe() { assert!(!is_safe_file(Path::new("malware.exe"))); } #[test] fn test_is_safe_file_rejects_dll() { assert!(!is_safe_file(Path::new("library.dll"))); } #[test] fn test_is_safe_file_rejects_zip_directly() { assert!(!is_safe_file(Path::new("archive.zip"))); } #[test] fn test_is_safe_file_case_insensitive() { assert!(is_safe_file(Path::new("file.TXT"))); assert!(is_safe_file(Path::new("file.Log"))); } #[test] fn test_is_safe_file_no_extension_rejected() { assert!(!is_safe_file(Path::new("Makefile"))); } #[test] fn test_extract_text_plain_file() { let dir = std::env::temp_dir(); let path = dir.join(format!("tftsr-test-extract-{}.txt", uuid::Uuid::now_v7())); std::fs::write(&path, "hello world").unwrap(); let result = extract_text_content(&path); assert!(result.is_ok()); assert_eq!(result.unwrap().trim(), "hello world"); let _ = std::fs::remove_file(path); } #[test] fn test_extract_text_unsupported_binary_returns_error() { let result = extract_text_content(Path::new("data.xlsx")); assert!(result.is_err()); assert!(result.unwrap_err().contains("not yet supported")); } #[test] fn test_compress_decompress_roundtrip() { let original = "Hello, World! This is a log line with some content."; let compressed = compress_text(original).unwrap(); assert!(!compressed.is_empty()); assert!(compressed.len() < original.len() * 3); let decompressed = decompress_text(&compressed).unwrap(); assert_eq!(decompressed, original); } #[test] fn test_compress_returns_error_not_empty_on_failure() { // compress_text returns Result — callers must propagate, not silently discard. // For in-memory gzip this essentially never fails, but the API now allows // callers to surface the error rather than storing empty bytes. let result = compress_text("normal log line"); assert!( result.is_ok(), "compress_text should succeed for normal input" ); assert!(!result.unwrap().is_empty()); } #[test] fn test_compress_large_text_is_smaller() { let original = "INFO server started\n".repeat(1000); let compressed = compress_text(&original).unwrap(); assert!( compressed.len() < original.len(), "gzip should compress repetitive text" ); } #[test] fn test_decompress_invalid_bytes_returns_error() { let result = decompress_text(b"not gzip data"); assert!(result.is_err()); } #[test] fn test_decompress_size_limit_enforced() { assert_eq!( MAX_DECOMPRESSED_BYTES, 100 * 1024 * 1024, "Decompression bomb guard must be 100 MB" ); // A valid small payload must still decompress fine after the guard is in place. let text = "hello world decompression guard test\n".repeat(100); let compressed = compress_text(&text).unwrap(); let result = decompress_text(&compressed); assert!(result.is_ok()); assert_eq!(result.unwrap(), text); } }