feat(upload): add safe file extension validation and binary text extraction

- Add extension allowlist (SAFE_TEXT_EXTENSIONS + SAFE_BINARY_EXTENSIONS) rejecting unsupported file types at both upload_log_file and upload_log_file_by_content entry points - Add extract_text_content() with PDF text extraction via lopdf and DOCX extraction via zip+quick-xml - Binary files (PDF/DOCX) get extracted text written to .extracted.txt for downstream PII detection - Expand frontend file input accept list and add collapsible supported-formats disclosure element - Add 11 unit tests covering allowlist logic and extraction paths
2026-05-31 13:50:59 -05:00 · 2026-05-31 13:50:59 -05:00 · f47ec90d05
commit f47ec90d05
parent cd67a09a6a
3 changed files with 286 additions and 19 deletions
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@ -45,6 +45,9 @@ warp = "0.3"
 urlencoding = "2"
 infer = "0.15"
 url = "2.5.8"
+lopdf = "0.31"
+zip = "0.6"
+quick-xml = "0.36"
 rmcp = { version = "1.7.0", features = [
    "client",
    "transport-child-process",
--- a/src-tauri/src/commands/analysis.rs
+++ b/src-tauri/src/commands/analysis.rs
@ -9,6 +9,139 @@ use crate::state::AppState;

 const MAX_LOG_FILE_BYTES: u64 = 50 * 1024 * 1024;

+const SAFE_TEXT_EXTENSIONS: &[&str] = &[
+    "log",
+    "txt",
+    "out",
+    "err",
+    "syslog",
+    "journal",
+    "yaml",
+    "yml",
+    "json",
+    "toml",
+    "xml",
+    "ini",
+    "cfg",
+    "conf",
+    "config",
+    "env",
+    "properties",
+    "md",
+    "markdown",
+    "rst",
+    "csv",
+    "tsv",
+    "ndjson",
+    "jsonl",
+    "sql",
+    "sh",
+    "bash",
+    "zsh",
+    "py",
+    "js",
+    "ts",
+    "rb",
+    "go",
+    "rs",
+    "java",
+    "html",
+    "htm",
+    "css",
+    "diff",
+    "patch",
+    "rtf",
+];
+
+const SAFE_BINARY_EXTENSIONS: &[&str] = &["pdf", "docx", "doc", "xlsx", "xls"];
+
+pub fn is_safe_file(path: &Path) -> bool {
+    let ext = path
+        .extension()
+        .and_then(|e| e.to_str())
+        .map(|e| e.to_lowercase());
+    match ext.as_deref() {
+        Some(e) => SAFE_TEXT_EXTENSIONS.contains(&e) || SAFE_BINARY_EXTENSIONS.contains(&e),
+        None => false,
+    }
+}
+
+pub fn extract_text_content(path: &Path) -> Result<String, String> {
+    let ext = path
+        .extension()
+        .and_then(|e| e.to_str())
+        .map(|e| e.to_lowercase())
+        .unwrap_or_default();
+
+    match ext.as_str() {
+        "pdf" => extract_pdf_text(path),
+        "docx" | "doc" => extract_docx_text(path),
+        "xlsx" | "xls" => Err(format!(
+            "Spreadsheet format .{ext} is not yet supported for text extraction. \
+             Export the sheet as CSV and upload that instead."
+        )),
+        _ => std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {e}")),
+    }
+}
+
+fn extract_pdf_text(path: &Path) -> Result<String, String> {
+    let doc = lopdf::Document::load(path).map_err(|e| format!("Failed to parse PDF: {e}"))?;
+    let mut text = String::new();
+    let mut pages: Vec<u32> = doc.get_pages().keys().copied().collect();
+    pages.sort_unstable();
+    for page_num in pages {
+        if let Ok(content) = doc.extract_text(&[page_num]) {
+            text.push_str(&content);
+            text.push('\n');
+        }
+    }
+    if text.trim().is_empty() {
+        return Err("PDF contains no extractable text (may be a scanned image)".to_string());
+    }
+    Ok(text)
+}
+
+fn extract_docx_text(path: &Path) -> Result<String, String> {
+    use std::io::Read as _;
+    let file = std::fs::File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
+    let mut archive =
+        zip::ZipArchive::new(file).map_err(|e| format!("Failed to open as ZIP/DOCX: {e}"))?;
+    let mut xml_content = String::new();
+    {
+        let mut doc_xml = archive
+            .by_name("word/document.xml")
+            .map_err(|_| "Not a valid DOCX: missing word/document.xml".to_string())?;
+        doc_xml
+            .read_to_string(&mut xml_content)
+            .map_err(|e| format!("Failed to read document.xml: {e}"))?;
+    }
+    let mut text = String::new();
+    let mut reader = quick_xml::Reader::from_str(&xml_content);
+    reader.config_mut().trim_text(true);
+    let mut buf = Vec::new();
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(quick_xml::events::Event::Text(e)) => {
+                if let Ok(s) = e.unescape() {
+                    let trimmed = s.trim().to_string();
+                    if !trimmed.is_empty() {
+                        text.push_str(&trimmed);
+                        text.push(' ');
+                    }
+                }
+            }
+            Ok(quick_xml::events::Event::Eof) => break,
+            Err(e) => return Err(format!("XML parse error: {e}")),
+            _ => {}
+        }
+        buf.clear();
+    }
+    if text.trim().is_empty() {
+        return Err("DOCX contains no extractable text".to_string());
+    }
+    Ok(text)
+}
+
 fn validate_log_file_path(file_path: &str) -> Result<PathBuf, String> {
    let path = Path::new(file_path);
    let canonical = std::fs::canonicalize(path).map_err(|_| "Unable to access selected file")?;
@ -35,24 +168,59 @@ pub async fn upload_log_file(
    state: State<'_, AppState>,
 ) -> Result<LogFile, String> {
    let canonical_path = validate_log_file_path(&file_path)?;
-    let content = std::fs::read(&canonical_path).map_err(|_| "Failed to read selected log file")?;
-    let content_hash = format!("{:x}", Sha256::digest(&content));
+
+    if !is_safe_file(&canonical_path) {
+        let ext = canonical_path
+            .extension()
+            .and_then(|e| e.to_str())
+            .unwrap_or("(none)");
+        return Err(format!(
+            "File type '.{ext}' is not supported. Supported formats include .log, .txt, .json, .pdf, .docx, .md, and many more."
+        ));
+    }
+
    let file_name = canonical_path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("unknown")
        .to_string();
-    let file_size = content.len() as i64;
-    let mime_type = if file_name.ends_with(".json") {
-        "application/json"
-    } else if file_name.ends_with(".xml") {
-        "application/xml"
-    } else {
-        "text/plain"
+
+    let file_ext = canonical_path
+        .extension()
+        .and_then(|e| e.to_str())
+        .map(|e| e.to_lowercase())
+        .unwrap_or_default();
+
+    let extracted_text = extract_text_content(&canonical_path)
+        .map_err(|e| format!("Failed to read file content: {e}"))?;
+    let content_bytes = extracted_text.as_bytes();
+    let content_hash = format!("{:x}", Sha256::digest(content_bytes));
+    let file_size = content_bytes.len() as i64;
+
+    let mime_type = match file_ext.as_str() {
+        "json" => "application/json",
+        "xml" => "application/xml",
+        "yaml" | "yml" => "application/yaml",
+        "pdf" => "application/pdf",
+        "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "doc" => "application/msword",
+        "md" | "markdown" => "text/markdown",
+        "csv" | "tsv" => "text/csv",
+        "html" | "htm" => "text/html",
+        _ => "text/plain",
    };

-    let canonical_file_path = canonical_path.to_string_lossy().to_string();
-    let log_file = LogFile::new(issue_id.clone(), file_name, canonical_file_path, file_size);
+    let is_binary = SAFE_BINARY_EXTENSIONS.contains(&file_ext.as_str());
+    let stored_path = if is_binary {
+        let extracted_path = canonical_path.with_extension("extracted.txt");
+        std::fs::write(&extracted_path, &extracted_text)
+            .map_err(|e| format!("Failed to write extracted text: {e}"))?;
+        extracted_path.to_string_lossy().to_string()
+    } else {
+        canonical_path.to_string_lossy().to_string()
+    };
+
+    let log_file = LogFile::new(issue_id.clone(), file_name, stored_path, file_size);
    let log_file = LogFile {
        content_hash: content_hash.clone(),
        mime_type: mime_type.to_string(),
@ -104,17 +272,36 @@ pub async fn upload_log_file_by_content(
    content: String,
    state: State<'_, AppState>,
 ) -> Result<LogFile, String> {
+    let fake_path = Path::new(&file_name);
+    if !is_safe_file(fake_path) {
+        let ext = fake_path
+            .extension()
+            .and_then(|e| e.to_str())
+            .unwrap_or("(none)");
+        return Err(format!("File type '.{ext}' is not supported."));
+    }
+
    let content_bytes = content.as_bytes();
    let content_hash = format!("{:x}", Sha256::digest(content_bytes));
    let file_size = content_bytes.len() as i64;

-    // Determine mime type based on file extension
-    let mime_type = if file_name.ends_with(".json") {
-        "application/json"
-    } else if file_name.ends_with(".xml") {
-        "application/xml"
-    } else {
-        "text/plain"
+    let file_ext = fake_path
+        .extension()
+        .and_then(|e| e.to_str())
+        .map(|e| e.to_lowercase())
+        .unwrap_or_default();
+
+    let mime_type = match file_ext.as_str() {
+        "json" => "application/json",
+        "xml" => "application/xml",
+        "yaml" | "yml" => "application/yaml",
+        "pdf" => "application/pdf",
+        "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "doc" => "application/msword",
+        "md" | "markdown" => "text/markdown",
+        "csv" | "tsv" => "text/csv",
+        "html" | "htm" => "text/html",
+        _ => "text/plain",
    };

    // Use the file_name as the file_path for DB storage
@ -328,4 +515,68 @@ mod tests {
        assert!(result.is_ok());
        let _ = std::fs::remove_file(file_path);
    }
+
+    #[test]
+    fn test_is_safe_file_allows_txt() {
+        assert!(is_safe_file(Path::new("file.txt")));
+    }
+
+    #[test]
+    fn test_is_safe_file_allows_md() {
+        assert!(is_safe_file(Path::new("readme.md")));
+    }
+
+    #[test]
+    fn test_is_safe_file_allows_pdf() {
+        assert!(is_safe_file(Path::new("report.pdf")));
+    }
+
+    #[test]
+    fn test_is_safe_file_allows_docx() {
+        assert!(is_safe_file(Path::new("doc.docx")));
+    }
+
+    #[test]
+    fn test_is_safe_file_rejects_exe() {
+        assert!(!is_safe_file(Path::new("malware.exe")));
+    }
+
+    #[test]
+    fn test_is_safe_file_rejects_dll() {
+        assert!(!is_safe_file(Path::new("library.dll")));
+    }
+
+    #[test]
+    fn test_is_safe_file_rejects_zip_directly() {
+        assert!(!is_safe_file(Path::new("archive.zip")));
+    }
+
+    #[test]
+    fn test_is_safe_file_case_insensitive() {
+        assert!(is_safe_file(Path::new("file.TXT")));
+        assert!(is_safe_file(Path::new("file.Log")));
+    }
+
+    #[test]
+    fn test_is_safe_file_no_extension_rejected() {
+        assert!(!is_safe_file(Path::new("Makefile")));
+    }
+
+    #[test]
+    fn test_extract_text_plain_file() {
+        let dir = std::env::temp_dir();
+        let path = dir.join(format!("tftsr-test-extract-{}.txt", uuid::Uuid::now_v7()));
+        std::fs::write(&path, "hello world").unwrap();
+        let result = extract_text_content(&path);
+        assert!(result.is_ok());
+        assert_eq!(result.unwrap().trim(), "hello world");
+        let _ = std::fs::remove_file(path);
+    }
+
+    #[test]
+    fn test_extract_text_unsupported_binary_returns_error() {
+        let result = extract_text_content(Path::new("data.xlsx"));
+        assert!(result.is_err());
+        assert!(result.unwrap_err().contains("not yet supported"));
+    }
 }
--- a/src/pages/LogUpload/index.tsx
+++ b/src/pages/LogUpload/index.tsx
@ -252,8 +252,21 @@ export default function LogUpload() {
          multiple
          className="hidden"
          onChange={handleFileSelect}
-          accept=".log,.txt,.json,.csv,.xml,.yaml,.yml"
+          accept=".log,.txt,.out,.err,.syslog,.journal,.yaml,.yml,.json,.toml,.xml,.ini,.cfg,.conf,.config,.env,.properties,.md,.markdown,.rst,.csv,.tsv,.ndjson,.jsonl,.sql,.sh,.bash,.zsh,.py,.js,.ts,.rb,.go,.rs,.java,.html,.htm,.css,.diff,.patch,.pdf,.docx,.doc,.rtf,.xlsx,.xls"
        />
+        <details className="mt-2 text-sm text-gray-500 dark:text-gray-400">
+          <summary className="cursor-pointer hover:text-gray-700 dark:hover:text-gray-200">
+            Supported formats
+          </summary>
+          <div className="mt-1 pl-3 space-y-1">
+            <div><span className="font-medium">Logs &amp; text:</span> .log, .txt, .out, .err, .syslog, .journal</div>
+            <div><span className="font-medium">Config &amp; markup:</span> .yaml, .yml, .json, .toml, .xml, .ini, .cfg, .conf, .env, .properties</div>
+            <div><span className="font-medium">Documents:</span> .pdf, .docx, .doc, .md, .rst, .rtf</div>
+            <div><span className="font-medium">Data:</span> .csv, .tsv, .xlsx, .xls, .ndjson, .jsonl, .sql</div>
+            <div><span className="font-medium">Code &amp; scripts:</span> .sh, .bash, .zsh, .py, .js, .ts, .rb, .go, .rs, .java, .html, .css, .diff, .patch</div>
+            <p className="mt-1 italic">Binary formats (PDF, DOCX, XLSX) will have their text extracted automatically.</p>
+          </div>
+        </details>
      </div>

      {/* File list */}