feat(upload): add safe file extension validation and binary text extraction

- Add extension allowlist (SAFE_TEXT_EXTENSIONS + SAFE_BINARY_EXTENSIONS)
  rejecting unsupported file types at both upload_log_file and
  upload_log_file_by_content entry points
- Add extract_text_content() with PDF text extraction via lopdf and
  DOCX extraction via zip+quick-xml
- Binary files (PDF/DOCX) get extracted text written to .extracted.txt
  for downstream PII detection
- Expand frontend file input accept list and add collapsible
  supported-formats disclosure element
- Add 11 unit tests covering allowlist logic and extraction paths
This commit is contained in:
Shaun Arman 2026-05-31 13:50:59 -05:00
parent cd67a09a6a
commit f47ec90d05
3 changed files with 286 additions and 19 deletions

View File

@ -45,6 +45,9 @@ warp = "0.3"
urlencoding = "2"
infer = "0.15"
url = "2.5.8"
lopdf = "0.31"
zip = "0.6"
quick-xml = "0.36"
rmcp = { version = "1.7.0", features = [
"client",
"transport-child-process",

View File

@ -9,6 +9,139 @@ use crate::state::AppState;
const MAX_LOG_FILE_BYTES: u64 = 50 * 1024 * 1024;
const SAFE_TEXT_EXTENSIONS: &[&str] = &[
"log",
"txt",
"out",
"err",
"syslog",
"journal",
"yaml",
"yml",
"json",
"toml",
"xml",
"ini",
"cfg",
"conf",
"config",
"env",
"properties",
"md",
"markdown",
"rst",
"csv",
"tsv",
"ndjson",
"jsonl",
"sql",
"sh",
"bash",
"zsh",
"py",
"js",
"ts",
"rb",
"go",
"rs",
"java",
"html",
"htm",
"css",
"diff",
"patch",
"rtf",
];
const SAFE_BINARY_EXTENSIONS: &[&str] = &["pdf", "docx", "doc", "xlsx", "xls"];
pub fn is_safe_file(path: &Path) -> bool {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase());
match ext.as_deref() {
Some(e) => SAFE_TEXT_EXTENSIONS.contains(&e) || SAFE_BINARY_EXTENSIONS.contains(&e),
None => false,
}
}
pub fn extract_text_content(path: &Path) -> Result<String, String> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())
.unwrap_or_default();
match ext.as_str() {
"pdf" => extract_pdf_text(path),
"docx" | "doc" => extract_docx_text(path),
"xlsx" | "xls" => Err(format!(
"Spreadsheet format .{ext} is not yet supported for text extraction. \
Export the sheet as CSV and upload that instead."
)),
_ => std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {e}")),
}
}
fn extract_pdf_text(path: &Path) -> Result<String, String> {
let doc = lopdf::Document::load(path).map_err(|e| format!("Failed to parse PDF: {e}"))?;
let mut text = String::new();
let mut pages: Vec<u32> = doc.get_pages().keys().copied().collect();
pages.sort_unstable();
for page_num in pages {
if let Ok(content) = doc.extract_text(&[page_num]) {
text.push_str(&content);
text.push('\n');
}
}
if text.trim().is_empty() {
return Err("PDF contains no extractable text (may be a scanned image)".to_string());
}
Ok(text)
}
fn extract_docx_text(path: &Path) -> Result<String, String> {
use std::io::Read as _;
let file = std::fs::File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
let mut archive =
zip::ZipArchive::new(file).map_err(|e| format!("Failed to open as ZIP/DOCX: {e}"))?;
let mut xml_content = String::new();
{
let mut doc_xml = archive
.by_name("word/document.xml")
.map_err(|_| "Not a valid DOCX: missing word/document.xml".to_string())?;
doc_xml
.read_to_string(&mut xml_content)
.map_err(|e| format!("Failed to read document.xml: {e}"))?;
}
let mut text = String::new();
let mut reader = quick_xml::Reader::from_str(&xml_content);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Text(e)) => {
if let Ok(s) = e.unescape() {
let trimmed = s.trim().to_string();
if !trimmed.is_empty() {
text.push_str(&trimmed);
text.push(' ');
}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(format!("XML parse error: {e}")),
_ => {}
}
buf.clear();
}
if text.trim().is_empty() {
return Err("DOCX contains no extractable text".to_string());
}
Ok(text)
}
fn validate_log_file_path(file_path: &str) -> Result<PathBuf, String> {
let path = Path::new(file_path);
let canonical = std::fs::canonicalize(path).map_err(|_| "Unable to access selected file")?;
@ -35,24 +168,59 @@ pub async fn upload_log_file(
state: State<'_, AppState>,
) -> Result<LogFile, String> {
let canonical_path = validate_log_file_path(&file_path)?;
let content = std::fs::read(&canonical_path).map_err(|_| "Failed to read selected log file")?;
let content_hash = format!("{:x}", Sha256::digest(&content));
if !is_safe_file(&canonical_path) {
let ext = canonical_path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("(none)");
return Err(format!(
"File type '.{ext}' is not supported. Supported formats include .log, .txt, .json, .pdf, .docx, .md, and many more."
));
}
let file_name = canonical_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown")
.to_string();
let file_size = content.len() as i64;
let mime_type = if file_name.ends_with(".json") {
"application/json"
} else if file_name.ends_with(".xml") {
"application/xml"
} else {
"text/plain"
let file_ext = canonical_path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())
.unwrap_or_default();
let extracted_text = extract_text_content(&canonical_path)
.map_err(|e| format!("Failed to read file content: {e}"))?;
let content_bytes = extracted_text.as_bytes();
let content_hash = format!("{:x}", Sha256::digest(content_bytes));
let file_size = content_bytes.len() as i64;
let mime_type = match file_ext.as_str() {
"json" => "application/json",
"xml" => "application/xml",
"yaml" | "yml" => "application/yaml",
"pdf" => "application/pdf",
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"doc" => "application/msword",
"md" | "markdown" => "text/markdown",
"csv" | "tsv" => "text/csv",
"html" | "htm" => "text/html",
_ => "text/plain",
};
let canonical_file_path = canonical_path.to_string_lossy().to_string();
let log_file = LogFile::new(issue_id.clone(), file_name, canonical_file_path, file_size);
let is_binary = SAFE_BINARY_EXTENSIONS.contains(&file_ext.as_str());
let stored_path = if is_binary {
let extracted_path = canonical_path.with_extension("extracted.txt");
std::fs::write(&extracted_path, &extracted_text)
.map_err(|e| format!("Failed to write extracted text: {e}"))?;
extracted_path.to_string_lossy().to_string()
} else {
canonical_path.to_string_lossy().to_string()
};
let log_file = LogFile::new(issue_id.clone(), file_name, stored_path, file_size);
let log_file = LogFile {
content_hash: content_hash.clone(),
mime_type: mime_type.to_string(),
@ -104,17 +272,36 @@ pub async fn upload_log_file_by_content(
content: String,
state: State<'_, AppState>,
) -> Result<LogFile, String> {
let fake_path = Path::new(&file_name);
if !is_safe_file(fake_path) {
let ext = fake_path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("(none)");
return Err(format!("File type '.{ext}' is not supported."));
}
let content_bytes = content.as_bytes();
let content_hash = format!("{:x}", Sha256::digest(content_bytes));
let file_size = content_bytes.len() as i64;
// Determine mime type based on file extension
let mime_type = if file_name.ends_with(".json") {
"application/json"
} else if file_name.ends_with(".xml") {
"application/xml"
} else {
"text/plain"
let file_ext = fake_path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())
.unwrap_or_default();
let mime_type = match file_ext.as_str() {
"json" => "application/json",
"xml" => "application/xml",
"yaml" | "yml" => "application/yaml",
"pdf" => "application/pdf",
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"doc" => "application/msword",
"md" | "markdown" => "text/markdown",
"csv" | "tsv" => "text/csv",
"html" | "htm" => "text/html",
_ => "text/plain",
};
// Use the file_name as the file_path for DB storage
@ -328,4 +515,68 @@ mod tests {
assert!(result.is_ok());
let _ = std::fs::remove_file(file_path);
}
#[test]
fn test_is_safe_file_allows_txt() {
assert!(is_safe_file(Path::new("file.txt")));
}
#[test]
fn test_is_safe_file_allows_md() {
assert!(is_safe_file(Path::new("readme.md")));
}
#[test]
fn test_is_safe_file_allows_pdf() {
assert!(is_safe_file(Path::new("report.pdf")));
}
#[test]
fn test_is_safe_file_allows_docx() {
assert!(is_safe_file(Path::new("doc.docx")));
}
#[test]
fn test_is_safe_file_rejects_exe() {
assert!(!is_safe_file(Path::new("malware.exe")));
}
#[test]
fn test_is_safe_file_rejects_dll() {
assert!(!is_safe_file(Path::new("library.dll")));
}
#[test]
fn test_is_safe_file_rejects_zip_directly() {
assert!(!is_safe_file(Path::new("archive.zip")));
}
#[test]
fn test_is_safe_file_case_insensitive() {
assert!(is_safe_file(Path::new("file.TXT")));
assert!(is_safe_file(Path::new("file.Log")));
}
#[test]
fn test_is_safe_file_no_extension_rejected() {
assert!(!is_safe_file(Path::new("Makefile")));
}
#[test]
fn test_extract_text_plain_file() {
let dir = std::env::temp_dir();
let path = dir.join(format!("tftsr-test-extract-{}.txt", uuid::Uuid::now_v7()));
std::fs::write(&path, "hello world").unwrap();
let result = extract_text_content(&path);
assert!(result.is_ok());
assert_eq!(result.unwrap().trim(), "hello world");
let _ = std::fs::remove_file(path);
}
#[test]
fn test_extract_text_unsupported_binary_returns_error() {
let result = extract_text_content(Path::new("data.xlsx"));
assert!(result.is_err());
assert!(result.unwrap_err().contains("not yet supported"));
}
}

View File

@ -252,8 +252,21 @@ export default function LogUpload() {
multiple
className="hidden"
onChange={handleFileSelect}
accept=".log,.txt,.json,.csv,.xml,.yaml,.yml"
accept=".log,.txt,.out,.err,.syslog,.journal,.yaml,.yml,.json,.toml,.xml,.ini,.cfg,.conf,.config,.env,.properties,.md,.markdown,.rst,.csv,.tsv,.ndjson,.jsonl,.sql,.sh,.bash,.zsh,.py,.js,.ts,.rb,.go,.rs,.java,.html,.htm,.css,.diff,.patch,.pdf,.docx,.doc,.rtf,.xlsx,.xls"
/>
<details className="mt-2 text-sm text-gray-500 dark:text-gray-400">
<summary className="cursor-pointer hover:text-gray-700 dark:hover:text-gray-200">
Supported formats
</summary>
<div className="mt-1 pl-3 space-y-1">
<div><span className="font-medium">Logs &amp; text:</span> .log, .txt, .out, .err, .syslog, .journal</div>
<div><span className="font-medium">Config &amp; markup:</span> .yaml, .yml, .json, .toml, .xml, .ini, .cfg, .conf, .env, .properties</div>
<div><span className="font-medium">Documents:</span> .pdf, .docx, .doc, .md, .rst, .rtf</div>
<div><span className="font-medium">Data:</span> .csv, .tsv, .xlsx, .xls, .ndjson, .jsonl, .sql</div>
<div><span className="font-medium">Code &amp; scripts:</span> .sh, .bash, .zsh, .py, .js, .ts, .rb, .go, .rs, .java, .html, .css, .diff, .patch</div>
<p className="mt-1 italic">Binary formats (PDF, DOCX, XLSX) will have their text extracted automatically.</p>
</div>
</details>
</div>
{/* File list */}