feat(upload): add safe file extension validation and binary text extraction
- Add extension allowlist (SAFE_TEXT_EXTENSIONS + SAFE_BINARY_EXTENSIONS) rejecting unsupported file types at both upload_log_file and upload_log_file_by_content entry points - Add extract_text_content() with PDF text extraction via lopdf and DOCX extraction via zip+quick-xml - Binary files (PDF/DOCX) get extracted text written to .extracted.txt for downstream PII detection - Expand frontend file input accept list and add collapsible supported-formats disclosure element - Add 11 unit tests covering allowlist logic and extraction paths
This commit is contained in:
parent
cd67a09a6a
commit
f47ec90d05
@ -45,6 +45,9 @@ warp = "0.3"
|
||||
urlencoding = "2"
|
||||
infer = "0.15"
|
||||
url = "2.5.8"
|
||||
lopdf = "0.31"
|
||||
zip = "0.6"
|
||||
quick-xml = "0.36"
|
||||
rmcp = { version = "1.7.0", features = [
|
||||
"client",
|
||||
"transport-child-process",
|
||||
|
||||
@ -9,6 +9,139 @@ use crate::state::AppState;
|
||||
|
||||
const MAX_LOG_FILE_BYTES: u64 = 50 * 1024 * 1024;
|
||||
|
||||
const SAFE_TEXT_EXTENSIONS: &[&str] = &[
|
||||
"log",
|
||||
"txt",
|
||||
"out",
|
||||
"err",
|
||||
"syslog",
|
||||
"journal",
|
||||
"yaml",
|
||||
"yml",
|
||||
"json",
|
||||
"toml",
|
||||
"xml",
|
||||
"ini",
|
||||
"cfg",
|
||||
"conf",
|
||||
"config",
|
||||
"env",
|
||||
"properties",
|
||||
"md",
|
||||
"markdown",
|
||||
"rst",
|
||||
"csv",
|
||||
"tsv",
|
||||
"ndjson",
|
||||
"jsonl",
|
||||
"sql",
|
||||
"sh",
|
||||
"bash",
|
||||
"zsh",
|
||||
"py",
|
||||
"js",
|
||||
"ts",
|
||||
"rb",
|
||||
"go",
|
||||
"rs",
|
||||
"java",
|
||||
"html",
|
||||
"htm",
|
||||
"css",
|
||||
"diff",
|
||||
"patch",
|
||||
"rtf",
|
||||
];
|
||||
|
||||
const SAFE_BINARY_EXTENSIONS: &[&str] = &["pdf", "docx", "doc", "xlsx", "xls"];
|
||||
|
||||
pub fn is_safe_file(path: &Path) -> bool {
|
||||
let ext = path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| e.to_lowercase());
|
||||
match ext.as_deref() {
|
||||
Some(e) => SAFE_TEXT_EXTENSIONS.contains(&e) || SAFE_BINARY_EXTENSIONS.contains(&e),
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extract_text_content(path: &Path) -> Result<String, String> {
|
||||
let ext = path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| e.to_lowercase())
|
||||
.unwrap_or_default();
|
||||
|
||||
match ext.as_str() {
|
||||
"pdf" => extract_pdf_text(path),
|
||||
"docx" | "doc" => extract_docx_text(path),
|
||||
"xlsx" | "xls" => Err(format!(
|
||||
"Spreadsheet format .{ext} is not yet supported for text extraction. \
|
||||
Export the sheet as CSV and upload that instead."
|
||||
)),
|
||||
_ => std::fs::read_to_string(path).map_err(|e| format!("Failed to read file: {e}")),
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_pdf_text(path: &Path) -> Result<String, String> {
|
||||
let doc = lopdf::Document::load(path).map_err(|e| format!("Failed to parse PDF: {e}"))?;
|
||||
let mut text = String::new();
|
||||
let mut pages: Vec<u32> = doc.get_pages().keys().copied().collect();
|
||||
pages.sort_unstable();
|
||||
for page_num in pages {
|
||||
if let Ok(content) = doc.extract_text(&[page_num]) {
|
||||
text.push_str(&content);
|
||||
text.push('\n');
|
||||
}
|
||||
}
|
||||
if text.trim().is_empty() {
|
||||
return Err("PDF contains no extractable text (may be a scanned image)".to_string());
|
||||
}
|
||||
Ok(text)
|
||||
}
|
||||
|
||||
fn extract_docx_text(path: &Path) -> Result<String, String> {
|
||||
use std::io::Read as _;
|
||||
let file = std::fs::File::open(path).map_err(|e| format!("Failed to open file: {e}"))?;
|
||||
let mut archive =
|
||||
zip::ZipArchive::new(file).map_err(|e| format!("Failed to open as ZIP/DOCX: {e}"))?;
|
||||
let mut xml_content = String::new();
|
||||
{
|
||||
let mut doc_xml = archive
|
||||
.by_name("word/document.xml")
|
||||
.map_err(|_| "Not a valid DOCX: missing word/document.xml".to_string())?;
|
||||
doc_xml
|
||||
.read_to_string(&mut xml_content)
|
||||
.map_err(|e| format!("Failed to read document.xml: {e}"))?;
|
||||
}
|
||||
let mut text = String::new();
|
||||
let mut reader = quick_xml::Reader::from_str(&xml_content);
|
||||
reader.config_mut().trim_text(true);
|
||||
let mut buf = Vec::new();
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(quick_xml::events::Event::Text(e)) => {
|
||||
if let Ok(s) = e.unescape() {
|
||||
let trimmed = s.trim().to_string();
|
||||
if !trimmed.is_empty() {
|
||||
text.push_str(&trimmed);
|
||||
text.push(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(quick_xml::events::Event::Eof) => break,
|
||||
Err(e) => return Err(format!("XML parse error: {e}")),
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
if text.trim().is_empty() {
|
||||
return Err("DOCX contains no extractable text".to_string());
|
||||
}
|
||||
Ok(text)
|
||||
}
|
||||
|
||||
fn validate_log_file_path(file_path: &str) -> Result<PathBuf, String> {
|
||||
let path = Path::new(file_path);
|
||||
let canonical = std::fs::canonicalize(path).map_err(|_| "Unable to access selected file")?;
|
||||
@ -35,24 +168,59 @@ pub async fn upload_log_file(
|
||||
state: State<'_, AppState>,
|
||||
) -> Result<LogFile, String> {
|
||||
let canonical_path = validate_log_file_path(&file_path)?;
|
||||
let content = std::fs::read(&canonical_path).map_err(|_| "Failed to read selected log file")?;
|
||||
let content_hash = format!("{:x}", Sha256::digest(&content));
|
||||
|
||||
if !is_safe_file(&canonical_path) {
|
||||
let ext = canonical_path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.unwrap_or("(none)");
|
||||
return Err(format!(
|
||||
"File type '.{ext}' is not supported. Supported formats include .log, .txt, .json, .pdf, .docx, .md, and many more."
|
||||
));
|
||||
}
|
||||
|
||||
let file_name = canonical_path
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
let file_size = content.len() as i64;
|
||||
let mime_type = if file_name.ends_with(".json") {
|
||||
"application/json"
|
||||
} else if file_name.ends_with(".xml") {
|
||||
"application/xml"
|
||||
} else {
|
||||
"text/plain"
|
||||
|
||||
let file_ext = canonical_path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| e.to_lowercase())
|
||||
.unwrap_or_default();
|
||||
|
||||
let extracted_text = extract_text_content(&canonical_path)
|
||||
.map_err(|e| format!("Failed to read file content: {e}"))?;
|
||||
let content_bytes = extracted_text.as_bytes();
|
||||
let content_hash = format!("{:x}", Sha256::digest(content_bytes));
|
||||
let file_size = content_bytes.len() as i64;
|
||||
|
||||
let mime_type = match file_ext.as_str() {
|
||||
"json" => "application/json",
|
||||
"xml" => "application/xml",
|
||||
"yaml" | "yml" => "application/yaml",
|
||||
"pdf" => "application/pdf",
|
||||
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"doc" => "application/msword",
|
||||
"md" | "markdown" => "text/markdown",
|
||||
"csv" | "tsv" => "text/csv",
|
||||
"html" | "htm" => "text/html",
|
||||
_ => "text/plain",
|
||||
};
|
||||
|
||||
let canonical_file_path = canonical_path.to_string_lossy().to_string();
|
||||
let log_file = LogFile::new(issue_id.clone(), file_name, canonical_file_path, file_size);
|
||||
let is_binary = SAFE_BINARY_EXTENSIONS.contains(&file_ext.as_str());
|
||||
let stored_path = if is_binary {
|
||||
let extracted_path = canonical_path.with_extension("extracted.txt");
|
||||
std::fs::write(&extracted_path, &extracted_text)
|
||||
.map_err(|e| format!("Failed to write extracted text: {e}"))?;
|
||||
extracted_path.to_string_lossy().to_string()
|
||||
} else {
|
||||
canonical_path.to_string_lossy().to_string()
|
||||
};
|
||||
|
||||
let log_file = LogFile::new(issue_id.clone(), file_name, stored_path, file_size);
|
||||
let log_file = LogFile {
|
||||
content_hash: content_hash.clone(),
|
||||
mime_type: mime_type.to_string(),
|
||||
@ -104,17 +272,36 @@ pub async fn upload_log_file_by_content(
|
||||
content: String,
|
||||
state: State<'_, AppState>,
|
||||
) -> Result<LogFile, String> {
|
||||
let fake_path = Path::new(&file_name);
|
||||
if !is_safe_file(fake_path) {
|
||||
let ext = fake_path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.unwrap_or("(none)");
|
||||
return Err(format!("File type '.{ext}' is not supported."));
|
||||
}
|
||||
|
||||
let content_bytes = content.as_bytes();
|
||||
let content_hash = format!("{:x}", Sha256::digest(content_bytes));
|
||||
let file_size = content_bytes.len() as i64;
|
||||
|
||||
// Determine mime type based on file extension
|
||||
let mime_type = if file_name.ends_with(".json") {
|
||||
"application/json"
|
||||
} else if file_name.ends_with(".xml") {
|
||||
"application/xml"
|
||||
} else {
|
||||
"text/plain"
|
||||
let file_ext = fake_path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| e.to_lowercase())
|
||||
.unwrap_or_default();
|
||||
|
||||
let mime_type = match file_ext.as_str() {
|
||||
"json" => "application/json",
|
||||
"xml" => "application/xml",
|
||||
"yaml" | "yml" => "application/yaml",
|
||||
"pdf" => "application/pdf",
|
||||
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"doc" => "application/msword",
|
||||
"md" | "markdown" => "text/markdown",
|
||||
"csv" | "tsv" => "text/csv",
|
||||
"html" | "htm" => "text/html",
|
||||
_ => "text/plain",
|
||||
};
|
||||
|
||||
// Use the file_name as the file_path for DB storage
|
||||
@ -328,4 +515,68 @@ mod tests {
|
||||
assert!(result.is_ok());
|
||||
let _ = std::fs::remove_file(file_path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_allows_txt() {
|
||||
assert!(is_safe_file(Path::new("file.txt")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_allows_md() {
|
||||
assert!(is_safe_file(Path::new("readme.md")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_allows_pdf() {
|
||||
assert!(is_safe_file(Path::new("report.pdf")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_allows_docx() {
|
||||
assert!(is_safe_file(Path::new("doc.docx")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_rejects_exe() {
|
||||
assert!(!is_safe_file(Path::new("malware.exe")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_rejects_dll() {
|
||||
assert!(!is_safe_file(Path::new("library.dll")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_rejects_zip_directly() {
|
||||
assert!(!is_safe_file(Path::new("archive.zip")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_case_insensitive() {
|
||||
assert!(is_safe_file(Path::new("file.TXT")));
|
||||
assert!(is_safe_file(Path::new("file.Log")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_safe_file_no_extension_rejected() {
|
||||
assert!(!is_safe_file(Path::new("Makefile")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_text_plain_file() {
|
||||
let dir = std::env::temp_dir();
|
||||
let path = dir.join(format!("tftsr-test-extract-{}.txt", uuid::Uuid::now_v7()));
|
||||
std::fs::write(&path, "hello world").unwrap();
|
||||
let result = extract_text_content(&path);
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(result.unwrap().trim(), "hello world");
|
||||
let _ = std::fs::remove_file(path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_text_unsupported_binary_returns_error() {
|
||||
let result = extract_text_content(Path::new("data.xlsx"));
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("not yet supported"));
|
||||
}
|
||||
}
|
||||
|
||||
@ -252,8 +252,21 @@ export default function LogUpload() {
|
||||
multiple
|
||||
className="hidden"
|
||||
onChange={handleFileSelect}
|
||||
accept=".log,.txt,.json,.csv,.xml,.yaml,.yml"
|
||||
accept=".log,.txt,.out,.err,.syslog,.journal,.yaml,.yml,.json,.toml,.xml,.ini,.cfg,.conf,.config,.env,.properties,.md,.markdown,.rst,.csv,.tsv,.ndjson,.jsonl,.sql,.sh,.bash,.zsh,.py,.js,.ts,.rb,.go,.rs,.java,.html,.htm,.css,.diff,.patch,.pdf,.docx,.doc,.rtf,.xlsx,.xls"
|
||||
/>
|
||||
<details className="mt-2 text-sm text-gray-500 dark:text-gray-400">
|
||||
<summary className="cursor-pointer hover:text-gray-700 dark:hover:text-gray-200">
|
||||
Supported formats
|
||||
</summary>
|
||||
<div className="mt-1 pl-3 space-y-1">
|
||||
<div><span className="font-medium">Logs & text:</span> .log, .txt, .out, .err, .syslog, .journal</div>
|
||||
<div><span className="font-medium">Config & markup:</span> .yaml, .yml, .json, .toml, .xml, .ini, .cfg, .conf, .env, .properties</div>
|
||||
<div><span className="font-medium">Documents:</span> .pdf, .docx, .doc, .md, .rst, .rtf</div>
|
||||
<div><span className="font-medium">Data:</span> .csv, .tsv, .xlsx, .xls, .ndjson, .jsonl, .sql</div>
|
||||
<div><span className="font-medium">Code & scripts:</span> .sh, .bash, .zsh, .py, .js, .ts, .rb, .go, .rs, .java, .html, .css, .diff, .patch</div>
|
||||
<p className="mt-1 italic">Binary formats (PDF, DOCX, XLSX) will have their text extracted automatically.</p>
|
||||
</div>
|
||||
</details>
|
||||
</div>
|
||||
|
||||
{/* File list */}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user