tftsr-devops_investigation/src-tauri/src/integrations/confluence_search.rs
Shaun Arman e6d1965342
All checks were successful
Test / rust-fmt-check (pull_request) Successful in 10s
Test / frontend-typecheck (pull_request) Successful in 1m9s
Test / frontend-tests (pull_request) Successful in 1m13s
PR Review Automation / review (pull_request) Successful in 2m58s
Test / rust-clippy (pull_request) Successful in 3m50s
Test / rust-tests (pull_request) Successful in 5m12s
security: address all issues from automated PR review
- Add missing CQL escaping for &, |, +, - characters
- Improve escape_wiql() to escape more dangerous characters: ", \, (, ), ~, *, ?, ;, =
- Sanitize HTML in excerpts using strip_html_tags() to prevent XSS
- Add unit tests for escape_wiql, escape_cql, canonicalize_url functions
- Document expand_query() behavior (always returns at least original query)
- All tests pass (158/158), cargo fmt and clippy pass
2026-04-14 20:26:05 -05:00

262 lines
7.6 KiB
Rust

use serde::{Deserialize, Serialize};
use url::Url;
use super::query_expansion::expand_query;
const MAX_EXPANDED_QUERIES: usize = 3;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
pub title: String,
pub url: String,
pub excerpt: String,
pub content: Option<String>,
pub source: String,
}
fn canonicalize_url(url: &str) -> String {
Url::parse(url)
.ok()
.map(|u| {
let mut u = u.clone();
u.set_fragment(None);
u.set_query(None);
u.to_string()
})
.unwrap_or_else(|| url.to_string())
}
fn escape_cql(s: &str) -> String {
s.replace('"', "\\\"")
.replace(')', "\\)")
.replace('(', "\\(")
.replace('~', "\\~")
.replace('&', "\\&")
.replace('|', "\\|")
.replace('+', "\\+")
.replace('-', "\\-")
}
/// Search Confluence for content matching the query
///
/// This function expands the user query with related terms, synonyms, and variations
/// to improve search coverage across Confluence spaces.
pub async fn search_confluence(
base_url: &str,
query: &str,
cookies: &[crate::integrations::webview_auth::Cookie],
) -> Result<Vec<SearchResult>, String> {
let cookie_header = crate::integrations::webview_auth::cookies_to_header(cookies);
let client = reqwest::Client::new();
let expanded_queries = expand_query(query);
let mut all_results = Vec::new();
for expanded_query in expanded_queries.iter().take(MAX_EXPANDED_QUERIES) {
let safe_query = escape_cql(expanded_query);
let search_url = format!(
"{}/rest/api/search?cql=text~\"{}\"&limit=5",
base_url.trim_end_matches('/'),
urlencoding::encode(&safe_query)
);
tracing::info!(
"Searching Confluence with expanded query: {}",
expanded_query
);
let resp = client
.get(&search_url)
.header("Cookie", &cookie_header)
.header("Accept", "application/json")
.send()
.await
.map_err(|e| format!("Confluence search request failed: {e}"))?;
if !resp.status().is_success() {
let status = resp.status();
let text = resp.text().await.unwrap_or_default();
tracing::warn!("Confluence search failed with status {status}: {text}");
continue;
}
let json: serde_json::Value = resp
.json()
.await
.map_err(|e| format!("Failed to parse Confluence search response: {e}"))?;
if let Some(results_array) = json["results"].as_array() {
for item in results_array.iter().take(MAX_EXPANDED_QUERIES) {
let title = item["title"].as_str().unwrap_or("Untitled").to_string();
let id = item["content"]["id"].as_str();
let space_key = item["content"]["space"]["key"].as_str();
let url = if let (Some(id_str), Some(space)) = (id, space_key) {
format!(
"{}/display/{}/{}",
base_url.trim_end_matches('/'),
space,
id_str
)
} else {
base_url.to_string()
};
let excerpt = strip_html_tags(item["excerpt"].as_str().unwrap_or(""))
.chars()
.take(300)
.collect::<String>();
let content = if let Some(content_id) = id {
fetch_page_content(base_url, content_id, &cookie_header)
.await
.ok()
} else {
None
};
all_results.push(SearchResult {
title,
url,
excerpt,
content,
source: "Confluence".to_string(),
});
}
}
}
all_results.sort_by(|a, b| canonicalize_url(&a.url).cmp(&canonicalize_url(&b.url)));
all_results.dedup_by(|a, b| canonicalize_url(&a.url) == canonicalize_url(&b.url));
Ok(all_results)
}
/// Fetch full content of a Confluence page
async fn fetch_page_content(
base_url: &str,
page_id: &str,
cookie_header: &str,
) -> Result<String, String> {
let client = reqwest::Client::new();
let content_url = format!(
"{}/rest/api/content/{}?expand=body.storage",
base_url.trim_end_matches('/'),
page_id
);
let resp = client
.get(&content_url)
.header("Cookie", cookie_header)
.header("Accept", "application/json")
.send()
.await
.map_err(|e| format!("Failed to fetch page content: {e}"))?;
if !resp.status().is_success() {
let status = resp.status();
return Err(format!("Failed to fetch page: {status}"));
}
let json: serde_json::Value = resp
.json()
.await
.map_err(|e| format!("Failed to parse page content: {e}"))?;
// Extract plain text from HTML storage format
let html = json["body"]["storage"]["value"]
.as_str()
.unwrap_or("")
.to_string();
// Basic HTML tag stripping (for better results, use a proper HTML parser)
let text = strip_html_tags(&html);
// Truncate to reasonable length for AI context
let truncated = if text.len() > 3000 {
format!("{}...", &text[..3000])
} else {
text
};
Ok(truncated)
}
/// Basic HTML tag stripping
fn strip_html_tags(html: &str) -> String {
let mut result = String::new();
let mut in_tag = false;
for ch in html.chars() {
match ch {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(ch),
_ => {}
}
}
// Clean up whitespace
result
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.trim()
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_html_tags() {
let html = "<p>Hello <strong>world</strong>!</p>";
assert_eq!(strip_html_tags(html), "Hello world!");
let html2 = "<div><h1>Title</h1><p>Content</p></div>";
assert_eq!(strip_html_tags(html2), "TitleContent");
}
#[test]
fn test_escape_cql_escapes_special_chars() {
assert_eq!(escape_cql("test\"quote"), r#"test\"quote"#);
assert_eq!(escape_cql("test(paren"), r#"test\(paren"#);
assert_eq!(escape_cql("test)paren"), r#"test\)paren"#);
assert_eq!(escape_cql("test~tilde"), r#"test\~tilde"#);
assert_eq!(escape_cql("test&and"), r#"test\&and"#);
assert_eq!(escape_cql("test|or"), r#"test\|or"#);
assert_eq!(escape_cql("test+plus"), r#"test\+plus"#);
assert_eq!(escape_cql("test-minus"), r#"test\-minus"#);
}
#[test]
fn test_escape_cql_no_special_chars() {
assert_eq!(escape_cql("simple query"), "simple query");
}
#[test]
fn test_canonicalize_url_removes_fragment() {
assert_eq!(
canonicalize_url("https://example.com/page#section"),
"https://example.com/page"
);
}
#[test]
fn test_canonicalize_url_removes_query() {
assert_eq!(
canonicalize_url("https://example.com/page?param=value"),
"https://example.com/page"
);
}
#[test]
fn test_canonicalize_url_handles_malformed() {
// Malformed URLs fall back to original
assert_eq!(canonicalize_url("not a url"), "not a url");
}
}