mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 10:33:51 +00:00
1211 lines
44 KiB
TypeScript
1211 lines
44 KiB
TypeScript
"use client"
|
|
|
|
import type React from "react"
|
|
|
|
import { createContext, useContext, useState, useEffect } from "react"
|
|
import { type Triple, processTextWithChunking, processTextWithChunkingPyG, triplesToGraph } from "@/utils/text-processing"
|
|
import { useRouter } from "next/navigation"
|
|
import { toast } from "@/hooks/use-toast"
|
|
import { type PromptConfigurations } from "@/components/prompt-configuration"
|
|
|
|
export type Document = {
|
|
id: string
|
|
name: string
|
|
status: "New" | "Processing" | "Processed" | "Error"
|
|
uploadStatus: "Uploading" | "Uploaded"
|
|
size: string
|
|
file: File
|
|
content?: string
|
|
triples?: Triple[]
|
|
graph?: {
|
|
nodes: Array<{ id: string; label: string }>
|
|
edges: Array<{ source: string; target: string; label: string }>
|
|
}
|
|
error?: string
|
|
chunkCount?: number
|
|
extractedDate?: Date
|
|
processingMethod?: 'default' | 'langchain' | 'graphtransformer' | 'fallback'
|
|
embeddings?: {
|
|
count: number
|
|
generated: Date
|
|
status: "New" | "Processing" | "Processed" | "Error"
|
|
error?: string
|
|
}
|
|
}
|
|
|
|
export type LLMProvider = 'nvidia' | 'ollama';
|
|
|
|
export type ProcessingOptions = {
|
|
useLangChain?: boolean;
|
|
useGraphTransformer?: boolean;
|
|
promptConfigs?: PromptConfigurations;
|
|
llmProvider?: LLMProvider;
|
|
ollamaModel?: string;
|
|
ollamaBaseUrl?: string;
|
|
chunkSize?: number;
|
|
overlapSize?: number;
|
|
chunkingMethod?: 'optimized' | 'pyg';
|
|
};
|
|
|
|
type DocumentContextType = {
|
|
documents: Document[]
|
|
addDocuments: (files: File[]) => void
|
|
deleteDocuments: (documentIds: string[]) => void
|
|
clearDocuments: () => void
|
|
processDocuments: (selectedDocIds?: string[], options?: ProcessingOptions) => Promise<void>
|
|
// Legacy method for backward compatibility
|
|
processDocumentsLegacy: (useLangChain: boolean, selectedDocIds?: string[], useGraphTransformer?: boolean, promptConfigs?: PromptConfigurations) => Promise<void>
|
|
isProcessing: boolean
|
|
updateTriples: (documentId: string, triples: Triple[]) => void
|
|
addTriple: (documentId: string, triple: Triple) => void
|
|
editTriple: (documentId: string, index: number, triple: Triple) => void
|
|
deleteTriple: (documentId: string, index: number) => void
|
|
openGraphVisualization: (documentId?: string) => Promise<void>
|
|
generateEmbeddings: (documentId: string) => Promise<void>
|
|
isGeneratingEmbeddings: boolean
|
|
viewTriples?: (documentId: string) => void
|
|
}
|
|
|
|
const DocumentContext = createContext<DocumentContextType | undefined>(undefined)
|
|
|
|
// Utility function to generate UUID with fallback
|
|
const generateUUID = (): string => {
|
|
// Check if crypto.randomUUID is available
|
|
if (typeof crypto !== 'undefined' && crypto.randomUUID) {
|
|
try {
|
|
return crypto.randomUUID();
|
|
} catch (error) {
|
|
console.warn('crypto.randomUUID failed, using fallback:', error);
|
|
}
|
|
}
|
|
|
|
// Fallback UUID generation
|
|
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
|
|
const r = Math.random() * 16 | 0;
|
|
const v = c == 'x' ? r : (r & 0x3 | 0x8);
|
|
return v.toString(16);
|
|
});
|
|
};
|
|
|
|
export function DocumentProvider({ children }: { children: React.ReactNode }) {
|
|
const router = useRouter()
|
|
const [documents, setDocuments] = useState<Document[]>([])
|
|
const [isInitialized, setIsInitialized] = useState(false)
|
|
const [isProcessing, setIsProcessing] = useState(false)
|
|
const [isGeneratingEmbeddings, setIsGeneratingEmbeddings] = useState(false)
|
|
const [apiKey, setApiKey] = useState<string | null>(null)
|
|
|
|
// Load API key from localStorage on client-side only
|
|
useEffect(() => {
|
|
if (typeof window !== 'undefined') {
|
|
// API key loading removed - xAI integration has been removed
|
|
}
|
|
}, []);
|
|
|
|
// Load from localStorage on client-side only
|
|
useEffect(() => {
|
|
if (!isInitialized) {
|
|
try {
|
|
const savedDocuments = localStorage.getItem('txt2kg_documents')
|
|
if (savedDocuments) {
|
|
const parsedDocuments = JSON.parse(savedDocuments)
|
|
|
|
// Reconstruct documents with placeholder File objects
|
|
const reconstructedDocs = parsedDocuments.map((doc: any) => {
|
|
// Create a blob from the content if available
|
|
let file: File;
|
|
if (doc.content) {
|
|
// Create a File object from the content string we previously saved
|
|
const blob = new Blob([doc.content], { type: 'text/plain' });
|
|
file = new File([blob], doc.name, { type: 'text/plain' });
|
|
} else {
|
|
// Create an empty placeholder if no content is available
|
|
file = new File([], doc.name, { type: 'text/plain' });
|
|
}
|
|
|
|
return {
|
|
...doc,
|
|
file
|
|
};
|
|
});
|
|
|
|
console.log(`Restored ${reconstructedDocs.length} documents from localStorage`);
|
|
setDocuments(reconstructedDocs);
|
|
}
|
|
} catch (error) {
|
|
console.error('Error loading documents from localStorage:', error);
|
|
}
|
|
|
|
setIsInitialized(true);
|
|
}
|
|
}, [isInitialized]);
|
|
|
|
// Save documents to localStorage whenever they change, but only after initialization
|
|
useEffect(() => {
|
|
if (isInitialized) {
|
|
try {
|
|
if (documents.length > 0) {
|
|
// Serialize documents for localStorage storage
|
|
// We need to ensure large documents don't exceed localStorage limits
|
|
// Focus on saving processed data (triples & graph) rather than raw content for large files
|
|
const documentsToSave = documents.map(doc => {
|
|
// Don't save content for very large documents to avoid localStorage limits
|
|
// But keep it for smaller ones to avoid reprocessing
|
|
const shouldSaveContent = !doc.content || doc.content.length < 100000;
|
|
|
|
return {
|
|
...doc,
|
|
// Omit the actual File object as it can't be serialized
|
|
file: {
|
|
name: doc.file.name,
|
|
size: doc.file.size,
|
|
type: doc.file.type
|
|
},
|
|
// Only include content for smaller documents
|
|
content: shouldSaveContent ? doc.content : undefined
|
|
};
|
|
});
|
|
|
|
localStorage.setItem('txt2kg_documents', JSON.stringify(documentsToSave));
|
|
console.log(`Saved ${documents.length} documents to localStorage`);
|
|
} else {
|
|
// Clear localStorage if documents array is empty
|
|
localStorage.removeItem('txt2kg_documents');
|
|
console.log('Cleared documents from localStorage');
|
|
}
|
|
} catch (error) {
|
|
console.error('Error saving documents to localStorage:', error);
|
|
}
|
|
}
|
|
}, [documents, isInitialized])
|
|
|
|
const addDocuments = (files: File[]) => {
|
|
const newDocuments = files.map((file) => ({
|
|
id: generateUUID(),
|
|
name: file.name,
|
|
status: "New" as const,
|
|
uploadStatus: "Uploaded" as const,
|
|
size: (file.size / 1024).toFixed(2), // Convert to KB
|
|
file,
|
|
}))
|
|
|
|
setDocuments((prev) => [...prev, ...newDocuments])
|
|
}
|
|
|
|
const deleteDocuments = (documentIds: string[]) => {
|
|
setDocuments((prev) => prev.filter((doc) => !documentIds.includes(doc.id)))
|
|
}
|
|
|
|
const clearDocuments = () => {
|
|
setDocuments([])
|
|
}
|
|
|
|
const updateDocumentStatus = (id: string, status: Document["status"], updates: Partial<Document> = {}) => {
|
|
console.log(`Updating document ${id} status to: ${status}`);
|
|
setDocuments((prev) => {
|
|
const updated = prev.map((doc) => (doc.id === id ? { ...doc, status, ...updates } : doc));
|
|
|
|
// Force UI refresh by adding timestamp to document state
|
|
// This ensures React detects the change and re-renders components
|
|
const timestamped = updated.map(doc => ({
|
|
...doc,
|
|
_lastUpdated: Date.now() // Adding timestamp helps React detect changes
|
|
}));
|
|
|
|
return timestamped;
|
|
});
|
|
|
|
// Trigger a custom event for components that need to refresh
|
|
if (typeof window !== 'undefined') {
|
|
console.log('Dispatching document-status-changed event');
|
|
window.dispatchEvent(new CustomEvent('document-status-changed', {
|
|
detail: { documentId: id, status }
|
|
}));
|
|
}
|
|
}
|
|
|
|
const updateTriples = (documentId: string, triples: Triple[]) => {
|
|
// Helper function to normalize text
|
|
const normalizeText = (text: string): string => {
|
|
return text.replace(/['"()]/g, '').trim();
|
|
};
|
|
|
|
// Normalize triples before saving
|
|
const normalizedTriples = triples.map(triple => ({
|
|
subject: normalizeText(triple.subject),
|
|
predicate: normalizeText(triple.predicate),
|
|
object: normalizeText(triple.object)
|
|
}));
|
|
|
|
setDocuments((prev) =>
|
|
prev.map((doc) => {
|
|
if (doc.id === documentId) {
|
|
const graph = triplesToGraph(normalizedTriples)
|
|
return { ...doc, triples: normalizedTriples, graph }
|
|
}
|
|
return doc
|
|
}),
|
|
)
|
|
}
|
|
|
|
const addTriple = (documentId: string, triple: Triple) => {
|
|
// Helper function to normalize text with null/undefined checks
|
|
const normalizeText = (text: string | null | undefined): string => {
|
|
if (!text || typeof text !== 'string') return '';
|
|
return text.replace(/['"()]/g, '').trim();
|
|
};
|
|
|
|
// Normalize the new triple
|
|
const normalizedTriple = {
|
|
subject: normalizeText(triple.subject),
|
|
predicate: normalizeText(triple.predicate),
|
|
object: normalizeText(triple.object)
|
|
};
|
|
|
|
setDocuments((prev) =>
|
|
prev.map((doc) => {
|
|
if (doc.id === documentId && doc.triples) {
|
|
const newTriples = [...doc.triples, normalizedTriple]
|
|
const graph = triplesToGraph(newTriples)
|
|
return { ...doc, triples: newTriples, graph }
|
|
}
|
|
return doc
|
|
}),
|
|
)
|
|
}
|
|
|
|
const editTriple = (documentId: string, index: number, triple: Triple) => {
|
|
// Helper function to normalize text with null/undefined checks
|
|
const normalizeText = (text: string | null | undefined): string => {
|
|
if (!text || typeof text !== 'string') return '';
|
|
return text.replace(/['"()]/g, '').trim();
|
|
};
|
|
|
|
// Normalize the edited triple
|
|
const normalizedTriple = {
|
|
subject: normalizeText(triple.subject),
|
|
predicate: normalizeText(triple.predicate),
|
|
object: normalizeText(triple.object)
|
|
};
|
|
|
|
setDocuments((prev) =>
|
|
prev.map((doc) => {
|
|
if (doc.id === documentId && doc.triples) {
|
|
const newTriples = [...doc.triples]
|
|
newTriples[index] = normalizedTriple
|
|
const graph = triplesToGraph(newTriples)
|
|
return { ...doc, triples: newTriples, graph }
|
|
}
|
|
return doc
|
|
}),
|
|
)
|
|
}
|
|
|
|
const deleteTriple = (documentId: string, index: number) => {
|
|
setDocuments((prev) =>
|
|
prev.map((doc) => {
|
|
if (doc.id === documentId && doc.triples) {
|
|
const newTriples = doc.triples.filter((_, i) => i !== index)
|
|
const graph = triplesToGraph(newTriples)
|
|
return { ...doc, triples: newTriples, graph }
|
|
}
|
|
return doc
|
|
}),
|
|
)
|
|
}
|
|
|
|
const readFileContent = (file: File): Promise<string> => {
|
|
return new Promise((resolve, reject) => {
|
|
// Check if it's a valid file with size
|
|
if (file.size === 0) {
|
|
// Handle zero-byte files
|
|
console.warn(`File ${file.name} is empty (0 bytes)`);
|
|
reject(new Error('File is empty (0 bytes)'));
|
|
return;
|
|
}
|
|
|
|
// If the file isn't a real file (like from localStorage), handle that case
|
|
if (!(file instanceof Blob) || (file.size === 0 && file.type === '')) {
|
|
console.warn(`File ${file.name} appears to be a placeholder or invalid`);
|
|
reject(new Error('Invalid file reference - likely a placeholder'));
|
|
return;
|
|
}
|
|
|
|
const reader = new FileReader();
|
|
reader.onload = (e) => {
|
|
const content = e.target?.result as string;
|
|
if (!content || content.trim() === '') {
|
|
console.warn(`File ${file.name} content is empty or whitespace only`);
|
|
reject(new Error('File content is empty'));
|
|
return;
|
|
}
|
|
resolve(content);
|
|
};
|
|
reader.onerror = (e) => {
|
|
console.error(`Error reading file ${file.name}:`, e);
|
|
reject(e);
|
|
};
|
|
reader.readAsText(file);
|
|
});
|
|
}
|
|
|
|
const extractTriplesFromChunk = async (chunk: string, systemPrompt?: string): Promise<Triple[]> => {
|
|
console.log(`Extracting triples from chunk of length: ${chunk.length}`)
|
|
|
|
// Create headers with API key if available
|
|
const headers: Record<string, string> = {
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
// Add API key to headers if available
|
|
if (apiKey) {
|
|
headers["X-API-Key"] = apiKey
|
|
}
|
|
|
|
// Prepare request body with optional custom system prompt
|
|
const requestBody: any = { text: chunk };
|
|
if (systemPrompt) {
|
|
requestBody.systemPrompt = systemPrompt;
|
|
}
|
|
|
|
// Add LLM provider information based on selected model
|
|
const selectedModel = localStorage.getItem("selectedModel");
|
|
if (selectedModel) {
|
|
try {
|
|
const model = JSON.parse(selectedModel);
|
|
if (model.provider === "ollama") {
|
|
requestBody.llmProvider = "ollama";
|
|
requestBody.ollamaModel = model.model || "llama3.1:8b";
|
|
console.log(`🦙 Using Ollama model: ${requestBody.ollamaModel}`);
|
|
} else if (model.id === "nvidia-nemotron" || model.id === "nvidia-nemotron-nano") {
|
|
requestBody.llmProvider = "nvidia";
|
|
console.log(`🖥️ Using NVIDIA model: ${model.id}`);
|
|
}
|
|
} catch (e) {
|
|
// Ignore parsing errors, will use default
|
|
console.log(`⚠️ Error parsing selected model, using default`);
|
|
}
|
|
} else {
|
|
console.log(`⚠️ No selected model found, using default`);
|
|
}
|
|
|
|
const response = await fetch("/api/extract-triples", {
|
|
method: "POST",
|
|
headers,
|
|
body: JSON.stringify(requestBody),
|
|
// Rely on server-side timeout configuration instead of client-side AbortSignal
|
|
})
|
|
|
|
console.log("API response status:", response.status)
|
|
|
|
const data = await response.json()
|
|
|
|
if (!response.ok) {
|
|
console.error("API error:", data)
|
|
throw new Error(data.error || "Failed to extract triples")
|
|
}
|
|
|
|
console.log("API response data:", data)
|
|
console.log("Triples count:", data.triples?.length || 0)
|
|
|
|
return data.triples || []
|
|
}
|
|
|
|
// New processDocuments method with better options structure
|
|
const processDocuments = async (
|
|
selectedDocIds?: string[],
|
|
options?: ProcessingOptions
|
|
) => {
|
|
console.log('🔍 processDocuments called with:', {
|
|
selectedDocIds,
|
|
selectedCount: selectedDocIds?.length || 0,
|
|
options
|
|
});
|
|
|
|
const {
|
|
useLangChain = false,
|
|
useGraphTransformer = false,
|
|
promptConfigs,
|
|
llmProvider = 'ollama',
|
|
ollamaModel = 'qwen3:1.7b',
|
|
ollamaBaseUrl = 'http://localhost:11434/v1',
|
|
chunkSize = 64000,
|
|
overlapSize = 2000,
|
|
chunkingMethod = 'optimized'
|
|
} = options || {};
|
|
|
|
return processDocumentsImpl(useLangChain, selectedDocIds, useGraphTransformer, promptConfigs, {
|
|
llmProvider,
|
|
ollamaModel,
|
|
ollamaBaseUrl,
|
|
chunkSize,
|
|
overlapSize,
|
|
chunkingMethod
|
|
});
|
|
};
|
|
|
|
// Legacy method for backward compatibility
|
|
const processDocumentsLegacy = async (
|
|
useLangChain: boolean,
|
|
selectedDocIds?: string[],
|
|
useGraphTransformer?: boolean,
|
|
promptConfigs?: PromptConfigurations
|
|
) => {
|
|
return processDocumentsImpl(useLangChain, selectedDocIds, useGraphTransformer, promptConfigs);
|
|
};
|
|
|
|
const processDocumentsImpl = async (
|
|
useLangChain: boolean,
|
|
selectedDocIds?: string[],
|
|
useGraphTransformer?: boolean,
|
|
promptConfigs?: PromptConfigurations,
|
|
llmOptions?: {
|
|
llmProvider?: LLMProvider;
|
|
ollamaModel?: string;
|
|
ollamaBaseUrl?: string;
|
|
chunkSize?: number;
|
|
overlapSize?: number;
|
|
chunkingMethod?: 'optimized' | 'pyg';
|
|
}
|
|
) => {
|
|
console.log('🔍 processDocumentsImpl called with:', {
|
|
useLangChain,
|
|
selectedDocIds,
|
|
selectedCount: selectedDocIds?.length || 0,
|
|
useGraphTransformer,
|
|
totalDocuments: documents.length
|
|
});
|
|
|
|
// If selectedDocIds is explicitly provided, use it
|
|
// If not provided, don't process anything (instead of processing all docs)
|
|
const docIdsToProcess = selectedDocIds || [];
|
|
|
|
console.log('🔍 Document IDs to process:', docIdsToProcess);
|
|
|
|
// Get selected documents - filter by the provided selectedDocIds array
|
|
const docsToProcess = documents.filter(
|
|
(doc) => docIdsToProcess.includes(doc.id) &&
|
|
(doc.status === "New" || doc.status === "Processed" || doc.status === "Error")
|
|
);
|
|
|
|
console.log('🔍 Documents to process:', docsToProcess.map(d => ({ id: d.id, name: d.name, status: d.status })));
|
|
|
|
if (docsToProcess.length === 0) {
|
|
console.log("❌ No documents to process - either none selected or none have valid status");
|
|
return;
|
|
}
|
|
|
|
setIsProcessing(true);
|
|
|
|
try {
|
|
// Process each document sequentially
|
|
for (const doc of docsToProcess) {
|
|
// Update status to Processing before we begin
|
|
updateDocumentStatus(doc.id, "Processing");
|
|
|
|
try {
|
|
// Read file content if not already available
|
|
let content = doc.content;
|
|
if (!content) {
|
|
content = await readFileContent(doc.file);
|
|
}
|
|
|
|
console.log(`🚀 Processing document ${doc.name}, useLangChain: ${useLangChain}, isCSV: ${doc.name.toLowerCase().endsWith('.csv')}`);
|
|
|
|
// Handle CSV files specially - always use row-as-document processing regardless of LangChain setting
|
|
if (doc.name.toLowerCase().endsWith('.csv')) {
|
|
console.log('📊 Processing CSV file with row-as-document approach:', doc.name);
|
|
|
|
try {
|
|
const triples = await parseCSVContent(content);
|
|
console.log(`✅ CSV processing complete: ${triples.length} triples extracted`);
|
|
|
|
// Send to process-document API
|
|
const response = await fetch('/api/process-document', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
text: content,
|
|
filename: doc.name,
|
|
triples: triples,
|
|
useLangChain: useLangChain, // Pass through the original setting
|
|
useGraphTransformer: useGraphTransformer,
|
|
systemPrompt: promptConfigs?.systemPrompt,
|
|
extractionPrompt: promptConfigs?.extractionPrompt,
|
|
graphTransformerPrompt: promptConfigs?.graphTransformerPrompt
|
|
})
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Document processing failed: ${response.statusText}`);
|
|
}
|
|
|
|
const result = await response.json();
|
|
|
|
// Update the document with triples and graph
|
|
updateDocumentStatus(doc.id, "Processed", {
|
|
triples: triples,
|
|
graph: triplesToGraph(triples),
|
|
metadata: {
|
|
totalTriples: triples.length,
|
|
processingMethod: 'csv_row_as_document',
|
|
langchainUsed: useLangChain,
|
|
graphTransformerUsed: useGraphTransformer
|
|
}
|
|
});
|
|
|
|
console.log(`✅ Document ${doc.name} processed successfully with ${triples.length} triples`);
|
|
} catch (error) {
|
|
console.error(`❌ Error processing CSV file ${doc.name}:`, error);
|
|
updateDocumentStatus(doc.id, "Error", undefined, error instanceof Error ? error.message : 'Unknown error');
|
|
}
|
|
|
|
continue; // Skip the rest of the processing for CSV files
|
|
}
|
|
|
|
if (useLangChain) {
|
|
// Use process-document endpoint with useLangChain flag
|
|
console.log(`Processing document ${doc.name} with LangChain via process-document API...`);
|
|
|
|
// Extract triples using the default method first (for fallback)
|
|
let triples: Triple[] = [];
|
|
try {
|
|
// Convert JSON to text if it's a JSON file
|
|
let processedContent = content;
|
|
if (doc.name.toLowerCase().endsWith('.json')) {
|
|
processedContent = convertJsonToText(content);
|
|
}
|
|
|
|
// Pass the custom system prompt if available
|
|
const systemPrompt = promptConfigs?.systemPrompt;
|
|
triples = await processTextWithChunking(
|
|
processedContent,
|
|
(chunk) => extractTriplesFromChunk(chunk, systemPrompt)
|
|
);
|
|
|
|
// Call the process-document API endpoint with useLangChain flag
|
|
// NOTE: This no longer automatically stores triples in Neo4j.
|
|
// Storage in Neo4j is now handled manually through the UI's "Store in Graph DB" button.
|
|
console.log(`Sending ${triples.length} triples to process-document API with useLangChain=true ${useGraphTransformer ? 'using GraphTransformer' : ''}`);
|
|
|
|
// Include prompt configurations in the request body
|
|
const requestBody: any = {
|
|
text: doc.name.toLowerCase().endsWith('.json') ? convertJsonToText(content) : content,
|
|
filename: doc.name,
|
|
triples: triples,
|
|
useLangChain: true,
|
|
useGraphTransformer: useGraphTransformer
|
|
};
|
|
|
|
// Add LLM provider options if available
|
|
if (llmOptions) {
|
|
if (llmOptions.llmProvider) {
|
|
requestBody.llmProvider = llmOptions.llmProvider;
|
|
}
|
|
if (llmOptions.ollamaModel) {
|
|
requestBody.ollamaModel = llmOptions.ollamaModel;
|
|
}
|
|
if (llmOptions.ollamaBaseUrl) {
|
|
requestBody.ollamaBaseUrl = llmOptions.ollamaBaseUrl;
|
|
}
|
|
}
|
|
|
|
// Add prompt configs if available
|
|
if (promptConfigs) {
|
|
if (useGraphTransformer && promptConfigs.graphTransformerPrompt) {
|
|
requestBody.graphTransformerPrompt = promptConfigs.graphTransformerPrompt;
|
|
} else if (promptConfigs.defaultExtractionPrompt) {
|
|
requestBody.extractionPrompt = promptConfigs.defaultExtractionPrompt;
|
|
}
|
|
}
|
|
|
|
const response = await fetch('/api/process-document', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(requestBody)
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const errorText = await response.text();
|
|
console.error(`Document processing API error: ${response.status} ${response.statusText}`, errorText);
|
|
throw new Error(`Document processing failed: ${response.statusText} - ${errorText}`);
|
|
}
|
|
|
|
const result = await response.json();
|
|
console.log(`Received response from process-document API with ${result.triples?.length || 0} triples`);
|
|
|
|
// Update the document with triples and graph
|
|
const resultTriples = result.triples || triples; // Fall back to original triples if none returned
|
|
console.log(`Updating document status to "Processed" with ${resultTriples.length} triples`);
|
|
updateDocumentStatus(doc.id, "Processed", {
|
|
content,
|
|
triples: resultTriples,
|
|
graph: triplesToGraph(resultTriples),
|
|
extractedDate: new Date(),
|
|
processingMethod: useGraphTransformer ? 'graphtransformer' : 'langchain'
|
|
});
|
|
} catch (processingError) {
|
|
console.error(`Error in LangChain processing for ${doc.name}:`, processingError);
|
|
|
|
// If we have fallback triples, still mark as processed but include the error
|
|
if (triples.length > 0) {
|
|
console.log(`Using ${triples.length} fallback triples despite processing error`);
|
|
updateDocumentStatus(doc.id, "Processed", {
|
|
content,
|
|
triples,
|
|
graph: triplesToGraph(triples),
|
|
extractedDate: new Date(),
|
|
error: processingError instanceof Error ? processingError.message : "Unknown error during LangChain processing",
|
|
processingMethod: 'fallback'
|
|
});
|
|
} else {
|
|
// If no fallback triples, mark as error
|
|
throw processingError;
|
|
}
|
|
}
|
|
} else {
|
|
// Use default processing (original implementation)
|
|
console.log(`Processing document ${doc.name} using default processor...`);
|
|
|
|
// Note: CSV files are handled above, so this only processes non-CSV files
|
|
{
|
|
// For non-CSV files, use the text chunking approach
|
|
console.log(`Processing text document with chunking: ${doc.name}`);
|
|
|
|
// Convert JSON to text if it's a JSON file
|
|
let processedContent = content;
|
|
if (doc.name.toLowerCase().endsWith('.json')) {
|
|
processedContent = convertJsonToText(content);
|
|
console.log(`Converted JSON file ${doc.name} to text format for processing`);
|
|
}
|
|
|
|
// Use custom system prompt if available
|
|
const systemPrompt = promptConfigs?.systemPrompt;
|
|
const chunkSize = llmOptions?.chunkSize || 512;
|
|
const overlapSize = llmOptions?.overlapSize || 0;
|
|
const chunkingMethod = llmOptions?.chunkingMethod || 'pyg';
|
|
|
|
let triples: Triple[];
|
|
if (chunkingMethod === 'pyg') {
|
|
// Use PyTorch Geometric's exact chunking method with configurable chunk size and overlap
|
|
const pygChunkSize = chunkSize || 512; // Use configured chunk size or default to 512
|
|
const pygOverlapSize = overlapSize || 0; // Use configured overlap or default to 0 (original PyG behavior)
|
|
triples = await processTextWithChunkingPyG(
|
|
processedContent,
|
|
(chunk) => extractTriplesFromChunk(chunk, systemPrompt),
|
|
pygChunkSize,
|
|
pygOverlapSize
|
|
);
|
|
} else {
|
|
// Use optimized chunking with overlap
|
|
triples = await processTextWithChunking(
|
|
processedContent,
|
|
(chunk) => extractTriplesFromChunk(chunk, systemPrompt),
|
|
chunkSize,
|
|
overlapSize
|
|
);
|
|
}
|
|
|
|
// Send to process-document API - no longer automatically stores in Neo4j
|
|
// Storage in Neo4j is now handled manually through the UI's "Store in Graph DB" button
|
|
const requestBody: any = {
|
|
text: processedContent,
|
|
filename: doc.name,
|
|
triples: triples,
|
|
useLangChain: false
|
|
};
|
|
|
|
// Add system prompt if available
|
|
if (promptConfigs?.systemPrompt) {
|
|
requestBody.systemPrompt = promptConfigs.systemPrompt;
|
|
}
|
|
|
|
const response = await fetch('/api/process-document', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(requestBody)
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Document processing failed: ${response.statusText}`);
|
|
}
|
|
|
|
// Update the document with triples and graph
|
|
updateDocumentStatus(doc.id, "Processed", {
|
|
content,
|
|
triples,
|
|
graph: triplesToGraph(triples),
|
|
chunkCount: Math.ceil(content.length / 512), // Approximate chunk count
|
|
extractedDate: new Date()
|
|
});
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error processing document ${doc.name}:`, error);
|
|
updateDocumentStatus(doc.id, "Error", {
|
|
error: error instanceof Error ? error.message : "Unknown error"
|
|
});
|
|
}
|
|
}
|
|
} finally {
|
|
// Add a small delay before turning off the processing state
|
|
// This gives time for all UI updates to complete
|
|
console.log("Processing complete, finalizing UI updates...");
|
|
|
|
// Force a final UI refresh by dispatching an event immediately
|
|
if (typeof window !== 'undefined') {
|
|
console.log("Dispatching processing-complete event");
|
|
window.dispatchEvent(new CustomEvent('processing-complete'));
|
|
}
|
|
|
|
// Reset the processing state
|
|
setIsProcessing(false);
|
|
console.log("Processing state reset, UI should be updated");
|
|
}
|
|
}
|
|
|
|
// Helper function to process CSV content - each row as a document for LLM extraction
|
|
const parseCSVContent = async (csvContent: string): Promise<Triple[]> => {
|
|
console.log('🔍 parseCSVContent called with content length:', csvContent.length);
|
|
console.log('Processing CSV content with row-as-document approach');
|
|
|
|
// Split the CSV content into lines
|
|
const lines = csvContent.split('\n').filter(line => line.trim().length > 0);
|
|
|
|
if (lines.length < 2) {
|
|
throw new Error("CSV file must contain a header row and at least one data row");
|
|
}
|
|
|
|
// Parse the header row
|
|
const header = lines[0].split(',').map(h => h.trim().replace(/^"(.*)"$/, '$1'));
|
|
console.log(`CSV headers: ${header.join(', ')}`);
|
|
|
|
// Get data rows (skip header)
|
|
const dataRows = lines.slice(1);
|
|
console.log(`Processing ${dataRows.length} data rows as individual documents`);
|
|
|
|
let allTriples: Triple[] = [];
|
|
const BATCH_SIZE = 50; // Store every 50 rows
|
|
let currentBatch: Triple[] = [];
|
|
let storedTriples = 0;
|
|
|
|
// Process each row as a separate document
|
|
for (let rowIdx = 0; rowIdx < dataRows.length; rowIdx++) {
|
|
const line = dataRows[rowIdx];
|
|
|
|
try {
|
|
// Parse CSV row into fields
|
|
const fields: string[] = [];
|
|
let fieldStart = 0;
|
|
let inQuotes = false;
|
|
|
|
for (let i = 0; i < line.length; i++) {
|
|
if (line[i] === '"') {
|
|
inQuotes = !inQuotes;
|
|
} else if (line[i] === ',' && !inQuotes) {
|
|
fields.push(line.substring(fieldStart, i).trim().replace(/^"(.*)"$/, '$1'));
|
|
fieldStart = i + 1;
|
|
}
|
|
}
|
|
|
|
// Add the last field
|
|
fields.push(line.substring(fieldStart).trim().replace(/^"(.*)"$/, '$1'));
|
|
|
|
// Create document text from the row data
|
|
let documentText = '';
|
|
for (let i = 0; i < Math.min(header.length, fields.length); i++) {
|
|
if (fields[i] && fields[i].trim()) {
|
|
documentText += `${header[i]}: ${fields[i]}\n`;
|
|
}
|
|
}
|
|
|
|
// Skip empty rows
|
|
if (!documentText.trim()) {
|
|
console.warn(`Skipping empty CSV row ${rowIdx + 1}`);
|
|
continue;
|
|
}
|
|
|
|
console.log(`Processing row ${rowIdx + 1} as document: ${documentText.substring(0, 100)}...`);
|
|
|
|
// Extract triples from this row's text using the existing extraction function
|
|
try {
|
|
console.log(`🔄 Calling extractTriplesFromChunk for row ${rowIdx + 1}`);
|
|
// Note: promptConfigs is not available in this scope, so we'll pass undefined for now
|
|
const rowTriples = await extractTriplesFromChunk(documentText, undefined);
|
|
|
|
console.log(`📥 extractTriplesFromChunk returned:`, rowTriples);
|
|
|
|
if (rowTriples && Array.isArray(rowTriples)) {
|
|
console.log(`✅ Extracted ${rowTriples.length} triples from row ${rowIdx + 1}`);
|
|
allTriples = allTriples.concat(rowTriples);
|
|
currentBatch = currentBatch.concat(rowTriples);
|
|
|
|
// Store batch every BATCH_SIZE rows or on last row
|
|
if (currentBatch.length >= BATCH_SIZE || rowIdx === dataRows.length - 1) {
|
|
try {
|
|
console.log(`💾 Storing batch: ${currentBatch.length} triples (rows ${storedTriples + 1}-${rowIdx + 1})`);
|
|
|
|
// Store batch to database via API
|
|
const batchResponse = await fetch('/api/graph-db/triples', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
triples: currentBatch,
|
|
source: `CSV batch ${Math.floor(storedTriples / BATCH_SIZE) + 1}`
|
|
})
|
|
});
|
|
|
|
if (batchResponse.ok) {
|
|
storedTriples += currentBatch.length;
|
|
console.log(`✅ Batch stored successfully! Progress: ${storedTriples} total triples stored`);
|
|
} else {
|
|
console.error(`❌ Failed to store batch: ${batchResponse.statusText}`);
|
|
// Continue processing even if storage fails
|
|
}
|
|
|
|
currentBatch = []; // Reset batch
|
|
} catch (batchError) {
|
|
console.error(`❌ Error storing batch at row ${rowIdx + 1}:`, batchError);
|
|
// Continue processing even if one batch fails
|
|
}
|
|
}
|
|
} else {
|
|
console.warn(`⚠️ No valid triples returned for row ${rowIdx + 1}`);
|
|
}
|
|
} catch (error) {
|
|
console.error(`❌ Error extracting triples from row ${rowIdx + 1}:`, error);
|
|
continue;
|
|
}
|
|
|
|
} catch (parseError) {
|
|
console.error(`Error parsing CSV row ${rowIdx + 1}:`, parseError);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
console.log(`🏁 Successfully extracted ${allTriples.length} triples from ${dataRows.length} CSV rows`);
|
|
console.log('Final triples array:', allTriples);
|
|
return allTriples;
|
|
}
|
|
|
|
// Helper function to convert JSON content to readable text format
|
|
const convertJsonToText = (jsonContent: string): string => {
|
|
try {
|
|
// Parse the JSON to validate it
|
|
const jsonData = JSON.parse(jsonContent);
|
|
|
|
// Convert JSON to a readable text format that preserves structure and relationships
|
|
const formatJsonObject = (obj: any, indent: number = 0): string => {
|
|
const spaces = ' '.repeat(indent);
|
|
|
|
if (obj === null || obj === undefined) {
|
|
return 'null';
|
|
}
|
|
|
|
if (typeof obj === 'string' || typeof obj === 'number' || typeof obj === 'boolean') {
|
|
return String(obj);
|
|
}
|
|
|
|
if (Array.isArray(obj)) {
|
|
if (obj.length === 0) return '[]';
|
|
const items = obj.map((item, index) =>
|
|
`${spaces} Item ${index + 1}: ${formatJsonObject(item, indent + 1)}`
|
|
).join('\n');
|
|
return `[\n${items}\n${spaces}]`;
|
|
}
|
|
|
|
if (typeof obj === 'object') {
|
|
const entries = Object.entries(obj);
|
|
if (entries.length === 0) return '{}';
|
|
|
|
const props = entries.map(([key, value]) =>
|
|
`${spaces} ${key}: ${formatJsonObject(value, indent + 1)}`
|
|
).join('\n');
|
|
return `{\n${props}\n${spaces}}`;
|
|
}
|
|
|
|
return String(obj);
|
|
};
|
|
|
|
// Create a descriptive text representation
|
|
let textContent = `JSON Document Content:\n\n`;
|
|
textContent += formatJsonObject(jsonData);
|
|
|
|
return textContent;
|
|
} catch (error) {
|
|
console.warn('Failed to parse JSON, treating as plain text:', error);
|
|
// If JSON parsing fails, return the original content as-is
|
|
return jsonContent;
|
|
}
|
|
}
|
|
|
|
const openGraphVisualization = async (documentId?: string) => {
|
|
// Find the document to visualize
|
|
const doc = documentId
|
|
? documents.find((d) => d.id === documentId && d.status === "Processed" && d.triples && d.triples.length > 0)
|
|
: documents.find((d) => d.status === "Processed" && d.triples && d.triples.length > 0)
|
|
|
|
if (!doc || !doc.triples) {
|
|
console.warn("No suitable document found for graph visualization")
|
|
return
|
|
}
|
|
|
|
try {
|
|
// Create a timestamp to ensure we have unique localStorage keys that don't conflict
|
|
const timestamp = Date.now();
|
|
|
|
// Always store in localStorage as a backup with a timestamp suffix
|
|
try {
|
|
// Store with both the old keys (for backward compatibility) and new timestamped keys
|
|
localStorage.setItem("graphTriples", JSON.stringify(doc.triples))
|
|
localStorage.setItem("graphDocumentName", doc.name)
|
|
|
|
// Also store with timestamp for uniqueness
|
|
localStorage.setItem(`graphTriples_${timestamp}`, JSON.stringify(doc.triples))
|
|
localStorage.setItem(`graphDocumentName_${timestamp}`, doc.name)
|
|
|
|
console.log(`Stored ${doc.triples.length} triples in localStorage for document: ${doc.name}`)
|
|
} catch (localStorageError) {
|
|
console.error("LocalStorage error:", localStorageError);
|
|
alert("Warning: Unable to save graph data to browser storage. The graph may not persist if you navigate away.");
|
|
// Continue with API storage even if localStorage fails
|
|
}
|
|
|
|
// Try the API approach
|
|
try {
|
|
const response = await fetch("/api/graph-data", {
|
|
method: "POST",
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
triples: doc.triples,
|
|
documentName: doc.name,
|
|
timestamp // Include timestamp for correlation
|
|
}),
|
|
})
|
|
|
|
if (response.ok) {
|
|
const { graphId } = await response.json()
|
|
console.log(`Successfully stored graph data with ID: ${graphId}`)
|
|
// Use Next.js router.replace to avoid building up history stack
|
|
router.replace(`/graph?id=${graphId}&ts=${timestamp}`)
|
|
} else {
|
|
console.warn(`API storage failed (${response.status}): ${await response.text()}`)
|
|
// If API fails, use localStorage fallback with timestamp parameter
|
|
router.replace(`/graph?source=local&ts=${timestamp}`)
|
|
}
|
|
} catch (apiError) {
|
|
console.error("Error with API storage:", apiError)
|
|
// Navigate using localStorage fallback with timestamp
|
|
router.replace(`/graph?source=local&ts=${timestamp}`)
|
|
}
|
|
} catch (error) {
|
|
console.error("Error preparing graph data:", error)
|
|
alert("Failed to prepare graph data. See console for details.")
|
|
}
|
|
}
|
|
|
|
const generateEmbeddings = async (documentId: string) => {
|
|
// Add more detailed diagnostics
|
|
const doc = documents.find(d => d.id === documentId);
|
|
|
|
if (!doc) {
|
|
toast({
|
|
title: "Document Not Found",
|
|
description: `Could not find document with ID: ${documentId}`,
|
|
variant: "destructive",
|
|
duration: 3000,
|
|
});
|
|
return;
|
|
}
|
|
|
|
// If content already exists, use it right away
|
|
if (doc.content && doc.content.trim() !== '') {
|
|
await processEmbeddings(doc.id, doc.name, doc.content);
|
|
return;
|
|
}
|
|
|
|
// Document exists but content is not loaded - log debug info
|
|
console.log(`Attempting to load content for document: ${doc.name}`);
|
|
console.log(`File info: size=${doc.file.size}, type=${doc.file.type}`);
|
|
|
|
// Check if the document was loaded from localStorage and might have a corrupted file reference
|
|
const isLikelyFromLocalStorage = doc.file.size === 0 || !(doc.file instanceof Blob);
|
|
|
|
if (isLikelyFromLocalStorage) {
|
|
toast({
|
|
title: "File Reference Issue",
|
|
description: "This document was restored from browser storage and cannot access its original file. Please re-upload the file or process it again first.",
|
|
variant: "destructive",
|
|
duration: 5000,
|
|
});
|
|
return;
|
|
}
|
|
|
|
try {
|
|
// Document exists but content might not be loaded - try to load it
|
|
const content = await readFileContent(doc.file);
|
|
if (content && content.trim() !== '') {
|
|
// Update the document with content first
|
|
setDocuments(prevDocs =>
|
|
prevDocs.map(d => {
|
|
if (d.id === documentId) {
|
|
return {
|
|
...d,
|
|
content: content
|
|
};
|
|
}
|
|
return d;
|
|
})
|
|
);
|
|
|
|
// Continue with the loaded content
|
|
await processEmbeddings(doc.id, doc.name, content);
|
|
} else {
|
|
toast({
|
|
title: "Empty Document",
|
|
description: "The document file appears to be empty",
|
|
variant: "destructive",
|
|
duration: 3000,
|
|
});
|
|
}
|
|
} catch (error) {
|
|
toast({
|
|
title: "Content Loading Error",
|
|
description: `Failed to load document content: ${error instanceof Error ? error.message : String(error)}`,
|
|
variant: "destructive",
|
|
duration: 5000,
|
|
});
|
|
}
|
|
};
|
|
|
|
// Helper function to handle the actual embeddings processing
|
|
const processEmbeddings = async (documentId: string, documentName: string, content: string) => {
|
|
setIsGeneratingEmbeddings(true);
|
|
try {
|
|
console.log(`Generating embeddings for document: ${documentName}`);
|
|
|
|
// Update embeddings status to show it's processing, without changing main document status
|
|
setDocuments(prevDocs =>
|
|
prevDocs.map(d => {
|
|
if (d.id === documentId) {
|
|
return {
|
|
...d,
|
|
embeddings: {
|
|
count: d.embeddings?.count || 0,
|
|
generated: d.embeddings?.generated || new Date(),
|
|
status: "Processing" as const
|
|
}
|
|
};
|
|
}
|
|
return d;
|
|
})
|
|
);
|
|
|
|
const response = await fetch('/api/embeddings', {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json'
|
|
},
|
|
body: JSON.stringify({
|
|
documentId: documentId,
|
|
content: content,
|
|
documentName: documentName
|
|
})
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to generate embeddings: ${await response.text()}`);
|
|
}
|
|
|
|
const result = await response.json();
|
|
console.log('Embeddings generation result:', result);
|
|
|
|
// Update embeddings status to show it's processed
|
|
setDocuments(prevDocs =>
|
|
prevDocs.map(d => {
|
|
if (d.id === documentId) {
|
|
return {
|
|
...d,
|
|
embeddings: {
|
|
count: result.embeddings,
|
|
generated: new Date(),
|
|
status: "Processed" as const
|
|
}
|
|
};
|
|
}
|
|
return d;
|
|
})
|
|
);
|
|
|
|
// Show a toast notification
|
|
toast({
|
|
title: "Embeddings Generated",
|
|
description: `Successfully generated ${result.embeddings} embeddings for "${documentName}"`,
|
|
duration: 5000,
|
|
});
|
|
|
|
} catch (error) {
|
|
console.error('Error generating embeddings:', error);
|
|
|
|
// Update embeddings status to show there was an error
|
|
setDocuments(prevDocs =>
|
|
prevDocs.map(d => {
|
|
if (d.id === documentId) {
|
|
return {
|
|
...d,
|
|
embeddings: {
|
|
count: d.embeddings?.count || 0,
|
|
generated: d.embeddings?.generated || new Date(),
|
|
status: "Error" as const,
|
|
error: error instanceof Error ? error.message : String(error)
|
|
}
|
|
};
|
|
}
|
|
return d;
|
|
})
|
|
);
|
|
|
|
toast({
|
|
title: "Embeddings Generation Failed",
|
|
description: `Failed to generate embeddings: ${error instanceof Error ? error.message : String(error)}`,
|
|
variant: "destructive",
|
|
duration: 5000,
|
|
});
|
|
} finally {
|
|
setIsGeneratingEmbeddings(false);
|
|
}
|
|
};
|
|
|
|
return (
|
|
<DocumentContext.Provider
|
|
value={{
|
|
documents,
|
|
addDocuments,
|
|
deleteDocuments,
|
|
clearDocuments,
|
|
processDocuments,
|
|
processDocumentsLegacy,
|
|
isProcessing,
|
|
updateTriples,
|
|
addTriple,
|
|
editTriple,
|
|
deleteTriple,
|
|
openGraphVisualization,
|
|
generateEmbeddings,
|
|
isGeneratingEmbeddings
|
|
}}
|
|
>
|
|
{children}
|
|
</DocumentContext.Provider>
|
|
)
|
|
}
|
|
|
|
export function useDocuments() {
|
|
const context = useContext(DocumentContext)
|
|
if (context === undefined) {
|
|
throw new Error("useDocuments must be used within a DocumentProvider")
|
|
}
|
|
return context
|
|
} |