mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
600 lines
22 KiB
TypeScript
600 lines
22 KiB
TypeScript
"use client";
|
|
|
|
import { useState } from "react";
|
|
import { Button } from "@/components/ui/button";
|
|
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
|
|
import { AlertCircle, FileText, FileUp, Loader2, Zap } from "lucide-react";
|
|
import { Progress } from "@/components/ui/progress";
|
|
import { Alert, AlertDescription } from "@/components/ui/alert";
|
|
import { useToast } from "@/components/ui/use-toast";
|
|
import { Switch } from "@/components/ui/switch";
|
|
import { Label } from "@/components/ui/label";
|
|
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
|
|
import { useDocuments } from "@/contexts/document-context";
|
|
|
|
interface DocumentProcessorProps {
|
|
onComplete?: (results: any) => void;
|
|
className?: string;
|
|
}
|
|
|
|
export function DocumentProcessor({ onComplete, className }: DocumentProcessorProps) {
|
|
const { addDocuments, processDocuments, documents } = useDocuments();
|
|
const [file, setFile] = useState<File | null>(null);
|
|
const [isProcessing, setIsProcessing] = useState(false);
|
|
const [progress, setProgress] = useState(0);
|
|
const [processingStatus, setProcessingStatus] = useState("");
|
|
const [error, setError] = useState<string | null>(null);
|
|
const [processingTab, setProcessingTab] = useState<string>("triples");
|
|
const [useSentenceChunking, setUseSentenceChunking] = useState(true);
|
|
const [useEntityExtraction, setUseEntityExtraction] = useState(true);
|
|
const { toast } = useToast();
|
|
|
|
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
|
const selectedFile = e.target.files?.[0] || null;
|
|
if (selectedFile) {
|
|
// Add file to document context for display in document list
|
|
addDocuments([selectedFile]);
|
|
setFile(selectedFile);
|
|
setError(null);
|
|
setProgress(0);
|
|
setProcessingStatus("");
|
|
|
|
// Show toast notification
|
|
toast({
|
|
title: "File Uploaded",
|
|
description: `"${selectedFile.name}" added to document list.`,
|
|
duration: 3000,
|
|
});
|
|
}
|
|
};
|
|
|
|
const processFile = async () => {
|
|
if (!file) {
|
|
setError("Please select a file to process");
|
|
return;
|
|
}
|
|
|
|
try {
|
|
setIsProcessing(true);
|
|
setProgress(0);
|
|
setProcessingStatus("Reading file...");
|
|
setError(null);
|
|
|
|
// Find the document ID for the file we're processing
|
|
const docToProcess = documents.find(doc => doc.name === file.name);
|
|
|
|
if (!docToProcess) {
|
|
throw new Error("Document not found in document list");
|
|
}
|
|
|
|
// Use the document context to process documents with the specific ID
|
|
await processDocuments([docToProcess.id], {
|
|
useLangChain: true,
|
|
useGraphTransformer: false,
|
|
promptConfigs: undefined
|
|
});
|
|
setProgress(100);
|
|
setProcessingStatus("Processing complete!");
|
|
|
|
// Notify about completion
|
|
toast({
|
|
title: "Processing Complete",
|
|
description: "Document has been processed successfully. You can now generate embeddings from the document table.",
|
|
duration: 5000,
|
|
});
|
|
|
|
// Reset the file input
|
|
setFile(null);
|
|
|
|
// Call onComplete callback if provided
|
|
if (onComplete) {
|
|
onComplete({
|
|
success: true,
|
|
message: "Document processed successfully"
|
|
});
|
|
}
|
|
} catch (err) {
|
|
console.error("Error processing document:", err);
|
|
setError(err instanceof Error ? err.message : "Unknown error processing document");
|
|
|
|
toast({
|
|
title: "Processing Failed",
|
|
description: err instanceof Error ? err.message : "Failed to process document",
|
|
variant: "destructive",
|
|
duration: 5000,
|
|
});
|
|
} finally {
|
|
setIsProcessing(false);
|
|
}
|
|
};
|
|
|
|
// Process triples from text
|
|
const processTriples = async (text: string, filename: string) => {
|
|
setProcessingStatus("Extracting triples with LangChain...");
|
|
|
|
let triples;
|
|
|
|
// If it's a CSV file, process each row as a document with LLM extraction
|
|
if (filename.toLowerCase().endsWith('.csv')) {
|
|
setProcessingStatus(`Processing CSV file rows as documents (${(text.length / 1024).toFixed(2)} KB)...`);
|
|
try {
|
|
console.log(`🔥 DocumentProcessor: Starting CSV row-by-row processing for file: ${filename}`);
|
|
triples = await parseCSVToTriples(text);
|
|
console.log(`🔥 DocumentProcessor: Extracted ${triples.length} triples from CSV file`);
|
|
setProgress(60);
|
|
|
|
// For very large triple sets, limit what we send to the API
|
|
const maxTriplesToProcess = 10000;
|
|
let triplesToProcess = triples;
|
|
|
|
if (triples.length > maxTriplesToProcess) {
|
|
console.log(`Limiting triples to ${maxTriplesToProcess} out of ${triples.length} total`);
|
|
setProcessingStatus(`Processing ${maxTriplesToProcess} of ${triples.length} triples (limited for performance)...`);
|
|
triplesToProcess = triples.slice(0, maxTriplesToProcess);
|
|
}
|
|
|
|
setProcessingStatus(`Processing ${triplesToProcess.length} triples...`);
|
|
|
|
// Process document to create backend with embeddings
|
|
// NOTE: This API no longer automatically stores triples in Neo4j.
|
|
// Storage in Neo4j is now handled manually through the UI's "Store in Graph DB" button.
|
|
const processingResponse = await fetch('/api/process-document', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
text: `CSV file with ${triples.length} triples`, // Don't send the full CSV content
|
|
filename,
|
|
triples: triplesToProcess
|
|
})
|
|
});
|
|
|
|
if (!processingResponse.ok) {
|
|
throw new Error(`Failed to process document: ${processingResponse.statusText}`);
|
|
}
|
|
|
|
const processingData = await processingResponse.json();
|
|
setProgress(100);
|
|
setProcessingStatus("Processing complete!");
|
|
|
|
// Notify about completion
|
|
toast({
|
|
title: "CSV Processing Complete",
|
|
description: `Processed ${triplesToProcess.length} triples${triples.length > triplesToProcess.length ? ' (limited from ' + triples.length + ' total)' : ''} from your CSV file.`,
|
|
duration: 5000,
|
|
});
|
|
|
|
// Call the onComplete callback with results
|
|
if (onComplete) {
|
|
onComplete({
|
|
triples: triplesToProcess,
|
|
totalTriples: triples.length,
|
|
embeddings: processingData.embeddings || [],
|
|
filename
|
|
});
|
|
}
|
|
|
|
return; // Early return to skip the standard processing flow
|
|
} catch (err) {
|
|
console.error(`CSV processing error:`, err);
|
|
throw new Error(`Failed to parse CSV file: ${err instanceof Error ? err.message : String(err)}`);
|
|
}
|
|
}
|
|
|
|
// Standard processing for non-CSV files
|
|
const extractResponse = await fetch('/api/extract-triples', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ text })
|
|
});
|
|
|
|
if (!extractResponse.ok) {
|
|
throw new Error(`Failed to extract triples: ${extractResponse.statusText}`);
|
|
}
|
|
|
|
const extractData = await extractResponse.json();
|
|
triples = extractData.triples;
|
|
setProgress(60);
|
|
|
|
setProcessingStatus("Generating embeddings...");
|
|
|
|
// Process document to create backend with embeddings
|
|
// NOTE: This API no longer automatically stores triples in Neo4j.
|
|
// Storage in Neo4j is now handled manually through the UI's "Store in Graph DB" button.
|
|
const processingResponse = await fetch('/api/process-document', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
text,
|
|
filename,
|
|
triples
|
|
})
|
|
});
|
|
|
|
if (!processingResponse.ok) {
|
|
throw new Error(`Failed to process document: ${processingResponse.statusText}`);
|
|
}
|
|
|
|
const processingData = await processingResponse.json();
|
|
setProgress(100);
|
|
setProcessingStatus("Processing complete!");
|
|
|
|
// Notify about completion
|
|
toast({
|
|
title: "Triple Extraction Complete",
|
|
description: `Extracted ${triples.length} triples and generated embeddings for the knowledge graph.`,
|
|
duration: 5000,
|
|
});
|
|
|
|
// Call the onComplete callback with results
|
|
if (onComplete) {
|
|
onComplete({
|
|
triples,
|
|
embeddings: processingData.embeddings || [],
|
|
filename
|
|
});
|
|
}
|
|
};
|
|
|
|
// Process sentence embeddings
|
|
const processSentenceEmbeddings = async (text: string, filename: string) => {
|
|
// If it's a CSV file, we need to convert it to text first
|
|
let processableText = text;
|
|
|
|
if (filename.toLowerCase().endsWith('.csv')) {
|
|
setProcessingStatus("Preparing CSV data for embedding...");
|
|
try {
|
|
// For CSV files, we'll use the content of the cells as text to generate embeddings
|
|
const triples = await parseCSVToTriples(text);
|
|
// Create a text representation by joining subjects, predicates and objects
|
|
processableText = triples
|
|
.map(t => `${t.subject} ${t.predicate} ${t.object}`)
|
|
.join('. ');
|
|
} catch (err) {
|
|
throw new Error(`Failed to process CSV file: ${err instanceof Error ? err.message : String(err)}`);
|
|
}
|
|
}
|
|
|
|
setProcessingStatus("Chunking text into sentences...");
|
|
|
|
// Call sentence embeddings API
|
|
const embeddingsResponse = await fetch('/api/sentence-embeddings', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
text: processableText,
|
|
documentId: filename
|
|
})
|
|
});
|
|
|
|
if (!embeddingsResponse.ok) {
|
|
throw new Error(`Failed to process sentence embeddings: ${embeddingsResponse.statusText}`);
|
|
}
|
|
|
|
const embeddingsData = await embeddingsResponse.json();
|
|
setProgress(100);
|
|
setProcessingStatus("Sentence embeddings complete!");
|
|
|
|
// Notify about completion
|
|
toast({
|
|
title: "Sentence Embeddings Complete",
|
|
description: `Generated embeddings for ${embeddingsData.count} sentences from your document.`,
|
|
duration: 5000,
|
|
});
|
|
|
|
// Show sample sentences in console for debugging
|
|
console.log("Sample sentences:", embeddingsData.samples);
|
|
|
|
// Call the onComplete callback with results
|
|
if (onComplete) {
|
|
onComplete({
|
|
sentenceCount: embeddingsData.count,
|
|
samples: embeddingsData.samples,
|
|
filename
|
|
});
|
|
}
|
|
};
|
|
|
|
// Helper function to read file content
|
|
const readFileContent = (file: File): Promise<string> => {
|
|
return new Promise((resolve, reject) => {
|
|
console.log(`Reading file: ${file.name}, size: ${(file.size / 1024).toFixed(2)} KB`);
|
|
|
|
const reader = new FileReader();
|
|
reader.onload = (event) => {
|
|
if (event.target?.result) {
|
|
const content = event.target.result as string;
|
|
console.log(`File content loaded, length: ${content.length} characters`);
|
|
|
|
// Special handling for CSV files
|
|
if (file.name.toLowerCase().endsWith('.csv')) {
|
|
try {
|
|
console.log(`Processing CSV file content...`);
|
|
// Don't parse here, just validate the content
|
|
const lineCount = content.split('\n').length;
|
|
console.log(`CSV file has ${lineCount} lines`);
|
|
resolve(content);
|
|
} catch (err) {
|
|
console.error(`CSV parsing error:`, err);
|
|
reject(new Error(`Failed to parse CSV file: ${err instanceof Error ? err.message : String(err)}`));
|
|
}
|
|
} else if (file.name.toLowerCase().endsWith('.json')) {
|
|
try {
|
|
console.log(`Processing JSON file content...`);
|
|
// Convert JSON to readable text format for processing
|
|
const textContent = convertJsonToText(content);
|
|
console.log(`Converted JSON file to text format, length: ${textContent.length} characters`);
|
|
resolve(textContent);
|
|
} catch (err) {
|
|
console.error(`JSON conversion error:`, err);
|
|
reject(new Error(`Failed to process JSON file: ${err instanceof Error ? err.message : String(err)}`));
|
|
}
|
|
} else {
|
|
resolve(content);
|
|
}
|
|
} else {
|
|
reject(new Error("Failed to read file content"));
|
|
}
|
|
};
|
|
reader.onerror = (error) => {
|
|
console.error(`Error reading file:`, error);
|
|
reject(new Error("Error reading file"));
|
|
};
|
|
reader.readAsText(file);
|
|
});
|
|
};
|
|
|
|
// Helper function to convert JSON content to readable text format
|
|
const convertJsonToText = (jsonContent: string): string => {
|
|
try {
|
|
// Parse the JSON to validate it
|
|
const jsonData = JSON.parse(jsonContent);
|
|
|
|
// Convert JSON to a readable text format that preserves structure and relationships
|
|
const formatJsonObject = (obj: any, indent: number = 0): string => {
|
|
const spaces = ' '.repeat(indent);
|
|
|
|
if (obj === null || obj === undefined) {
|
|
return 'null';
|
|
}
|
|
|
|
if (typeof obj === 'string' || typeof obj === 'number' || typeof obj === 'boolean') {
|
|
return String(obj);
|
|
}
|
|
|
|
if (Array.isArray(obj)) {
|
|
if (obj.length === 0) return '[]';
|
|
const items = obj.map((item, index) =>
|
|
`${spaces} Item ${index + 1}: ${formatJsonObject(item, indent + 1)}`
|
|
).join('\n');
|
|
return `[\n${items}\n${spaces}]`;
|
|
}
|
|
|
|
if (typeof obj === 'object') {
|
|
const entries = Object.entries(obj);
|
|
if (entries.length === 0) return '{}';
|
|
|
|
const props = entries.map(([key, value]) =>
|
|
`${spaces} ${key}: ${formatJsonObject(value, indent + 1)}`
|
|
).join('\n');
|
|
return `{\n${props}\n${spaces}}`;
|
|
}
|
|
|
|
return String(obj);
|
|
};
|
|
|
|
// Create a descriptive text representation
|
|
let textContent = `JSON Document Content:\n\n`;
|
|
textContent += formatJsonObject(jsonData);
|
|
|
|
return textContent;
|
|
} catch (error) {
|
|
console.warn('Failed to parse JSON, treating as plain text:', error);
|
|
// If JSON parsing fails, return the original content as-is
|
|
return jsonContent;
|
|
}
|
|
}
|
|
|
|
// Parse CSV file and process each row as a document for LLM-based triple extraction
|
|
const parseCSVToTriples = async (csvContent: string): Promise<any[]> => {
|
|
console.log(`Processing CSV content as individual documents, length: ${csvContent.length} characters`);
|
|
|
|
// Split the CSV content into lines
|
|
const lines = csvContent.split('\n').filter(line => line.trim().length > 0);
|
|
console.log(`CSV has ${lines.length} non-empty lines`);
|
|
|
|
if (lines.length < 2) {
|
|
throw new Error("CSV file must contain a header row and at least one data row");
|
|
}
|
|
|
|
// Parse the header row
|
|
const header = lines[0].split(',').map(h => h.trim().replace(/^"(.*)"$/, '$1'));
|
|
console.log(`CSV headers: ${header.join(', ')}`);
|
|
|
|
// Get data rows (skip header)
|
|
const dataRows = lines.slice(1);
|
|
console.log(`Processing ${dataRows.length} data rows as individual documents`);
|
|
|
|
let allTriples: any[] = [];
|
|
|
|
// Process each row as a separate document
|
|
for (let rowIdx = 0; rowIdx < dataRows.length; rowIdx++) {
|
|
const line = dataRows[rowIdx];
|
|
setProcessingStatus(`Processing CSV row ${rowIdx + 1}/${dataRows.length} with LLM...`);
|
|
|
|
try {
|
|
// Parse CSV row into fields
|
|
const fields: string[] = [];
|
|
let fieldStart = 0;
|
|
let inQuotes = false;
|
|
|
|
for (let i = 0; i < line.length; i++) {
|
|
if (line[i] === '"') {
|
|
inQuotes = !inQuotes;
|
|
} else if (line[i] === ',' && !inQuotes) {
|
|
fields.push(line.substring(fieldStart, i).trim().replace(/^"(.*)"$/, '$1'));
|
|
fieldStart = i + 1;
|
|
}
|
|
}
|
|
|
|
// Add the last field
|
|
fields.push(line.substring(fieldStart).trim().replace(/^"(.*)"$/, '$1'));
|
|
|
|
// Create document text from the row data
|
|
let documentText = '';
|
|
for (let i = 0; i < Math.min(header.length, fields.length); i++) {
|
|
if (fields[i] && fields[i].trim()) {
|
|
documentText += `${header[i]}: ${fields[i]}\n`;
|
|
}
|
|
}
|
|
|
|
// Skip empty rows
|
|
if (!documentText.trim()) {
|
|
console.warn(`Skipping empty CSV row ${rowIdx + 1}`);
|
|
continue;
|
|
}
|
|
|
|
console.log(`Processing row ${rowIdx + 1} as document: ${documentText.substring(0, 100)}...`);
|
|
|
|
// Extract triples from this row's text using LLM
|
|
try {
|
|
const response = await fetch('/api/extract-triples', {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
text: documentText,
|
|
useLangChain: true // Use LLM-based extraction
|
|
})
|
|
});
|
|
|
|
if (!response.ok) {
|
|
console.error(`Failed to extract triples from row ${rowIdx + 1}: ${response.statusText}`);
|
|
continue;
|
|
}
|
|
|
|
const data = await response.json();
|
|
if (data.triples && Array.isArray(data.triples)) {
|
|
console.log(`Extracted ${data.triples.length} triples from row ${rowIdx + 1}`);
|
|
allTriples = allTriples.concat(data.triples);
|
|
}
|
|
} catch (error) {
|
|
console.error(`Error processing row ${rowIdx + 1}:`, error);
|
|
continue;
|
|
}
|
|
|
|
// Update progress
|
|
setProgress(20 + (rowIdx / dataRows.length) * 40);
|
|
|
|
} catch (parseError) {
|
|
console.error(`Error parsing CSV row ${rowIdx + 1}:`, parseError);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
console.log(`Successfully extracted ${allTriples.length} triples from ${dataRows.length} CSV rows`);
|
|
return allTriples;
|
|
};
|
|
|
|
return (
|
|
<Card className={className}>
|
|
<CardHeader>
|
|
<CardTitle>Process Document</CardTitle>
|
|
<CardDescription>
|
|
Extract triples from documents and build a knowledge graph
|
|
</CardDescription>
|
|
</CardHeader>
|
|
<CardContent>
|
|
<div className="space-y-4">
|
|
<div className="flex items-start gap-4">
|
|
<div className="grid w-full gap-2">
|
|
<label htmlFor="document-upload" className="cursor-pointer">
|
|
<div className="flex h-24 w-full items-center justify-center rounded-md border border-dashed border-input bg-muted/50 p-4 hover:bg-muted/80 transition-colors">
|
|
<div className="flex flex-col items-center gap-2">
|
|
<FileUp className="h-10 w-10 text-muted-foreground" />
|
|
<span className="text-sm font-medium text-muted-foreground">
|
|
{file ? file.name : "Upload document"}
|
|
</span>
|
|
</div>
|
|
</div>
|
|
<input
|
|
id="document-upload"
|
|
type="file"
|
|
accept=".md,.txt,.csv"
|
|
onChange={handleFileChange}
|
|
className="sr-only"
|
|
/>
|
|
</label>
|
|
</div>
|
|
</div>
|
|
|
|
<Tabs
|
|
defaultValue="triples"
|
|
value={processingTab}
|
|
onValueChange={setProcessingTab}
|
|
className="w-full"
|
|
>
|
|
<TabsList className="grid w-full grid-cols-2">
|
|
<TabsTrigger value="triples">Knowledge Triples</TabsTrigger>
|
|
<TabsTrigger value="embeddings">Sentence Embeddings</TabsTrigger>
|
|
</TabsList>
|
|
<TabsContent value="triples">
|
|
<div className="space-y-4 py-4">
|
|
<div className="flex items-center space-x-2">
|
|
<Switch
|
|
id="use-sentence-chunking"
|
|
checked={useSentenceChunking}
|
|
onCheckedChange={setUseSentenceChunking}
|
|
/>
|
|
<Label htmlFor="use-sentence-chunking">Use sentence-level chunking</Label>
|
|
</div>
|
|
</div>
|
|
</TabsContent>
|
|
<TabsContent value="embeddings">
|
|
<div className="py-4 text-sm text-muted-foreground">
|
|
You can now generate embeddings directly from the document table after processing.
|
|
<div className="flex items-center mt-2 p-2 bg-muted/30 rounded-md">
|
|
<Zap className="h-4 w-4 text-primary mr-2" />
|
|
<span>Click the lightning icon in the document table to generate embeddings</span>
|
|
</div>
|
|
</div>
|
|
</TabsContent>
|
|
</Tabs>
|
|
|
|
{error && (
|
|
<Alert variant="destructive">
|
|
<AlertCircle className="h-4 w-4" />
|
|
<AlertDescription>{error}</AlertDescription>
|
|
</Alert>
|
|
)}
|
|
|
|
{isProcessing && (
|
|
<div className="space-y-2">
|
|
<div className="flex items-center gap-2 text-sm">
|
|
<Loader2 className="h-4 w-4 animate-spin" />
|
|
<span>{processingStatus}</span>
|
|
</div>
|
|
<Progress value={progress} className="h-2 w-full" />
|
|
</div>
|
|
)}
|
|
|
|
<Button
|
|
onClick={processFile}
|
|
className="w-full"
|
|
disabled={!file || isProcessing}
|
|
>
|
|
{isProcessing ? (
|
|
<>
|
|
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
|
|
Processing...
|
|
</>
|
|
) : (
|
|
<>
|
|
<FileText className="mr-2 h-4 w-4" />
|
|
Process Document & Generate Triples
|
|
</>
|
|
)}
|
|
</Button>
|
|
</div>
|
|
</CardContent>
|
|
</Card>
|
|
);
|
|
}
|