dgx-spark-playbooks/nvidia/txt2kg/assets/frontend/components/document-processor.tsx

"use client";

import { useState } from "react";
import { Button } from "@/components/ui/button";
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
import { AlertCircle, FileText, FileUp, Loader2, Zap } from "lucide-react";
import { Progress } from "@/components/ui/progress";
import { Alert, AlertDescription } from "@/components/ui/alert";
import { useToast } from "@/components/ui/use-toast";
import { Switch } from "@/components/ui/switch";
import { Label } from "@/components/ui/label";
import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
import { useDocuments } from "@/contexts/document-context";

interface DocumentProcessorProps {
  onComplete?: (results: any) => void;
  className?: string;
}

export function DocumentProcessor({ onComplete, className }: DocumentProcessorProps) {
  const { addDocuments, processDocuments, documents } = useDocuments();
  const [file, setFile] = useState<File | null>(null);
  const [isProcessing, setIsProcessing] = useState(false);
  const [progress, setProgress] = useState(0);
  const [processingStatus, setProcessingStatus] = useState("");
  const [error, setError] = useState<string | null>(null);
  const [processingTab, setProcessingTab] = useState<string>("triples");
  const [useSentenceChunking, setUseSentenceChunking] = useState(true);
  const [useEntityExtraction, setUseEntityExtraction] = useState(true);
  const { toast } = useToast();

  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
    const selectedFile = e.target.files?.[0] || null;
    if (selectedFile) {
      // Add file to document context for display in document list
      addDocuments([selectedFile]);
      setFile(selectedFile);
      setError(null);
      setProgress(0);
      setProcessingStatus("");

      // Show toast notification
      toast({
        title: "File Uploaded",
        description: `"${selectedFile.name}" added to document list.`,
        duration: 3000,
      });
    }
  };

  const processFile = async () => {
    if (!file) {
      setError("Please select a file to process");
      return;
    }

    try {
      setIsProcessing(true);
      setProgress(0);
      setProcessingStatus("Reading file...");
      setError(null);

      // Find the document ID for the file we're processing
      const docToProcess = documents.find(doc => doc.name === file.name);

      if (!docToProcess) {
        throw new Error("Document not found in document list");
      }

      // Use the document context to process documents with the specific ID
      await processDocuments([docToProcess.id], {
        useLangChain: true,
        useGraphTransformer: false,
        promptConfigs: undefined
      });
      setProgress(100);
      setProcessingStatus("Processing complete!");

      // Notify about completion
      toast({
        title: "Processing Complete",
        description: "Document has been processed successfully. You can now generate embeddings from the document table.",
        duration: 5000,
      });

      // Reset the file input
      setFile(null);

      // Call onComplete callback if provided
      if (onComplete) {
        onComplete({
          success: true,
          message: "Document processed successfully"
        });
      }
    } catch (err) {
      console.error("Error processing document:", err);
      setError(err instanceof Error ? err.message : "Unknown error processing document");

      toast({
        title: "Processing Failed",
        description: err instanceof Error ? err.message : "Failed to process document",
        variant: "destructive",
        duration: 5000,
      });
    } finally {
      setIsProcessing(false);
    }
  };

  // Process triples from text
  const processTriples = async (text: string, filename: string) => {
    setProcessingStatus("Extracting triples with LangChain...");

    let triples;

    // If it's a CSV file, process each row as a document with LLM extraction
    if (filename.toLowerCase().endsWith('.csv')) {
      setProcessingStatus(`Processing CSV file rows as documents (${(text.length / 1024).toFixed(2)} KB)...`);
      try {
        console.log(`🔥 DocumentProcessor: Starting CSV row-by-row processing for file: ${filename}`);
        triples = await parseCSVToTriples(text);
        console.log(`🔥 DocumentProcessor: Extracted ${triples.length} triples from CSV file`);
        setProgress(60);

        // For very large triple sets, limit what we send to the API
        const maxTriplesToProcess = 10000;
        let triplesToProcess = triples;

        if (triples.length > maxTriplesToProcess) {
          console.log(`Limiting triples to ${maxTriplesToProcess} out of ${triples.length} total`);
          setProcessingStatus(`Processing ${maxTriplesToProcess} of ${triples.length} triples (limited for performance)...`);
          triplesToProcess = triples.slice(0, maxTriplesToProcess);
        }

        setProcessingStatus(`Processing ${triplesToProcess.length} triples...`);

        // Process document to create backend with embeddings
        // NOTE: This API no longer automatically stores triples in Neo4j.
        // Storage in Neo4j is now handled manually through the UI's "Store in Graph DB" button.
        const processingResponse = await fetch('/api/process-document', {
          method: 'POST',
          headers: { 'Content-Type': 'application/json' },
          body: JSON.stringify({
            text: `CSV file with ${triples.length} triples`,  // Don't send the full CSV content
            filename,
            triples: triplesToProcess
          })
        });

        if (!processingResponse.ok) {
          throw new Error(`Failed to process document: ${processingResponse.statusText}`);
        }

        const processingData = await processingResponse.json();
        setProgress(100);
        setProcessingStatus("Processing complete!");

        // Notify about completion
        toast({
          title: "CSV Processing Complete",
          description: `Processed ${triplesToProcess.length} triples${triples.length > triplesToProcess.length ? ' (limited from ' + triples.length + ' total)' : ''} from your CSV file.`,
          duration: 5000,
        });

        // Call the onComplete callback with results
        if (onComplete) {
          onComplete({
            triples: triplesToProcess,
            totalTriples: triples.length,
            embeddings: processingData.embeddings || [],
            filename
          });
        }

        return; // Early return to skip the standard processing flow
      } catch (err) {
        console.error(`CSV processing error:`, err);
        throw new Error(`Failed to parse CSV file: ${err instanceof Error ? err.message : String(err)}`);
      }
    }

    // Standard processing for non-CSV files
    const extractResponse = await fetch('/api/extract-triples', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ text })
    });

    if (!extractResponse.ok) {
      throw new Error(`Failed to extract triples: ${extractResponse.statusText}`);
    }

    const extractData = await extractResponse.json();
    triples = extractData.triples;
    setProgress(60);

    setProcessingStatus("Generating embeddings...");

    // Process document to create backend with embeddings
    // NOTE: This API no longer automatically stores triples in Neo4j.
    // Storage in Neo4j is now handled manually through the UI's "Store in Graph DB" button.
    const processingResponse = await fetch('/api/process-document', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        text,
        filename,
        triples
      })
    });

    if (!processingResponse.ok) {
      throw new Error(`Failed to process document: ${processingResponse.statusText}`);
    }

    const processingData = await processingResponse.json();
    setProgress(100);
    setProcessingStatus("Processing complete!");

    // Notify about completion
    toast({
      title: "Triple Extraction Complete",
      description: `Extracted ${triples.length} triples and generated embeddings for the knowledge graph.`,
      duration: 5000,
    });

    // Call the onComplete callback with results
    if (onComplete) {
      onComplete({
        triples,
        embeddings: processingData.embeddings || [],
        filename
      });
    }
  };

  // Process sentence embeddings
  const processSentenceEmbeddings = async (text: string, filename: string) => {
    // If it's a CSV file, we need to convert it to text first
    let processableText = text;

    if (filename.toLowerCase().endsWith('.csv')) {
      setProcessingStatus("Preparing CSV data for embedding...");
      try {
        // For CSV files, we'll use the content of the cells as text to generate embeddings
        const triples = await parseCSVToTriples(text);
        // Create a text representation by joining subjects, predicates and objects
        processableText = triples
          .map(t => `${t.subject} ${t.predicate} ${t.object}`)
          .join('. ');
      } catch (err) {
        throw new Error(`Failed to process CSV file: ${err instanceof Error ? err.message : String(err)}`);
      }
    }

    setProcessingStatus("Chunking text into sentences...");

    // Call sentence embeddings API
    const embeddingsResponse = await fetch('/api/sentence-embeddings', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        text: processableText,
        documentId: filename
      })
    });

    if (!embeddingsResponse.ok) {
      throw new Error(`Failed to process sentence embeddings: ${embeddingsResponse.statusText}`);
    }

    const embeddingsData = await embeddingsResponse.json();
    setProgress(100);
    setProcessingStatus("Sentence embeddings complete!");

    // Notify about completion
    toast({
      title: "Sentence Embeddings Complete",
      description: `Generated embeddings for ${embeddingsData.count} sentences from your document.`,
      duration: 5000,
    });

    // Show sample sentences in console for debugging
    console.log("Sample sentences:", embeddingsData.samples);

    // Call the onComplete callback with results
    if (onComplete) {
      onComplete({
        sentenceCount: embeddingsData.count,
        samples: embeddingsData.samples,
        filename
      });
    }
  };

  // Helper function to read file content
  const readFileContent = (file: File): Promise<string> => {
    return new Promise((resolve, reject) => {
      console.log(`Reading file: ${file.name}, size: ${(file.size / 1024).toFixed(2)} KB`);

      const reader = new FileReader();
      reader.onload = (event) => {
        if (event.target?.result) {
          const content = event.target.result as string;
          console.log(`File content loaded, length: ${content.length} characters`);

          // Special handling for CSV files
          if (file.name.toLowerCase().endsWith('.csv')) {
            try {
              console.log(`Processing CSV file content...`);
              // Don't parse here, just validate the content
              const lineCount = content.split('\n').length;
              console.log(`CSV file has ${lineCount} lines`);
              resolve(content);
            } catch (err) {
              console.error(`CSV parsing error:`, err);
              reject(new Error(`Failed to parse CSV file: ${err instanceof Error ? err.message : String(err)}`));
            }
          } else if (file.name.toLowerCase().endsWith('.json')) {
            try {
              console.log(`Processing JSON file content...`);
              // Convert JSON to readable text format for processing
              const textContent = convertJsonToText(content);
              console.log(`Converted JSON file to text format, length: ${textContent.length} characters`);
              resolve(textContent);
            } catch (err) {
              console.error(`JSON conversion error:`, err);
              reject(new Error(`Failed to process JSON file: ${err instanceof Error ? err.message : String(err)}`));
            }
          } else {
            resolve(content);
          }
        } else {
          reject(new Error("Failed to read file content"));
        }
      };
      reader.onerror = (error) => {
        console.error(`Error reading file:`, error);
        reject(new Error("Error reading file"));
      };
      reader.readAsText(file);
    });
  };

  // Helper function to convert JSON content to readable text format
  const convertJsonToText = (jsonContent: string): string => {
    try {
      // Parse the JSON to validate it
      const jsonData = JSON.parse(jsonContent);

      // Convert JSON to a readable text format that preserves structure and relationships
      const formatJsonObject = (obj: any, indent: number = 0): string => {
        const spaces = '  '.repeat(indent);

        if (obj === null || obj === undefined) {
          return 'null';
        }

        if (typeof obj === 'string' || typeof obj === 'number' || typeof obj === 'boolean') {
          return String(obj);
        }

        if (Array.isArray(obj)) {
          if (obj.length === 0) return '[]';
          const items = obj.map((item, index) =>
            `${spaces}  Item ${index + 1}: ${formatJsonObject(item, indent + 1)}`
          ).join('\n');
          return `[\n${items}\n${spaces}]`;
        }

        if (typeof obj === 'object') {
          const entries = Object.entries(obj);
          if (entries.length === 0) return '{}';

          const props = entries.map(([key, value]) =>
            `${spaces}  ${key}: ${formatJsonObject(value, indent + 1)}`
          ).join('\n');
          return `{\n${props}\n${spaces}}`;
        }

        return String(obj);
      };

      // Create a descriptive text representation
      let textContent = `JSON Document Content:\n\n`;
      textContent += formatJsonObject(jsonData);

      return textContent;
    } catch (error) {
      console.warn('Failed to parse JSON, treating as plain text:', error);
      // If JSON parsing fails, return the original content as-is
      return jsonContent;
    }
  }

  // Parse CSV file and process each row as a document for LLM-based triple extraction
  const parseCSVToTriples = async (csvContent: string): Promise<any[]> => {
    console.log(`Processing CSV content as individual documents, length: ${csvContent.length} characters`);

    // Split the CSV content into lines
    const lines = csvContent.split('\n').filter(line => line.trim().length > 0);
    console.log(`CSV has ${lines.length} non-empty lines`);

    if (lines.length < 2) {
      throw new Error("CSV file must contain a header row and at least one data row");
    }

    // Parse the header row
    const header = lines[0].split(',').map(h => h.trim().replace(/^"(.*)"$/, '$1'));
    console.log(`CSV headers: ${header.join(', ')}`);

    // Get data rows (skip header)
    const dataRows = lines.slice(1);
    console.log(`Processing ${dataRows.length} data rows as individual documents`);

    let allTriples: any[] = [];

    // Process each row as a separate document
    for (let rowIdx = 0; rowIdx < dataRows.length; rowIdx++) {
      const line = dataRows[rowIdx];
      setProcessingStatus(`Processing CSV row ${rowIdx + 1}/${dataRows.length} with LLM...`);

      try {
        // Parse CSV row into fields
        const fields: string[] = [];
        let fieldStart = 0;
        let inQuotes = false;

        for (let i = 0; i < line.length; i++) {
          if (line[i] === '"') {
            inQuotes = !inQuotes;
          } else if (line[i] === ',' && !inQuotes) {
            fields.push(line.substring(fieldStart, i).trim().replace(/^"(.*)"$/, '$1'));
            fieldStart = i + 1;
          }
        }

        // Add the last field
        fields.push(line.substring(fieldStart).trim().replace(/^"(.*)"$/, '$1'));

        // Create document text from the row data
        let documentText = '';
        for (let i = 0; i < Math.min(header.length, fields.length); i++) {
          if (fields[i] && fields[i].trim()) {
            documentText += `${header[i]}: ${fields[i]}\n`;
          }
        }

        // Skip empty rows
        if (!documentText.trim()) {
          console.warn(`Skipping empty CSV row ${rowIdx + 1}`);
          continue;
        }

        console.log(`Processing row ${rowIdx + 1} as document: ${documentText.substring(0, 100)}...`);

        // Extract triples from this row's text using LLM
        try {
          const response = await fetch('/api/extract-triples', {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({
              text: documentText,
              useLangChain: true // Use LLM-based extraction
            })
          });

          if (!response.ok) {
            console.error(`Failed to extract triples from row ${rowIdx + 1}: ${response.statusText}`);
            continue;
          }

          const data = await response.json();
          if (data.triples && Array.isArray(data.triples)) {
            console.log(`Extracted ${data.triples.length} triples from row ${rowIdx + 1}`);
            allTriples = allTriples.concat(data.triples);
          }
        } catch (error) {
          console.error(`Error processing row ${rowIdx + 1}:`, error);
          continue;
        }

        // Update progress
        setProgress(20 + (rowIdx / dataRows.length) * 40);

      } catch (parseError) {
        console.error(`Error parsing CSV row ${rowIdx + 1}:`, parseError);
        continue;
      }
    }

    console.log(`Successfully extracted ${allTriples.length} triples from ${dataRows.length} CSV rows`);
    return allTriples;
  };

  return (
    <Card className={className}>
      <CardHeader>
        <CardTitle>Process Document</CardTitle>
        <CardDescription>
          Extract triples from documents and build a knowledge graph
        </CardDescription>
      </CardHeader>
      <CardContent>
        <div className="space-y-4">
          <div className="flex items-start gap-4">
            <div className="grid w-full gap-2">
              <label htmlFor="document-upload" className="cursor-pointer">
                <div className="flex h-24 w-full items-center justify-center rounded-md border border-dashed border-input bg-muted/50 p-4 hover:bg-muted/80 transition-colors">
                  <div className="flex flex-col items-center gap-2">
                    <FileUp className="h-10 w-10 text-muted-foreground" />
                    <span className="text-sm font-medium text-muted-foreground">
                      {file ? file.name : "Upload document"}
                    </span>
                  </div>
                </div>
                <input
                  id="document-upload"
                  type="file"
                  accept=".md,.txt,.csv"
                  onChange={handleFileChange}
                  className="sr-only"
                />
              </label>
            </div>
          </div>

          <Tabs
            defaultValue="triples"
            value={processingTab}
            onValueChange={setProcessingTab}
            className="w-full"
          >
            <TabsList className="grid w-full grid-cols-2">
              <TabsTrigger value="triples">Knowledge Triples</TabsTrigger>
              <TabsTrigger value="embeddings">Sentence Embeddings</TabsTrigger>
            </TabsList>
            <TabsContent value="triples">
              <div className="space-y-4 py-4">
                <div className="flex items-center space-x-2">
                  <Switch
                    id="use-sentence-chunking"
                    checked={useSentenceChunking}
                    onCheckedChange={setUseSentenceChunking}
                  />
                  <Label htmlFor="use-sentence-chunking">Use sentence-level chunking</Label>
                </div>
              </div>
            </TabsContent>
            <TabsContent value="embeddings">
              <div className="py-4 text-sm text-muted-foreground">
                You can now generate embeddings directly from the document table after processing.
                <div className="flex items-center mt-2 p-2 bg-muted/30 rounded-md">
                  <Zap className="h-4 w-4 text-primary mr-2" />
                  <span>Click the lightning icon in the document table to generate embeddings</span>
                </div>
              </div>
            </TabsContent>
          </Tabs>

          {error && (
            <Alert variant="destructive">
              <AlertCircle className="h-4 w-4" />
              <AlertDescription>{error}</AlertDescription>
            </Alert>
          )}

          {isProcessing && (
            <div className="space-y-2">
              <div className="flex items-center gap-2 text-sm">
                <Loader2 className="h-4 w-4 animate-spin" />
                <span>{processingStatus}</span>
              </div>
              <Progress value={progress} className="h-2 w-full" />
            </div>
          )}

          <Button
            onClick={processFile}
            className="w-full"
            disabled={!file || isProcessing}
          >
            {isProcessing ? (
              <>
                <Loader2 className="mr-2 h-4 w-4 animate-spin" />
                Processing...
              </>
            ) : (
              <>
                <FileText className="mr-2 h-4 w-4" />
                Process Document & Generate Triples
              </>
            )}
          </Button>
        </div>
      </CardContent>
    </Card>
  );
}