dgx-spark-playbooks/nvidia/txt2kg/assets/frontend/app/api/process-document/route.ts

170 lines
7.4 KiB
TypeScript
Raw Normal View History

2025-12-02 19:43:52 +00:00
//
// SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
2025-10-06 17:05:41 +00:00
import { NextRequest, NextResponse } from 'next/server';
import { RemoteBackendService } from '@/lib/remote-backend';
import { EmbeddingsService } from '@/lib/embeddings';
import type { Triple } from '@/types/graph';
import { BackendService } from '@/lib/backend-service';
import { getGraphDbType } from '../settings/route';
/**
* API endpoint for processing documents with LangChain, generating embeddings,
* and storing in the knowledge graph
* POST /api/process-document
*/
export async function POST(req: NextRequest) {
try {
// Parse request body
const body = await req.json();
const {
text,
filename,
triples,
useLangChain,
useGraphTransformer,
systemPrompt,
extractionPrompt,
graphTransformerPrompt
} = body;
if (!text || typeof text !== 'string') {
return NextResponse.json({ error: 'Text is required' }, { status: 400 });
}
if (!triples || !Array.isArray(triples)) {
return NextResponse.json({ error: 'Triples are required' }, { status: 400 });
}
// Initialize services
const backendService = RemoteBackendService.getInstance();
const embeddingsService = EmbeddingsService.getInstance();
console.log(`🔍 API: Processing document "${filename || 'unnamed'}" (${text.length} chars)`);
console.log(`🔍 API: Processing ${triples.length} triples`);
console.log(`🔍 API: Using LangChain for triple extraction: ${useLangChain ? 'Yes' : 'No'}`);
console.log(`🔍 API: First few triples:`, triples.slice(0, 3));
if (useLangChain) {
console.log(`Using LLMGraphTransformer: ${useGraphTransformer ? 'Yes' : 'No'}`);
}
// Log if custom prompts are being used
if (systemPrompt || extractionPrompt || graphTransformerPrompt) {
console.log('Using custom prompts for extraction');
if (systemPrompt) console.log('Custom system prompt provided');
if (extractionPrompt) console.log('Custom extraction prompt provided');
if (graphTransformerPrompt) console.log('Custom graph transformer prompt provided');
}
// Filter triples to ensure they are valid
const validTriples = triples.filter((triple: any) => {
return (
triple &&
typeof triple.subject === 'string' && triple.subject.trim() !== '' &&
typeof triple.predicate === 'string' && triple.predicate.trim() !== '' &&
typeof triple.object === 'string' && triple.object.trim() !== ''
);
}) as Triple[];
console.log(`Found ${validTriples.length} valid triples`);
// If useLangChain flag is set, we'll extract triples using the LangChain route
let triplesForProcessing = validTriples;
if (useLangChain && !filename?.toLowerCase().endsWith('.csv')) {
try {
console.log('Using LangChain for native triple extraction...');
// Use absolute URL with origin from request to fix URL parsing error
const baseUrl = new URL(req.url).origin;
console.log(`Using base URL: ${baseUrl} for LangChain API call`);
// Call the extract-triples endpoint with useLangChain flag and custom prompts
const requestBody: any = {
text,
useLangChain: true,
useGraphTransformer
};
// Add custom prompts if available
if (systemPrompt) requestBody.systemPrompt = systemPrompt;
if (extractionPrompt) requestBody.extractionPrompt = extractionPrompt;
if (graphTransformerPrompt) requestBody.graphTransformerPrompt = graphTransformerPrompt;
const langchainResponse = await fetch(`${baseUrl}/api/extract-triples`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(requestBody)
});
if (!langchainResponse.ok) {
const errorText = await langchainResponse.text();
console.error(`LangChain API error: ${langchainResponse.status} ${langchainResponse.statusText}`, errorText);
throw new Error(`LangChain extraction failed: ${langchainResponse.statusText} (${langchainResponse.status})`);
}
const langchainResult = await langchainResponse.json();
if (langchainResult.triples && Array.isArray(langchainResult.triples) && langchainResult.triples.length > 0) {
console.log(`Successfully extracted ${langchainResult.triples.length} triples using LangChain${useGraphTransformer ? ' with GraphTransformer' : ''}`);
triplesForProcessing = langchainResult.triples;
} else {
console.warn('LangChain extraction returned no triples, falling back to provided triples');
}
} catch (langchainError) {
console.error('Error using LangChain for triple extraction:', langchainError);
console.log('Falling back to provided triples');
}
}
// Check if this is a CSV file - if so, skip processing
const isCSVFile = filename && filename.toLowerCase().endsWith('.csv');
const isJSONFile = filename && filename.toLowerCase().endsWith('.json');
if (isCSVFile) {
console.log('CSV file detected, skipping text processor');
// NOTE: Neo4j storage is no longer done automatically
// This is now handled manually through the "Store in Graph DB" button in the UI
} else if (isJSONFile) {
console.log('JSON file detected, processed as unstructured text document - embeddings can be generated manually via the UI');
// NOTE: Automatic embeddings generation has been disabled for JSON files.
// Embeddings are now generated only when explicitly requested through the "Generate Embeddings" button in the UI.
} else {
// Regular text processing flow - no automatic embeddings generation
console.log('Document processed successfully - embeddings can be generated manually via the UI');
// NOTE: Automatic embeddings generation has been disabled.
// Embeddings are now generated only when explicitly requested through the "Generate Embeddings" button in the UI.
}
// Return success response
return NextResponse.json({
success: true,
message: 'Document processed successfully',
tripleCount: triplesForProcessing.length,
triples: triplesForProcessing,
documentName: filename || 'unnamed',
langchainUsed: useLangChain,
graphTransformerUsed: useGraphTransformer,
customPromptsUsed: !!(systemPrompt || extractionPrompt || graphTransformerPrompt),
graphDbType: getGraphDbType()
});
} catch (error) {
console.error('Error processing document:', error);
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
return NextResponse.json(
{ error: `Failed to process document: ${errorMessage}` },
{ status: 500 }
);
}
}