import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; import { StructuredOutputParser } from "langchain/output_parsers"; import { PromptTemplate } from "@langchain/core/prompts"; import { z } from "zod"; import { Triple } from "@/types/graph"; import axios from "axios"; import { ChatOpenAI } from "@langchain/openai"; import { Document } from "langchain/document"; import { LLMGraphTransformer } from "@langchain/community/experimental/graph_transformers/llm"; import { BaseLanguageModel } from "@langchain/core/language_models/base"; import { SystemMessage, HumanMessage } from "@langchain/core/messages"; import { langChainService } from "./langchain-service"; import { getShouldStopProcessing, resetStopProcessing } from "@/app/api/stop-processing/route"; // Define a type for sentence with embedding export interface SentenceEmbedding { sentence: string; embedding: number[]; metadata?: { index: number; documentId?: string; context?: string; }; } // Define interfaces for graph document types interface NodeType { id: string; type: string; properties?: Record; } interface RelationshipType { source: NodeType; target: NodeType; type: string; properties?: Record; } interface GraphDocument { nodes: NodeType[]; relationships: RelationshipType[]; } interface LLMGraphTransformerOptions { llm: BaseLanguageModel; allowedNodes?: string[]; allowedRelationships?: string[]; nodeProperties?: string[]; } // Add new interface for prompt options interface PromptOptions { systemPrompt?: string; extractionPrompt?: string; graphTransformerPrompt?: string; } /** * Text processing pipeline using LangChain.js that: * 1. Chunks documents into optimal sizes * 2. Extracts entities and relationships * 3. Performs metadata enrichment * 4. Outputs structured triples for the knowledge graph */ export class TextProcessor { private static instance: TextProcessor; private sentenceTransformerUrl: string; private modelName: string; private llm: ChatOpenAI | null = null; private tripleParser: StructuredOutputParser | null = null; private extractionTemplate: PromptTemplate | null = null; private selectedLLMProvider: 'ollama' | 'nvidia' | 'vllm' = 'ollama'; private ollamaModel: string = 'llama3.1:8b'; private ollamaBaseUrl: string = 'http://localhost:11434/v1'; private vllmModel: string = 'meta-llama/Llama-3.2-3B-Instruct'; private vllmBaseUrl: string = 'http://localhost:8001/v1'; private constructor() { this.sentenceTransformerUrl = process.env.SENTENCE_TRANSFORMER_URL || "http://localhost:8000"; this.modelName = process.env.MODEL_NAME || "all-MiniLM-L6-v2"; // Check for Ollama configuration this.ollamaBaseUrl = process.env.OLLAMA_BASE_URL || 'http://localhost:11434/v1'; this.ollamaModel = process.env.OLLAMA_MODEL || 'llama3.1:8b'; // Check for vLLM configuration this.vllmBaseUrl = process.env.VLLM_BASE_URL || 'http://localhost:8001/v1'; this.vllmModel = process.env.VLLM_MODEL || 'meta-llama/Llama-3.2-3B-Instruct'; // Determine which LLM provider to use based on configuration // Priority: vLLM > NVIDIA > Ollama if (process.env.VLLM_BASE_URL) { this.selectedLLMProvider = 'vllm'; } else if (process.env.NVIDIA_API_KEY) { this.selectedLLMProvider = 'nvidia'; } else { // Default to Ollama (no API key required) this.selectedLLMProvider = 'ollama'; } } /** * Get the singleton instance of TextProcessor */ public static getInstance(): TextProcessor { if (!TextProcessor.instance) { TextProcessor.instance = new TextProcessor(); } return TextProcessor.instance; } /** * Initialize the TextProcessor with the required components */ public async initialize(): Promise { // Only require API keys for specific providers, Ollama works without API keys if (this.selectedLLMProvider === 'nvidia' && !process.env.NVIDIA_API_KEY) { throw new Error("NVIDIA API key is required when using NVIDIA provider. Please set NVIDIA_API_KEY in your environment variables."); } // Initialize LLM based on selected provider switch (this.selectedLLMProvider) { case 'ollama': try { this.llm = await langChainService.getOllamaModel(this.ollamaModel, { temperature: 0.1, maxTokens: 8192, baseURL: this.ollamaBaseUrl }); } catch (error) { console.error('Failed to initialize Ollama model:', error); throw new Error(`Failed to initialize Ollama model: ${error instanceof Error ? error.message : String(error)}`); } break; case 'nvidia': try { // Use the default Nemotron model for NVIDIA this.llm = await langChainService.getNemotronModel({ temperature: 0.1, maxTokens: 8192 }); } catch (error) { console.error('Failed to initialize NVIDIA model:', error); throw new Error(`Failed to initialize NVIDIA model: ${error instanceof Error ? error.message : String(error)}`); } break; case 'vllm': try { this.llm = await langChainService.getVllmModel(this.vllmModel, { temperature: 0.1, maxTokens: 8192, baseURL: this.vllmBaseUrl }); } catch (error) { console.error('Failed to initialize vLLM model:', error); throw new Error(`Failed to initialize vLLM model: ${error instanceof Error ? error.message : String(error)}`); } break; } // Initialize Triple Parser this.tripleParser = StructuredOutputParser.fromZodSchema( z.array( z.object({ subject: z.string().describe("The subject entity of the triple"), predicate: z.string().describe("The relation/predicate connecting subject and object"), object: z.string().describe("The object entity of the triple"), confidence: z.number().min(0).max(1).describe("Confidence score between 0 and 1"), metadata: z.object({ entityTypes: z.array(z.string()).describe("Entity types for subject and object"), source: z.string().describe("The source text this triple was extracted from"), context: z.string().describe("Surrounding context for the triple") }).describe("Additional metadata about the triple") }) ).describe("Array of knowledge graph triples extracted from the text") ); // Initialize Extraction Template const templateString = ` You are a knowledge graph builder that extracts structured information from text. Extract subject-predicate-object triples from the following text. Guidelines: - Extract only factual triples present in the text - Normalize entity names to their canonical form - Assign appropriate confidence scores (0-1) - Include entity types in metadata - For each triple, include a brief context from the source text Text: {text} {format_instructions} `; this.extractionTemplate = PromptTemplate.fromTemplate(templateString); } /** * Process text to extract structured triples * @param text Text to process * @returns Array of triples with metadata */ public async processText(text: string): Promise> { if (!this.llm || !this.tripleParser || !this.extractionTemplate) { await this.initialize(); } // Ensure we have an LLM to extract triples if (!this.llm) { const providerMessage = this.selectedLLMProvider === 'ollama' ? "Ollama server connection failed. Please ensure Ollama is running and accessible." : "NVIDIA API key is required. Please set NVIDIA_API_KEY in your environment variables."; throw new Error(`LLM configuration error: ${providerMessage}`); } // Step 1: Chunk the text into manageable pieces const chunks = await this.chunkText(text); console.log(`Split text into ${chunks.length} chunks`); // Step 2: Process each chunk to extract triples const allTriples: Array = []; for (let i = 0; i < chunks.length; i++) { // Check if processing should be stopped if (getShouldStopProcessing()) { console.log(`Processing stopped by user at chunk ${i + 1}/${chunks.length}`); resetStopProcessing(); // Reset the flag for next time throw new Error('Processing stopped by user'); } const chunk = chunks[i]; console.log(`Processing chunk ${i + 1}/${chunks.length} (${chunk.length} chars)`); try { // Format the prompt with the chunk and parser instructions const formatInstructions = this.tripleParser!.getFormatInstructions(); const prompt = await this.extractionTemplate!.format({ text: chunk, format_instructions: formatInstructions }); // Extract triples using the LLM const response = await this.llm!.invoke(prompt); const responseText = response.content as string; const parsedTriples = await this.tripleParser!.parse(responseText); allTriples.push(...parsedTriples); } catch (error) { console.error(`Error processing chunk ${i + 1}:`, error); } } // Step 3: Post-process to remove duplicates and normalize const processedTriples = this.postProcessTriples(allTriples); console.log(`Extracted ${processedTriples.length} unique triples after post-processing`); return processedTriples; } /** * Split text into chunks of appropriate size * @param text Text to split * @returns Array of text chunks */ private async chunkText(text: string): Promise { const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 64000, // Increased chunk size for Llama 70B models (16K tokens) chunkOverlap: 1000, // Increased overlap to maintain context separators: ["\n\n", "\n", ". ", " ", ""], // Preferred split locations }); return await splitter.splitText(text); } /** * Split text into sentence-level chunks * @param text Text to split into sentences * @returns Array of sentences */ public async splitIntoSentences(text: string): Promise { const sentenceSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, // Maximum sentence length (very long to ensure sentences aren't split) chunkOverlap: 0, // No overlap for sentences separators: [". ", "! ", "? ", "\n", "\t"], // Sentence endings and paragraph breaks }); // First split by paragraphs, then by sentence delimiters const paragraphs = text.split(/\n{2,}/); // Split on double newlines for paragraphs const sentences: string[] = []; for (const paragraph of paragraphs) { if (paragraph.trim().length === 0) continue; // Further split by sentence delimiters const paragraphSentences = await sentenceSplitter.splitText(paragraph); sentences.push(...paragraphSentences); } // Clean up sentences return sentences .map(s => s.trim()) .filter(s => s.length >= 10); // Filter out very short sentences } /** * Generate embeddings using local Sentence Transformer service * @param texts Array of texts to embed * @returns Array of embeddings */ private async generateEmbeddings(texts: string[]): Promise { try { console.log(`Generating embeddings for ${texts.length} texts using local Sentence Transformer service`); // Use the sentence-transformers service defined in docker-compose const response = await axios.post(`${this.sentenceTransformerUrl}/embed`, { texts: texts, model: this.modelName }); if (response.status !== 200) { throw new Error(`Failed to generate embeddings: ${response.statusText}`); } return response.data.embeddings; } catch (error) { console.error('Error generating embeddings with Sentence Transformer:', error); throw new Error(`Failed to generate embeddings: ${error instanceof Error ? error.message : String(error)}`); } } /** * Generate embeddings for an array of sentences * @param sentences Array of sentences to embed * @param documentId Optional document identifier for metadata * @returns Array of sentence embeddings */ public async generateSentenceEmbeddings( sentences: string[], documentId?: string ): Promise { console.log(`Generating embeddings for ${sentences.length} sentences`); // Generate embeddings using the local Sentence Transformer service const embeddings = await this.generateEmbeddings(sentences); // Map embeddings to sentences with metadata return sentences.map((sentence, i) => ({ sentence, embedding: embeddings[i], metadata: { index: i, documentId: documentId || undefined, context: this.getSentenceContext(sentences, i), } })); } /** * Get surrounding context for a sentence * @private */ private getSentenceContext(sentences: string[], index: number): string { // Get previous and next sentence as context if available const previousSentence = index > 0 ? sentences[index - 1] : ''; const nextSentence = index < sentences.length - 1 ? sentences[index + 1] : ''; // Create a context window with up to 3 sentences let context = sentences[index]; if (previousSentence) { context = previousSentence + ' ' + context; } if (nextSentence) { context = context + ' ' + nextSentence; } return context; } /** * Post-process extracted triples to remove duplicates and normalize * @param triples Array of raw triples * @returns Array of processed triples */ private postProcessTriples( triples: Array ): Array { // Convert to lowercase for comparison const normalizedTriples = triples.map(triple => ({ ...triple, subject: triple.subject.toLowerCase().trim(), predicate: triple.predicate.toLowerCase().trim(), object: triple.object.toLowerCase().trim() })); // Remove duplicates using a Map with string key const tripleMap = new Map(); for (const triple of normalizedTriples) { const key = `${triple.subject}|${triple.predicate}|${triple.object}`; // If triple exists, keep the one with higher confidence if (tripleMap.has(key)) { const existingTriple = tripleMap.get(key)!; if (triple.confidence > existingTriple.confidence) { tripleMap.set(key, triple); } } else { tripleMap.set(key, triple); } } // Filter out low confidence triples return Array.from(tripleMap.values()) .filter(triple => triple.confidence >= 0.6) // Only keep reasonably confident triples .sort((a, b) => b.confidence - a.confidence); // Sort by confidence (highest first) } // Make LLM accessible for the LLMGraphTransformer public getLLM(): ChatOpenAI | null { return this.llm; } /** * Set the LLM provider to use for triple extraction */ public setLLMProvider(provider: 'ollama' | 'nvidia' | 'vllm', options?: { ollamaModel?: string; ollamaBaseUrl?: string; vllmModel?: string; vllmBaseUrl?: string; }): void { this.selectedLLMProvider = provider; if (provider === 'ollama') { this.ollamaModel = options?.ollamaModel || this.ollamaModel; this.ollamaBaseUrl = options?.ollamaBaseUrl || this.ollamaBaseUrl; } else if (provider === 'vllm') { this.vllmModel = options?.vllmModel || this.vllmModel; this.vllmBaseUrl = options?.vllmBaseUrl || this.vllmBaseUrl; } // Reset the LLM so it gets re-initialized with the new provider this.llm = null; } /** * Get the current LLM provider */ public getLLMProvider(): 'ollama' | 'nvidia' | 'vllm' { return this.selectedLLMProvider; } /** * Process text to extract structured triples with a custom prompt template * @param text Text to process * @param customPrompt Custom prompt template to use instead of the default * @returns Array of triples with metadata */ public async processTextWithCustomPrompt(text: string, customPrompt: string): Promise> { if (!this.llm || !this.tripleParser) { await this.initialize(); } // Ensure we have an LLM to extract triples if (!this.llm) { throw new Error("LLM is not initialized. Please ensure your selected provider is properly configured."); } // Step 1: Chunk the text into manageable pieces const chunks = await this.chunkText(text); console.log(`Split text into ${chunks.length} chunks`); // Step 2: Process each chunk to extract triples with the custom prompt const allTriples: Array = []; // Create a custom prompt template const customTemplate = PromptTemplate.fromTemplate(customPrompt); for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; console.log(`Processing chunk ${i + 1}/${chunks.length} (${chunk.length} chars) with custom prompt`); try { // Format the prompt with the chunk and parser instructions const formatInstructions = this.tripleParser!.getFormatInstructions(); const prompt = await customTemplate.format({ text: chunk, format_instructions: formatInstructions }); // Extract triples using the LLM const response = await this.llm!.invoke(prompt); const responseText = response.content as string; const parsedTriples = await this.tripleParser!.parse(responseText); allTriples.push(...parsedTriples); } catch (error) { console.error(`Error processing chunk ${i + 1} with custom prompt:`, error); } } // Step 3: Post-process to remove duplicates and normalize const processedTriples = this.postProcessTriples(allTriples); console.log(`Extracted ${processedTriples.length} unique triples after post-processing with custom prompt`); return processedTriples; } /** * Process text to extract structured triples with a custom system prompt * This is used for direct LLM invocation without LangChain * @param text Text to process * @param customSystemPrompt Custom system prompt to use * @returns Array of triples with metadata */ public async processTextWithCustomSystemPrompt(text: string, customSystemPrompt: string): Promise> { if (!this.llm) { await this.initialize(); } // Ensure we have an LLM to extract triples if (!this.llm) { throw new Error("LLM is not initialized. Please ensure your selected provider is properly configured."); } // Step 1: Chunk the text into manageable pieces const chunks = await this.chunkText(text); console.log(`Split text into ${chunks.length} chunks for processing with custom system prompt`); // Step 2: Process each chunk to extract triples with the custom system prompt const allTriples: Array = []; for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; console.log(`Processing chunk ${i + 1}/${chunks.length} (${chunk.length} chars) with custom system prompt`); try { // Create messages with the custom system prompt and the chunk const messages = [ new SystemMessage(customSystemPrompt), new HumanMessage(chunk) ]; const response = await this.llm!.invoke(messages); // Convert response to triples const responseText = response.content as string; // Create a simple triple parser for the response (since we're not using LangChain's parser) const simpleTriples = this.parseTripleLines(responseText); // Convert to the expected format with confidence and metadata const structuredTriples = simpleTriples.map(triple => ({ ...triple, confidence: 0.9, // Default confidence for custom prompt extraction metadata: { entityTypes: [], // Empty entity types as they're not provided source: chunk.substring(0, 100) + "...", // First 100 chars as context context: `${triple.subject} ${triple.predicate} ${triple.object}` } })); allTriples.push(...structuredTriples); } catch (error) { console.error(`Error processing chunk ${i + 1} with custom system prompt:`, error); } } // Step 3: Post-process to remove duplicates and normalize const processedTriples = this.postProcessTriples(allTriples); console.log(`Extracted ${processedTriples.length} unique triples after post-processing with custom system prompt`); return processedTriples; } /** * Helper method to parse triple lines from LLM output * @private */ private parseTripleLines(text: string): Triple[] { const triples: Triple[] = []; const lines = text.split('\n'); for (const line of lines) { const trimmed = line.trim(); if (!trimmed) continue; // Try different regex patterns to extract triples const patterns = [ // Standard format: ('subject', 'relation', 'object') /$'([^']+)',\s*'([^']+)',\s*'([^']+)'$/, // Double quotes: ("subject", "relation", "object") /$"([^"]+)",\s*"([^"]+)",\s*"([^"]+)"$/, // No parentheses: "subject", "relation", "object" /"([^"]+)",\s*"([^"]+)",\s*"([^"]+)"/, // Mixed quotes: ('subject', "relation", 'object') /$['"]([^'"]+)['"],\s*['"]([^'"]+)['"],\s*['"]([^'"]+)['"]$/, // Plain text: subject, relation, object /^([^,]+),\s*([^,]+),\s*(.+)$/ ]; let match = null; for (const pattern of patterns) { match = trimmed.match(pattern); if (match) break; } if (match) { triples.push({ subject: match[1].trim(), predicate: match[2].trim(), object: match[3].trim() }); } } return triples; } } /** * Process a document and extract triples with metadata * @param text Document text * @param useLangChain Whether to use LangChain's extraction (optional) * @param useGraphTransformer Whether to use LLMGraphTransformer (optional) * @param options Custom prompt options (optional) * @returns Extracted triples with metadata */ export async function processDocument( text: string, useLangChain = false, useGraphTransformer = false, options?: PromptOptions ): Promise> { if (useLangChain) { if (useGraphTransformer) { // Pass graphTransformerPrompt if available return await processDocumentWithGraphTransformer(text, options?.graphTransformerPrompt); } else { // Initialize text processor with custom extraction prompt if available const processor = TextProcessor.getInstance(); // If a custom extraction prompt is provided, use it for this invocation if (options?.extractionPrompt) { return await processor.processTextWithCustomPrompt(text, options.extractionPrompt); } else { return await processor.processText(text); } } } // Use default processor with potential custom system prompt const processor = TextProcessor.getInstance(); // If a custom system prompt is provided, use it for this invocation if (options?.systemPrompt) { return await processor.processTextWithCustomSystemPrompt(text, options.systemPrompt); } else { return await processor.processText(text); } } /** * Process a document using LangChain's LLMGraphTransformer * @param text Document text * @param customGraphPrompt Optional custom prompt for the graph transformer * @returns Extracted triples with metadata */ async function processDocumentWithGraphTransformer( text: string, customGraphPrompt?: string ): Promise> { const processor = TextProcessor.getInstance(); // Initialize LLM if not already done if (!processor.getLLM()) { await processor.initialize(); } // Ensure we have an LLM const llm = processor.getLLM(); if (!llm) { throw new Error("xAI API key is required for triple extraction. Please set XAI_API_KEY in your environment variables."); } // Use the existing LLM with LLMGraphTransformer const llmTransformerOptions: any = { llm, // Optional configurations allowedNodes: ["Person", "Organization", "Concept", "Location", "Event", "Product"], allowedRelationships: ["RELATED_TO", "PART_OF", "LOCATED_IN", "WORKS_AT", "CREATED", "BELONGS_TO", "HAS_PROPERTY"], nodeProperties: ["name", "type", "description"] }; // Add custom prompt if provided if (customGraphPrompt) { llmTransformerOptions.customPrompt = customGraphPrompt; } const llmTransformer = new LLMGraphTransformer(llmTransformerOptions); // Create LangChain document from text const documents = [new Document({ pageContent: text })]; try { // Extract graph documents const graphDocuments = await llmTransformer.convertToGraphDocuments(documents); // Convert graph nodes and relationships to triples const triples: Array = []; if (graphDocuments.length > 0) { const graphDoc = graphDocuments[0]; // Process each relationship as a triple for (const relationship of graphDoc.relationships) { // Use type assertion to handle potential mixed types const rel = relationship as unknown as { source: { id: string, type: string, properties?: Record }, target: { id: string, type: string, properties?: Record }, type: string }; triples.push({ subject: rel.source.id, predicate: rel.type.toLowerCase(), object: rel.target.id, confidence: 0.9, // Default high confidence for LLM-extracted relationships metadata: { entityTypes: [rel.source.type, rel.target.type], source: text.substring(0, 100) + "...", // First 100 chars as source context context: `${rel.source.id} ${rel.type} ${rel.target.id}`, sourceProperties: rel.source.properties || {}, targetProperties: rel.target.properties || {} } }); } } return triples; } catch (error) { console.error("Error processing with LLMGraphTransformer:", error); throw new Error(`Failed to process with LangChain: ${error instanceof Error ? error.message : String(error)}`); } } /** * Extract entity types from a text passage * @param text Text to analyze * @returns Map of entity names to their types */ export async function extractEntityTypes(text: string): Promise> { const processor = TextProcessor.getInstance(); const triples = await processor.processText(text); const entityTypes = new Map(); for (const triple of triples) { if (triple.metadata && triple.metadata.entityTypes) { // Extract subject type if (triple.metadata.entityTypes[0]) { const subjectType = entityTypes.get(triple.subject) || []; if (!subjectType.includes(triple.metadata.entityTypes[0])) { subjectType.push(triple.metadata.entityTypes[0]); } entityTypes.set(triple.subject, subjectType); } // Extract object type if (triple.metadata.entityTypes[1]) { const objectType = entityTypes.get(triple.object) || []; if (!objectType.includes(triple.metadata.entityTypes[1])) { objectType.push(triple.metadata.entityTypes[1]); } entityTypes.set(triple.object, objectType); } } } return entityTypes; } /** * Split text into sentences and generate embeddings * @param text Text to process * @param documentId Optional document identifier * @returns Array of sentence embeddings */ export async function processSentenceEmbeddings( text: string, documentId?: string ): Promise { const processor = TextProcessor.getInstance(); // Split text into sentences const sentences = await processor.splitIntoSentences(text); // Generate embeddings for the sentences return await processor.generateSentenceEmbeddings(sentences, documentId); }