dgx-spark-playbooks/nvidia/txt2kg/assets/frontend/utils/text-processing.ts
2025-10-06 17:05:41 +00:00

341 lines
10 KiB
TypeScript

/**
* Text processing utilities for knowledge graph extraction
* Matches PyTorch Geometric's txt2kg.py implementation
*/
import type { Triple } from '@/types/graph'
const CHUNK_SIZE = 20000 // Optimized for Gemma3:27b on DGX Spark
const OVERLAP_SIZE = 1000 // For context preservation between chunks
/**
* Chunks text using PyTorch Geometric's exact chunking algorithm
* Replicates the chunk_text function from PyG's txt2kg.py
*/
export function chunkTextPyG(text: string, chunkSize: number = 512, overlapSize: number = 0): string[] {
if (!text) {
return [];
}
const chunks: string[] = [];
const sentenceEndings = '.!?';
let startIndex = 0;
while (startIndex < text.length) {
// Calculate the end index for this chunk
const endIndex = Math.min(startIndex + chunkSize, text.length);
// If this is the last chunk (remaining text fits in chunk size), add it and break
if (endIndex >= text.length) {
const finalChunk = text.slice(startIndex).trim();
if (finalChunk) {
chunks.push(finalChunk);
}
break;
}
// Start with the maximum possible chunk from current position
let chunk = text.slice(startIndex, endIndex);
let bestSplit = endIndex;
// Try to find the last sentence ending within the chunk
for (const ending of sentenceEndings) {
const lastEnding = chunk.lastIndexOf(ending);
if (lastEnding !== -1) {
// Calculate absolute position in the original text
const absolutePos = startIndex + lastEnding + 1;
// Check if there's a space after the punctuation
const hasSpace = absolutePos < text.length && text[absolutePos] === ' ';
bestSplit = Math.min(bestSplit, absolutePos + (hasSpace ? 1 : 0));
}
}
// Adjust to ensure we don't break words
// If the next character is a letter, find the last space
if (bestSplit < text.length && /[a-zA-Z]/.test(text[bestSplit])) {
const chunkToSplit = text.slice(startIndex, bestSplit);
const spaceSplit = chunkToSplit.lastIndexOf(' ');
if (spaceSplit !== -1) {
bestSplit = startIndex + spaceSplit;
}
}
// Extract and add the chunk
const currentChunk = text.slice(startIndex, bestSplit).trim();
if (currentChunk) {
chunks.push(currentChunk);
}
// Calculate next start position
if (overlapSize === 0) {
// Original PyG behavior: no overlap
startIndex = bestSplit;
// Skip whitespace at the beginning of next chunk
while (startIndex < text.length && /\s/.test(text[startIndex])) {
startIndex++;
}
} else {
// With overlap: move forward by (chunkSize - overlapSize)
const step = Math.max(1, chunkSize - overlapSize);
startIndex += step;
}
}
return chunks;
}
/**
* Chunks text into sentence-based segments, matching Python implementation
*/
export function chunkText(text: string, chunkSize: number = CHUNK_SIZE): string[] {
// If the input text is empty or None, return an empty list
if (!text) {
return []
}
// List of punctuation marks that typically end sentences
const sentenceEndings = '.!?'
// List to store the resulting chunks
const chunks: string[] = []
// Continue processing the entire text
let remainingText = text
while (remainingText) {
// If the remaining text is shorter than chunk_size, add it and break
if (remainingText.length <= chunkSize) {
chunks.push(remainingText.trim())
break
}
// Start with the maximum possible chunk
let chunk = remainingText.slice(0, chunkSize)
// Try to find the last sentence ending within the chunk
let bestSplit = chunkSize
for (const ending of sentenceEndings) {
// Find the last occurrence of the ending punctuation
const lastEnding = chunk.lastIndexOf(ending)
if (lastEnding !== -1) {
// Ensure we include the punctuation and any following space
bestSplit = Math.min(
bestSplit,
lastEnding + 1 + (lastEnding + 1 < chunk.length && /\s/.test(chunk[lastEnding + 1]) ? 1 : 0)
)
}
}
// Adjust to ensure we don't break words
// If the next character is a letter, find the last space
if (bestSplit < remainingText.length && /[a-zA-Z]/.test(remainingText[bestSplit])) {
const spaceSplit = remainingText.slice(0, bestSplit).lastIndexOf(' ')
if (spaceSplit !== -1) {
bestSplit = spaceSplit
}
}
// Append the chunk, ensuring it's stripped
chunks.push(remainingText.slice(0, bestSplit).trim())
// Remove the processed part from the text
remainingText = remainingText.slice(bestSplit).trim()
}
return chunks
}
/**
* Merges triples from multiple chunks, removing duplicates
*/
export function mergeTriples(triplesArrays: Array<Array<Triple>>): Array<Triple> {
const uniqueTriplesMap = new Map<string, Triple>()
for (const triples of triplesArrays) {
for (const triple of triples) {
const key = `${triple.subject}|${triple.predicate}|${triple.object}`
if (!uniqueTriplesMap.has(key)) {
uniqueTriplesMap.set(key, triple)
}
}
}
return Array.from(uniqueTriplesMap.values())
}
/**
* Parses triple strings into structured Triple objects, matching Python patterns
*/
export function parseTriples(triplesStr: string): Triple[] {
const processed: Triple[] = []
const splitByNewline = triplesStr.split('\n')
// First try newline-separated format
if (splitByNewline.length > 1) {
for (const line of splitByNewline) {
const triple = parseTripleLine(line.trim())
if (triple) processed.push(triple)
}
} else {
// Handle space-separated format "(e, r, e) (e, r, e) ... (e, r, e)"
const splitTriples = triplesStr.slice(1, -1).split(') (')
for (const tripleStr of splitTriples) {
const triple = parseTripleLine(tripleStr)
if (triple) processed.push(triple)
}
}
return processed
}
/**
* Helper function to parse a single triple line with multiple formats
*/
function parseTripleLine(line: string): Triple | null {
if (!line.trim() || line.toLowerCase().includes('note:')) return null
// Try different regex patterns matching Python implementation
const patterns = [
// Standard format: ('subject', 'relation', 'object')
/\('([^']+)',\s*'([^']+)',\s*'([^']+)'\)/,
// Double quotes: ("subject", "relation", "object")
/\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)/,
// No parentheses: "subject", "relation", "object"
/"([^"]+)",\s*"([^"]+)",\s*"([^"]+)"/,
// Mixed quotes: ('subject', "relation", 'object')
/\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)/,
// Plain text: subject, relation, object
/^([^,]+),\s*([^,]+),\s*(.+)$/
]
for (const pattern of patterns) {
const match = line.match(pattern)
if (match) {
return {
subject: match[1].trim().toLowerCase(),
predicate: match[2].trim().toLowerCase(),
object: match[3].trim().toLowerCase()
}
}
}
return null
}
// Re-export types
export type { Triple }
/**
* Converts triples to a graph representation
* @param triples Array of triples
* @returns Graph representation with nodes and edges
*/
export function triplesToGraph(triples: Triple[]) {
const nodes = new Map<string, { id: string; label: string }>()
const edges: Array<{ source: string; target: string; label: string }> = []
// Process each triple to build nodes and edges
for (const triple of triples) {
// Add subject node if not exists
if (!nodes.has(triple.subject)) {
nodes.set(triple.subject, {
id: triple.subject,
label: triple.subject,
})
}
// Add object node if not exists
if (!nodes.has(triple.object)) {
nodes.set(triple.object, {
id: triple.object,
label: triple.object,
})
}
// Add edge
edges.push({
source: triple.subject,
target: triple.object,
label: triple.predicate,
})
}
return {
nodes: Array.from(nodes.values()),
edges,
}
}
/**
* Processes text using PyTorch Geometric's exact chunking method (no overlap)
* Replicates the chunking behavior from PyG's txt2kg.py
* @param text Text to process
* @param extractTriplesFn Function to extract triples from a chunk
* @param chunkSize Maximum size of each chunk (default: 512 like PyG)
* @returns Array of extracted triples
*/
export async function processTextWithChunkingPyG(
text: string,
extractTriplesFn: (chunk: string) => Promise<Triple[]>,
chunkSize = 512,
overlapSize = 0,
): Promise<Triple[]> {
// If text is small enough, process directly
if (text.length <= chunkSize) {
return await extractTriplesFn(text)
}
// Chunk the text using PyG method with configurable overlap
const chunks = chunkTextPyG(text, chunkSize, overlapSize)
const overlapText = overlapSize > 0 ? `, ${overlapSize} char overlap` : ', no overlap'
console.log(`PyG Chunking: Split text into ${chunks.length} chunks (${chunkSize} chars each${overlapText})`)
// Process each chunk
const triplesPromises = chunks.map((chunk, i) => {
console.log(`Processing PyG chunk ${i + 1}/${chunks.length}, size: ${chunk.length}`)
return extractTriplesFn(chunk)
})
// Wait for all chunks to be processed
const triplesArrays = await Promise.all(triplesPromises)
// Merge results (no deduplication needed since PyG doesn't use overlap)
return mergeTriples(triplesArrays)
}
/**
* Processes text to extract triples using chunking for large texts
* @param text Text to process
* @param extractTriplesFn Function to extract triples from a chunk
* @param chunkSize Maximum size of each chunk
* @param overlapSize Size of overlap between chunks
* @returns Array of extracted triples
*/
export async function processTextWithChunking(
text: string,
extractTriplesFn: (chunk: string) => Promise<Triple[]>,
chunkSize = 20000,
overlapSize = 1000,
): Promise<Triple[]> {
// If text is small enough, process directly
if (text.length <= chunkSize) {
return await extractTriplesFn(text)
}
// Chunk the text
const chunks = chunkText(text, chunkSize)
console.log(`Split text into ${chunks.length} chunks`)
// Process each chunk
const triplesPromises = chunks.map((chunk, i) => {
console.log(`Processing chunk ${i + 1}/${chunks.length}, size: ${chunk.length}`)
return extractTriplesFn(chunk)
})
// Wait for all chunks to be processed
const triplesArrays = await Promise.all(triplesPromises)
// Merge results
return mergeTriples(triplesArrays)
}