dgx-spark-playbooks/nvidia/txt2kg/assets/frontend/utils/text-processing.ts.backup
2025-10-06 17:05:41 +00:00

228 lines
6.6 KiB
Plaintext

/**
* Text processing utilities for knowledge graph extraction
* Matches PyTorch Geometric's txt2kg.py implementation
*/
import type { Triple } from '@/types/graph'
const CHUNK_SIZE = 20000 // Optimized for Gemma3:27b on DGX Spark
const OVERLAP_SIZE = 1000 // For context preservation between chunks
/**
* Chunks text into sentence-based segments, matching Python implementation
*/
export function chunkText(text: string, chunkSize: number = CHUNK_SIZE): string[] {
// If the input text is empty or None, return an empty list
if (!text) {
return []
}
// List of punctuation marks that typically end sentences
const sentenceEndings = '.!?'
// List to store the resulting chunks
const chunks: string[] = []
// Continue processing the entire text
let remainingText = text
while (remainingText) {
// If the remaining text is shorter than chunk_size, add it and break
if (remainingText.length <= chunkSize) {
chunks.push(remainingText.trim())
break
}
// Start with the maximum possible chunk
let chunk = remainingText.slice(0, chunkSize)
// Try to find the last sentence ending within the chunk
let bestSplit = chunkSize
for (const ending of sentenceEndings) {
// Find the last occurrence of the ending punctuation
const lastEnding = chunk.lastIndexOf(ending)
if (lastEnding !== -1) {
// Ensure we include the punctuation and any following space
bestSplit = Math.min(
bestSplit,
lastEnding + 1 + (lastEnding + 1 < chunk.length && /\s/.test(chunk[lastEnding + 1]) ? 1 : 0)
)
}
}
// Adjust to ensure we don't break words
// If the next character is a letter, find the last space
if (bestSplit < remainingText.length && /[a-zA-Z]/.test(remainingText[bestSplit])) {
const spaceSplit = remainingText.slice(0, bestSplit).lastIndexOf(' ')
if (spaceSplit !== -1) {
bestSplit = spaceSplit
}
}
// Append the chunk, ensuring it's stripped
chunks.push(remainingText.slice(0, bestSplit).trim())
// Remove the processed part from the text
remainingText = remainingText.slice(bestSplit).trim()
}
return chunks
}
/**
* Merges triples from multiple chunks, removing duplicates
*/
export function mergeTriples(triplesArrays: Array<Array<Triple>>): Array<Triple> {
const uniqueTriplesMap = new Map<string, Triple>()
for (const triples of triplesArrays) {
for (const triple of triples) {
const key = `${triple.subject}|${triple.predicate}|${triple.object}`
if (!uniqueTriplesMap.has(key)) {
uniqueTriplesMap.set(key, triple)
}
}
}
return Array.from(uniqueTriplesMap.values())
}
/**
* Parses triple strings into structured Triple objects, matching Python patterns
*/
export function parseTriples(triplesStr: string): Triple[] {
const processed: Triple[] = []
const splitByNewline = triplesStr.split('\n')
// First try newline-separated format
if (splitByNewline.length > 1) {
for (const line of splitByNewline) {
const triple = parseTripleLine(line.trim())
if (triple) processed.push(triple)
}
} else {
// Handle space-separated format "(e, r, e) (e, r, e) ... (e, r, e)"
const splitTriples = triplesStr.slice(1, -1).split(') (')
for (const tripleStr of splitTriples) {
const triple = parseTripleLine(tripleStr)
if (triple) processed.push(triple)
}
}
return processed
}
/**
* Helper function to parse a single triple line with multiple formats
*/
function parseTripleLine(line: string): Triple | null {
if (!line.trim() || line.toLowerCase().includes('note:')) return null
// Try different regex patterns matching Python implementation
const patterns = [
// Standard format: ('subject', 'relation', 'object')
/\('([^']+)',\s*'([^']+)',\s*'([^']+)'\)/,
// Double quotes: ("subject", "relation", "object")
/\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)/,
// No parentheses: "subject", "relation", "object"
/"([^"]+)",\s*"([^"]+)",\s*"([^"]+)"/,
// Mixed quotes: ('subject', "relation", 'object')
/\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)/,
// Plain text: subject, relation, object
/^([^,]+),\s*([^,]+),\s*(.+)$/
]
for (const pattern of patterns) {
const match = line.match(pattern)
if (match) {
return {
subject: match[1].trim().toLowerCase(),
predicate: match[2].trim().toLowerCase(),
object: match[3].trim().toLowerCase()
}
}
}
return null
}
// Re-export types
export type { Triple }
/**
* Converts triples to a graph representation
* @param triples Array of triples
* @returns Graph representation with nodes and edges
*/
export function triplesToGraph(triples: Triple[]) {
const nodes = new Map<string, { id: string; label: string }>()
const edges: Array<{ source: string; target: string; label: string }> = []
// Process each triple to build nodes and edges
for (const triple of triples) {
// Add subject node if not exists
if (!nodes.has(triple.subject)) {
nodes.set(triple.subject, {
id: triple.subject,
label: triple.subject,
})
}
// Add object node if not exists
if (!nodes.has(triple.object)) {
nodes.set(triple.object, {
id: triple.object,
label: triple.object,
})
}
// Add edge
edges.push({
source: triple.subject,
target: triple.object,
label: triple.predicate,
})
}
return {
nodes: Array.from(nodes.values()),
edges,
}
}
/**
* Processes text to extract triples using chunking for large texts
* @param text Text to process
* @param extractTriplesFn Function to extract triples from a chunk
* @param chunkSize Maximum size of each chunk
* @param overlapSize Size of overlap between chunks
* @returns Array of extracted triples
*/
export async function processTextWithChunking(
text: string,
extractTriplesFn: (chunk: string) => Promise<Triple[]>,
chunkSize = 20000,
overlapSize = 1000,
): Promise<Triple[]> {
// If text is small enough, process directly
if (text.length <= chunkSize) {
return await extractTriplesFn(text)
}
// Chunk the text
const chunks = chunkText(text, chunkSize)
console.log(`Split text into ${chunks.length} chunks`)
// Process each chunk
const triplesPromises = chunks.map((chunk, i) => {
console.log(`Processing chunk ${i + 1}/${chunks.length}, size: ${chunk.length}`)
return extractTriplesFn(chunk)
})
// Wait for all chunks to be processed
const triplesArrays = await Promise.all(triplesPromises)
// Merge results
return mergeTriples(triplesArrays)
}