mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
228 lines
6.6 KiB
Plaintext
228 lines
6.6 KiB
Plaintext
/**
|
|
* Text processing utilities for knowledge graph extraction
|
|
* Matches PyTorch Geometric's txt2kg.py implementation
|
|
*/
|
|
|
|
import type { Triple } from '@/types/graph'
|
|
|
|
const CHUNK_SIZE = 20000 // Optimized for Gemma3:27b on DGX Spark
|
|
const OVERLAP_SIZE = 1000 // For context preservation between chunks
|
|
|
|
/**
|
|
* Chunks text into sentence-based segments, matching Python implementation
|
|
*/
|
|
export function chunkText(text: string, chunkSize: number = CHUNK_SIZE): string[] {
|
|
// If the input text is empty or None, return an empty list
|
|
if (!text) {
|
|
return []
|
|
}
|
|
|
|
// List of punctuation marks that typically end sentences
|
|
const sentenceEndings = '.!?'
|
|
|
|
// List to store the resulting chunks
|
|
const chunks: string[] = []
|
|
|
|
// Continue processing the entire text
|
|
let remainingText = text
|
|
while (remainingText) {
|
|
// If the remaining text is shorter than chunk_size, add it and break
|
|
if (remainingText.length <= chunkSize) {
|
|
chunks.push(remainingText.trim())
|
|
break
|
|
}
|
|
|
|
// Start with the maximum possible chunk
|
|
let chunk = remainingText.slice(0, chunkSize)
|
|
|
|
// Try to find the last sentence ending within the chunk
|
|
let bestSplit = chunkSize
|
|
for (const ending of sentenceEndings) {
|
|
// Find the last occurrence of the ending punctuation
|
|
const lastEnding = chunk.lastIndexOf(ending)
|
|
if (lastEnding !== -1) {
|
|
// Ensure we include the punctuation and any following space
|
|
bestSplit = Math.min(
|
|
bestSplit,
|
|
lastEnding + 1 + (lastEnding + 1 < chunk.length && /\s/.test(chunk[lastEnding + 1]) ? 1 : 0)
|
|
)
|
|
}
|
|
}
|
|
|
|
// Adjust to ensure we don't break words
|
|
// If the next character is a letter, find the last space
|
|
if (bestSplit < remainingText.length && /[a-zA-Z]/.test(remainingText[bestSplit])) {
|
|
const spaceSplit = remainingText.slice(0, bestSplit).lastIndexOf(' ')
|
|
if (spaceSplit !== -1) {
|
|
bestSplit = spaceSplit
|
|
}
|
|
}
|
|
|
|
// Append the chunk, ensuring it's stripped
|
|
chunks.push(remainingText.slice(0, bestSplit).trim())
|
|
|
|
// Remove the processed part from the text
|
|
remainingText = remainingText.slice(bestSplit).trim()
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
/**
|
|
* Merges triples from multiple chunks, removing duplicates
|
|
*/
|
|
export function mergeTriples(triplesArrays: Array<Array<Triple>>): Array<Triple> {
|
|
const uniqueTriplesMap = new Map<string, Triple>()
|
|
|
|
for (const triples of triplesArrays) {
|
|
for (const triple of triples) {
|
|
const key = `${triple.subject}|${triple.predicate}|${triple.object}`
|
|
if (!uniqueTriplesMap.has(key)) {
|
|
uniqueTriplesMap.set(key, triple)
|
|
}
|
|
}
|
|
}
|
|
|
|
return Array.from(uniqueTriplesMap.values())
|
|
}
|
|
|
|
/**
|
|
* Parses triple strings into structured Triple objects, matching Python patterns
|
|
*/
|
|
export function parseTriples(triplesStr: string): Triple[] {
|
|
const processed: Triple[] = []
|
|
const splitByNewline = triplesStr.split('\n')
|
|
|
|
// First try newline-separated format
|
|
if (splitByNewline.length > 1) {
|
|
for (const line of splitByNewline) {
|
|
const triple = parseTripleLine(line.trim())
|
|
if (triple) processed.push(triple)
|
|
}
|
|
} else {
|
|
// Handle space-separated format "(e, r, e) (e, r, e) ... (e, r, e)"
|
|
const splitTriples = triplesStr.slice(1, -1).split(') (')
|
|
for (const tripleStr of splitTriples) {
|
|
const triple = parseTripleLine(tripleStr)
|
|
if (triple) processed.push(triple)
|
|
}
|
|
}
|
|
|
|
return processed
|
|
}
|
|
|
|
/**
|
|
* Helper function to parse a single triple line with multiple formats
|
|
*/
|
|
function parseTripleLine(line: string): Triple | null {
|
|
if (!line.trim() || line.toLowerCase().includes('note:')) return null
|
|
|
|
// Try different regex patterns matching Python implementation
|
|
const patterns = [
|
|
// Standard format: ('subject', 'relation', 'object')
|
|
/\('([^']+)',\s*'([^']+)',\s*'([^']+)'\)/,
|
|
// Double quotes: ("subject", "relation", "object")
|
|
/\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)/,
|
|
// No parentheses: "subject", "relation", "object"
|
|
/"([^"]+)",\s*"([^"]+)",\s*"([^"]+)"/,
|
|
// Mixed quotes: ('subject', "relation", 'object')
|
|
/\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)/,
|
|
// Plain text: subject, relation, object
|
|
/^([^,]+),\s*([^,]+),\s*(.+)$/
|
|
]
|
|
|
|
for (const pattern of patterns) {
|
|
const match = line.match(pattern)
|
|
if (match) {
|
|
return {
|
|
subject: match[1].trim().toLowerCase(),
|
|
predicate: match[2].trim().toLowerCase(),
|
|
object: match[3].trim().toLowerCase()
|
|
}
|
|
}
|
|
}
|
|
|
|
return null
|
|
}
|
|
|
|
// Re-export types
|
|
export type { Triple }
|
|
|
|
/**
|
|
* Converts triples to a graph representation
|
|
* @param triples Array of triples
|
|
* @returns Graph representation with nodes and edges
|
|
*/
|
|
export function triplesToGraph(triples: Triple[]) {
|
|
const nodes = new Map<string, { id: string; label: string }>()
|
|
const edges: Array<{ source: string; target: string; label: string }> = []
|
|
|
|
// Process each triple to build nodes and edges
|
|
for (const triple of triples) {
|
|
// Add subject node if not exists
|
|
if (!nodes.has(triple.subject)) {
|
|
nodes.set(triple.subject, {
|
|
id: triple.subject,
|
|
label: triple.subject,
|
|
})
|
|
}
|
|
|
|
// Add object node if not exists
|
|
if (!nodes.has(triple.object)) {
|
|
nodes.set(triple.object, {
|
|
id: triple.object,
|
|
label: triple.object,
|
|
})
|
|
}
|
|
|
|
// Add edge
|
|
edges.push({
|
|
source: triple.subject,
|
|
target: triple.object,
|
|
label: triple.predicate,
|
|
})
|
|
}
|
|
|
|
return {
|
|
nodes: Array.from(nodes.values()),
|
|
edges,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Processes text to extract triples using chunking for large texts
|
|
* @param text Text to process
|
|
* @param extractTriplesFn Function to extract triples from a chunk
|
|
* @param chunkSize Maximum size of each chunk
|
|
* @param overlapSize Size of overlap between chunks
|
|
* @returns Array of extracted triples
|
|
*/
|
|
export async function processTextWithChunking(
|
|
text: string,
|
|
extractTriplesFn: (chunk: string) => Promise<Triple[]>,
|
|
chunkSize = 20000,
|
|
overlapSize = 1000,
|
|
): Promise<Triple[]> {
|
|
// If text is small enough, process directly
|
|
if (text.length <= chunkSize) {
|
|
return await extractTriplesFn(text)
|
|
}
|
|
|
|
// Chunk the text
|
|
const chunks = chunkText(text, chunkSize)
|
|
console.log(`Split text into ${chunks.length} chunks`)
|
|
|
|
// Process each chunk
|
|
const triplesPromises = chunks.map((chunk, i) => {
|
|
console.log(`Processing chunk ${i + 1}/${chunks.length}, size: ${chunk.length}`)
|
|
return extractTriplesFn(chunk)
|
|
})
|
|
|
|
// Wait for all chunks to be processed
|
|
const triplesArrays = await Promise.all(triplesPromises)
|
|
|
|
// Merge results
|
|
return mergeTriples(triplesArrays)
|
|
}
|
|
|