mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 10:33:51 +00:00
341 lines
10 KiB
TypeScript
341 lines
10 KiB
TypeScript
/**
|
|
* Text processing utilities for knowledge graph extraction
|
|
* Matches PyTorch Geometric's txt2kg.py implementation
|
|
*/
|
|
|
|
import type { Triple } from '@/types/graph'
|
|
|
|
const CHUNK_SIZE = 20000 // Optimized for Gemma3:27b on DGX Spark
|
|
const OVERLAP_SIZE = 1000 // For context preservation between chunks
|
|
|
|
/**
|
|
* Chunks text using PyTorch Geometric's exact chunking algorithm
|
|
* Replicates the chunk_text function from PyG's txt2kg.py
|
|
*/
|
|
export function chunkTextPyG(text: string, chunkSize: number = 512, overlapSize: number = 0): string[] {
|
|
if (!text) {
|
|
return [];
|
|
}
|
|
|
|
const chunks: string[] = [];
|
|
const sentenceEndings = '.!?';
|
|
let startIndex = 0;
|
|
|
|
while (startIndex < text.length) {
|
|
// Calculate the end index for this chunk
|
|
const endIndex = Math.min(startIndex + chunkSize, text.length);
|
|
|
|
// If this is the last chunk (remaining text fits in chunk size), add it and break
|
|
if (endIndex >= text.length) {
|
|
const finalChunk = text.slice(startIndex).trim();
|
|
if (finalChunk) {
|
|
chunks.push(finalChunk);
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Start with the maximum possible chunk from current position
|
|
let chunk = text.slice(startIndex, endIndex);
|
|
let bestSplit = endIndex;
|
|
|
|
// Try to find the last sentence ending within the chunk
|
|
for (const ending of sentenceEndings) {
|
|
const lastEnding = chunk.lastIndexOf(ending);
|
|
if (lastEnding !== -1) {
|
|
// Calculate absolute position in the original text
|
|
const absolutePos = startIndex + lastEnding + 1;
|
|
// Check if there's a space after the punctuation
|
|
const hasSpace = absolutePos < text.length && text[absolutePos] === ' ';
|
|
bestSplit = Math.min(bestSplit, absolutePos + (hasSpace ? 1 : 0));
|
|
}
|
|
}
|
|
|
|
// Adjust to ensure we don't break words
|
|
// If the next character is a letter, find the last space
|
|
if (bestSplit < text.length && /[a-zA-Z]/.test(text[bestSplit])) {
|
|
const chunkToSplit = text.slice(startIndex, bestSplit);
|
|
const spaceSplit = chunkToSplit.lastIndexOf(' ');
|
|
if (spaceSplit !== -1) {
|
|
bestSplit = startIndex + spaceSplit;
|
|
}
|
|
}
|
|
|
|
// Extract and add the chunk
|
|
const currentChunk = text.slice(startIndex, bestSplit).trim();
|
|
if (currentChunk) {
|
|
chunks.push(currentChunk);
|
|
}
|
|
|
|
// Calculate next start position
|
|
if (overlapSize === 0) {
|
|
// Original PyG behavior: no overlap
|
|
startIndex = bestSplit;
|
|
// Skip whitespace at the beginning of next chunk
|
|
while (startIndex < text.length && /\s/.test(text[startIndex])) {
|
|
startIndex++;
|
|
}
|
|
} else {
|
|
// With overlap: move forward by (chunkSize - overlapSize)
|
|
const step = Math.max(1, chunkSize - overlapSize);
|
|
startIndex += step;
|
|
}
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Chunks text into sentence-based segments, matching Python implementation
|
|
*/
|
|
export function chunkText(text: string, chunkSize: number = CHUNK_SIZE): string[] {
|
|
// If the input text is empty or None, return an empty list
|
|
if (!text) {
|
|
return []
|
|
}
|
|
|
|
// List of punctuation marks that typically end sentences
|
|
const sentenceEndings = '.!?'
|
|
|
|
// List to store the resulting chunks
|
|
const chunks: string[] = []
|
|
|
|
// Continue processing the entire text
|
|
let remainingText = text
|
|
while (remainingText) {
|
|
// If the remaining text is shorter than chunk_size, add it and break
|
|
if (remainingText.length <= chunkSize) {
|
|
chunks.push(remainingText.trim())
|
|
break
|
|
}
|
|
|
|
// Start with the maximum possible chunk
|
|
let chunk = remainingText.slice(0, chunkSize)
|
|
|
|
// Try to find the last sentence ending within the chunk
|
|
let bestSplit = chunkSize
|
|
for (const ending of sentenceEndings) {
|
|
// Find the last occurrence of the ending punctuation
|
|
const lastEnding = chunk.lastIndexOf(ending)
|
|
if (lastEnding !== -1) {
|
|
// Ensure we include the punctuation and any following space
|
|
bestSplit = Math.min(
|
|
bestSplit,
|
|
lastEnding + 1 + (lastEnding + 1 < chunk.length && /\s/.test(chunk[lastEnding + 1]) ? 1 : 0)
|
|
)
|
|
}
|
|
}
|
|
|
|
// Adjust to ensure we don't break words
|
|
// If the next character is a letter, find the last space
|
|
if (bestSplit < remainingText.length && /[a-zA-Z]/.test(remainingText[bestSplit])) {
|
|
const spaceSplit = remainingText.slice(0, bestSplit).lastIndexOf(' ')
|
|
if (spaceSplit !== -1) {
|
|
bestSplit = spaceSplit
|
|
}
|
|
}
|
|
|
|
// Append the chunk, ensuring it's stripped
|
|
chunks.push(remainingText.slice(0, bestSplit).trim())
|
|
|
|
// Remove the processed part from the text
|
|
remainingText = remainingText.slice(bestSplit).trim()
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
/**
|
|
* Merges triples from multiple chunks, removing duplicates
|
|
*/
|
|
export function mergeTriples(triplesArrays: Array<Array<Triple>>): Array<Triple> {
|
|
const uniqueTriplesMap = new Map<string, Triple>()
|
|
|
|
for (const triples of triplesArrays) {
|
|
for (const triple of triples) {
|
|
const key = `${triple.subject}|${triple.predicate}|${triple.object}`
|
|
if (!uniqueTriplesMap.has(key)) {
|
|
uniqueTriplesMap.set(key, triple)
|
|
}
|
|
}
|
|
}
|
|
|
|
return Array.from(uniqueTriplesMap.values())
|
|
}
|
|
|
|
/**
|
|
* Parses triple strings into structured Triple objects, matching Python patterns
|
|
*/
|
|
export function parseTriples(triplesStr: string): Triple[] {
|
|
const processed: Triple[] = []
|
|
const splitByNewline = triplesStr.split('\n')
|
|
|
|
// First try newline-separated format
|
|
if (splitByNewline.length > 1) {
|
|
for (const line of splitByNewline) {
|
|
const triple = parseTripleLine(line.trim())
|
|
if (triple) processed.push(triple)
|
|
}
|
|
} else {
|
|
// Handle space-separated format "(e, r, e) (e, r, e) ... (e, r, e)"
|
|
const splitTriples = triplesStr.slice(1, -1).split(') (')
|
|
for (const tripleStr of splitTriples) {
|
|
const triple = parseTripleLine(tripleStr)
|
|
if (triple) processed.push(triple)
|
|
}
|
|
}
|
|
|
|
return processed
|
|
}
|
|
|
|
/**
|
|
* Helper function to parse a single triple line with multiple formats
|
|
*/
|
|
function parseTripleLine(line: string): Triple | null {
|
|
if (!line.trim() || line.toLowerCase().includes('note:')) return null
|
|
|
|
// Try different regex patterns matching Python implementation
|
|
const patterns = [
|
|
// Standard format: ('subject', 'relation', 'object')
|
|
/\('([^']+)',\s*'([^']+)',\s*'([^']+)'\)/,
|
|
// Double quotes: ("subject", "relation", "object")
|
|
/\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)/,
|
|
// No parentheses: "subject", "relation", "object"
|
|
/"([^"]+)",\s*"([^"]+)",\s*"([^"]+)"/,
|
|
// Mixed quotes: ('subject', "relation", 'object')
|
|
/\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)/,
|
|
// Plain text: subject, relation, object
|
|
/^([^,]+),\s*([^,]+),\s*(.+)$/
|
|
]
|
|
|
|
for (const pattern of patterns) {
|
|
const match = line.match(pattern)
|
|
if (match) {
|
|
return {
|
|
subject: match[1].trim().toLowerCase(),
|
|
predicate: match[2].trim().toLowerCase(),
|
|
object: match[3].trim().toLowerCase()
|
|
}
|
|
}
|
|
}
|
|
|
|
return null
|
|
}
|
|
|
|
// Re-export types
|
|
export type { Triple }
|
|
|
|
/**
|
|
* Converts triples to a graph representation
|
|
* @param triples Array of triples
|
|
* @returns Graph representation with nodes and edges
|
|
*/
|
|
export function triplesToGraph(triples: Triple[]) {
|
|
const nodes = new Map<string, { id: string; label: string }>()
|
|
const edges: Array<{ source: string; target: string; label: string }> = []
|
|
|
|
// Process each triple to build nodes and edges
|
|
for (const triple of triples) {
|
|
// Add subject node if not exists
|
|
if (!nodes.has(triple.subject)) {
|
|
nodes.set(triple.subject, {
|
|
id: triple.subject,
|
|
label: triple.subject,
|
|
})
|
|
}
|
|
|
|
// Add object node if not exists
|
|
if (!nodes.has(triple.object)) {
|
|
nodes.set(triple.object, {
|
|
id: triple.object,
|
|
label: triple.object,
|
|
})
|
|
}
|
|
|
|
// Add edge
|
|
edges.push({
|
|
source: triple.subject,
|
|
target: triple.object,
|
|
label: triple.predicate,
|
|
})
|
|
}
|
|
|
|
return {
|
|
nodes: Array.from(nodes.values()),
|
|
edges,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Processes text using PyTorch Geometric's exact chunking method (no overlap)
|
|
* Replicates the chunking behavior from PyG's txt2kg.py
|
|
* @param text Text to process
|
|
* @param extractTriplesFn Function to extract triples from a chunk
|
|
* @param chunkSize Maximum size of each chunk (default: 512 like PyG)
|
|
* @returns Array of extracted triples
|
|
*/
|
|
export async function processTextWithChunkingPyG(
|
|
text: string,
|
|
extractTriplesFn: (chunk: string) => Promise<Triple[]>,
|
|
chunkSize = 512,
|
|
overlapSize = 0,
|
|
): Promise<Triple[]> {
|
|
// If text is small enough, process directly
|
|
if (text.length <= chunkSize) {
|
|
return await extractTriplesFn(text)
|
|
}
|
|
|
|
// Chunk the text using PyG method with configurable overlap
|
|
const chunks = chunkTextPyG(text, chunkSize, overlapSize)
|
|
const overlapText = overlapSize > 0 ? `, ${overlapSize} char overlap` : ', no overlap'
|
|
console.log(`PyG Chunking: Split text into ${chunks.length} chunks (${chunkSize} chars each${overlapText})`)
|
|
|
|
// Process each chunk
|
|
const triplesPromises = chunks.map((chunk, i) => {
|
|
console.log(`Processing PyG chunk ${i + 1}/${chunks.length}, size: ${chunk.length}`)
|
|
return extractTriplesFn(chunk)
|
|
})
|
|
|
|
// Wait for all chunks to be processed
|
|
const triplesArrays = await Promise.all(triplesPromises)
|
|
|
|
// Merge results (no deduplication needed since PyG doesn't use overlap)
|
|
return mergeTriples(triplesArrays)
|
|
}
|
|
|
|
/**
|
|
* Processes text to extract triples using chunking for large texts
|
|
* @param text Text to process
|
|
* @param extractTriplesFn Function to extract triples from a chunk
|
|
* @param chunkSize Maximum size of each chunk
|
|
* @param overlapSize Size of overlap between chunks
|
|
* @returns Array of extracted triples
|
|
*/
|
|
export async function processTextWithChunking(
|
|
text: string,
|
|
extractTriplesFn: (chunk: string) => Promise<Triple[]>,
|
|
chunkSize = 20000,
|
|
overlapSize = 1000,
|
|
): Promise<Triple[]> {
|
|
// If text is small enough, process directly
|
|
if (text.length <= chunkSize) {
|
|
return await extractTriplesFn(text)
|
|
}
|
|
|
|
// Chunk the text
|
|
const chunks = chunkText(text, chunkSize)
|
|
console.log(`Split text into ${chunks.length} chunks`)
|
|
|
|
// Process each chunk
|
|
const triplesPromises = chunks.map((chunk, i) => {
|
|
console.log(`Processing chunk ${i + 1}/${chunks.length}, size: ${chunk.length}`)
|
|
return extractTriplesFn(chunk)
|
|
})
|
|
|
|
// Wait for all chunks to be processed
|
|
const triplesArrays = await Promise.all(triplesPromises)
|
|
|
|
// Merge results
|
|
return mergeTriples(triplesArrays)
|
|
}
|
|
|